wgit 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ae86258e3aac086f2215d1fb3e3b871cd4f4839884eb7c359ac9863148f1a307
4
- data.tar.gz: 2eafa3a2b7b6d6ff99aaf00466ccd5f9214f049a9f5836b45aaf6d17bfbe226b
3
+ metadata.gz: c4dc572e7d48a95d423e175ad2d6a791be52bf56f6c391e9152c075f45672ee8
4
+ data.tar.gz: fd6a9c9d1e38906f500543ae92169f7bcb9e64de1567f4f29ec24f7ca74c60d8
5
5
  SHA512:
6
- metadata.gz: 5735051c62d3d22db75a42c7d33cd8b1f78d4500b27c0e136980382ffb0e6830ee8d355a0f9151c4d844dd0eb91dc860cd5bd1855ec68099985c34d55ba1a3aa
7
- data.tar.gz: 0b5ab8f7f60e69f791fd4b51995f5dbf2f38a77954b164062b64b982847b7fd3844b016fbfaff186a7178f68d2fedc8736fd60b8c16ae82bc11c7a84a5892e42
6
+ metadata.gz: f4fc425aa1b25254dba343151794893ff26a3682e58dd08bdc918c180da89ecb08cf0f1013837e41527f564d36bb0784c6ecd204e53c1276cb5b32401c88ffab
7
+ data.tar.gz: 2ce1250ad7312257bc021e7414f164ec174e07e469a6956478cccaa7b05f159981a7a69ef6b713db5454233c8a03ea8487f14184c1e26dcab28ed8e81250507d
@@ -106,7 +106,8 @@ module Wgit
106
106
  doc = crawl_url(base_url, &block)
107
107
  return nil if doc.nil?
108
108
 
109
- crawled_urls = []
109
+ path = base_url.path.empty? ? '/' : base_url.path
110
+ crawled_urls = [path]
110
111
  external_urls = doc.external_links
111
112
  internal_urls = doc.internal_links
112
113
 
@@ -149,7 +150,10 @@ module Wgit
149
150
  def fetch(url)
150
151
  response = resolve(url)
151
152
  response.body.empty? ? nil : response.body
152
- rescue
153
+ rescue Exception => ex
154
+ Wgit.logger.debug(
155
+ "Wgit::Crawler#fetch('#{url}') exception: #{ex.message}"
156
+ )
153
157
  nil
154
158
  end
155
159
 
@@ -158,12 +162,16 @@ module Wgit
158
162
  # an exception. Redirects can be disabled by setting `redirect_limit: 1`.
159
163
  # The Net::HTTPResponse will be returned.
160
164
  def resolve(url, redirect_limit: 5)
161
- redirect_count = 0
165
+ redirect_count = -1
162
166
  begin
163
167
  raise "Too many redirects" if redirect_count >= redirect_limit
164
- response = Net::HTTP.get_response(URI.parse(url))
165
- url = response['location']
166
168
  redirect_count += 1
169
+
170
+ response = Net::HTTP.get_response(URI(url))
171
+ location = Wgit::Url.new(response.fetch('location', ''))
172
+ if not location.empty?
173
+ url = location.is_relative? ? url.to_base.concat(location) : location
174
+ end
167
175
  end while response.is_a?(Net::HTTPRedirection)
168
176
  response
169
177
  end
@@ -62,6 +62,8 @@ module Wgit
62
62
  @html = html ||= ""
63
63
  @doc = init_nokogiri
64
64
  @score = 0.0
65
+
66
+ process_url_and_html
65
67
 
66
68
  # Dynamically run the init_*_from_html methods.
67
69
  Document.private_instance_methods(false).each do |method|
@@ -80,7 +82,9 @@ module Wgit
80
82
  @html = obj.fetch("html", "")
81
83
  @doc = init_nokogiri
82
84
  @score = obj.fetch("score", 0.0)
83
-
85
+
86
+ process_url_and_html
87
+
84
88
  # Dynamically run the init_*_from_object methods.
85
89
  Document.private_instance_methods(false).each do |method|
86
90
  if method.to_s.start_with?("init_") &&
@@ -174,7 +178,7 @@ module Wgit
174
178
  # @return [Boolean] True if @html is nil/empty, false otherwise.
175
179
  def empty?
176
180
  return true if @html.nil?
177
- @html.strip.empty?
181
+ @html.empty?
178
182
  end
179
183
 
180
184
  # Uses Nokogiri's xpath method to search the doc's html and return the
@@ -194,47 +198,54 @@ module Wgit
194
198
  def css(selector)
195
199
  @doc.css(selector)
196
200
  end
197
-
198
- # Get all internal links of this Document.
201
+
202
+ # Get all internal links of this Document in relative form. Internal
203
+ # meaning a link to another page on this website. Also see
204
+ # Wgit::Document#internal_full_links.
199
205
  #
200
206
  # @return [Array<Wgit::Url>] self's internal/relative URL's.
201
207
  def internal_links
202
208
  return [] if @links.empty?
203
- @links.reject do |link|
204
- begin
205
- not link.relative_link?
209
+
210
+ links = @links.
211
+ reject do |link|
212
+ not link.relative_link?(base: @url.to_base)
206
213
  rescue
207
214
  true
208
- end
209
- end
215
+ end.
216
+ map(&:to_path)
217
+
218
+ process_arr(links)
210
219
  end
211
220
 
212
221
  # Get all internal links of this Document and append them to this
213
- # Document's base URL.
222
+ # Document's base URL making them absolute. Also see
223
+ # Wgit::Document#internal_links.
214
224
  #
215
225
  # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
216
226
  # form.
217
227
  def internal_full_links
218
228
  in_links = internal_links
219
229
  return [] if in_links.empty?
220
- in_links.map do |link|
221
- link.replace("/" + link) unless link.start_with?("/")
222
- Wgit::Url.new(@url.to_base + link)
223
- end
230
+ in_links.map { |link| @url.to_base.concat(link) }
224
231
  end
225
232
 
226
- # Get all external links of this Document.
233
+ # Get all external links of this Document. External meaning a link to
234
+ # another website.
227
235
  #
228
236
  # @return [Array<Wgit::Url>] self's external/absolute URL's.
229
237
  def external_links
230
238
  return [] if @links.empty?
231
- @links.reject do |link|
232
- begin
233
- link.relative_link?
239
+
240
+ links = @links.
241
+ reject do |link|
242
+ link.relative_link?(base: @url.to_base)
234
243
  rescue
235
244
  true
236
- end
237
- end
245
+ end.
246
+ map { |link| link.end_with?('/') ? link.chop : link }
247
+
248
+ process_arr(links)
238
249
  end
239
250
 
240
251
  # Searches against the @text for the given search query.
@@ -253,8 +264,8 @@ module Wgit
253
264
  # sentence.
254
265
  # @return [Array<String>] Representing the search results.
255
266
  def search(query, sentence_limit = 80)
256
- raise "A search value must be provided" if query.empty?
257
- raise "The sentence length value must be even" if sentence_limit.odd?
267
+ raise "A search query must be provided" if query.empty?
268
+ raise "The sentence_limit value must be even" if sentence_limit.odd?
258
269
 
259
270
  results = {}
260
271
  regex = Regexp.new(query, Regexp::IGNORECASE)
@@ -469,29 +480,16 @@ module Wgit
469
480
  if array.is_a?(Array)
470
481
  array.map! { |str| process_str(str) }
471
482
  array.reject! { |str| str.is_a?(String) ? str.empty? : false }
483
+ array.compact!
472
484
  array.uniq!
473
485
  end
474
486
  array
475
487
  end
476
-
477
- # Modifies internal links by removing this doc's base or host URL, if
478
- # present. http://www.google.co.uk/about.html (with or without the
479
- # protocol prefix) will become about.html meaning it'll appear within
480
- # Document#internal_links.
481
- def process_internal_links(links)
482
- links.map! do |link|
483
- host_or_base = if link.start_with?("http")
484
- @url.base
485
- else
486
- @url.host
487
- end
488
- if link.start_with?(host_or_base)
489
- link.sub!(host_or_base, "")
490
- link.replace(link[1..-1]) if link.start_with?("/")
491
- link.strip!
492
- end
493
- link
494
- end
488
+
489
+ # Ensure the @url and @html Strings are correctly encoded etc.
490
+ def process_url_and_html
491
+ @url = process_str(@url)
492
+ @html = process_str(@html)
495
493
  end
496
494
 
497
495
  ### Default init_* (Document extension) methods. ###
@@ -547,7 +545,6 @@ module Wgit
547
545
  xpath = "//a/@href"
548
546
  result = find_in_html(xpath, singleton: false) do |links|
549
547
  if links
550
- links.reject! { |link| link == "/" }
551
548
  links.map! do |link|
552
549
  begin
553
550
  Wgit::Url.new(link)
@@ -555,8 +552,7 @@ module Wgit
555
552
  nil
556
553
  end
557
554
  end
558
- links.reject! { |link| link.nil? }
559
- process_internal_links(links)
555
+ links.compact!
560
556
  end
561
557
  links
562
558
  end
@@ -95,21 +95,29 @@ module Wgit
95
95
  url
96
96
  end
97
97
 
98
- # Returns if link is a relative or absolute Url. How it works:
99
- # URI.split("http://www.google.co.uk/about.html") returns the following:
100
- # array[2]: "www.google.co.uk", array[5]: "/about.html".
101
- # This means that all external links in a page are expected to have a
102
- # protocol prefix e.g. "http://", otherwise the link is treated as an
103
- # internal link (regardless of whether it is valid or not).
98
+ # Returns if link is a relative or absolute Url.
99
+ # All external links in a page are expected to have a protocol prefix e.g.
100
+ # "http://", otherwise the link is treated as an internal link (regardless
101
+ # of whether it is valid or not). The only exception is if base is provided
102
+ # and link is a page within that site; then the link is relative.
104
103
  #
105
104
  # @param link [Wgit::Url, String] The url to test if relative or not.
105
+ # @param base [String] The Url base e.g. http://www.google.co.uk.
106
106
  # @return [Boolean] True if relative, false if absolute.
107
107
  # @raise [RuntimeError] If the link is invalid.
108
- def self.relative_link?(link)
109
- link_segs = URI.split(link)
110
- if not link_segs[2].nil? and not link_segs[2].empty?
111
- false
112
- elsif not link_segs[5].nil? and not link_segs[5].empty?
108
+ def self.relative_link?(link, base: nil)
109
+ if base and URI(base).host.nil?
110
+ raise "Invalid base, must contain protocol prefix: #{base}"
111
+ end
112
+
113
+ uri = URI(link)
114
+ if not uri.host.nil? and not uri.host.empty?
115
+ if base
116
+ uri.host == URI(base).host
117
+ else
118
+ false
119
+ end
120
+ elsif not uri.path.nil? and not uri.path.empty?
113
121
  true
114
122
  else
115
123
  raise "Invalid link: #{link}"
@@ -128,11 +136,14 @@ module Wgit
128
136
  Wgit::Url.new(url + "/" + link)
129
137
  end
130
138
 
131
- # Returns if self is a relative or absolute Url.
139
+ # Returns if self is a relative or absolute Url. If base is provided and
140
+ # self is a page within that site then the link is relative.
141
+ # See Wgit.relative_link? for more information.
142
+ #
132
143
  # @return [Boolean] True if relative, false if absolute.
133
144
  # @raise [RuntimeError] If the link is invalid.
134
- def relative_link?
135
- Wgit::Url.relative_link?(self)
145
+ def relative_link?(base: nil)
146
+ Wgit::Url.relative_link?(self, base: base)
136
147
  end
137
148
 
138
149
  # Determines if self is a valid Url or not.
@@ -142,7 +153,7 @@ module Wgit
142
153
  Wgit::Url.valid?(self)
143
154
  end
144
155
 
145
- # Concats self (Url) and the link.
156
+ # Concats self and the link.
146
157
  #
147
158
  # @param link [Wgit::Url, String] The link to concat with self.
148
159
  # @return [Wgit::Url] self + "/" + link
@@ -172,6 +183,14 @@ module Wgit
172
183
  def to_url
173
184
  self
174
185
  end
186
+
187
+ # Returns a new Wgit::Url containing just the scheme/protocol of this URL
188
+ # e.g. Given http://www.google.co.uk, http is returned.
189
+ #
190
+ # @return [Wgit::Url] Containing just the scheme/protocol.
191
+ def to_scheme
192
+ Wgit::Url.new(@uri.scheme)
193
+ end
175
194
 
176
195
  # Returns a new Wgit::Url containing just the host of this URL e.g.
177
196
  # Given http://www.google.co.uk/about.html, www.google.co.uk is returned.
@@ -181,24 +200,54 @@ module Wgit
181
200
  Wgit::Url.new(@uri.host)
182
201
  end
183
202
 
184
- # Returns the base of this URL e.g. the protocol and host combined.
185
- # How it works:
186
- # URI.split("http://www.google.co.uk/about.html") returns the following:
187
- # array[0]: "http://", array[2]: "www.google.co.uk", which we use.
203
+ # Returns only the base of this URL e.g. the protocol and host combined.
188
204
  #
189
- # @return [Wgit::Url] Base of self (Url) e.g. http://www.google.co.uk.
205
+ # @return [Wgit::Url] Base of self e.g. http://www.google.co.uk.
190
206
  def to_base
191
207
  if Wgit::Url.relative_link?(self)
192
208
  raise "A relative link doesn't have a base URL: #{self}"
193
209
  end
194
- url_segs = URI.split(self)
195
- if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
210
+ if @uri.scheme.nil? or @uri.host.nil? or @uri.host.empty?
196
211
  raise "Both a protocol and host are needed: #{self}"
197
212
  end
198
- base = "#{url_segs[0]}://#{url_segs[2]}"
213
+ base = "#{@uri.scheme}://#{@uri.host}"
199
214
  Wgit::Url.new(base)
200
215
  end
201
-
216
+
217
+ # Returns the path of this URL e.g. the bit after the host without slashes.
218
+ # For example:
219
+ # Wgit::Url.new("http://www.google.co.uk/about.html/").to_path returns
220
+ # "about.html". See Wgit::Url#to_endpoint if you want the slashes.
221
+ #
222
+ # @return [Wgit::Url] Path of self e.g. about.html.
223
+ def to_path
224
+ path = @uri.path
225
+ return Wgit::Url.new(path) if path == '/'
226
+ path = path[1..-1] if path.start_with?('/')
227
+ path.chop! if path.end_with?('/')
228
+ Wgit::Url.new(path)
229
+ end
230
+
231
+ # Returns the endpoint of this URL e.g. the bit after the host with any
232
+ # slashes included. For example:
233
+ # Wgit::Url.new("http://www.google.co.uk/about.html/").to_endpoint returns
234
+ # "/about.html/". See Wgit::Url#to_path if you don't want the slashes.
235
+ #
236
+ # @return [Wgit::Url] Endpoint of self e.g. /about.html/.
237
+ def to_endpoint
238
+ endpoint = @uri.path
239
+ endpoint = '/' + endpoint unless endpoint.start_with?('/')
240
+ Wgit::Url.new(endpoint)
241
+ end
242
+
243
+ # Returns a new Wgit::Url containing just the query string of this URL
244
+ # e.g. Given http://google.com?q=ruby, ruby is returned.
245
+ #
246
+ # @return [Wgit::Url] Containing just the query string.
247
+ def to_query_string
248
+ Wgit::Url.new(@uri.query)
249
+ end
250
+
202
251
  # Returns a Hash containing this Url's instance vars excluding @uri.
203
252
  # Used when storing the URL in a Database e.g. MongoDB etc.
204
253
  #
@@ -210,9 +259,20 @@ module Wgit
210
259
  end
211
260
 
212
261
  alias :to_hash :to_h
262
+ alias :uri :to_uri
263
+ alias :url :to_url
264
+ alias :scheme :to_scheme
265
+ alias :to_protocol :to_scheme
266
+ alias :protocol :to_scheme
213
267
  alias :host :to_host
214
268
  alias :base :to_base
269
+ alias :path :to_path
270
+ alias :endpoint :to_endpoint
271
+ alias :query_string :to_query_string
272
+ alias :query :to_query_string
215
273
  alias :internal_link? :relative_link?
274
+ alias :is_relative? :relative_link?
275
+ alias :is_internal? :relative_link?
216
276
  alias :crawled? :crawled
217
277
  end
218
278
  end
@@ -3,5 +3,5 @@
3
3
  # @author Michael Telford
4
4
  module Wgit
5
5
  # The current gem version of Wgit.
6
- VERSION = "0.0.9".freeze
6
+ VERSION = "0.0.10".freeze
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -28,16 +28,16 @@ dependencies:
28
28
  name: yard
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '0.9'
33
+ version: 0.9.20
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '0.9'
40
+ version: 0.9.20
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: byebug
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -108,6 +108,34 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '1.3'
111
+ - !ruby/object:Gem::Dependency
112
+ name: webmock
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '3.6'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '3.6'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rack
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '2.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '2.0'
111
139
  - !ruby/object:Gem::Dependency
112
140
  name: nokogiri
113
141
  requirement: !ruby/object:Gem::Requirement
@@ -128,21 +156,21 @@ dependencies:
128
156
  requirements:
129
157
  - - "~>"
130
158
  - !ruby/object:Gem::Version
131
- version: '2.6'
159
+ version: 2.8.0
132
160
  type: :runtime
133
161
  prerelease: false
134
162
  version_requirements: !ruby/object:Gem::Requirement
135
163
  requirements:
136
164
  - - "~>"
137
165
  - !ruby/object:Gem::Version
138
- version: '2.6'
139
- description: Wgit is a WWW indexer/scraper which crawls URL's, retrieves and serialises
140
- their page contents for later use. You can use Wgit to copy entire websites if required.
141
- Wgit also provides a means to search indexed documents stored in a database. Therefore,
142
- this library provides the main components of a WWW search engine. The Wgit API is
143
- easily extended allowing you to pull out the parts of a webpage that are important
144
- to you, the code snippets or images for example. As Wgit is a library, it's very
145
- useful in many different application types.
166
+ version: 2.8.0
167
+ description: Fundamentally, Wgit is a WWW indexer/scraper which crawls URL's, retrieves
168
+ and serialises their page contents for later use. You can use Wgit to copy entire
169
+ websites if required. Wgit also provides a means to search indexed documents stored
170
+ in a database. Therefore, this library provides the main components of a WWW search
171
+ engine. The Wgit API is easily extended allowing you to pull out the parts of a
172
+ webpage that are important to you, the code snippets or images for example. As Wgit
173
+ is a library, it has uses in many different application types.
146
174
  email: michael.telford@live.com
147
175
  executables: []
148
176
  extensions: []
@@ -166,6 +194,7 @@ licenses:
166
194
  - MIT
167
195
  metadata:
168
196
  source_code_uri: https://github.com/michaeltelford/wgit
197
+ yard.run: yri
169
198
  allowed_push_host: https://rubygems.org
170
199
  post_install_message:
171
200
  rdoc_options: []
@@ -183,9 +212,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
183
212
  version: '0'
184
213
  requirements: []
185
214
  rubyforge_project:
186
- rubygems_version: 2.7.8
215
+ rubygems_version: 2.7.6
187
216
  signing_key:
188
217
  specification_version: 4
189
- summary: Wgit is the ruby version of GNU's wget with an easy to use API for programmatic
190
- web scraping, indexing and searching.
218
+ summary: Wgit is a Ruby gem similar in nature to GNU's `wget`. It provides an easy
219
+ to use API for programmatic web scraping, indexing and searching.
191
220
  test_files: []