wgit 0.0.18 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 26e6a29fbf72b0ecbbc487c8aba9ec243a260b4761805c6c7923f2af82fa94f5
4
- data.tar.gz: 9e15ad14991418fc3b4b2c0dafacac617b32197e825ad72887d91182c8ddf652
3
+ metadata.gz: 6956381fcc74e20521f0e219cbfaaa74da79de5bdb24349c2fdf4643ca384a31
4
+ data.tar.gz: a544446aa9333d2001119df37ca929cdf2585f89ed084071e077c460b4ff24c9
5
5
  SHA512:
6
- metadata.gz: 4b17b8467abf13b186e88fb63fe8630163612bc685d7d521122fdc4c693e7d9229c59888afa1191189b3838317fa29e028c90757b880177c1e7a8f81a0a38047
7
- data.tar.gz: 6fb7bb518ca3b9e520e1edbf25b4c265018686b5c61e134d623c38efa1bdf5073affb5205e47aee3a32a4502b56205080ded00a79bd6f138cf9178b019a2b32d
6
+ metadata.gz: 517665017a25419d9213df10347cd704a98ee0061243ebcd8d482465461a16d5b8319971321703b663ec8d6ef8f453d60d771d2122590b1655a6fc08be461026
7
+ data.tar.gz: 760e1c8b1b5cf385dfb1d0418c3b416cdef7a9e02595b1f729a30179848145cdc3c4fa25e2bacf073779baba9909b20ef9f2c5038c8b9df1437f0ade81e05990
@@ -8,7 +8,6 @@ require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
9
  require_relative 'wgit/document_extensions'
10
10
  require_relative 'wgit/crawler'
11
- require_relative 'wgit/database/connection_details'
12
11
  require_relative 'wgit/database/model'
13
12
  require_relative 'wgit/database/database'
14
13
  require_relative 'wgit/indexer'
@@ -1,8 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wgit
4
- # Module containing assert methods including type checking which can be used
5
- # for asserting the integrity of method definitions etc.
4
+ # Module containing assertion methods including type checking and duck typing.
6
5
  module Assertable
7
6
  # Default type fail message.
8
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
@@ -11,21 +10,23 @@ module Wgit
11
10
  # Default duck fail message.
12
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
13
12
  # Default required keys message.
14
- DEFAULT_REQUIRED_KEYS_MSG = 'Some or all of the required keys are not present: %s'
13
+ DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not \
14
+ present: %s"
15
15
 
16
- # Tests if the obj is of a given type.
16
+ # Tests if the obj is_a? given type; raises an Exception if not.
17
17
  #
18
18
  # @param obj [Object] The Object to test.
19
19
  # @param type_or_types [Type, Array<Type>] The type/types that obj must
20
20
  # belong to or an exception is thrown.
21
- # @param msg [String] The raised RuntimeError message, if provided.
21
+ # @param msg [String] The raised StandardError message, if provided.
22
+ # @raise [StandardError] If the assertion fails.
22
23
  # @return [Object] The given obj on successful assertion.
23
24
  def assert_types(obj, type_or_types, msg = nil)
24
25
  msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class)
25
26
  match = if type_or_types.respond_to?(:any?)
26
- type_or_types.any? { |type| obj.instance_of?(type) }
27
+ type_or_types.any? { |type| obj.is_a?(type) }
27
28
  else
28
- obj.instance_of?(type_or_types)
29
+ obj.is_a?(type_or_types)
29
30
  end
30
31
  raise msg unless match
31
32
 
@@ -33,36 +34,36 @@ module Wgit
33
34
  end
34
35
 
35
36
  # Each object within arr must match one of the types listed in
36
- # type_or_types or an exception is raised using msg, if provided.
37
+ # type_or_types; or an exception is raised using msg, if provided.
37
38
  #
38
39
  # @param arr [Enumerable#each] Enumerable of objects to type check.
39
40
  # @param type_or_types [Type, Array<Type>] The allowed type(s).
40
- # @param msg [String] The raised RuntimeError message, if provided.
41
+ # @param msg [String] The raised StandardError message, if provided.
42
+ # @raise [StandardError] If the assertion fails.
41
43
  # @return [Object] The given arr on successful assertion.
42
44
  def assert_arr_types(arr, type_or_types, msg = nil)
43
45
  raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
44
46
 
45
- arr.each do |obj|
46
- assert_types(obj, type_or_types, msg)
47
- end
47
+ arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
49
49
 
50
50
  # The obj_or_objs must respond_to? all of the given methods or an
51
51
  # Exception is raised using msg, if provided.
52
52
  #
53
- # @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
53
+ # @param obj_or_objs [Object, Enumerable#each] The object(s) to duck check.
54
54
  # @param methods [Array<Symbol>] The methods to :respond_to?.
55
- # @param msg [String] The raised RuntimeError message, if provided.
55
+ # @param msg [String] The raised StandardError message, if provided.
56
+ # @raise [StandardError] If the assertion fails.
56
57
  # @return [Object] The given obj_or_objs on successful assertion.
57
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
58
59
  methods = [methods] unless methods.respond_to?(:all?)
60
+
59
61
  if obj_or_objs.respond_to?(:each)
60
- obj_or_objs.each do |obj|
61
- _assert_respond_to(obj, methods, msg)
62
- end
62
+ obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
63
63
  else
64
64
  _assert_respond_to(obj_or_objs, methods, msg)
65
65
  end
66
+
66
67
  obj_or_objs
67
68
  end
68
69
 
@@ -71,6 +72,7 @@ module Wgit
71
72
  # @param hash [Hash] The hash which should include the required keys.
72
73
  # @param keys [Array<String, Symbol>] The keys whose presence to assert.
73
74
  # @param msg [String] The raised KeyError message, if provided.
75
+ # @raise [KeyError] If the assertion fails.
74
76
  # @return [Hash] The given hash on successful assertion.
75
77
  def assert_required_keys(hash, keys, msg = nil)
76
78
  msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(', '))
@@ -93,12 +95,7 @@ module Wgit
93
95
  obj
94
96
  end
95
97
 
96
- alias assert_type assert_types
97
- alias type assert_types
98
- alias types assert_types
98
+ alias assert_type assert_types
99
99
  alias assert_arr_type assert_arr_types
100
- alias arr_type assert_arr_types
101
- alias arr_types assert_arr_types
102
- alias respond_to assert_respond_to
103
100
  end
104
101
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # Script which extends Ruby's core functionality when parsed.
4
- # Needs to be required separately using `require 'wgit/core_ext'`.
4
+ # Needs to be required separately to 'wgit' using `require 'wgit/core_ext'`.
5
5
 
6
6
  require_relative 'url'
7
7
 
@@ -22,19 +22,15 @@ module Enumerable
22
22
  #
23
23
  # @return [Array<Wgit::Url>] The converted URL's.
24
24
  def to_urls
25
- map do |element|
26
- process_url_element(element)
27
- end
25
+ map { |element| process_url_element(element) }
28
26
  end
29
27
 
30
- # Converts each String instance into a Wgit::Url object and returns the
31
- # updated array. Modifies the receiver.
28
+ # Converts each String instance into a Wgit::Url object and returns self
29
+ # having modified the receiver.
32
30
  #
33
31
  # @return [Array<Wgit::Url>] Self containing the converted URL's.
34
32
  def to_urls!
35
- map! do |element|
36
- process_url_element(element)
37
- end
33
+ map! { |element| process_url_element(element) }
38
34
  end
39
35
  end
40
36
 
@@ -42,9 +38,5 @@ private
42
38
 
43
39
  # Converts the element to a Wgit::Url if the element is a String.
44
40
  def process_url_element(element)
45
- if element.is_a? String
46
- element.to_url
47
- else
48
- element
49
- end
41
+ element.is_a?(String) ? element.to_url : element
50
42
  end
@@ -7,142 +7,24 @@ require_relative 'assertable'
7
7
  require 'net/http' # Requires 'uri'.
8
8
 
9
9
  module Wgit
10
- # The Crawler class provides a means of crawling web based Wgit::Url's, turning
11
- # their HTML into Wgit::Document instances.
10
+ # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
11
+ # serialising their HTML into Wgit::Document instances.
12
12
  class Crawler
13
13
  include Assertable
14
14
 
15
- # The default maximum amount of allowed URL redirects.
16
- @default_redirect_limit = 5
17
-
18
- class << self
19
- # Class level instance accessor methods for @default_redirect_limit.
20
- # Call using Wgit::Crawler.default_redirect_limit etc.
21
- attr_accessor :default_redirect_limit
22
- end
23
-
24
- # The urls to crawl.
25
- attr_reader :urls
26
-
27
- # The docs of the crawled @urls.
28
- attr_reader :docs
15
+ # The amount of allowed redirects before raising an error. Set to 0 to
16
+ # disable redirects completely.
17
+ attr_accessor :redirect_limit
29
18
 
30
19
  # The Net::HTTPResponse of the most recently crawled URL or nil.
31
20
  attr_reader :last_response
32
21
 
33
- # Initializes the Crawler and sets the @urls and @docs.
34
- #
35
- # @param urls [*Wgit::Url] The URL's to crawl in the future using either
36
- # Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
37
- # will NOT update if they happen to redirect when crawled. If in doubt,
38
- # pass the url(s) directly to the crawl_* method instead of to the new
39
- # method.
40
- def initialize(*urls)
41
- self.[](*urls)
42
- @docs = []
43
- end
44
-
45
- # Sets this Crawler's @urls.
46
- #
47
- # @param urls [*Wgit::Url] The URL's to crawl in the future using either
48
- # crawl_url or crawl_site. Note that the urls passed here will NOT update
49
- # if they happen to redirect when crawled. If in doubt, pass the url(s)
50
- # directly to the crawl_* method instead of to the new method.
51
- def urls=(urls)
52
- @urls = []
53
- Wgit::Utils.each(urls) { |url| add_url(url) }
54
- end
55
-
56
- # Sets this Crawler's @urls.
57
- #
58
- # @param urls [*Wgit::Url] The URL's to crawl in the future using either
59
- # crawl_url or crawl_site. Note that the urls passed here will NOT update
60
- # if they happen to redirect when crawled. If in doubt, pass the url(s)
61
- # directly to the crawl_* method instead of to the new method.
62
- def [](*urls)
63
- # If urls is nil then add_url (when called later) will set @urls = []
64
- # so we do nothing here.
65
- unless urls.nil?
66
- # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
67
- # outer array is bogus so we use the inner one only.
68
- if urls.is_a?(Enumerable) &&
69
- urls.length == 1 &&
70
- urls.first.is_a?(Enumerable)
71
- urls = urls.first
72
- end
73
-
74
- # Here we call urls= method using self because the param name is also
75
- # urls which conflicts.
76
- self.urls = urls
77
- end
78
- end
79
-
80
- # Adds the url to this Crawler's @urls.
22
+ # Initializes and returns a Wgit::Crawler instance.
81
23
  #
82
- # @param url [Wgit::Url] A URL to crawl later by calling a crawl_* method.
83
- # Note that the url added here will NOT update if it happens to
84
- # redirect when crawled. If in doubt, pass the url directly to the
85
- # crawl_* method instead of to the new method.
86
- def <<(url)
87
- add_url(url)
88
- end
89
-
90
- # Crawls one or more individual urls using Wgit::Crawler#crawl_url
91
- # underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
92
- # that any external redirects are followed. Use Wgit::Crawler#crawl_url if
93
- # this isn't desirable.
94
- #
95
- # @param urls [Array<Wgit::Url>] The URLs to crawl.
96
- # @yield [Wgit::Document] If provided, the block is given each crawled
97
- # Document. Otherwise each doc is added to @docs which can be accessed
98
- # by Crawler#docs after this method returns.
99
- # @return [Wgit::Document] The last Document crawled.
100
- def crawl_urls(urls = @urls, &block)
101
- raise 'No urls to crawl' unless urls
102
-
103
- @docs = []
104
- doc = nil
105
- Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
106
- doc || @docs.last
107
- end
108
-
109
- # Crawl the url returning the response Wgit::Document or nil if an error
110
- # occurs.
111
- #
112
- # @param url [Wgit::Url] The URL to crawl.
113
- # @param follow_external_redirects [Boolean] Whether or not to follow
114
- # an external redirect. False will return nil for such a crawl. If false,
115
- # you must also provide a `host:` parameter.
116
- # @param host [Wgit::Url, String] Specify the host by which
117
- # an absolute redirect is determined to be internal or not. Must be
118
- # absolute and contain a protocol prefix. For example, a `host:` of
119
- # 'http://www.example.com' will only allow redirects for Urls with a
120
- # `to_host` value of 'www.example.com'.
121
- # @yield [Wgit::Document] The crawled HTML Document regardless if the
122
- # crawl was successful or not. Therefore, the Document#url can be used.
123
- # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
124
- # crawl was unsuccessful.
125
- def crawl_url(
126
- url = @urls.first,
127
- follow_external_redirects: true,
128
- host: nil
129
- )
130
- assert_type(url, Wgit::Url)
131
- if !follow_external_redirects && host.nil?
132
- raise 'host cannot be nil if follow_external_redirects is false'
133
- end
134
-
135
- html = fetch(
136
- url,
137
- follow_external_redirects: follow_external_redirects,
138
- host: host
139
- )
140
- url.crawled = true
141
-
142
- doc = Wgit::Document.new(url, html)
143
- yield(doc) if block_given?
144
-
145
- doc.empty? ? nil : doc
24
+ # @param redirect_limit [Integer] The amount of allowed redirects before
25
+ # raising an error. Set to 0 to disable redirects completely.
26
+ def initialize(redirect_limit: 5)
27
+ @redirect_limit = redirect_limit
146
28
  end
147
29
 
148
30
  # Crawls an entire website's HTML pages by recursively going through
@@ -159,18 +41,16 @@ module Wgit
159
41
  # @param url [Wgit::Url] The base URL of the website to be crawled.
160
42
  # It is recommended that this URL be the index page of the site to give a
161
43
  # greater chance of finding all pages within that site/host.
162
- # @yield [Wgit::Document] Given each crawled Document/page of the site.
44
+ # @yield [doc] Given each crawled page (Wgit::Document) of the site.
163
45
  # A block is the only way to interact with each crawled Document.
164
46
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
165
47
  # from all of the site's pages or nil if the url could not be
166
48
  # crawled successfully.
167
- def crawl_site(url = @urls.first, &block)
168
- assert_type(url, Wgit::Url)
169
-
49
+ def crawl_site(url, &block)
170
50
  doc = crawl_url(url, &block)
171
51
  return nil if doc.nil?
172
52
 
173
- host = url.to_base
53
+ opts = { follow_external_redirects: false, host: url.to_base }
174
54
  alt_url = url.end_with?('/') ? url.chop : url + '/'
175
55
  crawled = [url, alt_url]
176
56
  externals = doc.external_links
@@ -187,9 +67,7 @@ module Wgit
187
67
 
188
68
  links.each do |link|
189
69
  orig_link = link.dup
190
- doc = crawl_url(
191
- link, follow_external_redirects: false, host: host, &block
192
- )
70
+ doc = crawl_url(link, opts, &block)
193
71
 
194
72
  crawled.push(orig_link, link) # Push both in case of redirects.
195
73
  next if doc.nil?
@@ -202,6 +80,66 @@ module Wgit
202
80
  externals.uniq
203
81
  end
204
82
 
83
+ # Crawls one or more individual urls using Wgit::Crawler#crawl_url
84
+ # underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
85
+ #
86
+ # @param urls [*Wgit::Url] The Url's to crawl.
87
+ # @yield [doc] Given each crawled page (Wgit::Document); this is the only
88
+ # way to interact with them.
89
+ # @raise [StandardError] If no urls are provided.
90
+ # @return [Wgit::Document] The last Document crawled.
91
+ def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
92
+ raise 'You must provide at least one Url' if urls.empty?
93
+
94
+ opts = {
95
+ follow_external_redirects: follow_external_redirects,
96
+ host: host
97
+ }
98
+ doc = nil
99
+
100
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
101
+
102
+ doc
103
+ end
104
+
105
+ # Crawl the url returning the response Wgit::Document or nil if an error
106
+ # occurs.
107
+ #
108
+ # @param url [Wgit::Url] The Url to crawl.
109
+ # @param follow_external_redirects [Boolean] Whether or not to follow
110
+ # an external redirect. External meaning to a different host. False will
111
+ # return nil for such a crawl. If false, you must also provide a `host:`
112
+ # parameter.
113
+ # @param host [Wgit::Url, String] Specify the host by which
114
+ # an absolute redirect is determined to be internal or not. Must be
115
+ # absolute and contain a protocol prefix. For example, a `host:` of
116
+ # 'http://www.example.com' will only allow redirects for Url's with a
117
+ # `to_host` value of 'www.example.com'.
118
+ # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
119
+ # crawl was successful or not. Therefore, Document#url etc. can be used.
120
+ # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
121
+ # crawl was unsuccessful.
122
+ def crawl_url(url, follow_external_redirects: true, host: nil)
123
+ # A String url isn't allowed because it's passed by value not reference,
124
+ # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
125
+ assert_type(url, Wgit::Url)
126
+ if !follow_external_redirects && host.nil?
127
+ raise 'host cannot be nil if follow_external_redirects is false'
128
+ end
129
+
130
+ html = fetch(
131
+ url,
132
+ follow_external_redirects: follow_external_redirects,
133
+ host: host
134
+ )
135
+ url.crawled = true
136
+
137
+ doc = Wgit::Document.new(url, html)
138
+ yield(doc) if block_given?
139
+
140
+ doc.empty? ? nil : doc
141
+ end
142
+
205
143
  protected
206
144
 
207
145
  # This method calls Wgit::Crawler#resolve to obtain the page HTML, handling
@@ -227,22 +165,19 @@ module Wgit
227
165
  host: host
228
166
  )
229
167
  @last_response = response
168
+
230
169
  response.body.empty? ? nil : response.body
231
170
  rescue StandardError => e
232
- Wgit.logger.debug(
233
- "Wgit::Crawler#fetch('#{url}') exception: #{e.message}"
234
- )
171
+ Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
235
172
  @last_response = nil
173
+
236
174
  nil
237
175
  end
238
176
 
239
177
  # The resolve method performs a HTTP GET to obtain the HTML response. The
240
- # Net::HTTPResponse will be returned or an error raised. Redirects can be
241
- # disabled by setting `redirect_limit: 0`.
178
+ # Net::HTTPResponse will be returned or an error raised.
242
179
  #
243
180
  # @param url [Wgit::Url] The URL to fetch the HTML from.
244
- # @param redirect_limit [Integer] The number of redirect hops to allow
245
- # before raising an error.
246
181
  # @param follow_external_redirects [Boolean] Whether or not to follow
247
182
  # an external redirect. If false, you must also provide a `host:`
248
183
  # parameter.
@@ -254,12 +189,7 @@ module Wgit
254
189
  # @raise [StandardError] If !url.respond_to? :to_uri or a redirect isn't
255
190
  # allowed.
256
191
  # @return [Net::HTTPResponse] The HTTP response of the GET request.
257
- def resolve(
258
- url,
259
- redirect_limit: Wgit::Crawler.default_redirect_limit,
260
- follow_external_redirects: true,
261
- host: nil
262
- )
192
+ def resolve(url, follow_external_redirects: true, host: nil)
263
193
  raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
264
194
 
265
195
  redirect_count = 0
@@ -267,25 +197,25 @@ module Wgit
267
197
 
268
198
  loop do
269
199
  response = Net::HTTP.get_response(url.to_uri)
200
+ break unless response.is_a?(Net::HTTPRedirection)
201
+
270
202
  location = Wgit::Url.new(response.fetch('location', ''))
203
+ raise 'Encountered redirect without Location header' if location.empty?
271
204
 
272
- break unless response.is_a?(Net::HTTPRedirection)
273
205
  yield(url, response, location) if block_given?
274
206
 
275
- unless location.empty?
276
- if !follow_external_redirects &&
277
- !location.is_relative?(host: host)
278
- raise "External redirect not allowed - Redirected to: \
207
+ if !follow_external_redirects && !location.is_relative?(host: host)
208
+ raise "External redirect not allowed - Redirected to: \
279
209
  '#{location}', which is outside of host: '#{host}'"
280
- end
210
+ end
281
211
 
282
- raise 'Too many redirects' if redirect_count >= redirect_limit
212
+ raise "Too many redirects: #{redirect_count}" \
213
+ if redirect_count >= @redirect_limit
283
214
 
284
- redirect_count += 1
215
+ redirect_count += 1
285
216
 
286
- location = url.to_base.concat(location) if location.is_relative?
287
- url.replace(location)
288
- end
217
+ location = url.to_base.concat(location) if location.is_relative?
218
+ url.replace(location) # Update the url on redirect.
289
219
  end
290
220
 
291
221
  response
@@ -300,7 +230,7 @@ module Wgit
300
230
  # internal page links.
301
231
  # @return [Array<Wgit::Url>] The internal page links from doc.
302
232
  def get_internal_links(doc)
303
- doc.internal_full_links
233
+ doc.internal_absolute_links
304
234
  .map(&:without_anchor) # Because anchors don't change page content.
305
235
  .uniq
306
236
  .reject do |link|
@@ -309,28 +239,9 @@ module Wgit
309
239
  end
310
240
  end
311
241
 
312
- private
313
-
314
- # Add the document to the @docs array for later processing or let the block
315
- # process it here and now.
316
- def handle_crawl_block(url, &block)
317
- if block_given?
318
- crawl_url(url, &block)
319
- else
320
- @docs << crawl_url(url)
321
- nil
322
- end
323
- end
324
-
325
- # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
326
- def add_url(url)
327
- @urls = [] if @urls.nil?
328
- @urls << Wgit::Url.new(url)
329
- end
330
-
331
- alias crawl crawl_urls
242
+ alias crawl crawl_urls
332
243
  alias crawl_pages crawl_urls
333
- alias crawl_page crawl_url
334
- alias crawl_r crawl_site
244
+ alias crawl_page crawl_url
245
+ alias crawl_r crawl_site
335
246
  end
336
247
  end