wgit 0.0.18 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 26e6a29fbf72b0ecbbc487c8aba9ec243a260b4761805c6c7923f2af82fa94f5
4
- data.tar.gz: 9e15ad14991418fc3b4b2c0dafacac617b32197e825ad72887d91182c8ddf652
3
+ metadata.gz: 6956381fcc74e20521f0e219cbfaaa74da79de5bdb24349c2fdf4643ca384a31
4
+ data.tar.gz: a544446aa9333d2001119df37ca929cdf2585f89ed084071e077c460b4ff24c9
5
5
  SHA512:
6
- metadata.gz: 4b17b8467abf13b186e88fb63fe8630163612bc685d7d521122fdc4c693e7d9229c59888afa1191189b3838317fa29e028c90757b880177c1e7a8f81a0a38047
7
- data.tar.gz: 6fb7bb518ca3b9e520e1edbf25b4c265018686b5c61e134d623c38efa1bdf5073affb5205e47aee3a32a4502b56205080ded00a79bd6f138cf9178b019a2b32d
6
+ metadata.gz: 517665017a25419d9213df10347cd704a98ee0061243ebcd8d482465461a16d5b8319971321703b663ec8d6ef8f453d60d771d2122590b1655a6fc08be461026
7
+ data.tar.gz: 760e1c8b1b5cf385dfb1d0418c3b416cdef7a9e02595b1f729a30179848145cdc3c4fa25e2bacf073779baba9909b20ef9f2c5038c8b9df1437f0ade81e05990
@@ -8,7 +8,6 @@ require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
9
  require_relative 'wgit/document_extensions'
10
10
  require_relative 'wgit/crawler'
11
- require_relative 'wgit/database/connection_details'
12
11
  require_relative 'wgit/database/model'
13
12
  require_relative 'wgit/database/database'
14
13
  require_relative 'wgit/indexer'
@@ -1,8 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wgit
4
- # Module containing assert methods including type checking which can be used
5
- # for asserting the integrity of method definitions etc.
4
+ # Module containing assertion methods including type checking and duck typing.
6
5
  module Assertable
7
6
  # Default type fail message.
8
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
@@ -11,21 +10,23 @@ module Wgit
11
10
  # Default duck fail message.
12
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
13
12
  # Default required keys message.
14
- DEFAULT_REQUIRED_KEYS_MSG = 'Some or all of the required keys are not present: %s'
13
+ DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not \
14
+ present: %s"
15
15
 
16
- # Tests if the obj is of a given type.
16
+ # Tests if the obj is_a? given type; raises an Exception if not.
17
17
  #
18
18
  # @param obj [Object] The Object to test.
19
19
  # @param type_or_types [Type, Array<Type>] The type/types that obj must
20
20
  # belong to or an exception is thrown.
21
- # @param msg [String] The raised RuntimeError message, if provided.
21
+ # @param msg [String] The raised StandardError message, if provided.
22
+ # @raise [StandardError] If the assertion fails.
22
23
  # @return [Object] The given obj on successful assertion.
23
24
  def assert_types(obj, type_or_types, msg = nil)
24
25
  msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class)
25
26
  match = if type_or_types.respond_to?(:any?)
26
- type_or_types.any? { |type| obj.instance_of?(type) }
27
+ type_or_types.any? { |type| obj.is_a?(type) }
27
28
  else
28
- obj.instance_of?(type_or_types)
29
+ obj.is_a?(type_or_types)
29
30
  end
30
31
  raise msg unless match
31
32
 
@@ -33,36 +34,36 @@ module Wgit
33
34
  end
34
35
 
35
36
  # Each object within arr must match one of the types listed in
36
- # type_or_types or an exception is raised using msg, if provided.
37
+ # type_or_types; or an exception is raised using msg, if provided.
37
38
  #
38
39
  # @param arr [Enumerable#each] Enumerable of objects to type check.
39
40
  # @param type_or_types [Type, Array<Type>] The allowed type(s).
40
- # @param msg [String] The raised RuntimeError message, if provided.
41
+ # @param msg [String] The raised StandardError message, if provided.
42
+ # @raise [StandardError] If the assertion fails.
41
43
  # @return [Object] The given arr on successful assertion.
42
44
  def assert_arr_types(arr, type_or_types, msg = nil)
43
45
  raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
44
46
 
45
- arr.each do |obj|
46
- assert_types(obj, type_or_types, msg)
47
- end
47
+ arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
49
49
 
50
50
  # The obj_or_objs must respond_to? all of the given methods or an
51
51
  # Exception is raised using msg, if provided.
52
52
  #
53
- # @param obj_or_objs [Object, Enumerable#each] The objects to duck check.
53
+ # @param obj_or_objs [Object, Enumerable#each] The object(s) to duck check.
54
54
  # @param methods [Array<Symbol>] The methods to :respond_to?.
55
- # @param msg [String] The raised RuntimeError message, if provided.
55
+ # @param msg [String] The raised StandardError message, if provided.
56
+ # @raise [StandardError] If the assertion fails.
56
57
  # @return [Object] The given obj_or_objs on successful assertion.
57
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
58
59
  methods = [methods] unless methods.respond_to?(:all?)
60
+
59
61
  if obj_or_objs.respond_to?(:each)
60
- obj_or_objs.each do |obj|
61
- _assert_respond_to(obj, methods, msg)
62
- end
62
+ obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
63
63
  else
64
64
  _assert_respond_to(obj_or_objs, methods, msg)
65
65
  end
66
+
66
67
  obj_or_objs
67
68
  end
68
69
 
@@ -71,6 +72,7 @@ module Wgit
71
72
  # @param hash [Hash] The hash which should include the required keys.
72
73
  # @param keys [Array<String, Symbol>] The keys whose presence to assert.
73
74
  # @param msg [String] The raised KeyError message, if provided.
75
+ # @raise [KeyError] If the assertion fails.
74
76
  # @return [Hash] The given hash on successful assertion.
75
77
  def assert_required_keys(hash, keys, msg = nil)
76
78
  msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(', '))
@@ -93,12 +95,7 @@ module Wgit
93
95
  obj
94
96
  end
95
97
 
96
- alias assert_type assert_types
97
- alias type assert_types
98
- alias types assert_types
98
+ alias assert_type assert_types
99
99
  alias assert_arr_type assert_arr_types
100
- alias arr_type assert_arr_types
101
- alias arr_types assert_arr_types
102
- alias respond_to assert_respond_to
103
100
  end
104
101
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  # Script which extends Ruby's core functionality when parsed.
4
- # Needs to be required separately using `require 'wgit/core_ext'`.
4
+ # Needs to be required separately to 'wgit' using `require 'wgit/core_ext'`.
5
5
 
6
6
  require_relative 'url'
7
7
 
@@ -22,19 +22,15 @@ module Enumerable
22
22
  #
23
23
  # @return [Array<Wgit::Url>] The converted URL's.
24
24
  def to_urls
25
- map do |element|
26
- process_url_element(element)
27
- end
25
+ map { |element| process_url_element(element) }
28
26
  end
29
27
 
30
- # Converts each String instance into a Wgit::Url object and returns the
31
- # updated array. Modifies the receiver.
28
+ # Converts each String instance into a Wgit::Url object and returns self
29
+ # having modified the receiver.
32
30
  #
33
31
  # @return [Array<Wgit::Url>] Self containing the converted URL's.
34
32
  def to_urls!
35
- map! do |element|
36
- process_url_element(element)
37
- end
33
+ map! { |element| process_url_element(element) }
38
34
  end
39
35
  end
40
36
 
@@ -42,9 +38,5 @@ private
42
38
 
43
39
  # Converts the element to a Wgit::Url if the element is a String.
44
40
  def process_url_element(element)
45
- if element.is_a? String
46
- element.to_url
47
- else
48
- element
49
- end
41
+ element.is_a?(String) ? element.to_url : element
50
42
  end
@@ -7,142 +7,24 @@ require_relative 'assertable'
7
7
  require 'net/http' # Requires 'uri'.
8
8
 
9
9
  module Wgit
10
- # The Crawler class provides a means of crawling web based Wgit::Url's, turning
11
- # their HTML into Wgit::Document instances.
10
+ # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
11
+ # serialising their HTML into Wgit::Document instances.
12
12
  class Crawler
13
13
  include Assertable
14
14
 
15
- # The default maximum amount of allowed URL redirects.
16
- @default_redirect_limit = 5
17
-
18
- class << self
19
- # Class level instance accessor methods for @default_redirect_limit.
20
- # Call using Wgit::Crawler.default_redirect_limit etc.
21
- attr_accessor :default_redirect_limit
22
- end
23
-
24
- # The urls to crawl.
25
- attr_reader :urls
26
-
27
- # The docs of the crawled @urls.
28
- attr_reader :docs
15
+ # The amount of allowed redirects before raising an error. Set to 0 to
16
+ # disable redirects completely.
17
+ attr_accessor :redirect_limit
29
18
 
30
19
  # The Net::HTTPResponse of the most recently crawled URL or nil.
31
20
  attr_reader :last_response
32
21
 
33
- # Initializes the Crawler and sets the @urls and @docs.
34
- #
35
- # @param urls [*Wgit::Url] The URL's to crawl in the future using either
36
- # Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
37
- # will NOT update if they happen to redirect when crawled. If in doubt,
38
- # pass the url(s) directly to the crawl_* method instead of to the new
39
- # method.
40
- def initialize(*urls)
41
- self.[](*urls)
42
- @docs = []
43
- end
44
-
45
- # Sets this Crawler's @urls.
46
- #
47
- # @param urls [*Wgit::Url] The URL's to crawl in the future using either
48
- # crawl_url or crawl_site. Note that the urls passed here will NOT update
49
- # if they happen to redirect when crawled. If in doubt, pass the url(s)
50
- # directly to the crawl_* method instead of to the new method.
51
- def urls=(urls)
52
- @urls = []
53
- Wgit::Utils.each(urls) { |url| add_url(url) }
54
- end
55
-
56
- # Sets this Crawler's @urls.
57
- #
58
- # @param urls [*Wgit::Url] The URL's to crawl in the future using either
59
- # crawl_url or crawl_site. Note that the urls passed here will NOT update
60
- # if they happen to redirect when crawled. If in doubt, pass the url(s)
61
- # directly to the crawl_* method instead of to the new method.
62
- def [](*urls)
63
- # If urls is nil then add_url (when called later) will set @urls = []
64
- # so we do nothing here.
65
- unless urls.nil?
66
- # Due to *urls you can end up with [[url1,url2,url3]] etc. where the
67
- # outer array is bogus so we use the inner one only.
68
- if urls.is_a?(Enumerable) &&
69
- urls.length == 1 &&
70
- urls.first.is_a?(Enumerable)
71
- urls = urls.first
72
- end
73
-
74
- # Here we call urls= method using self because the param name is also
75
- # urls which conflicts.
76
- self.urls = urls
77
- end
78
- end
79
-
80
- # Adds the url to this Crawler's @urls.
22
+ # Initializes and returns a Wgit::Crawler instance.
81
23
  #
82
- # @param url [Wgit::Url] A URL to crawl later by calling a crawl_* method.
83
- # Note that the url added here will NOT update if it happens to
84
- # redirect when crawled. If in doubt, pass the url directly to the
85
- # crawl_* method instead of to the new method.
86
- def <<(url)
87
- add_url(url)
88
- end
89
-
90
- # Crawls one or more individual urls using Wgit::Crawler#crawl_url
91
- # underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
92
- # that any external redirects are followed. Use Wgit::Crawler#crawl_url if
93
- # this isn't desirable.
94
- #
95
- # @param urls [Array<Wgit::Url>] The URLs to crawl.
96
- # @yield [Wgit::Document] If provided, the block is given each crawled
97
- # Document. Otherwise each doc is added to @docs which can be accessed
98
- # by Crawler#docs after this method returns.
99
- # @return [Wgit::Document] The last Document crawled.
100
- def crawl_urls(urls = @urls, &block)
101
- raise 'No urls to crawl' unless urls
102
-
103
- @docs = []
104
- doc = nil
105
- Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
106
- doc || @docs.last
107
- end
108
-
109
- # Crawl the url returning the response Wgit::Document or nil if an error
110
- # occurs.
111
- #
112
- # @param url [Wgit::Url] The URL to crawl.
113
- # @param follow_external_redirects [Boolean] Whether or not to follow
114
- # an external redirect. False will return nil for such a crawl. If false,
115
- # you must also provide a `host:` parameter.
116
- # @param host [Wgit::Url, String] Specify the host by which
117
- # an absolute redirect is determined to be internal or not. Must be
118
- # absolute and contain a protocol prefix. For example, a `host:` of
119
- # 'http://www.example.com' will only allow redirects for Urls with a
120
- # `to_host` value of 'www.example.com'.
121
- # @yield [Wgit::Document] The crawled HTML Document regardless if the
122
- # crawl was successful or not. Therefore, the Document#url can be used.
123
- # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
124
- # crawl was unsuccessful.
125
- def crawl_url(
126
- url = @urls.first,
127
- follow_external_redirects: true,
128
- host: nil
129
- )
130
- assert_type(url, Wgit::Url)
131
- if !follow_external_redirects && host.nil?
132
- raise 'host cannot be nil if follow_external_redirects is false'
133
- end
134
-
135
- html = fetch(
136
- url,
137
- follow_external_redirects: follow_external_redirects,
138
- host: host
139
- )
140
- url.crawled = true
141
-
142
- doc = Wgit::Document.new(url, html)
143
- yield(doc) if block_given?
144
-
145
- doc.empty? ? nil : doc
24
+ # @param redirect_limit [Integer] The amount of allowed redirects before
25
+ # raising an error. Set to 0 to disable redirects completely.
26
+ def initialize(redirect_limit: 5)
27
+ @redirect_limit = redirect_limit
146
28
  end
147
29
 
148
30
  # Crawls an entire website's HTML pages by recursively going through
@@ -159,18 +41,16 @@ module Wgit
159
41
  # @param url [Wgit::Url] The base URL of the website to be crawled.
160
42
  # It is recommended that this URL be the index page of the site to give a
161
43
  # greater chance of finding all pages within that site/host.
162
- # @yield [Wgit::Document] Given each crawled Document/page of the site.
44
+ # @yield [doc] Given each crawled page (Wgit::Document) of the site.
163
45
  # A block is the only way to interact with each crawled Document.
164
46
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
165
47
  # from all of the site's pages or nil if the url could not be
166
48
  # crawled successfully.
167
- def crawl_site(url = @urls.first, &block)
168
- assert_type(url, Wgit::Url)
169
-
49
+ def crawl_site(url, &block)
170
50
  doc = crawl_url(url, &block)
171
51
  return nil if doc.nil?
172
52
 
173
- host = url.to_base
53
+ opts = { follow_external_redirects: false, host: url.to_base }
174
54
  alt_url = url.end_with?('/') ? url.chop : url + '/'
175
55
  crawled = [url, alt_url]
176
56
  externals = doc.external_links
@@ -187,9 +67,7 @@ module Wgit
187
67
 
188
68
  links.each do |link|
189
69
  orig_link = link.dup
190
- doc = crawl_url(
191
- link, follow_external_redirects: false, host: host, &block
192
- )
70
+ doc = crawl_url(link, opts, &block)
193
71
 
194
72
  crawled.push(orig_link, link) # Push both in case of redirects.
195
73
  next if doc.nil?
@@ -202,6 +80,66 @@ module Wgit
202
80
  externals.uniq
203
81
  end
204
82
 
83
+ # Crawls one or more individual urls using Wgit::Crawler#crawl_url
84
+ # underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
85
+ #
86
+ # @param urls [*Wgit::Url] The Url's to crawl.
87
+ # @yield [doc] Given each crawled page (Wgit::Document); this is the only
88
+ # way to interact with them.
89
+ # @raise [StandardError] If no urls are provided.
90
+ # @return [Wgit::Document] The last Document crawled.
91
+ def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
92
+ raise 'You must provide at least one Url' if urls.empty?
93
+
94
+ opts = {
95
+ follow_external_redirects: follow_external_redirects,
96
+ host: host
97
+ }
98
+ doc = nil
99
+
100
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
101
+
102
+ doc
103
+ end
104
+
105
+ # Crawl the url returning the response Wgit::Document or nil if an error
106
+ # occurs.
107
+ #
108
+ # @param url [Wgit::Url] The Url to crawl.
109
+ # @param follow_external_redirects [Boolean] Whether or not to follow
110
+ # an external redirect. External meaning to a different host. False will
111
+ # return nil for such a crawl. If false, you must also provide a `host:`
112
+ # parameter.
113
+ # @param host [Wgit::Url, String] Specify the host by which
114
+ # an absolute redirect is determined to be internal or not. Must be
115
+ # absolute and contain a protocol prefix. For example, a `host:` of
116
+ # 'http://www.example.com' will only allow redirects for Url's with a
117
+ # `to_host` value of 'www.example.com'.
118
+ # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
119
+ # crawl was successful or not. Therefore, Document#url etc. can be used.
120
+ # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
121
+ # crawl was unsuccessful.
122
+ def crawl_url(url, follow_external_redirects: true, host: nil)
123
+ # A String url isn't allowed because it's passed by value not reference,
124
+ # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
125
+ assert_type(url, Wgit::Url)
126
+ if !follow_external_redirects && host.nil?
127
+ raise 'host cannot be nil if follow_external_redirects is false'
128
+ end
129
+
130
+ html = fetch(
131
+ url,
132
+ follow_external_redirects: follow_external_redirects,
133
+ host: host
134
+ )
135
+ url.crawled = true
136
+
137
+ doc = Wgit::Document.new(url, html)
138
+ yield(doc) if block_given?
139
+
140
+ doc.empty? ? nil : doc
141
+ end
142
+
205
143
  protected
206
144
 
207
145
  # This method calls Wgit::Crawler#resolve to obtain the page HTML, handling
@@ -227,22 +165,19 @@ module Wgit
227
165
  host: host
228
166
  )
229
167
  @last_response = response
168
+
230
169
  response.body.empty? ? nil : response.body
231
170
  rescue StandardError => e
232
- Wgit.logger.debug(
233
- "Wgit::Crawler#fetch('#{url}') exception: #{e.message}"
234
- )
171
+ Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
235
172
  @last_response = nil
173
+
236
174
  nil
237
175
  end
238
176
 
239
177
  # The resolve method performs a HTTP GET to obtain the HTML response. The
240
- # Net::HTTPResponse will be returned or an error raised. Redirects can be
241
- # disabled by setting `redirect_limit: 0`.
178
+ # Net::HTTPResponse will be returned or an error raised.
242
179
  #
243
180
  # @param url [Wgit::Url] The URL to fetch the HTML from.
244
- # @param redirect_limit [Integer] The number of redirect hops to allow
245
- # before raising an error.
246
181
  # @param follow_external_redirects [Boolean] Whether or not to follow
247
182
  # an external redirect. If false, you must also provide a `host:`
248
183
  # parameter.
@@ -254,12 +189,7 @@ module Wgit
254
189
  # @raise [StandardError] If !url.respond_to? :to_uri or a redirect isn't
255
190
  # allowed.
256
191
  # @return [Net::HTTPResponse] The HTTP response of the GET request.
257
- def resolve(
258
- url,
259
- redirect_limit: Wgit::Crawler.default_redirect_limit,
260
- follow_external_redirects: true,
261
- host: nil
262
- )
192
+ def resolve(url, follow_external_redirects: true, host: nil)
263
193
  raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
264
194
 
265
195
  redirect_count = 0
@@ -267,25 +197,25 @@ module Wgit
267
197
 
268
198
  loop do
269
199
  response = Net::HTTP.get_response(url.to_uri)
200
+ break unless response.is_a?(Net::HTTPRedirection)
201
+
270
202
  location = Wgit::Url.new(response.fetch('location', ''))
203
+ raise 'Encountered redirect without Location header' if location.empty?
271
204
 
272
- break unless response.is_a?(Net::HTTPRedirection)
273
205
  yield(url, response, location) if block_given?
274
206
 
275
- unless location.empty?
276
- if !follow_external_redirects &&
277
- !location.is_relative?(host: host)
278
- raise "External redirect not allowed - Redirected to: \
207
+ if !follow_external_redirects && !location.is_relative?(host: host)
208
+ raise "External redirect not allowed - Redirected to: \
279
209
  '#{location}', which is outside of host: '#{host}'"
280
- end
210
+ end
281
211
 
282
- raise 'Too many redirects' if redirect_count >= redirect_limit
212
+ raise "Too many redirects: #{redirect_count}" \
213
+ if redirect_count >= @redirect_limit
283
214
 
284
- redirect_count += 1
215
+ redirect_count += 1
285
216
 
286
- location = url.to_base.concat(location) if location.is_relative?
287
- url.replace(location)
288
- end
217
+ location = url.to_base.concat(location) if location.is_relative?
218
+ url.replace(location) # Update the url on redirect.
289
219
  end
290
220
 
291
221
  response
@@ -300,7 +230,7 @@ module Wgit
300
230
  # internal page links.
301
231
  # @return [Array<Wgit::Url>] The internal page links from doc.
302
232
  def get_internal_links(doc)
303
- doc.internal_full_links
233
+ doc.internal_absolute_links
304
234
  .map(&:without_anchor) # Because anchors don't change page content.
305
235
  .uniq
306
236
  .reject do |link|
@@ -309,28 +239,9 @@ module Wgit
309
239
  end
310
240
  end
311
241
 
312
- private
313
-
314
- # Add the document to the @docs array for later processing or let the block
315
- # process it here and now.
316
- def handle_crawl_block(url, &block)
317
- if block_given?
318
- crawl_url(url, &block)
319
- else
320
- @docs << crawl_url(url)
321
- nil
322
- end
323
- end
324
-
325
- # Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
326
- def add_url(url)
327
- @urls = [] if @urls.nil?
328
- @urls << Wgit::Url.new(url)
329
- end
330
-
331
- alias crawl crawl_urls
242
+ alias crawl crawl_urls
332
243
  alias crawl_pages crawl_urls
333
- alias crawl_page crawl_url
334
- alias crawl_r crawl_site
244
+ alias crawl_page crawl_url
245
+ alias crawl_r crawl_site
335
246
  end
336
247
  end