wgit 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d9d045d2dd7f570db1811bafab1ac244103cc359033efd9279323c795a67bb9f
4
- data.tar.gz: 996801763a6576ede812e2edd7d201ceb34b2135548a365b748f953e7df40db9
3
+ metadata.gz: 0db518346d1a939c13e689856a4ae9946ead17f188185529bce7fc1f97b84fba
4
+ data.tar.gz: aee59fe627767c45736e5ae518fe46c46ca9de13e1cd6bed23c0dc5b313aa1e6
5
5
  SHA512:
6
- metadata.gz: e0dfe907c599c320377464aec927b24700d0e9e17d7bb37b4903715af63cbf031dc5983cd6749b1d90353cbcffc0d71e76eb2a0f8c0ba77b3b03f2d51ca9634f
7
- data.tar.gz: bade693ab5b32bf8a16747233356307fe489798855133b8e16b3a907d38f8fd9ecfadadab0273d1c0767106bcf85027c64ebb6f86dc38660240200d5fef07377
6
+ metadata.gz: 5c29a7dc084742132241ec17118eb8dccf39ae57c57aa3364a01c948c4ed1a3304dd1510324b2f3b5d7c9d06ea04802524f6cd9eae00e2f68127e456b8cd9d17
7
+ data.tar.gz: 80cae11e47d806bcb14cc77e2628d533e9b466d7fc43d7e6e4fd79b3a65c51df5cbcd6886b62eab88ceb6176d8fdaaf91e45bddd5de3dabae00b245d2cb53968
@@ -4,8 +4,7 @@ require_relative 'url'
4
4
  require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
- require 'net/http' # Requires 'uri'.
8
- require 'benchmark'
7
+ require 'typhoeus'
9
8
 
10
9
  module Wgit
11
10
  # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
@@ -17,15 +16,24 @@ module Wgit
17
16
  # disable redirects completely.
18
17
  attr_accessor :redirect_limit
19
18
 
20
- # The Net::HTTPResponse of the most recently crawled URL or nil.
19
+ # The maximum amount of time (in seconds) a crawl request has to complete
20
+ # before raising an error. Set to 0 to disable time outs completely.
21
+ attr_accessor :time_out
22
+
23
+ # The Typhoeus::Response of the most recently crawled URL or nil.
24
+ # See https://rubydoc.info/gems/typhoeus/Typhoeus/Response for more info.
21
25
  attr_reader :last_response
22
26
 
23
27
  # Initializes and returns a Wgit::Crawler instance.
24
28
  #
25
29
  # @param redirect_limit [Integer] The amount of allowed redirects before
26
30
  # raising an error. Set to 0 to disable redirects completely.
27
- def initialize(redirect_limit: 5)
31
+ # @param time_out [Integer, Float] The maximum amount of time (in seconds)
32
+ # a crawl request has to complete before raising an error. Set to 0 to
33
+ # disable time outs completely.
34
+ def initialize(redirect_limit: 5, time_out: 5)
28
35
  @redirect_limit = redirect_limit
36
+ @time_out = time_out
29
37
  end
30
38
 
31
39
  # Crawls an entire website's HTML pages by recursively going through
@@ -103,7 +111,7 @@ module Wgit
103
111
  doc
104
112
  end
105
113
 
106
- # Crawl the url returning the response Wgit::Document or nil if an error
114
+ # Crawl the url returning the response Wgit::Document or nil, if an error
107
115
  # occurs.
108
116
  #
109
117
  # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
@@ -141,13 +149,12 @@ module Wgit
141
149
 
142
150
  protected
143
151
 
144
- # This method calls Wgit::Crawler#resolve to obtain the page HTML, handling
145
- # any errors that arise and setting the @last_response. Errors or any
146
- # HTTP response that doesn't return a HTML body will be ignored and nil
147
- # will be returned; otherwise, the HTML String is returned.
152
+ # Fetches the url HTML String or nil. Handles any errors that arise
153
+ # and sets the @last_response. Errors or any HTTP response that doesn't
154
+ # return a HTML body will be ignored, returning nil.
148
155
  #
149
- # @param url [Wgit::Url] The URL to fetch the HTML for. This Url object
150
- # will likely be modified as a result of the fetch/crawl.
156
+ # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
157
+ # reference and gets modified as a result of the fetch/crawl.
151
158
  # @param follow_external_redirects [Boolean] Whether or not to follow
152
159
  # an external redirect. False will return nil for such a crawl. If false,
153
160
  # you must also provide a `host:` parameter.
@@ -159,16 +166,15 @@ module Wgit
159
166
  # @return [String, nil] The crawled HTML or nil if the crawl was
160
167
  # unsuccessful.
161
168
  def fetch(url, follow_external_redirects: true, host: nil)
162
- crawl_duration = nil
163
169
  response = nil
170
+ crawl_duration = nil
164
171
 
165
- crawl_duration = Benchmark.measure do
166
- response = resolve(
167
- url,
168
- follow_external_redirects: follow_external_redirects,
169
- host: host
170
- )
171
- end.real
172
+ response = resolve(
173
+ url,
174
+ follow_external_redirects: follow_external_redirects,
175
+ host: host
176
+ )
177
+ crawl_duration = response.total_time
172
178
 
173
179
  response.body.empty? ? nil : response.body
174
180
  rescue StandardError => e
@@ -181,10 +187,10 @@ module Wgit
181
187
  @last_response = response
182
188
  end
183
189
 
184
- # The resolve method performs a HTTP GET to obtain the HTML response. The
185
- # Net::HTTPResponse will be returned or an error raised.
190
+ # Resolves the url by handling any redirects. The response object will be
191
+ # returned or an error raised.
186
192
  #
187
- # @param url [Wgit::Url] The URL to fetch the HTML from.
193
+ # @param url [Wgit::Url] The URL to resolve.
188
194
  # @param follow_external_redirects [Boolean] Whether or not to follow
189
195
  # an external redirect. If false, you must also provide a `host:`
190
196
  # parameter.
@@ -193,37 +199,66 @@ module Wgit
193
199
  # absolute and contain a protocol prefix. For example, a `host:` of
194
200
  # 'http://www.example.com' will only allow redirects for Urls with a
195
201
  # `to_host` value of 'www.example.com'.
196
- # @raise [StandardError] If !url.respond_to? :to_uri or a redirect isn't
197
- # allowed.
198
- # @return [Net::HTTPResponse] The HTTP response of the GET request.
202
+ # @raise [StandardError] If a redirect isn't allowed etc.
203
+ # @return [Typhoeus::Response] The HTTP response of the GET request.
199
204
  def resolve(url, follow_external_redirects: true, host: nil)
200
- raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
201
-
205
+ response = nil
202
206
  redirect_count = 0
203
- response = nil
207
+ total_net_time = 0.0
204
208
 
205
209
  loop do
206
- response = Net::HTTP.get_response(url.to_uri)
207
- break unless response.is_a?(Net::HTTPRedirection)
210
+ response = get_response(url)
211
+ total_net_time += response.total_time if response.total_time
208
212
 
209
- location = Wgit::Url.new(response.fetch('location', ''))
213
+ # Break unless it's a redirect.
214
+ break unless (response.code >= 300) && (response.code < 400)
215
+
216
+ # Handle response 'Location' header.
217
+ location = Wgit::Utils.fetch(response.headers, :location, '')
218
+ location = Wgit::Url.new(location)
210
219
  raise 'Encountered redirect without Location header' if location.empty?
211
220
 
212
221
  yield(url, response, location) if block_given?
213
222
 
223
+ # Handle redirect logic.
214
224
  raise "External redirect not allowed - Redirected to: \
215
225
  '#{location}', which is outside of host: '#{host}'" \
216
- if !follow_external_redirects && !location.is_relative?(host: host)
226
+ if !follow_external_redirects && !location.relative?(host: host)
217
227
 
218
- raise "Too many redirects: #{redirect_count}" \
228
+ raise "Too many redirects, exceeded: #{redirect_count}" \
219
229
  if redirect_count >= @redirect_limit
220
230
 
221
231
  redirect_count += 1
222
232
 
223
- location = url.to_base.concat(location) if location.is_relative?
233
+ # Process the location to be crawled next.
234
+ location = url.to_base.concat(location) if location.relative?
224
235
  url.replace(location) # Update the url on redirect.
225
236
  end
226
237
 
238
+ response.options[:redirect_count] = redirect_count
239
+ response.options[:total_time] = total_net_time
240
+
241
+ response
242
+ end
243
+
244
+ # Performs a HTTP GET request and returns the response.
245
+ #
246
+ # @param url [String] The url to GET. Will call url#normalize if possible.
247
+ # @raise [StandardError] If a response can't be obtained.
248
+ # @return [Typhoeus::Response] The HTTP response of the GET request.
249
+ def get_response(url)
250
+ url = url.normalize if url.respond_to?(:normalize)
251
+
252
+ opts = {
253
+ followlocation: false, timeout: @time_out, accept_encoding: 'gzip'
254
+ }
255
+
256
+ response = Typhoeus.get(url, opts)
257
+
258
+ # Handle response status code.
259
+ raise "No response (within timeout: #{@time_out} second(s))" \
260
+ if response.code == 0
261
+
227
262
  response
228
263
  end
229
264
 
@@ -23,6 +23,8 @@ module Wgit
23
23
  # Initializes a connected database client using the provided
24
24
  # connection_string or ENV['WGIT_CONNECTION_STRING'].
25
25
  #
26
+ # @param connection_string [String] The connection string needed to connect
27
+ # to the database.
26
28
  # @raise [StandardError] If a connection string isn't provided, either as a
27
29
  # parameter or via the environment.
28
30
  def initialize(connection_string = nil)
@@ -36,6 +38,10 @@ module Wgit
36
38
 
37
39
  # A class alias for Database.new.
38
40
  #
41
+ # @param connection_string [String] The connection string needed to connect
42
+ # to the database.
43
+ # @raise [StandardError] If a connection string isn't provided, either as a
44
+ # parameter or via the environment.
39
45
  # @return [Wgit::Database] The connected database client.
40
46
  def self.connect(connection_string = nil)
41
47
  new(connection_string)
@@ -43,6 +49,8 @@ module Wgit
43
49
 
44
50
  # Initializes a connected database client using the connection string.
45
51
  #
52
+ # @param connection_string [String] The connection string needed to connect
53
+ # to the database.
46
54
  # @raise [StandardError] If a connection cannot be established.
47
55
  # @return [Mong::Client] The connected MongoDB client.
48
56
  def self.establish_connection(connection_string)
@@ -224,11 +224,11 @@ module Wgit
224
224
  # @return [Wgit::Url] The base URL of this Document e.g.
225
225
  # 'http://example.com/public'.
226
226
  def base_url(link: nil)
227
- get_base = -> { @base.is_relative? ? @url.to_base.concat(@base) : @base }
227
+ get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
228
228
 
229
229
  if link
230
230
  link = Wgit::Url.new(link)
231
- raise "link must be relative: #{link}" unless link.is_relative?
231
+ raise "link must be relative: #{link}" unless link.relative?
232
232
 
233
233
  if link.is_anchor? || link.is_query?
234
234
  base_url = @base ? get_base.call : @url
@@ -339,7 +339,7 @@ module Wgit
339
339
  return [] if @links.empty?
340
340
 
341
341
  links = @links
342
- .select { |link| link.is_relative?(host: @url.to_base) }
342
+ .select { |link| link.relative?(host: @url.to_base) }
343
343
  .map(&:without_base)
344
344
  .map do |link| # Map @url.to_host into / as it's a duplicate.
345
345
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
@@ -365,7 +365,7 @@ module Wgit
365
365
  return [] if @links.empty?
366
366
 
367
367
  links = @links
368
- .reject { |link| link.is_relative?(host: @url.to_base) }
368
+ .reject { |link| link.relative?(host: @url.to_base) }
369
369
  .map(&:without_trailing_slash)
370
370
 
371
371
  Wgit::Utils.process_arr(links)
@@ -597,6 +597,7 @@ module Wgit
597
597
  end
598
598
  end
599
599
 
600
+ alias statistics stats
600
601
  alias internal_urls internal_links
601
602
  alias internal_absolute_urls internal_absolute_links
602
603
  alias external_urls external_links
@@ -189,7 +189,7 @@ protocol: #{url}" unless url.to_base
189
189
  # @return [Wgit::Url] self + separator + path, separator depends on path.
190
190
  def concat(path)
191
191
  path = Wgit::Url.new(path)
192
- raise 'path must be relative' unless path.is_relative?
192
+ raise 'path must be relative' unless path.relative?
193
193
 
194
194
  path = path.without_leading_slash
195
195
  separator = path.start_with?('#') || path.start_with?('?') ? '' : '/'
@@ -47,6 +47,32 @@ module Wgit
47
47
  obj_or_objs
48
48
  end
49
49
 
50
+ # An improved Hash :fetch method which checks for multiple formats of the
51
+ # given key and returns the value, or the default value (nil unless
52
+ # provided).
53
+ #
54
+ # For example, if key == :foo, hash is searched for:
55
+ # :foo, 'foo', 'Foo', 'FOO' in that order. The first value found is
56
+ # returned. If no value is found, the default value is returned.
57
+ #
58
+ # @param hash [Hash] The Hash to search within.
59
+ # @param key [Symbol, String] The key with which to search hash.
60
+ # @param default [Object] The default value to be returned if hash[key]
61
+ # doesn't exist.
62
+ # @return [Object] The value found at hash[key] or the default value.
63
+ def self.fetch(hash, key, default = nil)
64
+ key = key.to_s.downcase
65
+
66
+ # Try (in order): :foo, 'foo', 'Foo', 'FOO'.
67
+ [key.to_sym, key, key.capitalize, key.upcase].each do |k|
68
+ value = hash[k]
69
+
70
+ return value if value
71
+ end
72
+
73
+ default
74
+ end
75
+
50
76
  # Formats the sentence (modifies the receiver) and returns its value.
51
77
  # The formatting is essentially to shorten the sentence and ensure that
52
78
  # the index is present somewhere in the sentence. Used for search query
@@ -5,5 +5,5 @@
5
5
  # @author Michael Telford
6
6
  module Wgit
7
7
  # The current gem version of Wgit.
8
- VERSION = '0.3.0'
8
+ VERSION = '0.4.0'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-08 00:00:00.000000000 Z
11
+ date: 2019-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -53,61 +53,47 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: 1.10.3
55
55
  - !ruby/object:Gem::Dependency
56
- name: byebug
56
+ name: typhoeus
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '10.0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '10.0'
69
- - !ruby/object:Gem::Dependency
70
- name: dotenv
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - "~>"
74
- - !ruby/object:Gem::Version
75
- version: '2.5'
76
- type: :development
61
+ version: 1.3.1
62
+ type: :runtime
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
66
  - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: '2.5'
68
+ version: 1.3.1
83
69
  - !ruby/object:Gem::Dependency
84
- name: httplog
70
+ name: byebug
85
71
  requirement: !ruby/object:Gem::Requirement
86
72
  requirements:
87
73
  - - "~>"
88
74
  - !ruby/object:Gem::Version
89
- version: '1.3'
75
+ version: '10.0'
90
76
  type: :development
91
77
  prerelease: false
92
78
  version_requirements: !ruby/object:Gem::Requirement
93
79
  requirements:
94
80
  - - "~>"
95
81
  - !ruby/object:Gem::Version
96
- version: '1.3'
82
+ version: '10.0'
97
83
  - !ruby/object:Gem::Dependency
98
- name: inch
84
+ name: dotenv
99
85
  requirement: !ruby/object:Gem::Requirement
100
86
  requirements:
101
87
  - - "~>"
102
88
  - !ruby/object:Gem::Version
103
- version: '0.8'
89
+ version: '2.5'
104
90
  type: :development
105
91
  prerelease: false
106
92
  version_requirements: !ruby/object:Gem::Requirement
107
93
  requirements:
108
94
  - - "~>"
109
95
  - !ruby/object:Gem::Version
110
- version: '0.8'
96
+ version: '2.5'
111
97
  - !ruby/object:Gem::Dependency
112
98
  name: maxitest
113
99
  requirement: !ruby/object:Gem::Requirement