wgit 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +69 -34
- data/lib/wgit/database/database.rb +8 -0
- data/lib/wgit/document.rb +5 -4
- data/lib/wgit/url.rb +1 -1
- data/lib/wgit/utils.rb +26 -0
- data/lib/wgit/version.rb +1 -1
- metadata +12 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0db518346d1a939c13e689856a4ae9946ead17f188185529bce7fc1f97b84fba
|
4
|
+
data.tar.gz: aee59fe627767c45736e5ae518fe46c46ca9de13e1cd6bed23c0dc5b313aa1e6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5c29a7dc084742132241ec17118eb8dccf39ae57c57aa3364a01c948c4ed1a3304dd1510324b2f3b5d7c9d06ea04802524f6cd9eae00e2f68127e456b8cd9d17
|
7
|
+
data.tar.gz: 80cae11e47d806bcb14cc77e2628d533e9b466d7fc43d7e6e4fd79b3a65c51df5cbcd6886b62eab88ceb6176d8fdaaf91e45bddd5de3dabae00b245d2cb53968
|
data/lib/wgit/crawler.rb
CHANGED
@@ -4,8 +4,7 @@ require_relative 'url'
|
|
4
4
|
require_relative 'document'
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
|
-
require '
|
8
|
-
require 'benchmark'
|
7
|
+
require 'typhoeus'
|
9
8
|
|
10
9
|
module Wgit
|
11
10
|
# The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
|
@@ -17,15 +16,24 @@ module Wgit
|
|
17
16
|
# disable redirects completely.
|
18
17
|
attr_accessor :redirect_limit
|
19
18
|
|
20
|
-
# The
|
19
|
+
# The maximum amount of time (in seconds) a crawl request has to complete
|
20
|
+
# before raising an error. Set to 0 to disable time outs completely.
|
21
|
+
attr_accessor :time_out
|
22
|
+
|
23
|
+
# The Typhoeus::Response of the most recently crawled URL or nil.
|
24
|
+
# See https://rubydoc.info/gems/typhoeus/Typhoeus/Response for more info.
|
21
25
|
attr_reader :last_response
|
22
26
|
|
23
27
|
# Initializes and returns a Wgit::Crawler instance.
|
24
28
|
#
|
25
29
|
# @param redirect_limit [Integer] The amount of allowed redirects before
|
26
30
|
# raising an error. Set to 0 to disable redirects completely.
|
27
|
-
|
31
|
+
# @param time_out [Integer, Float] The maximum amount of time (in seconds)
|
32
|
+
# a crawl request has to complete before raising an error. Set to 0 to
|
33
|
+
# disable time outs completely.
|
34
|
+
def initialize(redirect_limit: 5, time_out: 5)
|
28
35
|
@redirect_limit = redirect_limit
|
36
|
+
@time_out = time_out
|
29
37
|
end
|
30
38
|
|
31
39
|
# Crawls an entire website's HTML pages by recursively going through
|
@@ -103,7 +111,7 @@ module Wgit
|
|
103
111
|
doc
|
104
112
|
end
|
105
113
|
|
106
|
-
# Crawl the url returning the response Wgit::Document or nil if an error
|
114
|
+
# Crawl the url returning the response Wgit::Document or nil, if an error
|
107
115
|
# occurs.
|
108
116
|
#
|
109
117
|
# @param url [Wgit::Url] The Url to crawl; which will likely be modified.
|
@@ -141,13 +149,12 @@ module Wgit
|
|
141
149
|
|
142
150
|
protected
|
143
151
|
|
144
|
-
#
|
145
|
-
#
|
146
|
-
#
|
147
|
-
# will be returned; otherwise, the HTML String is returned.
|
152
|
+
# Fetches the url HTML String or nil. Handles any errors that arise
|
153
|
+
# and sets the @last_response. Errors or any HTTP response that doesn't
|
154
|
+
# return a HTML body will be ignored, returning nil.
|
148
155
|
#
|
149
|
-
# @param url [Wgit::Url] The URL to fetch
|
150
|
-
#
|
156
|
+
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
157
|
+
# reference and gets modified as a result of the fetch/crawl.
|
151
158
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
152
159
|
# an external redirect. False will return nil for such a crawl. If false,
|
153
160
|
# you must also provide a `host:` parameter.
|
@@ -159,16 +166,15 @@ module Wgit
|
|
159
166
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
160
167
|
# unsuccessful.
|
161
168
|
def fetch(url, follow_external_redirects: true, host: nil)
|
162
|
-
crawl_duration = nil
|
163
169
|
response = nil
|
170
|
+
crawl_duration = nil
|
164
171
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
end.real
|
172
|
+
response = resolve(
|
173
|
+
url,
|
174
|
+
follow_external_redirects: follow_external_redirects,
|
175
|
+
host: host
|
176
|
+
)
|
177
|
+
crawl_duration = response.total_time
|
172
178
|
|
173
179
|
response.body.empty? ? nil : response.body
|
174
180
|
rescue StandardError => e
|
@@ -181,10 +187,10 @@ module Wgit
|
|
181
187
|
@last_response = response
|
182
188
|
end
|
183
189
|
|
184
|
-
#
|
185
|
-
#
|
190
|
+
# Resolves the url by handling any redirects. The response object will be
|
191
|
+
# returned or an error raised.
|
186
192
|
#
|
187
|
-
# @param url [Wgit::Url] The URL to
|
193
|
+
# @param url [Wgit::Url] The URL to resolve.
|
188
194
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
189
195
|
# an external redirect. If false, you must also provide a `host:`
|
190
196
|
# parameter.
|
@@ -193,37 +199,66 @@ module Wgit
|
|
193
199
|
# absolute and contain a protocol prefix. For example, a `host:` of
|
194
200
|
# 'http://www.example.com' will only allow redirects for Urls with a
|
195
201
|
# `to_host` value of 'www.example.com'.
|
196
|
-
# @raise [StandardError] If
|
197
|
-
#
|
198
|
-
# @return [Net::HTTPResponse] The HTTP response of the GET request.
|
202
|
+
# @raise [StandardError] If a redirect isn't allowed etc.
|
203
|
+
# @return [Typhoeus::Response] The HTTP response of the GET request.
|
199
204
|
def resolve(url, follow_external_redirects: true, host: nil)
|
200
|
-
|
201
|
-
|
205
|
+
response = nil
|
202
206
|
redirect_count = 0
|
203
|
-
|
207
|
+
total_net_time = 0.0
|
204
208
|
|
205
209
|
loop do
|
206
|
-
response =
|
207
|
-
|
210
|
+
response = get_response(url)
|
211
|
+
total_net_time += response.total_time if response.total_time
|
208
212
|
|
209
|
-
|
213
|
+
# Break unless it's a redirect.
|
214
|
+
break unless (response.code >= 300) && (response.code < 400)
|
215
|
+
|
216
|
+
# Handle response 'Location' header.
|
217
|
+
location = Wgit::Utils.fetch(response.headers, :location, '')
|
218
|
+
location = Wgit::Url.new(location)
|
210
219
|
raise 'Encountered redirect without Location header' if location.empty?
|
211
220
|
|
212
221
|
yield(url, response, location) if block_given?
|
213
222
|
|
223
|
+
# Handle redirect logic.
|
214
224
|
raise "External redirect not allowed - Redirected to: \
|
215
225
|
'#{location}', which is outside of host: '#{host}'" \
|
216
|
-
if !follow_external_redirects && !location.
|
226
|
+
if !follow_external_redirects && !location.relative?(host: host)
|
217
227
|
|
218
|
-
raise "Too many redirects: #{redirect_count}" \
|
228
|
+
raise "Too many redirects, exceeded: #{redirect_count}" \
|
219
229
|
if redirect_count >= @redirect_limit
|
220
230
|
|
221
231
|
redirect_count += 1
|
222
232
|
|
223
|
-
|
233
|
+
# Process the location to be crawled next.
|
234
|
+
location = url.to_base.concat(location) if location.relative?
|
224
235
|
url.replace(location) # Update the url on redirect.
|
225
236
|
end
|
226
237
|
|
238
|
+
response.options[:redirect_count] = redirect_count
|
239
|
+
response.options[:total_time] = total_net_time
|
240
|
+
|
241
|
+
response
|
242
|
+
end
|
243
|
+
|
244
|
+
# Performs a HTTP GET request and returns the response.
|
245
|
+
#
|
246
|
+
# @param url [String] The url to GET. Will call url#normalize if possible.
|
247
|
+
# @raise [StandardError] If a response can't be obtained.
|
248
|
+
# @return [Typhoeus::Response] The HTTP response of the GET request.
|
249
|
+
def get_response(url)
|
250
|
+
url = url.normalize if url.respond_to?(:normalize)
|
251
|
+
|
252
|
+
opts = {
|
253
|
+
followlocation: false, timeout: @time_out, accept_encoding: 'gzip'
|
254
|
+
}
|
255
|
+
|
256
|
+
response = Typhoeus.get(url, opts)
|
257
|
+
|
258
|
+
# Handle response status code.
|
259
|
+
raise "No response (within timeout: #{@time_out} second(s))" \
|
260
|
+
if response.code == 0
|
261
|
+
|
227
262
|
response
|
228
263
|
end
|
229
264
|
|
@@ -23,6 +23,8 @@ module Wgit
|
|
23
23
|
# Initializes a connected database client using the provided
|
24
24
|
# connection_string or ENV['WGIT_CONNECTION_STRING'].
|
25
25
|
#
|
26
|
+
# @param connection_string [String] The connection string needed to connect
|
27
|
+
# to the database.
|
26
28
|
# @raise [StandardError] If a connection string isn't provided, either as a
|
27
29
|
# parameter or via the environment.
|
28
30
|
def initialize(connection_string = nil)
|
@@ -36,6 +38,10 @@ module Wgit
|
|
36
38
|
|
37
39
|
# A class alias for Database.new.
|
38
40
|
#
|
41
|
+
# @param connection_string [String] The connection string needed to connect
|
42
|
+
# to the database.
|
43
|
+
# @raise [StandardError] If a connection string isn't provided, either as a
|
44
|
+
# parameter or via the environment.
|
39
45
|
# @return [Wgit::Database] The connected database client.
|
40
46
|
def self.connect(connection_string = nil)
|
41
47
|
new(connection_string)
|
@@ -43,6 +49,8 @@ module Wgit
|
|
43
49
|
|
44
50
|
# Initializes a connected database client using the connection string.
|
45
51
|
#
|
52
|
+
# @param connection_string [String] The connection string needed to connect
|
53
|
+
# to the database.
|
46
54
|
# @raise [StandardError] If a connection cannot be established.
|
47
55
|
# @return [Mong::Client] The connected MongoDB client.
|
48
56
|
def self.establish_connection(connection_string)
|
data/lib/wgit/document.rb
CHANGED
@@ -224,11 +224,11 @@ module Wgit
|
|
224
224
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
225
225
|
# 'http://example.com/public'.
|
226
226
|
def base_url(link: nil)
|
227
|
-
get_base = -> { @base.
|
227
|
+
get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
|
228
228
|
|
229
229
|
if link
|
230
230
|
link = Wgit::Url.new(link)
|
231
|
-
raise "link must be relative: #{link}" unless link.
|
231
|
+
raise "link must be relative: #{link}" unless link.relative?
|
232
232
|
|
233
233
|
if link.is_anchor? || link.is_query?
|
234
234
|
base_url = @base ? get_base.call : @url
|
@@ -339,7 +339,7 @@ module Wgit
|
|
339
339
|
return [] if @links.empty?
|
340
340
|
|
341
341
|
links = @links
|
342
|
-
.select { |link| link.
|
342
|
+
.select { |link| link.relative?(host: @url.to_base) }
|
343
343
|
.map(&:without_base)
|
344
344
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
345
345
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
@@ -365,7 +365,7 @@ module Wgit
|
|
365
365
|
return [] if @links.empty?
|
366
366
|
|
367
367
|
links = @links
|
368
|
-
.reject { |link| link.
|
368
|
+
.reject { |link| link.relative?(host: @url.to_base) }
|
369
369
|
.map(&:without_trailing_slash)
|
370
370
|
|
371
371
|
Wgit::Utils.process_arr(links)
|
@@ -597,6 +597,7 @@ module Wgit
|
|
597
597
|
end
|
598
598
|
end
|
599
599
|
|
600
|
+
alias statistics stats
|
600
601
|
alias internal_urls internal_links
|
601
602
|
alias internal_absolute_urls internal_absolute_links
|
602
603
|
alias external_urls external_links
|
data/lib/wgit/url.rb
CHANGED
@@ -189,7 +189,7 @@ protocol: #{url}" unless url.to_base
|
|
189
189
|
# @return [Wgit::Url] self + separator + path, separator depends on path.
|
190
190
|
def concat(path)
|
191
191
|
path = Wgit::Url.new(path)
|
192
|
-
raise 'path must be relative' unless path.
|
192
|
+
raise 'path must be relative' unless path.relative?
|
193
193
|
|
194
194
|
path = path.without_leading_slash
|
195
195
|
separator = path.start_with?('#') || path.start_with?('?') ? '' : '/'
|
data/lib/wgit/utils.rb
CHANGED
@@ -47,6 +47,32 @@ module Wgit
|
|
47
47
|
obj_or_objs
|
48
48
|
end
|
49
49
|
|
50
|
+
# An improved Hash :fetch method which checks for multiple formats of the
|
51
|
+
# given key and returns the value, or the default value (nil unless
|
52
|
+
# provided).
|
53
|
+
#
|
54
|
+
# For example, if key == :foo, hash is searched for:
|
55
|
+
# :foo, 'foo', 'Foo', 'FOO' in that order. The first value found is
|
56
|
+
# returned. If no value is found, the default value is returned.
|
57
|
+
#
|
58
|
+
# @param hash [Hash] The Hash to search within.
|
59
|
+
# @param key [Symbol, String] The key with which to search hash.
|
60
|
+
# @param default [Object] The default value to be returned if hash[key]
|
61
|
+
# doesn't exist.
|
62
|
+
# @return [Object] The value found at hash[key] or the default value.
|
63
|
+
def self.fetch(hash, key, default = nil)
|
64
|
+
key = key.to_s.downcase
|
65
|
+
|
66
|
+
# Try (in order): :foo, 'foo', 'Foo', 'FOO'.
|
67
|
+
[key.to_sym, key, key.capitalize, key.upcase].each do |k|
|
68
|
+
value = hash[k]
|
69
|
+
|
70
|
+
return value if value
|
71
|
+
end
|
72
|
+
|
73
|
+
default
|
74
|
+
end
|
75
|
+
|
50
76
|
# Formats the sentence (modifies the receiver) and returns its value.
|
51
77
|
# The formatting is essentially to shorten the sentence and ensure that
|
52
78
|
# the index is present somewhere in the sentence. Used for search query
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -53,61 +53,47 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 1.10.3
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: typhoeus
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
62
|
-
type: :
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '10.0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: dotenv
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - "~>"
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '2.5'
|
76
|
-
type: :development
|
61
|
+
version: 1.3.1
|
62
|
+
type: :runtime
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
66
|
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
68
|
+
version: 1.3.1
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
70
|
+
name: byebug
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
86
72
|
requirements:
|
87
73
|
- - "~>"
|
88
74
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
75
|
+
version: '10.0'
|
90
76
|
type: :development
|
91
77
|
prerelease: false
|
92
78
|
version_requirements: !ruby/object:Gem::Requirement
|
93
79
|
requirements:
|
94
80
|
- - "~>"
|
95
81
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
82
|
+
version: '10.0'
|
97
83
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
84
|
+
name: dotenv
|
99
85
|
requirement: !ruby/object:Gem::Requirement
|
100
86
|
requirements:
|
101
87
|
- - "~>"
|
102
88
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
89
|
+
version: '2.5'
|
104
90
|
type: :development
|
105
91
|
prerelease: false
|
106
92
|
version_requirements: !ruby/object:Gem::Requirement
|
107
93
|
requirements:
|
108
94
|
- - "~>"
|
109
95
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
96
|
+
version: '2.5'
|
111
97
|
- !ruby/object:Gem::Dependency
|
112
98
|
name: maxitest
|
113
99
|
requirement: !ruby/object:Gem::Requirement
|