wgit 0.7.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/bin/wgit
CHANGED
@@ -2,18 +2,22 @@
|
|
2
2
|
|
3
3
|
require 'wgit'
|
4
4
|
|
5
|
-
# Eval .wgit.rb file (if it exists).
|
6
|
-
def eval_wgit
|
7
|
-
puts 'Searching for .wgit.rb in local and home directories...'
|
5
|
+
# Eval .wgit.rb file (if it exists somewhere).
|
6
|
+
def eval_wgit(filepath = nil)
|
7
|
+
puts 'Searching for .wgit.rb file in local and home directories...'
|
8
8
|
|
9
|
-
[
|
9
|
+
[filepath, Dir.pwd, Dir.home].each do |dir|
|
10
10
|
path = "#{dir}/.wgit.rb"
|
11
11
|
next unless File.exist?(path)
|
12
12
|
|
13
|
-
puts "Eval'ing #{path}
|
13
|
+
puts "Eval'ing #{path}"
|
14
|
+
puts 'Call `eval_wgit` after changes to re-eval the file'
|
14
15
|
eval(File.read(path))
|
16
|
+
|
15
17
|
break
|
16
18
|
end
|
19
|
+
|
20
|
+
nil
|
17
21
|
end
|
18
22
|
|
19
23
|
eval_wgit
|
data/lib/wgit/assertable.rb
CHANGED
@@ -6,7 +6,7 @@ module Wgit
|
|
6
6
|
# Default type fail message.
|
7
7
|
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
8
8
|
# Wrong method message.
|
9
|
-
|
9
|
+
NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
|
10
10
|
# Default duck fail message.
|
11
11
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
12
12
|
# Default required keys message.
|
@@ -42,7 +42,7 @@ present: %s"
|
|
42
42
|
# @raise [StandardError] If the assertion fails.
|
43
43
|
# @return [Object] The given arr on successful assertion.
|
44
44
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
45
|
-
raise
|
45
|
+
raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
|
46
46
|
|
47
47
|
arr.each { |obj| assert_types(obj, type_or_types, msg) }
|
48
48
|
end
|
@@ -56,7 +56,7 @@ present: %s"
|
|
56
56
|
# @raise [StandardError] If the assertion fails.
|
57
57
|
# @return [Object] The given obj_or_objs on successful assertion.
|
58
58
|
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
59
|
-
methods =
|
59
|
+
methods = *methods
|
60
60
|
|
61
61
|
if obj_or_objs.respond_to?(:each)
|
62
62
|
obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
|
data/lib/wgit/base.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wgit
|
2
|
+
# Class to inherit from, as an alternative form of using the `Wgit::DSL`.
|
3
|
+
# All subclasses must define a `#parse(doc, &block)` method.
|
4
|
+
class Base
|
5
|
+
extend Wgit::DSL
|
6
|
+
|
7
|
+
# Runs the crawl/index passing each crawled `Wgit::Document` and the given
|
8
|
+
# block to the subclass's `#parse` method.
|
9
|
+
def self.run(&block)
|
10
|
+
obj = new
|
11
|
+
unless obj.respond_to?(:parse)
|
12
|
+
raise "#{obj.class} must respond_to? #parse(doc, &block)"
|
13
|
+
end
|
14
|
+
|
15
|
+
crawl_method = @method || :crawl
|
16
|
+
send(crawl_method) { |doc| obj.parse(doc, &block) }
|
17
|
+
|
18
|
+
obj
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets the crawl/index method to call when `Base.run` is called.
|
22
|
+
# The mode method must match one defined in the `Wgit::Crawler` or
|
23
|
+
# `Wgit::Indexer` class.
|
24
|
+
#
|
25
|
+
# @param method [Symbol] The crawl/index method to call.
|
26
|
+
def self.mode(method)
|
27
|
+
@method = method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/wgit/core_ext.rb
CHANGED
data/lib/wgit/crawler.rb
CHANGED
@@ -6,23 +6,33 @@ require_relative 'utils'
|
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
8
|
require 'set'
|
9
|
+
require 'benchmark'
|
9
10
|
require 'typhoeus'
|
11
|
+
require 'ferrum'
|
10
12
|
|
11
13
|
module Wgit
|
12
|
-
# The Crawler class provides a means of crawling web based HTTP Wgit::Url
|
13
|
-
# serialising their HTML into Wgit::Document instances. This is the
|
14
|
-
# class
|
14
|
+
# The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
|
15
|
+
# and serialising their HTML into `Wgit::Document` instances. This is the
|
16
|
+
# only Wgit class containing network logic (HTTP request/response handling).
|
15
17
|
class Crawler
|
16
18
|
include Assertable
|
17
19
|
|
18
|
-
#
|
19
|
-
|
20
|
-
# doesn't keep the crawl of the site going. All URL's without a file
|
21
|
-
# extension will be crawled, because they're assumed to be HTML.
|
22
|
-
SUPPORTED_FILE_EXTENSIONS = Set.new(
|
20
|
+
# Set of supported file extensions for Wgit::Crawler#crawl_site.
|
21
|
+
@supported_file_extensions = Set.new(
|
23
22
|
%w[asp aspx cfm cgi htm html htmlx jsp php]
|
24
23
|
)
|
25
24
|
|
25
|
+
class << self
|
26
|
+
# The URL file extensions (from `<a>` hrefs) which will be crawled by
|
27
|
+
# `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
|
28
|
+
# doesn't keep the crawl of the site going. All URL's without a file
|
29
|
+
# extension will be crawled, because they're assumed to be HTML.
|
30
|
+
# The `#crawl` method will crawl anything since it's given the URL(s).
|
31
|
+
# You can add your own site's URL file extension e.g.
|
32
|
+
# `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
|
33
|
+
attr_reader :supported_file_extensions
|
34
|
+
end
|
35
|
+
|
26
36
|
# The amount of allowed redirects before raising an error. Set to 0 to
|
27
37
|
# disable redirects completely; or you can pass `follow_redirects: false`
|
28
38
|
# to any Wgit::Crawler.crawl_* method.
|
@@ -30,12 +40,21 @@ module Wgit
|
|
30
40
|
|
31
41
|
# The maximum amount of time (in seconds) a crawl request has to complete
|
32
42
|
# before raising an error. Set to 0 to disable time outs completely.
|
33
|
-
attr_accessor :
|
43
|
+
attr_accessor :timeout
|
34
44
|
|
35
45
|
# Whether or not to UTF-8 encode the response body once crawled. Set to
|
36
46
|
# false if crawling more than just HTML e.g. images.
|
37
47
|
attr_accessor :encode
|
38
48
|
|
49
|
+
# Whether or not to parse the Javascript of the crawled document.
|
50
|
+
# Parsing requires Chrome/Chromium to be installed and in $PATH.
|
51
|
+
attr_accessor :parse_javascript
|
52
|
+
|
53
|
+
# The delay between checks in a page's HTML size. When the page has stopped
|
54
|
+
# "growing", the Javascript has finished dynamically updating the DOM.
|
55
|
+
# The value should balance between a good UX and enough JS parse time.
|
56
|
+
attr_accessor :parse_javascript_delay
|
57
|
+
|
39
58
|
# The Wgit::Response of the most recently crawled URL.
|
40
59
|
attr_reader :last_response
|
41
60
|
|
@@ -43,20 +62,27 @@ module Wgit
|
|
43
62
|
#
|
44
63
|
# @param redirect_limit [Integer] The amount of allowed redirects before
|
45
64
|
# raising an error. Set to 0 to disable redirects completely.
|
46
|
-
# @param
|
65
|
+
# @param timeout [Integer, Float] The maximum amount of time (in seconds)
|
47
66
|
# a crawl request has to complete before raising an error. Set to 0 to
|
48
67
|
# disable time outs completely.
|
49
68
|
# @param encode [Boolean] Whether or not to UTF-8 encode the response body
|
50
69
|
# once crawled. Set to false if crawling more than just HTML e.g. images.
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
70
|
+
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
|
+
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
|
+
# installed and in $PATH.
|
73
|
+
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
|
+
parse_javascript: false, parse_javascript_delay: 1)
|
75
|
+
@redirect_limit = redirect_limit
|
76
|
+
@timeout = timeout
|
77
|
+
@encode = encode
|
78
|
+
@parse_javascript = parse_javascript
|
79
|
+
@parse_javascript_delay = parse_javascript_delay
|
55
80
|
end
|
56
81
|
|
57
82
|
# Crawls an entire website's HTML pages by recursively going through
|
58
|
-
# its internal `<a>` links
|
59
|
-
#
|
83
|
+
# its internal `<a>` links; this can be overridden with `follow: xpath`.
|
84
|
+
# Each crawled Document is yielded to a block. Use `doc.empty?` to
|
85
|
+
# determine if the crawled link was successful / is valid.
|
60
86
|
#
|
61
87
|
# Use the allow and disallow paths params to partially and selectively
|
62
88
|
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
@@ -74,26 +100,36 @@ module Wgit
|
|
74
100
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
75
101
|
# It is recommended that this URL be the index page of the site to give a
|
76
102
|
# greater chance of finding all pages within that site/host.
|
77
|
-
# @param
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
103
|
+
# @param follow [String] The xpath extracting links to be followed during
|
104
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
105
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
106
|
+
# HTML.
|
107
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
108
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
109
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
110
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
81
111
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
82
112
|
# A block is the only way to interact with each crawled Document.
|
83
113
|
# Use `doc.empty?` to determine if the page is valid.
|
84
114
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
85
115
|
# from all of the site's pages or nil if the given url could not be
|
86
116
|
# crawled successfully.
|
87
|
-
def crawl_site(
|
117
|
+
def crawl_site(
|
118
|
+
url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
|
119
|
+
)
|
88
120
|
doc = crawl_url(url, &block)
|
89
121
|
return nil if doc.nil?
|
90
122
|
|
91
|
-
|
123
|
+
link_opts = {
|
124
|
+
xpath: follow,
|
125
|
+
allow_paths: allow_paths,
|
126
|
+
disallow_paths: disallow_paths
|
127
|
+
}
|
92
128
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
93
129
|
|
94
130
|
crawled = Set.new([url, alt_url])
|
95
131
|
externals = Set.new(doc.external_links)
|
96
|
-
internals = Set.new(
|
132
|
+
internals = Set.new(next_internal_links(doc, **link_opts))
|
97
133
|
|
98
134
|
return externals.to_a if internals.empty?
|
99
135
|
|
@@ -108,7 +144,7 @@ module Wgit
|
|
108
144
|
crawled += [orig_link, link] # Push both links in case of redirects.
|
109
145
|
next if doc.nil?
|
110
146
|
|
111
|
-
internals +=
|
147
|
+
internals += next_internal_links(doc, **link_opts)
|
112
148
|
externals += doc.external_links
|
113
149
|
end
|
114
150
|
end
|
@@ -123,10 +159,11 @@ module Wgit
|
|
123
159
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
124
160
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
125
161
|
# e.g. :host only allows redirects within the same host. Choose from
|
126
|
-
# :
|
162
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
127
163
|
# This value will be used for all urls crawled.
|
128
164
|
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
129
|
-
# way to interact with them.
|
165
|
+
# way to interact with them. Use `doc.empty?` to determine if the page
|
166
|
+
# is valid.
|
130
167
|
# @raise [StandardError] If no urls are provided.
|
131
168
|
# @return [Wgit::Document] The last Document crawled.
|
132
169
|
def crawl_urls(*urls, follow_redirects: true, &block)
|
@@ -135,7 +172,7 @@ module Wgit
|
|
135
172
|
opts = { follow_redirects: follow_redirects }
|
136
173
|
doc = nil
|
137
174
|
|
138
|
-
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
175
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
139
176
|
|
140
177
|
doc
|
141
178
|
end
|
@@ -143,13 +180,15 @@ module Wgit
|
|
143
180
|
# Crawl the url returning the response Wgit::Document or nil, if an error
|
144
181
|
# occurs.
|
145
182
|
#
|
146
|
-
# @param url [Wgit::Url] The Url to crawl; which will
|
183
|
+
# @param url [Wgit::Url] The Url to crawl; which will be modified in the
|
184
|
+
# event of a redirect.
|
147
185
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
148
186
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
149
187
|
# e.g. :host only allows redirects within the same host. Choose from
|
150
|
-
# :
|
188
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
151
189
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
152
190
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
|
+
# Use `doc.empty?` to determine if the page is valid.
|
153
192
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
154
193
|
# crawl was unsuccessful.
|
155
194
|
def crawl_url(url, follow_redirects: true)
|
@@ -167,16 +206,19 @@ module Wgit
|
|
167
206
|
|
168
207
|
protected
|
169
208
|
|
170
|
-
# Returns the
|
209
|
+
# Returns the URL's HTML String or nil. Handles any errors that arise
|
171
210
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
172
211
|
# return a HTML body will be ignored, returning nil.
|
173
212
|
#
|
213
|
+
# If @parse_javascript is true, then the final resolved URL will be browsed
|
214
|
+
# to and Javascript parsed allowing for dynamic HTML generation.
|
215
|
+
#
|
174
216
|
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
175
217
|
# reference and gets modified as a result of the fetch/crawl.
|
176
218
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
177
219
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
178
220
|
# e.g. :host only allows redirects within the same host. Choose from
|
179
|
-
# :
|
221
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
180
222
|
# @raise [StandardError] If url isn't valid and absolute.
|
181
223
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
182
224
|
# unsuccessful.
|
@@ -185,6 +227,8 @@ module Wgit
|
|
185
227
|
raise "Invalid url: #{url}" if url.invalid?
|
186
228
|
|
187
229
|
resolve(url, response, follow_redirects: follow_redirects)
|
230
|
+
get_browser_response(url, response) if @parse_javascript
|
231
|
+
|
188
232
|
response.body_or_nil
|
189
233
|
rescue StandardError => e
|
190
234
|
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
|
@@ -206,14 +250,14 @@ module Wgit
|
|
206
250
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
207
251
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
208
252
|
# e.g. :host only allows redirects within the same host. Choose from
|
209
|
-
# :
|
253
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
210
254
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
211
255
|
def resolve(url, response, follow_redirects: true)
|
212
|
-
|
256
|
+
origin = url.to_url.to_origin # Recorded before any redirects.
|
213
257
|
follow_redirects, within = redirect?(follow_redirects)
|
214
258
|
|
215
259
|
loop do
|
216
|
-
|
260
|
+
get_http_response(url, response)
|
217
261
|
break unless response.redirect?
|
218
262
|
|
219
263
|
# Handle response 'Location' header.
|
@@ -225,7 +269,7 @@ module Wgit
|
|
225
269
|
# Validate if the redirect is allowed.
|
226
270
|
raise "Redirect not allowed: #{location}" unless follow_redirects
|
227
271
|
|
228
|
-
if within && !location.relative?(within =>
|
272
|
+
if within && !location.relative?(within => origin)
|
229
273
|
raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
|
230
274
|
end
|
231
275
|
|
@@ -233,7 +277,7 @@ module Wgit
|
|
233
277
|
if response.redirect_count >= @redirect_limit
|
234
278
|
|
235
279
|
# Process the location to be crawled next.
|
236
|
-
location = url.
|
280
|
+
location = url.to_origin.concat(location) if location.relative?
|
237
281
|
response.redirections[url.to_s] = location.to_s
|
238
282
|
url.replace(location) # Update the url on redirect.
|
239
283
|
end
|
@@ -246,7 +290,7 @@ module Wgit
|
|
246
290
|
# reference.
|
247
291
|
# @raise [StandardError] If a response can't be obtained.
|
248
292
|
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
249
|
-
def
|
293
|
+
def get_http_response(url, response)
|
250
294
|
# Perform a HTTP GET request.
|
251
295
|
orig_url = url.to_s
|
252
296
|
url = url.normalize if url.respond_to?(:normalize)
|
@@ -263,10 +307,40 @@ module Wgit
|
|
263
307
|
response.add_total_time(http_response.total_time)
|
264
308
|
|
265
309
|
# Log the request/response details.
|
266
|
-
|
310
|
+
log_net(:http, response, http_response.total_time)
|
267
311
|
|
268
312
|
# Handle a failed response.
|
269
|
-
raise "No response (within timeout: #{@
|
313
|
+
raise "No response (within timeout: #{@timeout} second(s))" \
|
314
|
+
if response.failure?
|
315
|
+
end
|
316
|
+
|
317
|
+
# Makes a browser request and enriches the given Wgit::Response from it.
|
318
|
+
#
|
319
|
+
# @param url [String] The url to browse to. Will call url#normalize if
|
320
|
+
# possible.
|
321
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
322
|
+
# reference.
|
323
|
+
# @raise [StandardError] If a response can't be obtained.
|
324
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
325
|
+
def get_browser_response(url, response)
|
326
|
+
url = url.normalize if url.respond_to?(:normalize)
|
327
|
+
browser = nil
|
328
|
+
|
329
|
+
crawl_time = Benchmark.measure { browser = browser_get(url) }.real
|
330
|
+
yield browser if block_given?
|
331
|
+
|
332
|
+
# Enrich the given Wgit::Response object (on top of Typhoeus response).
|
333
|
+
response.adapter_response = browser.network.response
|
334
|
+
response.status = browser.network.response.status
|
335
|
+
response.headers = browser.network.response.headers
|
336
|
+
response.body = browser.body
|
337
|
+
response.add_total_time(crawl_time)
|
338
|
+
|
339
|
+
# Log the request/response details.
|
340
|
+
log_net(:browser, response, crawl_time)
|
341
|
+
|
342
|
+
# Handle a failed response.
|
343
|
+
raise "No browser response (within timeout: #{@timeout} second(s))" \
|
270
344
|
if response.failure?
|
271
345
|
end
|
272
346
|
|
@@ -277,7 +351,7 @@ module Wgit
|
|
277
351
|
def http_get(url)
|
278
352
|
opts = {
|
279
353
|
followlocation: false,
|
280
|
-
timeout: @
|
354
|
+
timeout: @timeout,
|
281
355
|
accept_encoding: 'gzip',
|
282
356
|
headers: {
|
283
357
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
@@ -286,35 +360,58 @@ module Wgit
|
|
286
360
|
}
|
287
361
|
|
288
362
|
# See https://rubydoc.info/gems/typhoeus for more info.
|
289
|
-
Typhoeus.get(url, opts)
|
363
|
+
Typhoeus.get(url, **opts)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Performs a HTTP GET request in a web browser and parses the response JS
|
367
|
+
# before returning the HTML body of the fully rendered webpage. This allows
|
368
|
+
# Javascript (SPA apps etc.) to generate HTML dynamically.
|
369
|
+
#
|
370
|
+
# @param url [String] The url to browse to.
|
371
|
+
# @return [Ferrum::Browser] The browser response object.
|
372
|
+
def browser_get(url)
|
373
|
+
@browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
|
374
|
+
@browser.goto(url)
|
375
|
+
|
376
|
+
# Wait for the page's JS to finish dynamically manipulating the DOM.
|
377
|
+
html = @browser.body
|
378
|
+
loop do
|
379
|
+
sleep @parse_javascript_delay
|
380
|
+
break if html.size == @browser.body.size
|
381
|
+
|
382
|
+
html = @browser.body
|
383
|
+
end
|
384
|
+
|
385
|
+
@browser
|
290
386
|
end
|
291
387
|
|
292
388
|
# Returns a doc's internal HTML page links in absolute form; used when
|
293
|
-
# crawling a site.
|
294
|
-
#
|
295
|
-
# `'wiki/\*'` etc. Note that each path should NOT start with a slash.
|
389
|
+
# crawling a site. By default, any `<a>` href returning HTML is returned;
|
390
|
+
# override this with `xpath:` if desired.
|
296
391
|
#
|
297
|
-
#
|
298
|
-
#
|
299
|
-
#
|
300
|
-
# files containing `<a>` links keep the crawl going beyond the base URL.
|
392
|
+
# Use the allow and disallow paths params to partially and selectively
|
393
|
+
# crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
|
394
|
+
# that each path should NOT start with a slash.
|
301
395
|
#
|
302
396
|
# @param doc [Wgit::Document] The document from which to extract it's
|
303
397
|
# internal (absolute) page links.
|
398
|
+
# @param xpath [String] The xpath selecting links to be returned. Only
|
399
|
+
# links pointing to the doc.url domain are allowed. The :default is any
|
400
|
+
# <a> href returning HTML. The allow/disallow paths will be applied to
|
401
|
+
# the returned value.
|
304
402
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
305
403
|
# them if their path `File.fnmatch?` one of allow_paths.
|
306
404
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
307
405
|
# them if their path `File.fnmatch?` one of disallow_paths.
|
308
406
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
309
|
-
def
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
end
|
407
|
+
def next_internal_links(
|
408
|
+
doc, xpath: :default, allow_paths: nil, disallow_paths: nil
|
409
|
+
)
|
410
|
+
links = if xpath && xpath != :default
|
411
|
+
follow_xpath(doc, xpath)
|
412
|
+
else
|
413
|
+
follow_default(doc)
|
414
|
+
end
|
318
415
|
|
319
416
|
return links if allow_paths.nil? && disallow_paths.nil?
|
320
417
|
|
@@ -323,29 +420,40 @@ module Wgit
|
|
323
420
|
|
324
421
|
private
|
325
422
|
|
326
|
-
# Returns
|
327
|
-
#
|
328
|
-
|
329
|
-
|
423
|
+
# Returns the next links used to continue crawling a site. The xpath value
|
424
|
+
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
|
+
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
426
|
+
# pointing to the site domain will raise an error.
|
427
|
+
def follow_xpath(doc, xpath)
|
428
|
+
links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
|
429
|
+
urls
|
430
|
+
.map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
|
431
|
+
.compact
|
432
|
+
end
|
330
433
|
|
331
|
-
|
332
|
-
raise
|
333
|
-
#{follow_redirects}"
|
434
|
+
if links.any? { |link| link.to_domain != doc.url.to_domain }
|
435
|
+
raise 'The links to follow must be within the site domain'
|
334
436
|
end
|
335
437
|
|
336
|
-
|
438
|
+
links
|
337
439
|
end
|
338
440
|
|
339
|
-
#
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
441
|
+
# Returns the default set of links used to continue crawling a site.
|
442
|
+
# By default, any <a> href returning HTML and pointing to the same domain
|
443
|
+
# will get returned.
|
444
|
+
def follow_default(doc)
|
445
|
+
doc
|
446
|
+
.internal_absolute_links
|
447
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
448
|
+
.uniq
|
449
|
+
.select do |link| # Whitelist only HTML content.
|
450
|
+
ext = link.to_extension
|
451
|
+
if ext
|
452
|
+
Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
|
453
|
+
else
|
454
|
+
true # URLs without an extension are assumed HTML.
|
455
|
+
end
|
456
|
+
end
|
349
457
|
end
|
350
458
|
|
351
459
|
# Validate and filter by the given URL paths.
|
@@ -365,14 +473,17 @@ module Wgit
|
|
365
473
|
|
366
474
|
# Validate the paths are suitable for filtering.
|
367
475
|
def validate_paths(paths)
|
368
|
-
paths =
|
476
|
+
paths = *paths
|
369
477
|
raise 'The provided paths must all be Strings' \
|
370
478
|
unless paths.all? { |path| path.is_a?(String) }
|
371
479
|
|
372
|
-
Wgit::Utils.
|
480
|
+
Wgit::Utils.sanitize(paths, encode: false)
|
373
481
|
raise 'The provided paths cannot be empty' if paths.empty?
|
374
482
|
|
375
|
-
paths
|
483
|
+
paths.map do |path|
|
484
|
+
path = Wgit::Url.parse(path)
|
485
|
+
path.index? ? path : path.omit_slashes
|
486
|
+
end
|
376
487
|
end
|
377
488
|
|
378
489
|
# Filters links by selecting/rejecting them based on their path.
|
@@ -380,7 +491,7 @@ module Wgit
|
|
380
491
|
def filter_links(links, filter_method, paths)
|
381
492
|
links.send(filter_method) do |link|
|
382
493
|
# Turn http://example.com into / meaning index.
|
383
|
-
link = link.to_endpoint
|
494
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base
|
384
495
|
|
385
496
|
match = false
|
386
497
|
paths.each do |pattern|
|
@@ -392,6 +503,35 @@ module Wgit
|
|
392
503
|
end
|
393
504
|
end
|
394
505
|
|
506
|
+
# Returns whether or not to follow redirects, and within what context e.g.
|
507
|
+
# :host, :domain etc.
|
508
|
+
def redirect?(follow_redirects)
|
509
|
+
return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
|
510
|
+
|
511
|
+
unless [true, false].include?(follow_redirects)
|
512
|
+
raise "follow_redirects: must be a Boolean or Symbol, not: \
|
513
|
+
#{follow_redirects}"
|
514
|
+
end
|
515
|
+
|
516
|
+
[follow_redirects, nil]
|
517
|
+
end
|
518
|
+
|
519
|
+
# Log (at debug level) the network request/response details.
|
520
|
+
def log_net(client, response, duration)
|
521
|
+
resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
|
522
|
+
log_status = (response.status || 0)
|
523
|
+
log_total_time = (duration || 0.0).truncate(3)
|
524
|
+
|
525
|
+
# The browsers request URL is the same so ignore it.
|
526
|
+
if client.to_sym == :http
|
527
|
+
Wgit.logger.debug("[#{client}] Request: #{response.url}")
|
528
|
+
end
|
529
|
+
|
530
|
+
Wgit.logger.debug(
|
531
|
+
format(resp_template, log_status, response.size, log_total_time)
|
532
|
+
)
|
533
|
+
end
|
534
|
+
|
395
535
|
alias crawl crawl_urls
|
396
536
|
alias crawl_pages crawl_urls
|
397
537
|
alias crawl_page crawl_url
|