wgit 0.7.0 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +74 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -290
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +219 -79
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +226 -143
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +21 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +11 -8
- data/lib/wgit/url.rb +192 -61
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/bin/wgit
CHANGED
@@ -2,18 +2,22 @@
|
|
2
2
|
|
3
3
|
require 'wgit'
|
4
4
|
|
5
|
-
# Eval .wgit.rb file (if it exists).
|
6
|
-
def eval_wgit
|
7
|
-
puts 'Searching for .wgit.rb in local and home directories...'
|
5
|
+
# Eval .wgit.rb file (if it exists somewhere).
|
6
|
+
def eval_wgit(filepath = nil)
|
7
|
+
puts 'Searching for .wgit.rb file in local and home directories...'
|
8
8
|
|
9
|
-
[
|
9
|
+
[filepath, Dir.pwd, Dir.home].each do |dir|
|
10
10
|
path = "#{dir}/.wgit.rb"
|
11
11
|
next unless File.exist?(path)
|
12
12
|
|
13
|
-
puts "Eval'ing #{path}
|
13
|
+
puts "Eval'ing #{path}"
|
14
|
+
puts 'Call `eval_wgit` after changes to re-eval the file'
|
14
15
|
eval(File.read(path))
|
16
|
+
|
15
17
|
break
|
16
18
|
end
|
19
|
+
|
20
|
+
nil
|
17
21
|
end
|
18
22
|
|
19
23
|
eval_wgit
|
data/lib/wgit/assertable.rb
CHANGED
@@ -6,7 +6,7 @@ module Wgit
|
|
6
6
|
# Default type fail message.
|
7
7
|
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
8
8
|
# Wrong method message.
|
9
|
-
|
9
|
+
NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
|
10
10
|
# Default duck fail message.
|
11
11
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
12
12
|
# Default required keys message.
|
@@ -42,7 +42,7 @@ present: %s"
|
|
42
42
|
# @raise [StandardError] If the assertion fails.
|
43
43
|
# @return [Object] The given arr on successful assertion.
|
44
44
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
45
|
-
raise
|
45
|
+
raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
|
46
46
|
|
47
47
|
arr.each { |obj| assert_types(obj, type_or_types, msg) }
|
48
48
|
end
|
@@ -56,7 +56,7 @@ present: %s"
|
|
56
56
|
# @raise [StandardError] If the assertion fails.
|
57
57
|
# @return [Object] The given obj_or_objs on successful assertion.
|
58
58
|
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
59
|
-
methods =
|
59
|
+
methods = *methods
|
60
60
|
|
61
61
|
if obj_or_objs.respond_to?(:each)
|
62
62
|
obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
|
data/lib/wgit/base.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wgit
|
2
|
+
# Class to inherit from, as an alternative form of using the `Wgit::DSL`.
|
3
|
+
# All subclasses must define a `#parse(doc, &block)` method.
|
4
|
+
class Base
|
5
|
+
extend Wgit::DSL
|
6
|
+
|
7
|
+
# Runs the crawl/index passing each crawled `Wgit::Document` and the given
|
8
|
+
# block to the subclass's `#parse` method.
|
9
|
+
def self.run(&block)
|
10
|
+
obj = new
|
11
|
+
unless obj.respond_to?(:parse)
|
12
|
+
raise "#{obj.class} must respond_to? #parse(doc, &block)"
|
13
|
+
end
|
14
|
+
|
15
|
+
crawl_method = @method || :crawl
|
16
|
+
send(crawl_method) { |doc| obj.parse(doc, &block) }
|
17
|
+
|
18
|
+
obj
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets the crawl/index method to call when `Base.run` is called.
|
22
|
+
# The mode method must match one defined in the `Wgit::Crawler` or
|
23
|
+
# `Wgit::Indexer` class.
|
24
|
+
#
|
25
|
+
# @param method [Symbol] The crawl/index method to call.
|
26
|
+
def self.mode(method)
|
27
|
+
@method = method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/wgit/core_ext.rb
CHANGED
data/lib/wgit/crawler.rb
CHANGED
@@ -6,23 +6,33 @@ require_relative 'utils'
|
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
8
|
require 'set'
|
9
|
+
require 'benchmark'
|
9
10
|
require 'typhoeus'
|
11
|
+
require 'ferrum'
|
10
12
|
|
11
13
|
module Wgit
|
12
|
-
# The Crawler class provides a means of crawling web based HTTP Wgit::Url
|
13
|
-
# serialising their HTML into Wgit::Document instances. This is the
|
14
|
-
# class
|
14
|
+
# The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
|
15
|
+
# and serialising their HTML into `Wgit::Document` instances. This is the
|
16
|
+
# only Wgit class containing network logic (HTTP request/response handling).
|
15
17
|
class Crawler
|
16
18
|
include Assertable
|
17
19
|
|
18
|
-
#
|
19
|
-
|
20
|
-
# doesn't keep the crawl of the site going. All URL's without a file
|
21
|
-
# extension will be crawled, because they're assumed to be HTML.
|
22
|
-
SUPPORTED_FILE_EXTENSIONS = Set.new(
|
20
|
+
# Set of supported file extensions for Wgit::Crawler#crawl_site.
|
21
|
+
@supported_file_extensions = Set.new(
|
23
22
|
%w[asp aspx cfm cgi htm html htmlx jsp php]
|
24
23
|
)
|
25
24
|
|
25
|
+
class << self
|
26
|
+
# The URL file extensions (from `<a>` hrefs) which will be crawled by
|
27
|
+
# `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
|
28
|
+
# doesn't keep the crawl of the site going. All URL's without a file
|
29
|
+
# extension will be crawled, because they're assumed to be HTML.
|
30
|
+
# The `#crawl` method will crawl anything since it's given the URL(s).
|
31
|
+
# You can add your own site's URL file extension e.g.
|
32
|
+
# `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
|
33
|
+
attr_reader :supported_file_extensions
|
34
|
+
end
|
35
|
+
|
26
36
|
# The amount of allowed redirects before raising an error. Set to 0 to
|
27
37
|
# disable redirects completely; or you can pass `follow_redirects: false`
|
28
38
|
# to any Wgit::Crawler.crawl_* method.
|
@@ -30,12 +40,21 @@ module Wgit
|
|
30
40
|
|
31
41
|
# The maximum amount of time (in seconds) a crawl request has to complete
|
32
42
|
# before raising an error. Set to 0 to disable time outs completely.
|
33
|
-
attr_accessor :
|
43
|
+
attr_accessor :timeout
|
34
44
|
|
35
45
|
# Whether or not to UTF-8 encode the response body once crawled. Set to
|
36
46
|
# false if crawling more than just HTML e.g. images.
|
37
47
|
attr_accessor :encode
|
38
48
|
|
49
|
+
# Whether or not to parse the Javascript of the crawled document.
|
50
|
+
# Parsing requires Chrome/Chromium to be installed and in $PATH.
|
51
|
+
attr_accessor :parse_javascript
|
52
|
+
|
53
|
+
# The delay between checks in a page's HTML size. When the page has stopped
|
54
|
+
# "growing", the Javascript has finished dynamically updating the DOM.
|
55
|
+
# The value should balance between a good UX and enough JS parse time.
|
56
|
+
attr_accessor :parse_javascript_delay
|
57
|
+
|
39
58
|
# The Wgit::Response of the most recently crawled URL.
|
40
59
|
attr_reader :last_response
|
41
60
|
|
@@ -43,20 +62,27 @@ module Wgit
|
|
43
62
|
#
|
44
63
|
# @param redirect_limit [Integer] The amount of allowed redirects before
|
45
64
|
# raising an error. Set to 0 to disable redirects completely.
|
46
|
-
# @param
|
65
|
+
# @param timeout [Integer, Float] The maximum amount of time (in seconds)
|
47
66
|
# a crawl request has to complete before raising an error. Set to 0 to
|
48
67
|
# disable time outs completely.
|
49
68
|
# @param encode [Boolean] Whether or not to UTF-8 encode the response body
|
50
69
|
# once crawled. Set to false if crawling more than just HTML e.g. images.
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
70
|
+
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
|
+
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
|
+
# installed and in $PATH.
|
73
|
+
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
|
+
parse_javascript: false, parse_javascript_delay: 1)
|
75
|
+
@redirect_limit = redirect_limit
|
76
|
+
@timeout = timeout
|
77
|
+
@encode = encode
|
78
|
+
@parse_javascript = parse_javascript
|
79
|
+
@parse_javascript_delay = parse_javascript_delay
|
55
80
|
end
|
56
81
|
|
57
82
|
# Crawls an entire website's HTML pages by recursively going through
|
58
|
-
# its internal `<a>` links
|
59
|
-
#
|
83
|
+
# its internal `<a>` links; this can be overridden with `follow: xpath`.
|
84
|
+
# Each crawled Document is yielded to a block. Use `doc.empty?` to
|
85
|
+
# determine if the crawled link was successful / is valid.
|
60
86
|
#
|
61
87
|
# Use the allow and disallow paths params to partially and selectively
|
62
88
|
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
@@ -74,26 +100,36 @@ module Wgit
|
|
74
100
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
75
101
|
# It is recommended that this URL be the index page of the site to give a
|
76
102
|
# greater chance of finding all pages within that site/host.
|
77
|
-
# @param
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
103
|
+
# @param follow [String] The xpath extracting links to be followed during
|
104
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
105
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
106
|
+
# HTML.
|
107
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
108
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
109
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
110
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
81
111
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
82
112
|
# A block is the only way to interact with each crawled Document.
|
83
113
|
# Use `doc.empty?` to determine if the page is valid.
|
84
114
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
85
115
|
# from all of the site's pages or nil if the given url could not be
|
86
116
|
# crawled successfully.
|
87
|
-
def crawl_site(
|
117
|
+
def crawl_site(
|
118
|
+
url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
|
119
|
+
)
|
88
120
|
doc = crawl_url(url, &block)
|
89
121
|
return nil if doc.nil?
|
90
122
|
|
91
|
-
|
123
|
+
link_opts = {
|
124
|
+
xpath: follow,
|
125
|
+
allow_paths: allow_paths,
|
126
|
+
disallow_paths: disallow_paths
|
127
|
+
}
|
92
128
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
93
129
|
|
94
130
|
crawled = Set.new([url, alt_url])
|
95
131
|
externals = Set.new(doc.external_links)
|
96
|
-
internals = Set.new(
|
132
|
+
internals = Set.new(next_internal_links(doc, **link_opts))
|
97
133
|
|
98
134
|
return externals.to_a if internals.empty?
|
99
135
|
|
@@ -108,7 +144,7 @@ module Wgit
|
|
108
144
|
crawled += [orig_link, link] # Push both links in case of redirects.
|
109
145
|
next if doc.nil?
|
110
146
|
|
111
|
-
internals +=
|
147
|
+
internals += next_internal_links(doc, **link_opts)
|
112
148
|
externals += doc.external_links
|
113
149
|
end
|
114
150
|
end
|
@@ -123,10 +159,11 @@ module Wgit
|
|
123
159
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
124
160
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
125
161
|
# e.g. :host only allows redirects within the same host. Choose from
|
126
|
-
# :
|
162
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
127
163
|
# This value will be used for all urls crawled.
|
128
164
|
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
129
|
-
# way to interact with them.
|
165
|
+
# way to interact with them. Use `doc.empty?` to determine if the page
|
166
|
+
# is valid.
|
130
167
|
# @raise [StandardError] If no urls are provided.
|
131
168
|
# @return [Wgit::Document] The last Document crawled.
|
132
169
|
def crawl_urls(*urls, follow_redirects: true, &block)
|
@@ -135,7 +172,7 @@ module Wgit
|
|
135
172
|
opts = { follow_redirects: follow_redirects }
|
136
173
|
doc = nil
|
137
174
|
|
138
|
-
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
175
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
139
176
|
|
140
177
|
doc
|
141
178
|
end
|
@@ -143,13 +180,15 @@ module Wgit
|
|
143
180
|
# Crawl the url returning the response Wgit::Document or nil, if an error
|
144
181
|
# occurs.
|
145
182
|
#
|
146
|
-
# @param url [Wgit::Url] The Url to crawl; which will
|
183
|
+
# @param url [Wgit::Url] The Url to crawl; which will be modified in the
|
184
|
+
# event of a redirect.
|
147
185
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
148
186
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
149
187
|
# e.g. :host only allows redirects within the same host. Choose from
|
150
|
-
# :
|
188
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
151
189
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
152
190
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
|
+
# Use `doc.empty?` to determine if the page is valid.
|
153
192
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
154
193
|
# crawl was unsuccessful.
|
155
194
|
def crawl_url(url, follow_redirects: true)
|
@@ -167,16 +206,19 @@ module Wgit
|
|
167
206
|
|
168
207
|
protected
|
169
208
|
|
170
|
-
# Returns the
|
209
|
+
# Returns the URL's HTML String or nil. Handles any errors that arise
|
171
210
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
172
211
|
# return a HTML body will be ignored, returning nil.
|
173
212
|
#
|
213
|
+
# If @parse_javascript is true, then the final resolved URL will be browsed
|
214
|
+
# to and Javascript parsed allowing for dynamic HTML generation.
|
215
|
+
#
|
174
216
|
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
175
217
|
# reference and gets modified as a result of the fetch/crawl.
|
176
218
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
177
219
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
178
220
|
# e.g. :host only allows redirects within the same host. Choose from
|
179
|
-
# :
|
221
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
180
222
|
# @raise [StandardError] If url isn't valid and absolute.
|
181
223
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
182
224
|
# unsuccessful.
|
@@ -185,6 +227,8 @@ module Wgit
|
|
185
227
|
raise "Invalid url: #{url}" if url.invalid?
|
186
228
|
|
187
229
|
resolve(url, response, follow_redirects: follow_redirects)
|
230
|
+
get_browser_response(url, response) if @parse_javascript
|
231
|
+
|
188
232
|
response.body_or_nil
|
189
233
|
rescue StandardError => e
|
190
234
|
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
|
@@ -206,14 +250,14 @@ module Wgit
|
|
206
250
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
207
251
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
208
252
|
# e.g. :host only allows redirects within the same host. Choose from
|
209
|
-
# :
|
253
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
210
254
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
211
255
|
def resolve(url, response, follow_redirects: true)
|
212
|
-
|
256
|
+
origin = url.to_url.to_origin # Recorded before any redirects.
|
213
257
|
follow_redirects, within = redirect?(follow_redirects)
|
214
258
|
|
215
259
|
loop do
|
216
|
-
|
260
|
+
get_http_response(url, response)
|
217
261
|
break unless response.redirect?
|
218
262
|
|
219
263
|
# Handle response 'Location' header.
|
@@ -225,7 +269,7 @@ module Wgit
|
|
225
269
|
# Validate if the redirect is allowed.
|
226
270
|
raise "Redirect not allowed: #{location}" unless follow_redirects
|
227
271
|
|
228
|
-
if within && !location.relative?(within =>
|
272
|
+
if within && !location.relative?(within => origin)
|
229
273
|
raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
|
230
274
|
end
|
231
275
|
|
@@ -233,7 +277,7 @@ module Wgit
|
|
233
277
|
if response.redirect_count >= @redirect_limit
|
234
278
|
|
235
279
|
# Process the location to be crawled next.
|
236
|
-
location = url.
|
280
|
+
location = url.to_origin.concat(location) if location.relative?
|
237
281
|
response.redirections[url.to_s] = location.to_s
|
238
282
|
url.replace(location) # Update the url on redirect.
|
239
283
|
end
|
@@ -246,7 +290,7 @@ module Wgit
|
|
246
290
|
# reference.
|
247
291
|
# @raise [StandardError] If a response can't be obtained.
|
248
292
|
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
249
|
-
def
|
293
|
+
def get_http_response(url, response)
|
250
294
|
# Perform a HTTP GET request.
|
251
295
|
orig_url = url.to_s
|
252
296
|
url = url.normalize if url.respond_to?(:normalize)
|
@@ -263,10 +307,40 @@ module Wgit
|
|
263
307
|
response.add_total_time(http_response.total_time)
|
264
308
|
|
265
309
|
# Log the request/response details.
|
266
|
-
|
310
|
+
log_net(:http, response, http_response.total_time)
|
267
311
|
|
268
312
|
# Handle a failed response.
|
269
|
-
raise "No response (within timeout: #{@
|
313
|
+
raise "No response (within timeout: #{@timeout} second(s))" \
|
314
|
+
if response.failure?
|
315
|
+
end
|
316
|
+
|
317
|
+
# Makes a browser request and enriches the given Wgit::Response from it.
|
318
|
+
#
|
319
|
+
# @param url [String] The url to browse to. Will call url#normalize if
|
320
|
+
# possible.
|
321
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
322
|
+
# reference.
|
323
|
+
# @raise [StandardError] If a response can't be obtained.
|
324
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
325
|
+
def get_browser_response(url, response)
|
326
|
+
url = url.normalize if url.respond_to?(:normalize)
|
327
|
+
browser = nil
|
328
|
+
|
329
|
+
crawl_time = Benchmark.measure { browser = browser_get(url) }.real
|
330
|
+
yield browser if block_given?
|
331
|
+
|
332
|
+
# Enrich the given Wgit::Response object (on top of Typhoeus response).
|
333
|
+
response.adapter_response = browser.network.response
|
334
|
+
response.status = browser.network.response.status
|
335
|
+
response.headers = browser.network.response.headers
|
336
|
+
response.body = browser.body
|
337
|
+
response.add_total_time(crawl_time)
|
338
|
+
|
339
|
+
# Log the request/response details.
|
340
|
+
log_net(:browser, response, crawl_time)
|
341
|
+
|
342
|
+
# Handle a failed response.
|
343
|
+
raise "No browser response (within timeout: #{@timeout} second(s))" \
|
270
344
|
if response.failure?
|
271
345
|
end
|
272
346
|
|
@@ -277,7 +351,7 @@ module Wgit
|
|
277
351
|
def http_get(url)
|
278
352
|
opts = {
|
279
353
|
followlocation: false,
|
280
|
-
timeout: @
|
354
|
+
timeout: @timeout,
|
281
355
|
accept_encoding: 'gzip',
|
282
356
|
headers: {
|
283
357
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
@@ -286,35 +360,58 @@ module Wgit
|
|
286
360
|
}
|
287
361
|
|
288
362
|
# See https://rubydoc.info/gems/typhoeus for more info.
|
289
|
-
Typhoeus.get(url, opts)
|
363
|
+
Typhoeus.get(url, **opts)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Performs a HTTP GET request in a web browser and parses the response JS
|
367
|
+
# before returning the HTML body of the fully rendered webpage. This allows
|
368
|
+
# Javascript (SPA apps etc.) to generate HTML dynamically.
|
369
|
+
#
|
370
|
+
# @param url [String] The url to browse to.
|
371
|
+
# @return [Ferrum::Browser] The browser response object.
|
372
|
+
def browser_get(url)
|
373
|
+
@browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
|
374
|
+
@browser.goto(url)
|
375
|
+
|
376
|
+
# Wait for the page's JS to finish dynamically manipulating the DOM.
|
377
|
+
html = @browser.body
|
378
|
+
loop do
|
379
|
+
sleep @parse_javascript_delay
|
380
|
+
break if html.size == @browser.body.size
|
381
|
+
|
382
|
+
html = @browser.body
|
383
|
+
end
|
384
|
+
|
385
|
+
@browser
|
290
386
|
end
|
291
387
|
|
292
388
|
# Returns a doc's internal HTML page links in absolute form; used when
|
293
|
-
# crawling a site.
|
294
|
-
#
|
295
|
-
# `'wiki/\*'` etc. Note that each path should NOT start with a slash.
|
389
|
+
# crawling a site. By default, any `<a>` href returning HTML is returned;
|
390
|
+
# override this with `xpath:` if desired.
|
296
391
|
#
|
297
|
-
#
|
298
|
-
#
|
299
|
-
#
|
300
|
-
# files containing `<a>` links keep the crawl going beyond the base URL.
|
392
|
+
# Use the allow and disallow paths params to partially and selectively
|
393
|
+
# crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
|
394
|
+
# that each path should NOT start with a slash.
|
301
395
|
#
|
302
396
|
# @param doc [Wgit::Document] The document from which to extract it's
|
303
397
|
# internal (absolute) page links.
|
398
|
+
# @param xpath [String] The xpath selecting links to be returned. Only
|
399
|
+
# links pointing to the doc.url domain are allowed. The :default is any
|
400
|
+
# <a> href returning HTML. The allow/disallow paths will be applied to
|
401
|
+
# the returned value.
|
304
402
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
305
403
|
# them if their path `File.fnmatch?` one of allow_paths.
|
306
404
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
307
405
|
# them if their path `File.fnmatch?` one of disallow_paths.
|
308
406
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
309
|
-
def
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
end
|
407
|
+
def next_internal_links(
|
408
|
+
doc, xpath: :default, allow_paths: nil, disallow_paths: nil
|
409
|
+
)
|
410
|
+
links = if xpath && xpath != :default
|
411
|
+
follow_xpath(doc, xpath)
|
412
|
+
else
|
413
|
+
follow_default(doc)
|
414
|
+
end
|
318
415
|
|
319
416
|
return links if allow_paths.nil? && disallow_paths.nil?
|
320
417
|
|
@@ -323,29 +420,40 @@ module Wgit
|
|
323
420
|
|
324
421
|
private
|
325
422
|
|
326
|
-
# Returns
|
327
|
-
#
|
328
|
-
|
329
|
-
|
423
|
+
# Returns the next links used to continue crawling a site. The xpath value
|
424
|
+
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
|
+
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
426
|
+
# pointing to the site domain will raise an error.
|
427
|
+
def follow_xpath(doc, xpath)
|
428
|
+
links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
|
429
|
+
urls
|
430
|
+
.map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
|
431
|
+
.compact
|
432
|
+
end
|
330
433
|
|
331
|
-
|
332
|
-
raise
|
333
|
-
#{follow_redirects}"
|
434
|
+
if links.any? { |link| link.to_domain != doc.url.to_domain }
|
435
|
+
raise 'The links to follow must be within the site domain'
|
334
436
|
end
|
335
437
|
|
336
|
-
|
438
|
+
links
|
337
439
|
end
|
338
440
|
|
339
|
-
#
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
441
|
+
# Returns the default set of links used to continue crawling a site.
|
442
|
+
# By default, any <a> href returning HTML and pointing to the same domain
|
443
|
+
# will get returned.
|
444
|
+
def follow_default(doc)
|
445
|
+
doc
|
446
|
+
.internal_absolute_links
|
447
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
448
|
+
.uniq
|
449
|
+
.select do |link| # Whitelist only HTML content.
|
450
|
+
ext = link.to_extension
|
451
|
+
if ext
|
452
|
+
Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
|
453
|
+
else
|
454
|
+
true # URLs without an extension are assumed HTML.
|
455
|
+
end
|
456
|
+
end
|
349
457
|
end
|
350
458
|
|
351
459
|
# Validate and filter by the given URL paths.
|
@@ -365,14 +473,17 @@ module Wgit
|
|
365
473
|
|
366
474
|
# Validate the paths are suitable for filtering.
|
367
475
|
def validate_paths(paths)
|
368
|
-
paths =
|
476
|
+
paths = *paths
|
369
477
|
raise 'The provided paths must all be Strings' \
|
370
478
|
unless paths.all? { |path| path.is_a?(String) }
|
371
479
|
|
372
|
-
Wgit::Utils.
|
480
|
+
Wgit::Utils.sanitize(paths, encode: false)
|
373
481
|
raise 'The provided paths cannot be empty' if paths.empty?
|
374
482
|
|
375
|
-
paths
|
483
|
+
paths.map do |path|
|
484
|
+
path = Wgit::Url.parse(path)
|
485
|
+
path.index? ? path : path.omit_slashes
|
486
|
+
end
|
376
487
|
end
|
377
488
|
|
378
489
|
# Filters links by selecting/rejecting them based on their path.
|
@@ -380,7 +491,7 @@ module Wgit
|
|
380
491
|
def filter_links(links, filter_method, paths)
|
381
492
|
links.send(filter_method) do |link|
|
382
493
|
# Turn http://example.com into / meaning index.
|
383
|
-
link = link.to_endpoint
|
494
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base
|
384
495
|
|
385
496
|
match = false
|
386
497
|
paths.each do |pattern|
|
@@ -392,6 +503,35 @@ module Wgit
|
|
392
503
|
end
|
393
504
|
end
|
394
505
|
|
506
|
+
# Returns whether or not to follow redirects, and within what context e.g.
|
507
|
+
# :host, :domain etc.
|
508
|
+
def redirect?(follow_redirects)
|
509
|
+
return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
|
510
|
+
|
511
|
+
unless [true, false].include?(follow_redirects)
|
512
|
+
raise "follow_redirects: must be a Boolean or Symbol, not: \
|
513
|
+
#{follow_redirects}"
|
514
|
+
end
|
515
|
+
|
516
|
+
[follow_redirects, nil]
|
517
|
+
end
|
518
|
+
|
519
|
+
# Log (at debug level) the network request/response details.
|
520
|
+
def log_net(client, response, duration)
|
521
|
+
resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
|
522
|
+
log_status = (response.status || 0)
|
523
|
+
log_total_time = (duration || 0.0).truncate(3)
|
524
|
+
|
525
|
+
# The browsers request URL is the same so ignore it.
|
526
|
+
if client.to_sym == :http
|
527
|
+
Wgit.logger.debug("[#{client}] Request: #{response.url}")
|
528
|
+
end
|
529
|
+
|
530
|
+
Wgit.logger.debug(
|
531
|
+
format(resp_template, log_status, response.size, log_total_time)
|
532
|
+
)
|
533
|
+
end
|
534
|
+
|
395
535
|
alias crawl crawl_urls
|
396
536
|
alias crawl_pages crawl_urls
|
397
537
|
alias crawl_page crawl_url
|