wgit 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +39 -0
- data/LICENSE.txt +1 -1
- data/README.md +118 -323
- data/bin/wgit +9 -5
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +138 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +133 -31
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- metadata +26 -14
data/bin/wgit
CHANGED
@@ -2,18 +2,22 @@
|
|
2
2
|
|
3
3
|
require 'wgit'
|
4
4
|
|
5
|
-
# Eval .wgit.rb file (if it exists).
|
6
|
-
def eval_wgit
|
7
|
-
puts 'Searching for .wgit.rb in local and home directories...'
|
5
|
+
# Eval .wgit.rb file (if it exists somewhere).
|
6
|
+
def eval_wgit(filepath = nil)
|
7
|
+
puts 'Searching for .wgit.rb file in local and home directories...'
|
8
8
|
|
9
|
-
[
|
9
|
+
[filepath, Dir.pwd, Dir.home].each do |dir|
|
10
10
|
path = "#{dir}/.wgit.rb"
|
11
11
|
next unless File.exist?(path)
|
12
12
|
|
13
|
-
puts "Eval'ing #{path}
|
13
|
+
puts "Eval'ing #{path}"
|
14
|
+
puts 'Call `eval_wgit` after changes to re-eval the file'
|
14
15
|
eval(File.read(path))
|
16
|
+
|
15
17
|
break
|
16
18
|
end
|
19
|
+
|
20
|
+
nil
|
17
21
|
end
|
18
22
|
|
19
23
|
eval_wgit
|
data/lib/wgit.rb
CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
|
|
6
6
|
require_relative 'wgit/utils'
|
7
7
|
require_relative 'wgit/url'
|
8
8
|
require_relative 'wgit/document'
|
9
|
-
require_relative 'wgit/
|
9
|
+
require_relative 'wgit/document_extractors'
|
10
10
|
require_relative 'wgit/crawler'
|
11
11
|
require_relative 'wgit/database/model'
|
12
12
|
require_relative 'wgit/database/database'
|
13
13
|
require_relative 'wgit/indexer'
|
14
|
+
require_relative 'wgit/dsl'
|
15
|
+
require_relative 'wgit/base'
|
14
16
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
data/lib/wgit/assertable.rb
CHANGED
@@ -6,7 +6,7 @@ module Wgit
|
|
6
6
|
# Default type fail message.
|
7
7
|
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
8
8
|
# Wrong method message.
|
9
|
-
|
9
|
+
NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
|
10
10
|
# Default duck fail message.
|
11
11
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
12
12
|
# Default required keys message.
|
@@ -42,7 +42,7 @@ present: %s"
|
|
42
42
|
# @raise [StandardError] If the assertion fails.
|
43
43
|
# @return [Object] The given arr on successful assertion.
|
44
44
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
45
|
-
raise
|
45
|
+
raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
|
46
46
|
|
47
47
|
arr.each { |obj| assert_types(obj, type_or_types, msg) }
|
48
48
|
end
|
@@ -56,7 +56,7 @@ present: %s"
|
|
56
56
|
# @raise [StandardError] If the assertion fails.
|
57
57
|
# @return [Object] The given obj_or_objs on successful assertion.
|
58
58
|
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
59
|
-
methods =
|
59
|
+
methods = *methods
|
60
60
|
|
61
61
|
if obj_or_objs.respond_to?(:each)
|
62
62
|
obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
|
data/lib/wgit/base.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wgit
|
2
|
+
# Class to inherit from, as an alternative form of using the `Wgit::DSL`.
|
3
|
+
# All subclasses must define a `#parse(doc, &block)` method.
|
4
|
+
class Base
|
5
|
+
extend Wgit::DSL
|
6
|
+
|
7
|
+
# Runs the crawl/index passing each crawled `Wgit::Document` and the given
|
8
|
+
# block to the subclass's `#parse` method.
|
9
|
+
def self.run(&block)
|
10
|
+
obj = new
|
11
|
+
unless obj.respond_to?(:parse)
|
12
|
+
raise "#{obj.class} must respond_to? #parse(doc, &block)"
|
13
|
+
end
|
14
|
+
|
15
|
+
crawl_method = @method || :crawl
|
16
|
+
send(crawl_method) { |doc| obj.parse(doc, &block) }
|
17
|
+
|
18
|
+
obj
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets the crawl/index method to call when `Base.run` is called.
|
22
|
+
# The mode method must match one defined in the `Wgit::Crawler` or
|
23
|
+
# `Wgit::Indexer` class.
|
24
|
+
#
|
25
|
+
# @param method [Symbol] The crawl/index method to call.
|
26
|
+
def self.mode(method)
|
27
|
+
@method = method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/wgit/crawler.rb
CHANGED
@@ -6,12 +6,14 @@ require_relative 'utils'
|
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
8
|
require 'set'
|
9
|
+
require 'benchmark'
|
9
10
|
require 'typhoeus'
|
11
|
+
require 'ferrum'
|
10
12
|
|
11
13
|
module Wgit
|
12
|
-
# The Crawler class provides a means of crawling web based HTTP Wgit::Url
|
13
|
-
# serialising their HTML into Wgit::Document instances. This is the
|
14
|
-
# class
|
14
|
+
# The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
|
15
|
+
# and serialising their HTML into `Wgit::Document` instances. This is the
|
16
|
+
# only Wgit class containing network logic (HTTP request/response handling).
|
15
17
|
class Crawler
|
16
18
|
include Assertable
|
17
19
|
|
@@ -38,12 +40,21 @@ module Wgit
|
|
38
40
|
|
39
41
|
# The maximum amount of time (in seconds) a crawl request has to complete
|
40
42
|
# before raising an error. Set to 0 to disable time outs completely.
|
41
|
-
attr_accessor :
|
43
|
+
attr_accessor :timeout
|
42
44
|
|
43
45
|
# Whether or not to UTF-8 encode the response body once crawled. Set to
|
44
46
|
# false if crawling more than just HTML e.g. images.
|
45
47
|
attr_accessor :encode
|
46
48
|
|
49
|
+
# Whether or not to parse the Javascript of the crawled document.
|
50
|
+
# Parsing requires Chrome/Chromium to be installed and in $PATH.
|
51
|
+
attr_accessor :parse_javascript
|
52
|
+
|
53
|
+
# The delay between checks in a page's HTML size. When the page has stopped
|
54
|
+
# "growing", the Javascript has finished dynamically updating the DOM.
|
55
|
+
# The value should balance between a good UX and enough JS parse time.
|
56
|
+
attr_accessor :parse_javascript_delay
|
57
|
+
|
47
58
|
# The Wgit::Response of the most recently crawled URL.
|
48
59
|
attr_reader :last_response
|
49
60
|
|
@@ -51,20 +62,27 @@ module Wgit
|
|
51
62
|
#
|
52
63
|
# @param redirect_limit [Integer] The amount of allowed redirects before
|
53
64
|
# raising an error. Set to 0 to disable redirects completely.
|
54
|
-
# @param
|
65
|
+
# @param timeout [Integer, Float] The maximum amount of time (in seconds)
|
55
66
|
# a crawl request has to complete before raising an error. Set to 0 to
|
56
67
|
# disable time outs completely.
|
57
68
|
# @param encode [Boolean] Whether or not to UTF-8 encode the response body
|
58
69
|
# once crawled. Set to false if crawling more than just HTML e.g. images.
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
70
|
+
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
|
+
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
|
+
# installed and in $PATH.
|
73
|
+
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
|
+
parse_javascript: false, parse_javascript_delay: 1)
|
75
|
+
@redirect_limit = redirect_limit
|
76
|
+
@timeout = timeout
|
77
|
+
@encode = encode
|
78
|
+
@parse_javascript = parse_javascript
|
79
|
+
@parse_javascript_delay = parse_javascript_delay
|
63
80
|
end
|
64
81
|
|
65
82
|
# Crawls an entire website's HTML pages by recursively going through
|
66
|
-
# its internal `<a>` links
|
67
|
-
#
|
83
|
+
# its internal `<a>` links; this can be overridden with `follow: xpath`.
|
84
|
+
# Each crawled Document is yielded to a block. Use `doc.empty?` to
|
85
|
+
# determine if the crawled link was successful / is valid.
|
68
86
|
#
|
69
87
|
# Use the allow and disallow paths params to partially and selectively
|
70
88
|
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
@@ -82,26 +100,36 @@ module Wgit
|
|
82
100
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
83
101
|
# It is recommended that this URL be the index page of the site to give a
|
84
102
|
# greater chance of finding all pages within that site/host.
|
85
|
-
# @param
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
103
|
+
# @param follow [String] The xpath extracting links to be followed during
|
104
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
105
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
106
|
+
# HTML.
|
107
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
108
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
109
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
110
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
89
111
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
90
112
|
# A block is the only way to interact with each crawled Document.
|
91
113
|
# Use `doc.empty?` to determine if the page is valid.
|
92
114
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
93
115
|
# from all of the site's pages or nil if the given url could not be
|
94
116
|
# crawled successfully.
|
95
|
-
def crawl_site(
|
117
|
+
def crawl_site(
|
118
|
+
url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
|
119
|
+
)
|
96
120
|
doc = crawl_url(url, &block)
|
97
121
|
return nil if doc.nil?
|
98
122
|
|
99
|
-
|
123
|
+
link_opts = {
|
124
|
+
xpath: follow,
|
125
|
+
allow_paths: allow_paths,
|
126
|
+
disallow_paths: disallow_paths
|
127
|
+
}
|
100
128
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
101
129
|
|
102
130
|
crawled = Set.new([url, alt_url])
|
103
131
|
externals = Set.new(doc.external_links)
|
104
|
-
internals = Set.new(
|
132
|
+
internals = Set.new(next_internal_links(doc, **link_opts))
|
105
133
|
|
106
134
|
return externals.to_a if internals.empty?
|
107
135
|
|
@@ -116,7 +144,7 @@ module Wgit
|
|
116
144
|
crawled += [orig_link, link] # Push both links in case of redirects.
|
117
145
|
next if doc.nil?
|
118
146
|
|
119
|
-
internals +=
|
147
|
+
internals += next_internal_links(doc, **link_opts)
|
120
148
|
externals += doc.external_links
|
121
149
|
end
|
122
150
|
end
|
@@ -131,10 +159,11 @@ module Wgit
|
|
131
159
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
132
160
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
133
161
|
# e.g. :host only allows redirects within the same host. Choose from
|
134
|
-
# :
|
162
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
135
163
|
# This value will be used for all urls crawled.
|
136
164
|
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
137
|
-
# way to interact with them.
|
165
|
+
# way to interact with them. Use `doc.empty?` to determine if the page
|
166
|
+
# is valid.
|
138
167
|
# @raise [StandardError] If no urls are provided.
|
139
168
|
# @return [Wgit::Document] The last Document crawled.
|
140
169
|
def crawl_urls(*urls, follow_redirects: true, &block)
|
@@ -143,7 +172,7 @@ module Wgit
|
|
143
172
|
opts = { follow_redirects: follow_redirects }
|
144
173
|
doc = nil
|
145
174
|
|
146
|
-
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
175
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
147
176
|
|
148
177
|
doc
|
149
178
|
end
|
@@ -151,13 +180,15 @@ module Wgit
|
|
151
180
|
# Crawl the url returning the response Wgit::Document or nil, if an error
|
152
181
|
# occurs.
|
153
182
|
#
|
154
|
-
# @param url [Wgit::Url] The Url to crawl; which will
|
183
|
+
# @param url [Wgit::Url] The Url to crawl; which will be modified in the
|
184
|
+
# event of a redirect.
|
155
185
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
156
186
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
157
187
|
# e.g. :host only allows redirects within the same host. Choose from
|
158
|
-
# :
|
188
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
159
189
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
160
190
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
|
+
# Use `doc.empty?` to determine if the page is valid.
|
161
192
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
162
193
|
# crawl was unsuccessful.
|
163
194
|
def crawl_url(url, follow_redirects: true)
|
@@ -175,16 +206,19 @@ module Wgit
|
|
175
206
|
|
176
207
|
protected
|
177
208
|
|
178
|
-
# Returns the
|
209
|
+
# Returns the URL's HTML String or nil. Handles any errors that arise
|
179
210
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
180
211
|
# return a HTML body will be ignored, returning nil.
|
181
212
|
#
|
213
|
+
# If @parse_javascript is true, then the final resolved URL will be browsed
|
214
|
+
# to and Javascript parsed allowing for dynamic HTML generation.
|
215
|
+
#
|
182
216
|
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
183
217
|
# reference and gets modified as a result of the fetch/crawl.
|
184
218
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
185
219
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
186
220
|
# e.g. :host only allows redirects within the same host. Choose from
|
187
|
-
# :
|
221
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
188
222
|
# @raise [StandardError] If url isn't valid and absolute.
|
189
223
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
190
224
|
# unsuccessful.
|
@@ -193,6 +227,8 @@ module Wgit
|
|
193
227
|
raise "Invalid url: #{url}" if url.invalid?
|
194
228
|
|
195
229
|
resolve(url, response, follow_redirects: follow_redirects)
|
230
|
+
get_browser_response(url, response) if @parse_javascript
|
231
|
+
|
196
232
|
response.body_or_nil
|
197
233
|
rescue StandardError => e
|
198
234
|
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
|
@@ -214,14 +250,14 @@ module Wgit
|
|
214
250
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
215
251
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
216
252
|
# e.g. :host only allows redirects within the same host. Choose from
|
217
|
-
# :
|
253
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
218
254
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
219
255
|
def resolve(url, response, follow_redirects: true)
|
220
|
-
|
256
|
+
origin = url.to_url.to_origin # Recorded before any redirects.
|
221
257
|
follow_redirects, within = redirect?(follow_redirects)
|
222
258
|
|
223
259
|
loop do
|
224
|
-
|
260
|
+
get_http_response(url, response)
|
225
261
|
break unless response.redirect?
|
226
262
|
|
227
263
|
# Handle response 'Location' header.
|
@@ -233,7 +269,7 @@ module Wgit
|
|
233
269
|
# Validate if the redirect is allowed.
|
234
270
|
raise "Redirect not allowed: #{location}" unless follow_redirects
|
235
271
|
|
236
|
-
if within && !location.relative?(within =>
|
272
|
+
if within && !location.relative?(within => origin)
|
237
273
|
raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
|
238
274
|
end
|
239
275
|
|
@@ -241,7 +277,7 @@ module Wgit
|
|
241
277
|
if response.redirect_count >= @redirect_limit
|
242
278
|
|
243
279
|
# Process the location to be crawled next.
|
244
|
-
location = url.
|
280
|
+
location = url.to_origin.concat(location) if location.relative?
|
245
281
|
response.redirections[url.to_s] = location.to_s
|
246
282
|
url.replace(location) # Update the url on redirect.
|
247
283
|
end
|
@@ -254,7 +290,7 @@ module Wgit
|
|
254
290
|
# reference.
|
255
291
|
# @raise [StandardError] If a response can't be obtained.
|
256
292
|
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
257
|
-
def
|
293
|
+
def get_http_response(url, response)
|
258
294
|
# Perform a HTTP GET request.
|
259
295
|
orig_url = url.to_s
|
260
296
|
url = url.normalize if url.respond_to?(:normalize)
|
@@ -271,10 +307,40 @@ module Wgit
|
|
271
307
|
response.add_total_time(http_response.total_time)
|
272
308
|
|
273
309
|
# Log the request/response details.
|
274
|
-
|
310
|
+
log_net(:http, response, http_response.total_time)
|
311
|
+
|
312
|
+
# Handle a failed response.
|
313
|
+
raise "No response (within timeout: #{@timeout} second(s))" \
|
314
|
+
if response.failure?
|
315
|
+
end
|
316
|
+
|
317
|
+
# Makes a browser request and enriches the given Wgit::Response from it.
|
318
|
+
#
|
319
|
+
# @param url [String] The url to browse to. Will call url#normalize if
|
320
|
+
# possible.
|
321
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
322
|
+
# reference.
|
323
|
+
# @raise [StandardError] If a response can't be obtained.
|
324
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
325
|
+
def get_browser_response(url, response)
|
326
|
+
url = url.normalize if url.respond_to?(:normalize)
|
327
|
+
browser = nil
|
328
|
+
|
329
|
+
crawl_time = Benchmark.measure { browser = browser_get(url) }.real
|
330
|
+
yield browser if block_given?
|
331
|
+
|
332
|
+
# Enrich the given Wgit::Response object (on top of Typhoeus response).
|
333
|
+
response.adapter_response = browser.network.response
|
334
|
+
response.status = browser.network.response.status
|
335
|
+
response.headers = browser.network.response.headers
|
336
|
+
response.body = browser.body
|
337
|
+
response.add_total_time(crawl_time)
|
338
|
+
|
339
|
+
# Log the request/response details.
|
340
|
+
log_net(:browser, response, crawl_time)
|
275
341
|
|
276
342
|
# Handle a failed response.
|
277
|
-
raise "No response (within timeout: #{@
|
343
|
+
raise "No browser response (within timeout: #{@timeout} second(s))" \
|
278
344
|
if response.failure?
|
279
345
|
end
|
280
346
|
|
@@ -285,7 +351,7 @@ module Wgit
|
|
285
351
|
def http_get(url)
|
286
352
|
opts = {
|
287
353
|
followlocation: false,
|
288
|
-
timeout: @
|
354
|
+
timeout: @timeout,
|
289
355
|
accept_encoding: 'gzip',
|
290
356
|
headers: {
|
291
357
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
@@ -294,37 +360,58 @@ module Wgit
|
|
294
360
|
}
|
295
361
|
|
296
362
|
# See https://rubydoc.info/gems/typhoeus for more info.
|
297
|
-
Typhoeus.get(url, opts)
|
363
|
+
Typhoeus.get(url, **opts)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Performs a HTTP GET request in a web browser and parses the response JS
|
367
|
+
# before returning the HTML body of the fully rendered webpage. This allows
|
368
|
+
# Javascript (SPA apps etc.) to generate HTML dynamically.
|
369
|
+
#
|
370
|
+
# @param url [String] The url to browse to.
|
371
|
+
# @return [Ferrum::Browser] The browser response object.
|
372
|
+
def browser_get(url)
|
373
|
+
@browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
|
374
|
+
@browser.goto(url)
|
375
|
+
|
376
|
+
# Wait for the page's JS to finish dynamically manipulating the DOM.
|
377
|
+
html = @browser.body
|
378
|
+
loop do
|
379
|
+
sleep @parse_javascript_delay
|
380
|
+
break if html.size == @browser.body.size
|
381
|
+
|
382
|
+
html = @browser.body
|
383
|
+
end
|
384
|
+
|
385
|
+
@browser
|
298
386
|
end
|
299
387
|
|
300
388
|
# Returns a doc's internal HTML page links in absolute form; used when
|
301
|
-
# crawling a site.
|
302
|
-
#
|
303
|
-
# `'wiki/\*'` etc. Note that each path should NOT start with a slash.
|
389
|
+
# crawling a site. By default, any `<a>` href returning HTML is returned;
|
390
|
+
# override this with `xpath:` if desired.
|
304
391
|
#
|
305
|
-
#
|
306
|
-
#
|
307
|
-
#
|
308
|
-
# files containing `<a>` links keep the crawl going beyond the base URL.
|
392
|
+
# Use the allow and disallow paths params to partially and selectively
|
393
|
+
# crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
|
394
|
+
# that each path should NOT start with a slash.
|
309
395
|
#
|
310
396
|
# @param doc [Wgit::Document] The document from which to extract it's
|
311
397
|
# internal (absolute) page links.
|
398
|
+
# @param xpath [String] The xpath selecting links to be returned. Only
|
399
|
+
# links pointing to the doc.url domain are allowed. The :default is any
|
400
|
+
# <a> href returning HTML. The allow/disallow paths will be applied to
|
401
|
+
# the returned value.
|
312
402
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
313
403
|
# them if their path `File.fnmatch?` one of allow_paths.
|
314
404
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
315
405
|
# them if their path `File.fnmatch?` one of disallow_paths.
|
316
406
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
317
|
-
def
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
Wgit::Crawler.supported_file_extensions.include?(ext.downcase) :
|
326
|
-
true # URLs without an extension are assumed HTML.
|
327
|
-
end
|
407
|
+
def next_internal_links(
|
408
|
+
doc, xpath: :default, allow_paths: nil, disallow_paths: nil
|
409
|
+
)
|
410
|
+
links = if xpath && xpath != :default
|
411
|
+
follow_xpath(doc, xpath)
|
412
|
+
else
|
413
|
+
follow_default(doc)
|
414
|
+
end
|
328
415
|
|
329
416
|
return links if allow_paths.nil? && disallow_paths.nil?
|
330
417
|
|
@@ -333,29 +420,40 @@ module Wgit
|
|
333
420
|
|
334
421
|
private
|
335
422
|
|
336
|
-
# Returns
|
337
|
-
#
|
338
|
-
|
339
|
-
|
423
|
+
# Returns the next links used to continue crawling a site. The xpath value
|
424
|
+
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
|
+
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
426
|
+
# pointing to the site domain will raise an error.
|
427
|
+
def follow_xpath(doc, xpath)
|
428
|
+
links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
|
429
|
+
urls
|
430
|
+
.map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
|
431
|
+
.compact
|
432
|
+
end
|
340
433
|
|
341
|
-
|
342
|
-
raise
|
343
|
-
#{follow_redirects}"
|
434
|
+
if links.any? { |link| link.to_domain != doc.url.to_domain }
|
435
|
+
raise 'The links to follow must be within the site domain'
|
344
436
|
end
|
345
437
|
|
346
|
-
|
438
|
+
links
|
347
439
|
end
|
348
440
|
|
349
|
-
#
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
441
|
+
# Returns the default set of links used to continue crawling a site.
|
442
|
+
# By default, any <a> href returning HTML and pointing to the same domain
|
443
|
+
# will get returned.
|
444
|
+
def follow_default(doc)
|
445
|
+
doc
|
446
|
+
.internal_absolute_links
|
447
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
448
|
+
.uniq
|
449
|
+
.select do |link| # Whitelist only HTML content.
|
450
|
+
ext = link.to_extension
|
451
|
+
if ext
|
452
|
+
Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
|
453
|
+
else
|
454
|
+
true # URLs without an extension are assumed HTML.
|
455
|
+
end
|
456
|
+
end
|
359
457
|
end
|
360
458
|
|
361
459
|
# Validate and filter by the given URL paths.
|
@@ -375,14 +473,17 @@ module Wgit
|
|
375
473
|
|
376
474
|
# Validate the paths are suitable for filtering.
|
377
475
|
def validate_paths(paths)
|
378
|
-
paths =
|
476
|
+
paths = *paths
|
379
477
|
raise 'The provided paths must all be Strings' \
|
380
478
|
unless paths.all? { |path| path.is_a?(String) }
|
381
479
|
|
382
|
-
Wgit::Utils.
|
480
|
+
Wgit::Utils.sanitize(paths, encode: false)
|
383
481
|
raise 'The provided paths cannot be empty' if paths.empty?
|
384
482
|
|
385
|
-
paths
|
483
|
+
paths.map do |path|
|
484
|
+
path = Wgit::Url.parse(path)
|
485
|
+
path.index? ? path : path.omit_slashes
|
486
|
+
end
|
386
487
|
end
|
387
488
|
|
388
489
|
# Filters links by selecting/rejecting them based on their path.
|
@@ -390,7 +491,7 @@ module Wgit
|
|
390
491
|
def filter_links(links, filter_method, paths)
|
391
492
|
links.send(filter_method) do |link|
|
392
493
|
# Turn http://example.com into / meaning index.
|
393
|
-
link = link.to_endpoint
|
494
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base
|
394
495
|
|
395
496
|
match = false
|
396
497
|
paths.each do |pattern|
|
@@ -402,6 +503,35 @@ module Wgit
|
|
402
503
|
end
|
403
504
|
end
|
404
505
|
|
506
|
+
# Returns whether or not to follow redirects, and within what context e.g.
|
507
|
+
# :host, :domain etc.
|
508
|
+
def redirect?(follow_redirects)
|
509
|
+
return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
|
510
|
+
|
511
|
+
unless [true, false].include?(follow_redirects)
|
512
|
+
raise "follow_redirects: must be a Boolean or Symbol, not: \
|
513
|
+
#{follow_redirects}"
|
514
|
+
end
|
515
|
+
|
516
|
+
[follow_redirects, nil]
|
517
|
+
end
|
518
|
+
|
519
|
+
# Log (at debug level) the network request/response details.
|
520
|
+
def log_net(client, response, duration)
|
521
|
+
resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
|
522
|
+
log_status = (response.status || 0)
|
523
|
+
log_total_time = (duration || 0.0).truncate(3)
|
524
|
+
|
525
|
+
# The browsers request URL is the same so ignore it.
|
526
|
+
if client.to_sym == :http
|
527
|
+
Wgit.logger.debug("[#{client}] Request: #{response.url}")
|
528
|
+
end
|
529
|
+
|
530
|
+
Wgit.logger.debug(
|
531
|
+
format(resp_template, log_status, response.size, log_total_time)
|
532
|
+
)
|
533
|
+
end
|
534
|
+
|
405
535
|
alias crawl crawl_urls
|
406
536
|
alias crawl_pages crawl_urls
|
407
537
|
alias crawl_page crawl_url
|