wgit 0.8.0 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +68 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -326
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +39 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +145 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +66 -163
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +177 -63
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/crawler.rb
CHANGED
@@ -6,12 +6,14 @@ require_relative 'utils'
|
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
8
|
require 'set'
|
9
|
+
require 'benchmark'
|
9
10
|
require 'typhoeus'
|
11
|
+
require 'ferrum'
|
10
12
|
|
11
13
|
module Wgit
|
12
|
-
# The Crawler class provides a means of crawling web based HTTP Wgit::Url
|
13
|
-
# serialising their HTML into Wgit::Document instances. This is the
|
14
|
-
# class
|
14
|
+
# The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
|
15
|
+
# and serialising their HTML into `Wgit::Document` instances. This is the
|
16
|
+
# only Wgit class containing network logic (HTTP request/response handling).
|
15
17
|
class Crawler
|
16
18
|
include Assertable
|
17
19
|
|
@@ -38,12 +40,21 @@ module Wgit
|
|
38
40
|
|
39
41
|
# The maximum amount of time (in seconds) a crawl request has to complete
|
40
42
|
# before raising an error. Set to 0 to disable time outs completely.
|
41
|
-
attr_accessor :
|
43
|
+
attr_accessor :timeout
|
42
44
|
|
43
45
|
# Whether or not to UTF-8 encode the response body once crawled. Set to
|
44
46
|
# false if crawling more than just HTML e.g. images.
|
45
47
|
attr_accessor :encode
|
46
48
|
|
49
|
+
# Whether or not to parse the Javascript of the crawled document.
|
50
|
+
# Parsing requires Chrome/Chromium to be installed and in $PATH.
|
51
|
+
attr_accessor :parse_javascript
|
52
|
+
|
53
|
+
# The delay between checks in a page's HTML size. When the page has stopped
|
54
|
+
# "growing", the Javascript has finished dynamically updating the DOM.
|
55
|
+
# The value should balance between a good UX and enough JS parse time.
|
56
|
+
attr_accessor :parse_javascript_delay
|
57
|
+
|
47
58
|
# The Wgit::Response of the most recently crawled URL.
|
48
59
|
attr_reader :last_response
|
49
60
|
|
@@ -51,20 +62,27 @@ module Wgit
|
|
51
62
|
#
|
52
63
|
# @param redirect_limit [Integer] The amount of allowed redirects before
|
53
64
|
# raising an error. Set to 0 to disable redirects completely.
|
54
|
-
# @param
|
65
|
+
# @param timeout [Integer, Float] The maximum amount of time (in seconds)
|
55
66
|
# a crawl request has to complete before raising an error. Set to 0 to
|
56
67
|
# disable time outs completely.
|
57
68
|
# @param encode [Boolean] Whether or not to UTF-8 encode the response body
|
58
69
|
# once crawled. Set to false if crawling more than just HTML e.g. images.
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
70
|
+
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
|
+
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
|
+
# installed and in $PATH.
|
73
|
+
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
|
+
parse_javascript: false, parse_javascript_delay: 1)
|
75
|
+
@redirect_limit = redirect_limit
|
76
|
+
@timeout = timeout
|
77
|
+
@encode = encode
|
78
|
+
@parse_javascript = parse_javascript
|
79
|
+
@parse_javascript_delay = parse_javascript_delay
|
63
80
|
end
|
64
81
|
|
65
82
|
# Crawls an entire website's HTML pages by recursively going through
|
66
|
-
# its internal `<a>` links
|
67
|
-
#
|
83
|
+
# its internal `<a>` links; this can be overridden with `follow: xpath`.
|
84
|
+
# Each crawled Document is yielded to a block. Use `doc.empty?` to
|
85
|
+
# determine if the crawled link was successful / is valid.
|
68
86
|
#
|
69
87
|
# Use the allow and disallow paths params to partially and selectively
|
70
88
|
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
@@ -82,26 +100,36 @@ module Wgit
|
|
82
100
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
83
101
|
# It is recommended that this URL be the index page of the site to give a
|
84
102
|
# greater chance of finding all pages within that site/host.
|
85
|
-
# @param
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
103
|
+
# @param follow [String] The xpath extracting links to be followed during
|
104
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
105
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
106
|
+
# HTML.
|
107
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
108
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
109
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
110
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
89
111
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
90
112
|
# A block is the only way to interact with each crawled Document.
|
91
113
|
# Use `doc.empty?` to determine if the page is valid.
|
92
114
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
93
115
|
# from all of the site's pages or nil if the given url could not be
|
94
116
|
# crawled successfully.
|
95
|
-
def crawl_site(
|
117
|
+
def crawl_site(
|
118
|
+
url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
|
119
|
+
)
|
96
120
|
doc = crawl_url(url, &block)
|
97
121
|
return nil if doc.nil?
|
98
122
|
|
99
|
-
|
123
|
+
link_opts = {
|
124
|
+
xpath: follow,
|
125
|
+
allow_paths: allow_paths,
|
126
|
+
disallow_paths: disallow_paths
|
127
|
+
}
|
100
128
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
101
129
|
|
102
130
|
crawled = Set.new([url, alt_url])
|
103
131
|
externals = Set.new(doc.external_links)
|
104
|
-
internals = Set.new(
|
132
|
+
internals = Set.new(next_internal_links(doc, **link_opts))
|
105
133
|
|
106
134
|
return externals.to_a if internals.empty?
|
107
135
|
|
@@ -116,7 +144,7 @@ module Wgit
|
|
116
144
|
crawled += [orig_link, link] # Push both links in case of redirects.
|
117
145
|
next if doc.nil?
|
118
146
|
|
119
|
-
internals +=
|
147
|
+
internals += next_internal_links(doc, **link_opts)
|
120
148
|
externals += doc.external_links
|
121
149
|
end
|
122
150
|
end
|
@@ -131,10 +159,11 @@ module Wgit
|
|
131
159
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
132
160
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
133
161
|
# e.g. :host only allows redirects within the same host. Choose from
|
134
|
-
# :
|
162
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
135
163
|
# This value will be used for all urls crawled.
|
136
164
|
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
137
|
-
# way to interact with them.
|
165
|
+
# way to interact with them. Use `doc.empty?` to determine if the page
|
166
|
+
# is valid.
|
138
167
|
# @raise [StandardError] If no urls are provided.
|
139
168
|
# @return [Wgit::Document] The last Document crawled.
|
140
169
|
def crawl_urls(*urls, follow_redirects: true, &block)
|
@@ -143,7 +172,7 @@ module Wgit
|
|
143
172
|
opts = { follow_redirects: follow_redirects }
|
144
173
|
doc = nil
|
145
174
|
|
146
|
-
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
175
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
147
176
|
|
148
177
|
doc
|
149
178
|
end
|
@@ -151,13 +180,15 @@ module Wgit
|
|
151
180
|
# Crawl the url returning the response Wgit::Document or nil, if an error
|
152
181
|
# occurs.
|
153
182
|
#
|
154
|
-
# @param url [Wgit::Url] The Url to crawl; which will
|
183
|
+
# @param url [Wgit::Url] The Url to crawl; which will be modified in the
|
184
|
+
# event of a redirect.
|
155
185
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
156
186
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
157
187
|
# e.g. :host only allows redirects within the same host. Choose from
|
158
|
-
# :
|
188
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
159
189
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
160
190
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
|
+
# Use `doc.empty?` to determine if the page is valid.
|
161
192
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
162
193
|
# crawl was unsuccessful.
|
163
194
|
def crawl_url(url, follow_redirects: true)
|
@@ -175,16 +206,19 @@ module Wgit
|
|
175
206
|
|
176
207
|
protected
|
177
208
|
|
178
|
-
# Returns the
|
209
|
+
# Returns the URL's HTML String or nil. Handles any errors that arise
|
179
210
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
180
211
|
# return a HTML body will be ignored, returning nil.
|
181
212
|
#
|
213
|
+
# If @parse_javascript is true, then the final resolved URL will be browsed
|
214
|
+
# to and Javascript parsed allowing for dynamic HTML generation.
|
215
|
+
#
|
182
216
|
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
183
217
|
# reference and gets modified as a result of the fetch/crawl.
|
184
218
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
185
219
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
186
220
|
# e.g. :host only allows redirects within the same host. Choose from
|
187
|
-
# :
|
221
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
188
222
|
# @raise [StandardError] If url isn't valid and absolute.
|
189
223
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
190
224
|
# unsuccessful.
|
@@ -193,6 +227,8 @@ module Wgit
|
|
193
227
|
raise "Invalid url: #{url}" if url.invalid?
|
194
228
|
|
195
229
|
resolve(url, response, follow_redirects: follow_redirects)
|
230
|
+
get_browser_response(url, response) if @parse_javascript
|
231
|
+
|
196
232
|
response.body_or_nil
|
197
233
|
rescue StandardError => e
|
198
234
|
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
|
@@ -214,14 +250,14 @@ module Wgit
|
|
214
250
|
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
215
251
|
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
216
252
|
# e.g. :host only allows redirects within the same host. Choose from
|
217
|
-
# :
|
253
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
218
254
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
219
255
|
def resolve(url, response, follow_redirects: true)
|
220
|
-
|
256
|
+
origin = url.to_url.to_origin # Recorded before any redirects.
|
221
257
|
follow_redirects, within = redirect?(follow_redirects)
|
222
258
|
|
223
259
|
loop do
|
224
|
-
|
260
|
+
get_http_response(url, response)
|
225
261
|
break unless response.redirect?
|
226
262
|
|
227
263
|
# Handle response 'Location' header.
|
@@ -233,7 +269,7 @@ module Wgit
|
|
233
269
|
# Validate if the redirect is allowed.
|
234
270
|
raise "Redirect not allowed: #{location}" unless follow_redirects
|
235
271
|
|
236
|
-
if within && !location.relative?(within =>
|
272
|
+
if within && !location.relative?(within => origin)
|
237
273
|
raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
|
238
274
|
end
|
239
275
|
|
@@ -241,7 +277,7 @@ module Wgit
|
|
241
277
|
if response.redirect_count >= @redirect_limit
|
242
278
|
|
243
279
|
# Process the location to be crawled next.
|
244
|
-
location = url.
|
280
|
+
location = url.to_origin.concat(location) if location.relative?
|
245
281
|
response.redirections[url.to_s] = location.to_s
|
246
282
|
url.replace(location) # Update the url on redirect.
|
247
283
|
end
|
@@ -254,7 +290,7 @@ module Wgit
|
|
254
290
|
# reference.
|
255
291
|
# @raise [StandardError] If a response can't be obtained.
|
256
292
|
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
257
|
-
def
|
293
|
+
def get_http_response(url, response)
|
258
294
|
# Perform a HTTP GET request.
|
259
295
|
orig_url = url.to_s
|
260
296
|
url = url.normalize if url.respond_to?(:normalize)
|
@@ -271,10 +307,40 @@ module Wgit
|
|
271
307
|
response.add_total_time(http_response.total_time)
|
272
308
|
|
273
309
|
# Log the request/response details.
|
274
|
-
|
310
|
+
log_net(:http, response, http_response.total_time)
|
311
|
+
|
312
|
+
# Handle a failed response.
|
313
|
+
raise "No response (within timeout: #{@timeout} second(s))" \
|
314
|
+
if response.failure?
|
315
|
+
end
|
316
|
+
|
317
|
+
# Makes a browser request and enriches the given Wgit::Response from it.
|
318
|
+
#
|
319
|
+
# @param url [String] The url to browse to. Will call url#normalize if
|
320
|
+
# possible.
|
321
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
322
|
+
# reference.
|
323
|
+
# @raise [StandardError] If a response can't be obtained.
|
324
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
325
|
+
def get_browser_response(url, response)
|
326
|
+
url = url.normalize if url.respond_to?(:normalize)
|
327
|
+
browser = nil
|
328
|
+
|
329
|
+
crawl_time = Benchmark.measure { browser = browser_get(url) }.real
|
330
|
+
yield browser if block_given?
|
331
|
+
|
332
|
+
# Enrich the given Wgit::Response object (on top of Typhoeus response).
|
333
|
+
response.adapter_response = browser.network.response
|
334
|
+
response.status = browser.network.response.status
|
335
|
+
response.headers = browser.network.response.headers
|
336
|
+
response.body = browser.body
|
337
|
+
response.add_total_time(crawl_time)
|
338
|
+
|
339
|
+
# Log the request/response details.
|
340
|
+
log_net(:browser, response, crawl_time)
|
275
341
|
|
276
342
|
# Handle a failed response.
|
277
|
-
raise "No response (within timeout: #{@
|
343
|
+
raise "No browser response (within timeout: #{@timeout} second(s))" \
|
278
344
|
if response.failure?
|
279
345
|
end
|
280
346
|
|
@@ -285,7 +351,7 @@ module Wgit
|
|
285
351
|
def http_get(url)
|
286
352
|
opts = {
|
287
353
|
followlocation: false,
|
288
|
-
timeout: @
|
354
|
+
timeout: @timeout,
|
289
355
|
accept_encoding: 'gzip',
|
290
356
|
headers: {
|
291
357
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
@@ -294,37 +360,58 @@ module Wgit
|
|
294
360
|
}
|
295
361
|
|
296
362
|
# See https://rubydoc.info/gems/typhoeus for more info.
|
297
|
-
Typhoeus.get(url, opts)
|
363
|
+
Typhoeus.get(url, **opts)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Performs a HTTP GET request in a web browser and parses the response JS
|
367
|
+
# before returning the HTML body of the fully rendered webpage. This allows
|
368
|
+
# Javascript (SPA apps etc.) to generate HTML dynamically.
|
369
|
+
#
|
370
|
+
# @param url [String] The url to browse to.
|
371
|
+
# @return [Ferrum::Browser] The browser response object.
|
372
|
+
def browser_get(url)
|
373
|
+
@browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
|
374
|
+
@browser.goto(url)
|
375
|
+
|
376
|
+
# Wait for the page's JS to finish dynamically manipulating the DOM.
|
377
|
+
html = @browser.body
|
378
|
+
loop do
|
379
|
+
sleep @parse_javascript_delay
|
380
|
+
break if html.size == @browser.body.size
|
381
|
+
|
382
|
+
html = @browser.body
|
383
|
+
end
|
384
|
+
|
385
|
+
@browser
|
298
386
|
end
|
299
387
|
|
300
388
|
# Returns a doc's internal HTML page links in absolute form; used when
|
301
|
-
# crawling a site.
|
302
|
-
#
|
303
|
-
# `'wiki/\*'` etc. Note that each path should NOT start with a slash.
|
389
|
+
# crawling a site. By default, any `<a>` href returning HTML is returned;
|
390
|
+
# override this with `xpath:` if desired.
|
304
391
|
#
|
305
|
-
#
|
306
|
-
#
|
307
|
-
#
|
308
|
-
# files containing `<a>` links keep the crawl going beyond the base URL.
|
392
|
+
# Use the allow and disallow paths params to partially and selectively
|
393
|
+
# crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
|
394
|
+
# that each path should NOT start with a slash.
|
309
395
|
#
|
310
396
|
# @param doc [Wgit::Document] The document from which to extract it's
|
311
397
|
# internal (absolute) page links.
|
398
|
+
# @param xpath [String] The xpath selecting links to be returned. Only
|
399
|
+
# links pointing to the doc.url domain are allowed. The :default is any
|
400
|
+
# <a> href returning HTML. The allow/disallow paths will be applied to
|
401
|
+
# the returned value.
|
312
402
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
313
403
|
# them if their path `File.fnmatch?` one of allow_paths.
|
314
404
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
315
405
|
# them if their path `File.fnmatch?` one of disallow_paths.
|
316
406
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
317
|
-
def
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
Wgit::Crawler.supported_file_extensions.include?(ext.downcase) :
|
326
|
-
true # URLs without an extension are assumed HTML.
|
327
|
-
end
|
407
|
+
def next_internal_links(
|
408
|
+
doc, xpath: :default, allow_paths: nil, disallow_paths: nil
|
409
|
+
)
|
410
|
+
links = if xpath && xpath != :default
|
411
|
+
follow_xpath(doc, xpath)
|
412
|
+
else
|
413
|
+
follow_default(doc)
|
414
|
+
end
|
328
415
|
|
329
416
|
return links if allow_paths.nil? && disallow_paths.nil?
|
330
417
|
|
@@ -333,29 +420,40 @@ module Wgit
|
|
333
420
|
|
334
421
|
private
|
335
422
|
|
336
|
-
# Returns
|
337
|
-
#
|
338
|
-
|
339
|
-
|
423
|
+
# Returns the next links used to continue crawling a site. The xpath value
|
424
|
+
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
|
+
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
426
|
+
# pointing to the site domain will raise an error.
|
427
|
+
def follow_xpath(doc, xpath)
|
428
|
+
links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
|
429
|
+
urls
|
430
|
+
.map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
|
431
|
+
.compact
|
432
|
+
end
|
340
433
|
|
341
|
-
|
342
|
-
raise
|
343
|
-
#{follow_redirects}"
|
434
|
+
if links.any? { |link| link.to_domain != doc.url.to_domain }
|
435
|
+
raise 'The links to follow must be within the site domain'
|
344
436
|
end
|
345
437
|
|
346
|
-
|
438
|
+
links
|
347
439
|
end
|
348
440
|
|
349
|
-
#
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
441
|
+
# Returns the default set of links used to continue crawling a site.
|
442
|
+
# By default, any <a> href returning HTML and pointing to the same domain
|
443
|
+
# will get returned.
|
444
|
+
def follow_default(doc)
|
445
|
+
doc
|
446
|
+
.internal_absolute_links
|
447
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
448
|
+
.uniq
|
449
|
+
.select do |link| # Whitelist only HTML content.
|
450
|
+
ext = link.to_extension
|
451
|
+
if ext
|
452
|
+
Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
|
453
|
+
else
|
454
|
+
true # URLs without an extension are assumed HTML.
|
455
|
+
end
|
456
|
+
end
|
359
457
|
end
|
360
458
|
|
361
459
|
# Validate and filter by the given URL paths.
|
@@ -375,14 +473,17 @@ module Wgit
|
|
375
473
|
|
376
474
|
# Validate the paths are suitable for filtering.
|
377
475
|
def validate_paths(paths)
|
378
|
-
paths =
|
476
|
+
paths = *paths
|
379
477
|
raise 'The provided paths must all be Strings' \
|
380
478
|
unless paths.all? { |path| path.is_a?(String) }
|
381
479
|
|
382
|
-
Wgit::Utils.
|
480
|
+
Wgit::Utils.sanitize(paths, encode: false)
|
383
481
|
raise 'The provided paths cannot be empty' if paths.empty?
|
384
482
|
|
385
|
-
paths
|
483
|
+
paths.map do |path|
|
484
|
+
path = Wgit::Url.parse(path)
|
485
|
+
path.index? ? path : path.omit_slashes
|
486
|
+
end
|
386
487
|
end
|
387
488
|
|
388
489
|
# Filters links by selecting/rejecting them based on their path.
|
@@ -390,7 +491,7 @@ module Wgit
|
|
390
491
|
def filter_links(links, filter_method, paths)
|
391
492
|
links.send(filter_method) do |link|
|
392
493
|
# Turn http://example.com into / meaning index.
|
393
|
-
link = link.to_endpoint
|
494
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base
|
394
495
|
|
395
496
|
match = false
|
396
497
|
paths.each do |pattern|
|
@@ -402,6 +503,35 @@ module Wgit
|
|
402
503
|
end
|
403
504
|
end
|
404
505
|
|
506
|
+
# Returns whether or not to follow redirects, and within what context e.g.
|
507
|
+
# :host, :domain etc.
|
508
|
+
def redirect?(follow_redirects)
|
509
|
+
return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
|
510
|
+
|
511
|
+
unless [true, false].include?(follow_redirects)
|
512
|
+
raise "follow_redirects: must be a Boolean or Symbol, not: \
|
513
|
+
#{follow_redirects}"
|
514
|
+
end
|
515
|
+
|
516
|
+
[follow_redirects, nil]
|
517
|
+
end
|
518
|
+
|
519
|
+
# Log (at debug level) the network request/response details.
|
520
|
+
def log_net(client, response, duration)
|
521
|
+
resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
|
522
|
+
log_status = (response.status || 0)
|
523
|
+
log_total_time = (duration || 0.0).truncate(3)
|
524
|
+
|
525
|
+
# The browsers request URL is the same so ignore it.
|
526
|
+
if client.to_sym == :http
|
527
|
+
Wgit.logger.debug("[#{client}] Request: #{response.url}")
|
528
|
+
end
|
529
|
+
|
530
|
+
Wgit.logger.debug(
|
531
|
+
format(resp_template, log_status, response.size, log_total_time)
|
532
|
+
)
|
533
|
+
end
|
534
|
+
|
405
535
|
alias crawl crawl_urls
|
406
536
|
alias crawl_pages crawl_urls
|
407
537
|
alias crawl_page crawl_url
|