wgit 0.5.1 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/wgit ADDED
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wgit'
4
+
5
+ # Eval .wgit.rb file (if it exists somewhere).
6
+ def eval_wgit(filepath = nil)
7
+ puts 'Searching for .wgit.rb file in local and home directories...'
8
+
9
+ [filepath, Dir.pwd, Dir.home].each do |dir|
10
+ path = "#{dir}/.wgit.rb"
11
+ next unless File.exist?(path)
12
+
13
+ puts "Eval'ing #{path}"
14
+ puts 'Call `eval_wgit` after changes to re-eval the file'
15
+ eval(File.read(path))
16
+
17
+ break
18
+ end
19
+
20
+ nil
21
+ end
22
+
23
+ eval_wgit
24
+ puts "\n#{Wgit.version_str}\n\n"
25
+
26
+ # Use Pry if installed or fall back to IRB.
27
+ begin
28
+ require 'pry'
29
+ klass = Pry
30
+ rescue LoadError
31
+ require 'irb'
32
+ klass = IRB
33
+
34
+ puts "Starting IRB because Pry isn't installed."
35
+ end
36
+
37
+ klass.start
38
+
39
+ puts 'Interactive session complete.'
data/lib/wgit.rb CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
6
6
  require_relative 'wgit/utils'
7
7
  require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
- require_relative 'wgit/document_extensions'
9
+ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
13
  require_relative 'wgit/indexer'
14
+ require_relative 'wgit/dsl'
15
+ require_relative 'wgit/base'
14
16
  # require_relative 'wgit/core_ext' - Must be explicitly required.
@@ -6,7 +6,7 @@ module Wgit
6
6
  # Default type fail message.
7
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
8
8
  # Wrong method message.
9
- WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
9
+ NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
10
10
  # Default duck fail message.
11
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
12
12
  # Default required keys message.
@@ -42,7 +42,7 @@ present: %s"
42
42
  # @raise [StandardError] If the assertion fails.
43
43
  # @return [Object] The given arr on successful assertion.
44
44
  def assert_arr_types(arr, type_or_types, msg = nil)
45
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
45
+ raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
46
46
 
47
47
  arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
@@ -56,7 +56,7 @@ present: %s"
56
56
  # @raise [StandardError] If the assertion fails.
57
57
  # @return [Object] The given obj_or_objs on successful assertion.
58
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
59
- methods = [methods] unless methods.respond_to?(:all?)
59
+ methods = *methods
60
60
 
61
61
  if obj_or_objs.respond_to?(:each)
62
62
  obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
data/lib/wgit/base.rb ADDED
@@ -0,0 +1,30 @@
1
+ module Wgit
2
+ # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
3
+ # All subclasses must define a `#parse(doc, &block)` method.
4
+ class Base
5
+ extend Wgit::DSL
6
+
7
+ # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
+ # block to the subclass's `#parse` method.
9
+ def self.run(&block)
10
+ obj = new
11
+ unless obj.respond_to?(:parse)
12
+ raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
+ end
14
+
15
+ crawl_method = @method || :crawl
16
+ send(crawl_method) { |doc| obj.parse(doc, &block) }
17
+
18
+ obj
19
+ end
20
+
21
+ # Sets the crawl/index method to call when `Base.run` is called.
22
+ # The mode method must match one defined in the `Wgit::Crawler` or
23
+ # `Wgit::Indexer` class.
24
+ #
25
+ # @param method [Symbol] The crawl/index method to call.
26
+ def self.mode(method)
27
+ @method = method
28
+ end
29
+ end
30
+ end
data/lib/wgit/core_ext.rb CHANGED
@@ -11,7 +11,7 @@ class String
11
11
  #
12
12
  # @return [Wgit::Url] The converted URL.
13
13
  def to_url
14
- Wgit::Url.new(self)
14
+ Wgit::Url.parse(self)
15
15
  end
16
16
  end
17
17
 
data/lib/wgit/crawler.rb CHANGED
@@ -5,26 +5,55 @@ require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
+ require 'set'
9
+ require 'benchmark'
8
10
  require 'typhoeus'
11
+ require 'ferrum'
9
12
 
10
13
  module Wgit
11
- # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
12
- # serialising their HTML into Wgit::Document instances. This is the only Wgit
13
- # class which contains network logic e.g. request/response handling.
14
+ # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
15
+ # and serialising their HTML into `Wgit::Document` instances. This is the
16
+ # only Wgit class containing network logic (HTTP request/response handling).
14
17
  class Crawler
15
18
  include Assertable
16
19
 
20
+ # Set of supported file extensions for Wgit::Crawler#crawl_site.
21
+ @supported_file_extensions = Set.new(
22
+ %w[asp aspx cfm cgi htm html htmlx jsp php]
23
+ )
24
+
25
+ class << self
26
+ # The URL file extensions (from `<a>` hrefs) which will be crawled by
27
+ # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
28
+ # doesn't keep the crawl of the site going. All URL's without a file
29
+ # extension will be crawled, because they're assumed to be HTML.
30
+ # The `#crawl` method will crawl anything since it's given the URL(s).
31
+ # You can add your own site's URL file extension e.g.
32
+ # `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
33
+ attr_reader :supported_file_extensions
34
+ end
35
+
17
36
  # The amount of allowed redirects before raising an error. Set to 0 to
18
- # disable redirects completely.
37
+ # disable redirects completely; or you can pass `follow_redirects: false`
38
+ # to any Wgit::Crawler.crawl_* method.
19
39
  attr_accessor :redirect_limit
20
40
 
21
41
  # The maximum amount of time (in seconds) a crawl request has to complete
22
42
  # before raising an error. Set to 0 to disable time outs completely.
23
- attr_accessor :time_out
43
+ attr_accessor :timeout
44
+
45
+ # Whether or not to UTF-8 encode the response body once crawled. Set to
46
+ # false if crawling more than just HTML e.g. images.
47
+ attr_accessor :encode
24
48
 
25
- # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
26
- # crawling more than just HTML e.g. images etc.
27
- attr_accessor :encode_html
49
+ # Whether or not to parse the Javascript of the crawled document.
50
+ # Parsing requires Chrome/Chromium to be installed and in $PATH.
51
+ attr_accessor :parse_javascript
52
+
53
+ # The delay between checks in a page's HTML size. When the page has stopped
54
+ # "growing", the Javascript has finished dynamically updating the DOM.
55
+ # The value should balance between a good UX and enough JS parse time.
56
+ attr_accessor :parse_javascript_delay
28
57
 
29
58
  # The Wgit::Response of the most recently crawled URL.
30
59
  attr_reader :last_response
@@ -33,21 +62,32 @@ module Wgit
33
62
  #
34
63
  # @param redirect_limit [Integer] The amount of allowed redirects before
35
64
  # raising an error. Set to 0 to disable redirects completely.
36
- # @param time_out [Integer, Float] The maximum amount of time (in seconds)
65
+ # @param timeout [Integer, Float] The maximum amount of time (in seconds)
37
66
  # a crawl request has to complete before raising an error. Set to 0 to
38
67
  # disable time outs completely.
39
- # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
40
- # crawled. Set to false if crawling more than just HTML e.g. images etc.
41
- def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
42
- @redirect_limit = redirect_limit
43
- @time_out = time_out
44
- @encode_html = encode_html
68
+ # @param encode [Boolean] Whether or not to UTF-8 encode the response body
69
+ # once crawled. Set to false if crawling more than just HTML e.g. images.
70
+ # @param parse_javascript [Boolean] Whether or not to parse the Javascript
71
+ # of the crawled document. Parsing requires Chrome/Chromium to be
72
+ # installed and in $PATH.
73
+ def initialize(redirect_limit: 5, timeout: 5, encode: true,
74
+ parse_javascript: false, parse_javascript_delay: 1)
75
+ @redirect_limit = redirect_limit
76
+ @timeout = timeout
77
+ @encode = encode
78
+ @parse_javascript = parse_javascript
79
+ @parse_javascript_delay = parse_javascript_delay
45
80
  end
46
81
 
47
82
  # Crawls an entire website's HTML pages by recursively going through
48
- # its internal <a> links. Each crawled Document is yielded to a block. Use
49
- # the allow and disallow paths params to partially and selectively crawl a
50
- # site.
83
+ # its internal `<a>` links; this can be overridden with `follow: xpath`.
84
+ # Each crawled Document is yielded to a block. Use `doc.empty?` to
85
+ # determine if the crawled link was successful / is valid.
86
+ #
87
+ # Use the allow and disallow paths params to partially and selectively
88
+ # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
89
+ # Note that each path must NOT start with a slash; the only exception being
90
+ # a `/` on its own with no other characters, referring to the index page.
51
91
  #
52
92
  # Only redirects to the same host are followed. For example, the Url
53
93
  # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -60,69 +100,79 @@ module Wgit
60
100
  # @param url [Wgit::Url] The base URL of the website to be crawled.
61
101
  # It is recommended that this URL be the index page of the site to give a
62
102
  # greater chance of finding all pages within that site/host.
63
- # @param allow_paths [String, Array<String>] Filters links by selecting
64
- # them only if their path includes one of allow_paths.
65
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
66
- # them if their path includes one of disallow_paths.
103
+ # @param follow [String] The xpath extracting links to be followed during
104
+ # the crawl. This changes how a site is crawled. Only links pointing to
105
+ # the site domain are allowed. The `:default` is any `<a>` href returning
106
+ # HTML.
107
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
108
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
109
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
110
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
67
111
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
68
112
  # A block is the only way to interact with each crawled Document.
113
+ # Use `doc.empty?` to determine if the page is valid.
69
114
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
70
- # from all of the site's pages or nil if the url could not be
115
+ # from all of the site's pages or nil if the given url could not be
71
116
  # crawled successfully.
72
- def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
117
+ def crawl_site(
118
+ url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
119
+ )
73
120
  doc = crawl_url(url, &block)
74
121
  return nil if doc.nil?
75
122
 
76
- crawl_opts = { follow_external_redirects: false, host: url.to_base }
77
- link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
78
-
123
+ link_opts = {
124
+ xpath: follow,
125
+ allow_paths: allow_paths,
126
+ disallow_paths: disallow_paths
127
+ }
79
128
  alt_url = url.end_with?('/') ? url.chop : url + '/'
80
- crawled = [url, alt_url]
81
- externals = doc.external_links
82
- internals = get_internal_links(doc, link_opts)
83
129
 
84
- return doc.external_links.uniq if internals.empty?
130
+ crawled = Set.new([url, alt_url])
131
+ externals = Set.new(doc.external_links)
132
+ internals = Set.new(next_internal_links(doc, **link_opts))
85
133
 
86
- loop do
87
- crawled.uniq!
88
- internals.uniq!
134
+ return externals.to_a if internals.empty?
89
135
 
136
+ loop do
90
137
  links = internals - crawled
91
138
  break if links.empty?
92
139
 
93
140
  links.each do |link|
94
141
  orig_link = link.dup
95
- doc = crawl_url(link, crawl_opts, &block)
142
+ doc = crawl_url(link, follow_redirects: :host, &block)
96
143
 
97
- crawled.push(orig_link, link) # Push both in case of redirects.
144
+ crawled += [orig_link, link] # Push both links in case of redirects.
98
145
  next if doc.nil?
99
146
 
100
- internals.concat(get_internal_links(doc, link_opts))
101
- externals.concat(doc.external_links)
147
+ internals += next_internal_links(doc, **link_opts)
148
+ externals += doc.external_links
102
149
  end
103
150
  end
104
151
 
105
- externals.uniq
152
+ externals.to_a
106
153
  end
107
154
 
108
155
  # Crawls one or more individual urls using Wgit::Crawler#crawl_url
109
156
  # underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
110
157
  #
111
158
  # @param urls [*Wgit::Url] The Url's to crawl.
159
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
160
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
161
+ # e.g. :host only allows redirects within the same host. Choose from
162
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
163
+ # This value will be used for all urls crawled.
112
164
  # @yield [doc] Given each crawled page (Wgit::Document); this is the only
113
- # way to interact with them.
165
+ # way to interact with them. Use `doc.empty?` to determine if the page
166
+ # is valid.
114
167
  # @raise [StandardError] If no urls are provided.
115
168
  # @return [Wgit::Document] The last Document crawled.
116
- def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
169
+ def crawl_urls(*urls, follow_redirects: true, &block)
117
170
  raise 'You must provide at least one Url' if urls.empty?
118
171
 
119
- opts = {
120
- follow_external_redirects: follow_external_redirects,
121
- host: host
122
- }
172
+ opts = { follow_redirects: follow_redirects }
123
173
  doc = nil
124
174
 
125
- Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
175
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
126
176
 
127
177
  doc
128
178
  end
@@ -130,34 +180,25 @@ module Wgit
130
180
  # Crawl the url returning the response Wgit::Document or nil, if an error
131
181
  # occurs.
132
182
  #
133
- # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
134
- # @param follow_external_redirects [Boolean] Whether or not to follow
135
- # an external redirect. External meaning to a different host. False will
136
- # return nil for such a crawl. If false, you must also provide a `host:`
137
- # parameter.
138
- # @param host [Wgit::Url, String] Specify the host by which
139
- # an absolute redirect is determined to be internal or not. Must be
140
- # absolute and contain a protocol prefix. For example, a `host:` of
141
- # 'http://www.example.com' will only allow redirects for Url's with a
142
- # `to_host` value of 'www.example.com'.
183
+ # @param url [Wgit::Url] The Url to crawl; which will be modified in the
184
+ # event of a redirect.
185
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
186
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
187
+ # e.g. :host only allows redirects within the same host. Choose from
188
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
143
189
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
144
190
  # crawl was successful or not. Therefore, Document#url etc. can be used.
191
+ # Use `doc.empty?` to determine if the page is valid.
145
192
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
146
193
  # crawl was unsuccessful.
147
- def crawl_url(url, follow_external_redirects: true, host: nil)
194
+ def crawl_url(url, follow_redirects: true)
148
195
  # A String url isn't allowed because it's passed by value not reference,
149
196
  # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
150
197
  assert_type(url, Wgit::Url)
151
- raise 'host cannot be nil if follow_external_redirects is false' \
152
- if !follow_external_redirects && host.nil?
153
198
 
154
- html = fetch(
155
- url,
156
- follow_external_redirects: follow_external_redirects,
157
- host: host
158
- )
199
+ html = fetch(url, follow_redirects: follow_redirects)
200
+ doc = Wgit::Document.new(url, html, encode: @encode)
159
201
 
160
- doc = Wgit::Document.new(url, html, encode_html: @encode_html)
161
202
  yield(doc) if block_given?
162
203
 
163
204
  doc.empty? ? nil : doc
@@ -165,31 +206,28 @@ module Wgit
165
206
 
166
207
  protected
167
208
 
168
- # Returns the url HTML String or nil. Handles any errors that arise
209
+ # Returns the URL's HTML String or nil. Handles any errors that arise
169
210
  # and sets the @last_response. Errors or any HTTP response that doesn't
170
211
  # return a HTML body will be ignored, returning nil.
171
212
  #
213
+ # If @parse_javascript is true, then the final resolved URL will be browsed
214
+ # to and Javascript parsed allowing for dynamic HTML generation.
215
+ #
172
216
  # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
173
217
  # reference and gets modified as a result of the fetch/crawl.
174
- # @param follow_external_redirects [Boolean] Whether or not to follow
175
- # an external redirect. False will return nil for such a crawl. If false,
176
- # you must also provide a `host:` parameter.
177
- # @param host [Wgit::Url, String] Specify the host by which
178
- # an absolute redirect is determined to be internal or not. Must be
179
- # absolute and contain a protocol prefix. For example, a `host:` of
180
- # 'http://www.example.com' will only allow redirects for Urls with a
181
- # `to_host` value of 'www.example.com'.
218
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
219
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
220
+ # e.g. :host only allows redirects within the same host. Choose from
221
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
222
+ # @raise [StandardError] If url isn't valid and absolute.
182
223
  # @return [String, nil] The crawled HTML or nil if the crawl was
183
224
  # unsuccessful.
184
- def fetch(url, follow_external_redirects: true, host: nil)
225
+ def fetch(url, follow_redirects: true)
185
226
  response = Wgit::Response.new
227
+ raise "Invalid url: #{url}" if url.invalid?
186
228
 
187
- resolve(
188
- url,
189
- response,
190
- follow_external_redirects: follow_external_redirects,
191
- host: host
192
- )
229
+ resolve(url, response, follow_redirects: follow_redirects)
230
+ get_browser_response(url, response) if @parse_javascript
193
231
 
194
232
  response.body_or_nil
195
233
  rescue StandardError => e
@@ -209,18 +247,17 @@ module Wgit
209
247
  # @param url [Wgit::Url] The URL to GET and resolve.
210
248
  # @param response [Wgit::Response] The response to enrich. Modifies by
211
249
  # reference.
212
- # @param follow_external_redirects [Boolean] Whether or not to follow
213
- # an external redirect. If false, you must also provide a `host:`
214
- # parameter.
215
- # @param host [Wgit::Url, String] Specify the host by which
216
- # an absolute redirect is determined to be internal or not. Must be
217
- # absolute and contain a protocol prefix. For example, a `host:` of
218
- # 'http://www.example.com' will only allow redirects for Urls with a
219
- # `to_host` value of 'www.example.com'.
250
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
251
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
252
+ # e.g. :host only allows redirects within the same host. Choose from
253
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
220
254
  # @raise [StandardError] If a redirect isn't allowed etc.
221
- def resolve(url, response, follow_external_redirects: true, host: nil)
255
+ def resolve(url, response, follow_redirects: true)
256
+ origin = url.to_url.to_origin # Recorded before any redirects.
257
+ follow_redirects, within = redirect?(follow_redirects)
258
+
222
259
  loop do
223
- get_response(url, response)
260
+ get_http_response(url, response)
224
261
  break unless response.redirect?
225
262
 
226
263
  # Handle response 'Location' header.
@@ -229,17 +266,18 @@ module Wgit
229
266
 
230
267
  yield(url, response, location) if block_given?
231
268
 
232
- # Validate redirect.
233
- if !follow_external_redirects && !location.relative?(host: host)
234
- raise "External redirect not allowed - Redirected to: \
235
- '#{location}', which is outside of host: '#{host}'"
269
+ # Validate if the redirect is allowed.
270
+ raise "Redirect not allowed: #{location}" unless follow_redirects
271
+
272
+ if within && !location.relative?(within => origin)
273
+ raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
236
274
  end
237
275
 
238
276
  raise "Too many redirects, exceeded: #{@redirect_limit}" \
239
277
  if response.redirect_count >= @redirect_limit
240
278
 
241
279
  # Process the location to be crawled next.
242
- location = url.to_base.concat(location) if location.relative?
280
+ location = url.to_origin.concat(location) if location.relative?
243
281
  response.redirections[url.to_s] = location.to_s
244
282
  url.replace(location) # Update the url on redirect.
245
283
  end
@@ -252,7 +290,7 @@ module Wgit
252
290
  # reference.
253
291
  # @raise [StandardError] If a response can't be obtained.
254
292
  # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
255
- def get_response(url, response)
293
+ def get_http_response(url, response)
256
294
  # Perform a HTTP GET request.
257
295
  orig_url = url.to_s
258
296
  url = url.normalize if url.respond_to?(:normalize)
@@ -268,18 +306,41 @@ module Wgit
268
306
  response.ip_address = http_response.primary_ip
269
307
  response.add_total_time(http_response.total_time)
270
308
 
271
- # Log (debug) the request/response details.
272
- resp_template = '[http] Response: %s (%s bytes in %s seconds)'
273
- log_status = (response.status || 0)
274
- log_total_time = response.total_time.truncate(3)
309
+ # Log the request/response details.
310
+ log_net(:http, response, http_response.total_time)
275
311
 
276
- Wgit.logger.debug("[http] Request: #{response.url}")
277
- Wgit.logger.debug(
278
- format(resp_template, log_status, response.size, log_total_time)
279
- )
312
+ # Handle a failed response.
313
+ raise "No response (within timeout: #{@timeout} second(s))" \
314
+ if response.failure?
315
+ end
316
+
317
+ # Makes a browser request and enriches the given Wgit::Response from it.
318
+ #
319
+ # @param url [String] The url to browse to. Will call url#normalize if
320
+ # possible.
321
+ # @param response [Wgit::Response] The response to enrich. Modifies by
322
+ # reference.
323
+ # @raise [StandardError] If a response can't be obtained.
324
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
325
+ def get_browser_response(url, response)
326
+ url = url.normalize if url.respond_to?(:normalize)
327
+ browser = nil
328
+
329
+ crawl_time = Benchmark.measure { browser = browser_get(url) }.real
330
+ yield browser if block_given?
331
+
332
+ # Enrich the given Wgit::Response object (on top of Typhoeus response).
333
+ response.adapter_response = browser.network.response
334
+ response.status = browser.network.response.status
335
+ response.headers = browser.network.response.headers
336
+ response.body = browser.body
337
+ response.add_total_time(crawl_time)
338
+
339
+ # Log the request/response details.
340
+ log_net(:browser, response, crawl_time)
280
341
 
281
342
  # Handle a failed response.
282
- raise "No response (within timeout: #{@time_out} second(s))" \
343
+ raise "No browser response (within timeout: #{@timeout} second(s))" \
283
344
  if response.failure?
284
345
  end
285
346
 
@@ -290,7 +351,7 @@ module Wgit
290
351
  def http_get(url)
291
352
  opts = {
292
353
  followlocation: false,
293
- timeout: @time_out,
354
+ timeout: @timeout,
294
355
  accept_encoding: 'gzip',
295
356
  headers: {
296
357
  'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -299,34 +360,58 @@ module Wgit
299
360
  }
300
361
 
301
362
  # See https://rubydoc.info/gems/typhoeus for more info.
302
- Typhoeus.get(url, opts)
363
+ Typhoeus.get(url, **opts)
364
+ end
365
+
366
+ # Performs a HTTP GET request in a web browser and parses the response JS
367
+ # before returning the HTML body of the fully rendered webpage. This allows
368
+ # Javascript (SPA apps etc.) to generate HTML dynamically.
369
+ #
370
+ # @param url [String] The url to browse to.
371
+ # @return [Ferrum::Browser] The browser response object.
372
+ def browser_get(url)
373
+ @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
374
+ @browser.goto(url)
375
+
376
+ # Wait for the page's JS to finish dynamically manipulating the DOM.
377
+ html = @browser.body
378
+ loop do
379
+ sleep @parse_javascript_delay
380
+ break if html.size == @browser.body.size
381
+
382
+ html = @browser.body
383
+ end
384
+
385
+ @browser
303
386
  end
304
387
 
305
388
  # Returns a doc's internal HTML page links in absolute form; used when
306
- # crawling a site. Use the allow and disallow paths params to partially
307
- # and selectively crawl a site.
389
+ # crawling a site. By default, any `<a>` href returning HTML is returned;
390
+ # override this with `xpath:` if desired.
308
391
  #
309
- # Override this method in a subclass to change how a site
310
- # is crawled; not what is extracted from each page (Document extensions
311
- # should be used for this purpose instead). Just remember that only HTML
312
- # files containing <a> links can keep the crawl going beyond the base URL.
392
+ # Use the allow and disallow paths params to partially and selectively
393
+ # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
394
+ # that each path should NOT start with a slash.
313
395
  #
314
396
  # @param doc [Wgit::Document] The document from which to extract it's
315
- # internal page links.
397
+ # internal (absolute) page links.
398
+ # @param xpath [String] The xpath selecting links to be returned. Only
399
+ # links pointing to the doc.url domain are allowed. The :default is any
400
+ # <a> href returning HTML. The allow/disallow paths will be applied to
401
+ # the returned value.
316
402
  # @param allow_paths [String, Array<String>] Filters links by selecting
317
- # them only if their path includes one of allow_paths.
403
+ # them if their path `File.fnmatch?` one of allow_paths.
318
404
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
319
- # them if their path includes one of disallow_paths.
405
+ # them if their path `File.fnmatch?` one of disallow_paths.
320
406
  # @return [Array<Wgit::Url>] The internal page links from doc.
321
- def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
322
- links = doc
323
- .internal_absolute_links
324
- .map(&:omit_fragment) # Because fragments don't alter content.
325
- .uniq
326
- .reject do |link|
327
- ext = link.to_extension
328
- ext ? !%w[htm html].include?(ext.downcase) : false
329
- end
407
+ def next_internal_links(
408
+ doc, xpath: :default, allow_paths: nil, disallow_paths: nil
409
+ )
410
+ links = if xpath && xpath != :default
411
+ follow_xpath(doc, xpath)
412
+ else
413
+ follow_default(doc)
414
+ end
330
415
 
331
416
  return links if allow_paths.nil? && disallow_paths.nil?
332
417
 
@@ -335,40 +420,82 @@ module Wgit
335
420
 
336
421
  private
337
422
 
423
+ # Returns the next links used to continue crawling a site. The xpath value
424
+ # is used to obtain the links. Any valid URL Strings will be converted into
425
+ # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
426
+ # pointing to the site domain will raise an error.
427
+ def follow_xpath(doc, xpath)
428
+ links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
429
+ urls
430
+ .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
431
+ .compact
432
+ end
433
+
434
+ if links.any? { |link| link.to_domain != doc.url.to_domain }
435
+ raise 'The links to follow must be within the site domain'
436
+ end
437
+
438
+ links
439
+ end
440
+
441
+ # Returns the default set of links used to continue crawling a site.
442
+ # By default, any <a> href returning HTML and pointing to the same domain
443
+ # will get returned.
444
+ def follow_default(doc)
445
+ doc
446
+ .internal_absolute_links
447
+ .map(&:omit_fragment) # Because fragments don't alter content.
448
+ .uniq
449
+ .select do |link| # Whitelist only HTML content.
450
+ ext = link.to_extension
451
+ if ext
452
+ Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
453
+ else
454
+ true # URLs without an extension are assumed HTML.
455
+ end
456
+ end
457
+ end
458
+
338
459
  # Validate and filter by the given URL paths.
339
460
  def process_paths(links, allow_paths, disallow_paths)
340
- raise "You can't provide both allow_paths: and disallow_paths: params" \
341
- if allow_paths && disallow_paths
342
-
343
- if allow_paths # White list.
344
- filter_method = :select
345
- paths = allow_paths
346
- else # Black list.
347
- filter_method = :reject
348
- paths = disallow_paths
461
+ if allow_paths
462
+ paths = validate_paths(allow_paths)
463
+ filter_links(links, :select!, paths)
349
464
  end
350
465
 
351
- paths = [paths] unless paths.is_a?(Array)
352
- paths = paths
353
- .compact
354
- .reject(&:empty?)
355
- .uniq
356
- .map { |path| Wgit::Url.new(path).to_path }
466
+ if disallow_paths
467
+ paths = validate_paths(disallow_paths)
468
+ filter_links(links, :reject!, paths)
469
+ end
357
470
 
471
+ links
472
+ end
473
+
474
+ # Validate the paths are suitable for filtering.
475
+ def validate_paths(paths)
476
+ paths = *paths
477
+ raise 'The provided paths must all be Strings' \
478
+ unless paths.all? { |path| path.is_a?(String) }
479
+
480
+ Wgit::Utils.sanitize(paths, encode: false)
358
481
  raise 'The provided paths cannot be empty' if paths.empty?
359
482
 
360
- filter_links_by_path(links, filter_method, paths)
483
+ paths.map do |path|
484
+ path = Wgit::Url.parse(path)
485
+ path.index? ? path : path.omit_slashes
486
+ end
361
487
  end
362
488
 
363
- # Filters links by selecting or rejecting them based on their path.
364
- def filter_links_by_path(links, filter_method, paths)
489
+ # Filters links by selecting/rejecting them based on their path.
490
+ # Uses File.fnmatch? so that globbing is supported.
491
+ def filter_links(links, filter_method, paths)
365
492
  links.send(filter_method) do |link|
366
- link_path = link.to_path
367
- next(false) unless link_path
493
+ # Turn http://example.com into / meaning index.
494
+ link = link.to_endpoint.index? ? '/' : link.omit_base
368
495
 
369
496
  match = false
370
- paths.each do |path|
371
- match = link_path.start_with?(path)
497
+ paths.each do |pattern|
498
+ match = File.fnmatch?(pattern, link, File::FNM_EXTGLOB)
372
499
  break if match
373
500
  end
374
501
 
@@ -376,6 +503,35 @@ module Wgit
376
503
  end
377
504
  end
378
505
 
506
+ # Returns whether or not to follow redirects, and within what context e.g.
507
+ # :host, :domain etc.
508
+ def redirect?(follow_redirects)
509
+ return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
510
+
511
+ unless [true, false].include?(follow_redirects)
512
+ raise "follow_redirects: must be a Boolean or Symbol, not: \
513
+ #{follow_redirects}"
514
+ end
515
+
516
+ [follow_redirects, nil]
517
+ end
518
+
519
+ # Log (at debug level) the network request/response details.
520
+ def log_net(client, response, duration)
521
+ resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
522
+ log_status = (response.status || 0)
523
+ log_total_time = (duration || 0.0).truncate(3)
524
+
525
+ # The browsers request URL is the same so ignore it.
526
+ if client.to_sym == :http
527
+ Wgit.logger.debug("[#{client}] Request: #{response.url}")
528
+ end
529
+
530
+ Wgit.logger.debug(
531
+ format(resp_template, log_status, response.size, log_total_time)
532
+ )
533
+ end
534
+
379
535
  alias crawl crawl_urls
380
536
  alias crawl_pages crawl_urls
381
537
  alias crawl_page crawl_url