wgit 0.5.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wgit'
4
+
5
+ # Eval .wgit.rb file (if it exists somewhere).
6
+ def eval_wgit(filepath = nil)
7
+ puts 'Searching for .wgit.rb file in local and home directories...'
8
+
9
+ [filepath, Dir.pwd, Dir.home].each do |dir|
10
+ path = "#{dir}/.wgit.rb"
11
+ next unless File.exist?(path)
12
+
13
+ puts "Eval'ing #{path}"
14
+ puts 'Call `eval_wgit` after changes to re-eval the file'
15
+ eval(File.read(path))
16
+
17
+ break
18
+ end
19
+
20
+ nil
21
+ end
22
+
23
+ eval_wgit
24
+ puts "\n#{Wgit.version_str}\n\n"
25
+
26
+ # Use Pry if installed or fall back to IRB.
27
+ begin
28
+ require 'pry'
29
+ klass = Pry
30
+ rescue LoadError
31
+ require 'irb'
32
+ klass = IRB
33
+
34
+ puts "Starting IRB because Pry isn't installed."
35
+ end
36
+
37
+ klass.start
38
+
39
+ puts 'Interactive session complete.'
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
6
6
  require_relative 'wgit/utils'
7
7
  require_relative 'wgit/url'
8
8
  require_relative 'wgit/document'
9
- require_relative 'wgit/document_extensions'
9
+ require_relative 'wgit/document_extractors'
10
10
  require_relative 'wgit/crawler'
11
11
  require_relative 'wgit/database/model'
12
12
  require_relative 'wgit/database/database'
13
13
  require_relative 'wgit/indexer'
14
+ require_relative 'wgit/dsl'
15
+ require_relative 'wgit/base'
14
16
  # require_relative 'wgit/core_ext' - Must be explicitly required.
@@ -6,7 +6,7 @@ module Wgit
6
6
  # Default type fail message.
7
7
  DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
8
8
  # Wrong method message.
9
- WRONG_METHOD_MSG = 'arr must be Enumerable, use a different method'
9
+ NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
10
10
  # Default duck fail message.
11
11
  DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
12
12
  # Default required keys message.
@@ -42,7 +42,7 @@ present: %s"
42
42
  # @raise [StandardError] If the assertion fails.
43
43
  # @return [Object] The given arr on successful assertion.
44
44
  def assert_arr_types(arr, type_or_types, msg = nil)
45
- raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
45
+ raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
46
46
 
47
47
  arr.each { |obj| assert_types(obj, type_or_types, msg) }
48
48
  end
@@ -56,7 +56,7 @@ present: %s"
56
56
  # @raise [StandardError] If the assertion fails.
57
57
  # @return [Object] The given obj_or_objs on successful assertion.
58
58
  def assert_respond_to(obj_or_objs, methods, msg = nil)
59
- methods = [methods] unless methods.respond_to?(:all?)
59
+ methods = *methods
60
60
 
61
61
  if obj_or_objs.respond_to?(:each)
62
62
  obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
@@ -0,0 +1,30 @@
1
+ module Wgit
2
+ # Class to inherit from, as an alternative form of using the `Wgit::DSL`.
3
+ # All subclasses must define a `#parse(doc, &block)` method.
4
+ class Base
5
+ extend Wgit::DSL
6
+
7
+ # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
+ # block to the subclass's `#parse` method.
9
+ def self.run(&block)
10
+ obj = new
11
+ unless obj.respond_to?(:parse)
12
+ raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
+ end
14
+
15
+ crawl_method = @method || :crawl
16
+ send(crawl_method) { |doc| obj.parse(doc, &block) }
17
+
18
+ obj
19
+ end
20
+
21
+ # Sets the crawl/index method to call when `Base.run` is called.
22
+ # The mode method must match one defined in the `Wgit::Crawler` or
23
+ # `Wgit::Indexer` class.
24
+ #
25
+ # @param method [Symbol] The crawl/index method to call.
26
+ def self.mode(method)
27
+ @method = method
28
+ end
29
+ end
30
+ end
@@ -11,7 +11,7 @@ class String
11
11
  #
12
12
  # @return [Wgit::Url] The converted URL.
13
13
  def to_url
14
- Wgit::Url.new(self)
14
+ Wgit::Url.parse(self)
15
15
  end
16
16
  end
17
17
 
@@ -5,26 +5,55 @@ require_relative 'document'
5
5
  require_relative 'utils'
6
6
  require_relative 'assertable'
7
7
  require_relative 'response'
8
+ require 'set'
9
+ require 'benchmark'
8
10
  require 'typhoeus'
11
+ require 'ferrum'
9
12
 
10
13
  module Wgit
11
- # The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
12
- # serialising their HTML into Wgit::Document instances. This is the only Wgit
13
- # class which contains network logic e.g. request/response handling.
14
+ # The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
15
+ # and serialising their HTML into `Wgit::Document` instances. This is the
16
+ # only Wgit class containing network logic (HTTP request/response handling).
14
17
  class Crawler
15
18
  include Assertable
16
19
 
20
+ # Set of supported file extensions for Wgit::Crawler#crawl_site.
21
+ @supported_file_extensions = Set.new(
22
+ %w[asp aspx cfm cgi htm html htmlx jsp php]
23
+ )
24
+
25
+ class << self
26
+ # The URL file extensions (from `<a>` hrefs) which will be crawled by
27
+ # `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
28
+ # doesn't keep the crawl of the site going. All URL's without a file
29
+ # extension will be crawled, because they're assumed to be HTML.
30
+ # The `#crawl` method will crawl anything since it's given the URL(s).
31
+ # You can add your own site's URL file extension e.g.
32
+ # `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
33
+ attr_reader :supported_file_extensions
34
+ end
35
+
17
36
  # The amount of allowed redirects before raising an error. Set to 0 to
18
- # disable redirects completely.
37
+ # disable redirects completely; or you can pass `follow_redirects: false`
38
+ # to any Wgit::Crawler.crawl_* method.
19
39
  attr_accessor :redirect_limit
20
40
 
21
41
  # The maximum amount of time (in seconds) a crawl request has to complete
22
42
  # before raising an error. Set to 0 to disable time outs completely.
23
- attr_accessor :time_out
43
+ attr_accessor :timeout
44
+
45
+ # Whether or not to UTF-8 encode the response body once crawled. Set to
46
+ # false if crawling more than just HTML e.g. images.
47
+ attr_accessor :encode
24
48
 
25
- # Whether or not to UTF-8 encode the HTML once crawled. Set to false if
26
- # crawling more than just HTML e.g. images etc.
27
- attr_accessor :encode_html
49
+ # Whether or not to parse the Javascript of the crawled document.
50
+ # Parsing requires Chrome/Chromium to be installed and in $PATH.
51
+ attr_accessor :parse_javascript
52
+
53
+ # The delay between checks in a page's HTML size. When the page has stopped
54
+ # "growing", the Javascript has finished dynamically updating the DOM.
55
+ # The value should balance between a good UX and enough JS parse time.
56
+ attr_accessor :parse_javascript_delay
28
57
 
29
58
  # The Wgit::Response of the most recently crawled URL.
30
59
  attr_reader :last_response
@@ -33,21 +62,32 @@ module Wgit
33
62
  #
34
63
  # @param redirect_limit [Integer] The amount of allowed redirects before
35
64
  # raising an error. Set to 0 to disable redirects completely.
36
- # @param time_out [Integer, Float] The maximum amount of time (in seconds)
65
+ # @param timeout [Integer, Float] The maximum amount of time (in seconds)
37
66
  # a crawl request has to complete before raising an error. Set to 0 to
38
67
  # disable time outs completely.
39
- # @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
40
- # crawled. Set to false if crawling more than just HTML e.g. images etc.
41
- def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
42
- @redirect_limit = redirect_limit
43
- @time_out = time_out
44
- @encode_html = encode_html
68
+ # @param encode [Boolean] Whether or not to UTF-8 encode the response body
69
+ # once crawled. Set to false if crawling more than just HTML e.g. images.
70
+ # @param parse_javascript [Boolean] Whether or not to parse the Javascript
71
+ # of the crawled document. Parsing requires Chrome/Chromium to be
72
+ # installed and in $PATH.
73
+ def initialize(redirect_limit: 5, timeout: 5, encode: true,
74
+ parse_javascript: false, parse_javascript_delay: 1)
75
+ @redirect_limit = redirect_limit
76
+ @timeout = timeout
77
+ @encode = encode
78
+ @parse_javascript = parse_javascript
79
+ @parse_javascript_delay = parse_javascript_delay
45
80
  end
46
81
 
47
82
  # Crawls an entire website's HTML pages by recursively going through
48
- # its internal <a> links. Each crawled Document is yielded to a block. Use
49
- # the allow and disallow paths params to partially and selectively crawl a
50
- # site.
83
+ # its internal `<a>` links; this can be overridden with `follow: xpath`.
84
+ # Each crawled Document is yielded to a block. Use `doc.empty?` to
85
+ # determine if the crawled link was successful / is valid.
86
+ #
87
+ # Use the allow and disallow paths params to partially and selectively
88
+ # crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
89
+ # Note that each path must NOT start with a slash; the only exception being
90
+ # a `/` on its own with no other characters, referring to the index page.
51
91
  #
52
92
  # Only redirects to the same host are followed. For example, the Url
53
93
  # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
@@ -60,69 +100,79 @@ module Wgit
60
100
  # @param url [Wgit::Url] The base URL of the website to be crawled.
61
101
  # It is recommended that this URL be the index page of the site to give a
62
102
  # greater chance of finding all pages within that site/host.
63
- # @param allow_paths [String, Array<String>] Filters links by selecting
64
- # them only if their path includes one of allow_paths.
65
- # @param disallow_paths [String, Array<String>] Filters links by rejecting
66
- # them if their path includes one of disallow_paths.
103
+ # @param follow [String] The xpath extracting links to be followed during
104
+ # the crawl. This changes how a site is crawled. Only links pointing to
105
+ # the site domain are allowed. The `:default` is any `<a>` href returning
106
+ # HTML.
107
+ # @param allow_paths [String, Array<String>] Filters the `follow:` links by
108
+ # selecting them if their path `File.fnmatch?` one of allow_paths.
109
+ # @param disallow_paths [String, Array<String>] Filters the `follow` links
110
+ # by rejecting them if their path `File.fnmatch?` one of disallow_paths.
67
111
  # @yield [doc] Given each crawled page (Wgit::Document) of the site.
68
112
  # A block is the only way to interact with each crawled Document.
113
+ # Use `doc.empty?` to determine if the page is valid.
69
114
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
70
- # from all of the site's pages or nil if the url could not be
115
+ # from all of the site's pages or nil if the given url could not be
71
116
  # crawled successfully.
72
- def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
117
+ def crawl_site(
118
+ url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
119
+ )
73
120
  doc = crawl_url(url, &block)
74
121
  return nil if doc.nil?
75
122
 
76
- crawl_opts = { follow_external_redirects: false, host: url.to_base }
77
- link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
78
-
123
+ link_opts = {
124
+ xpath: follow,
125
+ allow_paths: allow_paths,
126
+ disallow_paths: disallow_paths
127
+ }
79
128
  alt_url = url.end_with?('/') ? url.chop : url + '/'
80
- crawled = [url, alt_url]
81
- externals = doc.external_links
82
- internals = get_internal_links(doc, link_opts)
83
129
 
84
- return doc.external_links.uniq if internals.empty?
130
+ crawled = Set.new([url, alt_url])
131
+ externals = Set.new(doc.external_links)
132
+ internals = Set.new(next_internal_links(doc, **link_opts))
85
133
 
86
- loop do
87
- crawled.uniq!
88
- internals.uniq!
134
+ return externals.to_a if internals.empty?
89
135
 
136
+ loop do
90
137
  links = internals - crawled
91
138
  break if links.empty?
92
139
 
93
140
  links.each do |link|
94
141
  orig_link = link.dup
95
- doc = crawl_url(link, crawl_opts, &block)
142
+ doc = crawl_url(link, follow_redirects: :host, &block)
96
143
 
97
- crawled.push(orig_link, link) # Push both in case of redirects.
144
+ crawled += [orig_link, link] # Push both links in case of redirects.
98
145
  next if doc.nil?
99
146
 
100
- internals.concat(get_internal_links(doc, link_opts))
101
- externals.concat(doc.external_links)
147
+ internals += next_internal_links(doc, **link_opts)
148
+ externals += doc.external_links
102
149
  end
103
150
  end
104
151
 
105
- externals.uniq
152
+ externals.to_a
106
153
  end
107
154
 
108
155
  # Crawls one or more individual urls using Wgit::Crawler#crawl_url
109
156
  # underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
110
157
  #
111
158
  # @param urls [*Wgit::Url] The Url's to crawl.
159
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
160
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
161
+ # e.g. :host only allows redirects within the same host. Choose from
162
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
163
+ # This value will be used for all urls crawled.
112
164
  # @yield [doc] Given each crawled page (Wgit::Document); this is the only
113
- # way to interact with them.
165
+ # way to interact with them. Use `doc.empty?` to determine if the page
166
+ # is valid.
114
167
  # @raise [StandardError] If no urls are provided.
115
168
  # @return [Wgit::Document] The last Document crawled.
116
- def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
169
+ def crawl_urls(*urls, follow_redirects: true, &block)
117
170
  raise 'You must provide at least one Url' if urls.empty?
118
171
 
119
- opts = {
120
- follow_external_redirects: follow_external_redirects,
121
- host: host
122
- }
172
+ opts = { follow_redirects: follow_redirects }
123
173
  doc = nil
124
174
 
125
- Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
175
+ Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
126
176
 
127
177
  doc
128
178
  end
@@ -130,34 +180,25 @@ module Wgit
130
180
  # Crawl the url returning the response Wgit::Document or nil, if an error
131
181
  # occurs.
132
182
  #
133
- # @param url [Wgit::Url] The Url to crawl; which will likely be modified.
134
- # @param follow_external_redirects [Boolean] Whether or not to follow
135
- # an external redirect. External meaning to a different host. False will
136
- # return nil for such a crawl. If false, you must also provide a `host:`
137
- # parameter.
138
- # @param host [Wgit::Url, String] Specify the host by which
139
- # an absolute redirect is determined to be internal or not. Must be
140
- # absolute and contain a protocol prefix. For example, a `host:` of
141
- # 'http://www.example.com' will only allow redirects for Url's with a
142
- # `to_host` value of 'www.example.com'.
183
+ # @param url [Wgit::Url] The Url to crawl; which will be modified in the
184
+ # event of a redirect.
185
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
186
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
187
+ # e.g. :host only allows redirects within the same host. Choose from
188
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
143
189
  # @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
144
190
  # crawl was successful or not. Therefore, Document#url etc. can be used.
191
+ # Use `doc.empty?` to determine if the page is valid.
145
192
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
146
193
  # crawl was unsuccessful.
147
- def crawl_url(url, follow_external_redirects: true, host: nil)
194
+ def crawl_url(url, follow_redirects: true)
148
195
  # A String url isn't allowed because it's passed by value not reference,
149
196
  # meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
150
197
  assert_type(url, Wgit::Url)
151
- raise 'host cannot be nil if follow_external_redirects is false' \
152
- if !follow_external_redirects && host.nil?
153
198
 
154
- html = fetch(
155
- url,
156
- follow_external_redirects: follow_external_redirects,
157
- host: host
158
- )
199
+ html = fetch(url, follow_redirects: follow_redirects)
200
+ doc = Wgit::Document.new(url, html, encode: @encode)
159
201
 
160
- doc = Wgit::Document.new(url, html, encode_html: @encode_html)
161
202
  yield(doc) if block_given?
162
203
 
163
204
  doc.empty? ? nil : doc
@@ -165,31 +206,28 @@ module Wgit
165
206
 
166
207
  protected
167
208
 
168
- # Returns the url HTML String or nil. Handles any errors that arise
209
+ # Returns the URL's HTML String or nil. Handles any errors that arise
169
210
  # and sets the @last_response. Errors or any HTTP response that doesn't
170
211
  # return a HTML body will be ignored, returning nil.
171
212
  #
213
+ # If @parse_javascript is true, then the final resolved URL will be browsed
214
+ # to and Javascript parsed allowing for dynamic HTML generation.
215
+ #
172
216
  # @param url [Wgit::Url] The URL to fetch. This Url object is passed by
173
217
  # reference and gets modified as a result of the fetch/crawl.
174
- # @param follow_external_redirects [Boolean] Whether or not to follow
175
- # an external redirect. False will return nil for such a crawl. If false,
176
- # you must also provide a `host:` parameter.
177
- # @param host [Wgit::Url, String] Specify the host by which
178
- # an absolute redirect is determined to be internal or not. Must be
179
- # absolute and contain a protocol prefix. For example, a `host:` of
180
- # 'http://www.example.com' will only allow redirects for Urls with a
181
- # `to_host` value of 'www.example.com'.
218
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
219
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
220
+ # e.g. :host only allows redirects within the same host. Choose from
221
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
222
+ # @raise [StandardError] If url isn't valid and absolute.
182
223
  # @return [String, nil] The crawled HTML or nil if the crawl was
183
224
  # unsuccessful.
184
- def fetch(url, follow_external_redirects: true, host: nil)
225
+ def fetch(url, follow_redirects: true)
185
226
  response = Wgit::Response.new
227
+ raise "Invalid url: #{url}" if url.invalid?
186
228
 
187
- resolve(
188
- url,
189
- response,
190
- follow_external_redirects: follow_external_redirects,
191
- host: host
192
- )
229
+ resolve(url, response, follow_redirects: follow_redirects)
230
+ get_browser_response(url, response) if @parse_javascript
193
231
 
194
232
  response.body_or_nil
195
233
  rescue StandardError => e
@@ -209,18 +247,17 @@ module Wgit
209
247
  # @param url [Wgit::Url] The URL to GET and resolve.
210
248
  # @param response [Wgit::Response] The response to enrich. Modifies by
211
249
  # reference.
212
- # @param follow_external_redirects [Boolean] Whether or not to follow
213
- # an external redirect. If false, you must also provide a `host:`
214
- # parameter.
215
- # @param host [Wgit::Url, String] Specify the host by which
216
- # an absolute redirect is determined to be internal or not. Must be
217
- # absolute and contain a protocol prefix. For example, a `host:` of
218
- # 'http://www.example.com' will only allow redirects for Urls with a
219
- # `to_host` value of 'www.example.com'.
250
+ # @param follow_redirects [Boolean, Symbol] Whether or not to follow
251
+ # redirects. Pass a Symbol to limit where the redirect is allowed to go
252
+ # e.g. :host only allows redirects within the same host. Choose from
253
+ # :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
220
254
  # @raise [StandardError] If a redirect isn't allowed etc.
221
- def resolve(url, response, follow_external_redirects: true, host: nil)
255
+ def resolve(url, response, follow_redirects: true)
256
+ origin = url.to_url.to_origin # Recorded before any redirects.
257
+ follow_redirects, within = redirect?(follow_redirects)
258
+
222
259
  loop do
223
- get_response(url, response)
260
+ get_http_response(url, response)
224
261
  break unless response.redirect?
225
262
 
226
263
  # Handle response 'Location' header.
@@ -229,17 +266,18 @@ module Wgit
229
266
 
230
267
  yield(url, response, location) if block_given?
231
268
 
232
- # Validate redirect.
233
- if !follow_external_redirects && !location.relative?(host: host)
234
- raise "External redirect not allowed - Redirected to: \
235
- '#{location}', which is outside of host: '#{host}'"
269
+ # Validate if the redirect is allowed.
270
+ raise "Redirect not allowed: #{location}" unless follow_redirects
271
+
272
+ if within && !location.relative?(within => origin)
273
+ raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
236
274
  end
237
275
 
238
276
  raise "Too many redirects, exceeded: #{@redirect_limit}" \
239
277
  if response.redirect_count >= @redirect_limit
240
278
 
241
279
  # Process the location to be crawled next.
242
- location = url.to_base.concat(location) if location.relative?
280
+ location = url.to_origin.concat(location) if location.relative?
243
281
  response.redirections[url.to_s] = location.to_s
244
282
  url.replace(location) # Update the url on redirect.
245
283
  end
@@ -252,7 +290,7 @@ module Wgit
252
290
  # reference.
253
291
  # @raise [StandardError] If a response can't be obtained.
254
292
  # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
255
- def get_response(url, response)
293
+ def get_http_response(url, response)
256
294
  # Perform a HTTP GET request.
257
295
  orig_url = url.to_s
258
296
  url = url.normalize if url.respond_to?(:normalize)
@@ -268,18 +306,41 @@ module Wgit
268
306
  response.ip_address = http_response.primary_ip
269
307
  response.add_total_time(http_response.total_time)
270
308
 
271
- # Log (debug) the request/response details.
272
- resp_template = '[http] Response: %s (%s bytes in %s seconds)'
273
- log_status = (response.status || 0)
274
- log_total_time = response.total_time.truncate(3)
309
+ # Log the request/response details.
310
+ log_net(:http, response, http_response.total_time)
275
311
 
276
- Wgit.logger.debug("[http] Request: #{response.url}")
277
- Wgit.logger.debug(
278
- format(resp_template, log_status, response.size, log_total_time)
279
- )
312
+ # Handle a failed response.
313
+ raise "No response (within timeout: #{@timeout} second(s))" \
314
+ if response.failure?
315
+ end
316
+
317
+ # Makes a browser request and enriches the given Wgit::Response from it.
318
+ #
319
+ # @param url [String] The url to browse to. Will call url#normalize if
320
+ # possible.
321
+ # @param response [Wgit::Response] The response to enrich. Modifies by
322
+ # reference.
323
+ # @raise [StandardError] If a response can't be obtained.
324
+ # @return [Wgit::Response] The enriched HTTP Wgit::Response object.
325
+ def get_browser_response(url, response)
326
+ url = url.normalize if url.respond_to?(:normalize)
327
+ browser = nil
328
+
329
+ crawl_time = Benchmark.measure { browser = browser_get(url) }.real
330
+ yield browser if block_given?
331
+
332
+ # Enrich the given Wgit::Response object (on top of Typhoeus response).
333
+ response.adapter_response = browser.network.response
334
+ response.status = browser.network.response.status
335
+ response.headers = browser.network.response.headers
336
+ response.body = browser.body
337
+ response.add_total_time(crawl_time)
338
+
339
+ # Log the request/response details.
340
+ log_net(:browser, response, crawl_time)
280
341
 
281
342
  # Handle a failed response.
282
- raise "No response (within timeout: #{@time_out} second(s))" \
343
+ raise "No browser response (within timeout: #{@timeout} second(s))" \
283
344
  if response.failure?
284
345
  end
285
346
 
@@ -290,7 +351,7 @@ module Wgit
290
351
  def http_get(url)
291
352
  opts = {
292
353
  followlocation: false,
293
- timeout: @time_out,
354
+ timeout: @timeout,
294
355
  accept_encoding: 'gzip',
295
356
  headers: {
296
357
  'User-Agent' => "wgit/#{Wgit::VERSION}",
@@ -299,34 +360,58 @@ module Wgit
299
360
  }
300
361
 
301
362
  # See https://rubydoc.info/gems/typhoeus for more info.
302
- Typhoeus.get(url, opts)
363
+ Typhoeus.get(url, **opts)
364
+ end
365
+
366
+ # Performs a HTTP GET request in a web browser and parses the response JS
367
+ # before returning the HTML body of the fully rendered webpage. This allows
368
+ # Javascript (SPA apps etc.) to generate HTML dynamically.
369
+ #
370
+ # @param url [String] The url to browse to.
371
+ # @return [Ferrum::Browser] The browser response object.
372
+ def browser_get(url)
373
+ @browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
374
+ @browser.goto(url)
375
+
376
+ # Wait for the page's JS to finish dynamically manipulating the DOM.
377
+ html = @browser.body
378
+ loop do
379
+ sleep @parse_javascript_delay
380
+ break if html.size == @browser.body.size
381
+
382
+ html = @browser.body
383
+ end
384
+
385
+ @browser
303
386
  end
304
387
 
305
388
  # Returns a doc's internal HTML page links in absolute form; used when
306
- # crawling a site. Use the allow and disallow paths params to partially
307
- # and selectively crawl a site.
389
+ # crawling a site. By default, any `<a>` href returning HTML is returned;
390
+ # override this with `xpath:` if desired.
308
391
  #
309
- # Override this method in a subclass to change how a site
310
- # is crawled; not what is extracted from each page (Document extensions
311
- # should be used for this purpose instead). Just remember that only HTML
312
- # files containing <a> links can keep the crawl going beyond the base URL.
392
+ # Use the allow and disallow paths params to partially and selectively
393
+ # crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
394
+ # that each path should NOT start with a slash.
313
395
  #
314
396
  # @param doc [Wgit::Document] The document from which to extract it's
315
- # internal page links.
397
+ # internal (absolute) page links.
398
+ # @param xpath [String] The xpath selecting links to be returned. Only
399
+ # links pointing to the doc.url domain are allowed. The :default is any
400
+ # <a> href returning HTML. The allow/disallow paths will be applied to
401
+ # the returned value.
316
402
  # @param allow_paths [String, Array<String>] Filters links by selecting
317
- # them only if their path includes one of allow_paths.
403
+ # them if their path `File.fnmatch?` one of allow_paths.
318
404
  # @param disallow_paths [String, Array<String>] Filters links by rejecting
319
- # them if their path includes one of disallow_paths.
405
+ # them if their path `File.fnmatch?` one of disallow_paths.
320
406
  # @return [Array<Wgit::Url>] The internal page links from doc.
321
- def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
322
- links = doc
323
- .internal_absolute_links
324
- .map(&:omit_fragment) # Because fragments don't alter content.
325
- .uniq
326
- .reject do |link|
327
- ext = link.to_extension
328
- ext ? !%w[htm html].include?(ext.downcase) : false
329
- end
407
+ def next_internal_links(
408
+ doc, xpath: :default, allow_paths: nil, disallow_paths: nil
409
+ )
410
+ links = if xpath && xpath != :default
411
+ follow_xpath(doc, xpath)
412
+ else
413
+ follow_default(doc)
414
+ end
330
415
 
331
416
  return links if allow_paths.nil? && disallow_paths.nil?
332
417
 
@@ -335,40 +420,82 @@ module Wgit
335
420
 
336
421
  private
337
422
 
423
+ # Returns the next links used to continue crawling a site. The xpath value
424
+ # is used to obtain the links. Any valid URL Strings will be converted into
425
+ # absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
426
+ # pointing to the site domain will raise an error.
427
+ def follow_xpath(doc, xpath)
428
+ links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
429
+ urls
430
+ .map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
431
+ .compact
432
+ end
433
+
434
+ if links.any? { |link| link.to_domain != doc.url.to_domain }
435
+ raise 'The links to follow must be within the site domain'
436
+ end
437
+
438
+ links
439
+ end
440
+
441
+ # Returns the default set of links used to continue crawling a site.
442
+ # By default, any <a> href returning HTML and pointing to the same domain
443
+ # will get returned.
444
+ def follow_default(doc)
445
+ doc
446
+ .internal_absolute_links
447
+ .map(&:omit_fragment) # Because fragments don't alter content.
448
+ .uniq
449
+ .select do |link| # Whitelist only HTML content.
450
+ ext = link.to_extension
451
+ if ext
452
+ Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
453
+ else
454
+ true # URLs without an extension are assumed HTML.
455
+ end
456
+ end
457
+ end
458
+
338
459
  # Validate and filter by the given URL paths.
339
460
  def process_paths(links, allow_paths, disallow_paths)
340
- raise "You can't provide both allow_paths: and disallow_paths: params" \
341
- if allow_paths && disallow_paths
342
-
343
- if allow_paths # White list.
344
- filter_method = :select
345
- paths = allow_paths
346
- else # Black list.
347
- filter_method = :reject
348
- paths = disallow_paths
461
+ if allow_paths
462
+ paths = validate_paths(allow_paths)
463
+ filter_links(links, :select!, paths)
349
464
  end
350
465
 
351
- paths = [paths] unless paths.is_a?(Array)
352
- paths = paths
353
- .compact
354
- .reject(&:empty?)
355
- .uniq
356
- .map { |path| Wgit::Url.new(path).to_path }
466
+ if disallow_paths
467
+ paths = validate_paths(disallow_paths)
468
+ filter_links(links, :reject!, paths)
469
+ end
357
470
 
471
+ links
472
+ end
473
+
474
+ # Validate the paths are suitable for filtering.
475
+ def validate_paths(paths)
476
+ paths = *paths
477
+ raise 'The provided paths must all be Strings' \
478
+ unless paths.all? { |path| path.is_a?(String) }
479
+
480
+ Wgit::Utils.sanitize(paths, encode: false)
358
481
  raise 'The provided paths cannot be empty' if paths.empty?
359
482
 
360
- filter_links_by_path(links, filter_method, paths)
483
+ paths.map do |path|
484
+ path = Wgit::Url.parse(path)
485
+ path.index? ? path : path.omit_slashes
486
+ end
361
487
  end
362
488
 
363
- # Filters links by selecting or rejecting them based on their path.
364
- def filter_links_by_path(links, filter_method, paths)
489
+ # Filters links by selecting/rejecting them based on their path.
490
+ # Uses File.fnmatch? so that globbing is supported.
491
+ def filter_links(links, filter_method, paths)
365
492
  links.send(filter_method) do |link|
366
- link_path = link.to_path
367
- next(false) unless link_path
493
+ # Turn http://example.com into / meaning index.
494
+ link = link.to_endpoint.index? ? '/' : link.omit_base
368
495
 
369
496
  match = false
370
- paths.each do |path|
371
- match = link_path.start_with?(path)
497
+ paths.each do |pattern|
498
+ match = File.fnmatch?(pattern, link, File::FNM_EXTGLOB)
372
499
  break if match
373
500
  end
374
501
 
@@ -376,6 +503,35 @@ module Wgit
376
503
  end
377
504
  end
378
505
 
506
+ # Returns whether or not to follow redirects, and within what context e.g.
507
+ # :host, :domain etc.
508
+ def redirect?(follow_redirects)
509
+ return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
510
+
511
+ unless [true, false].include?(follow_redirects)
512
+ raise "follow_redirects: must be a Boolean or Symbol, not: \
513
+ #{follow_redirects}"
514
+ end
515
+
516
+ [follow_redirects, nil]
517
+ end
518
+
519
+ # Log (at debug level) the network request/response details.
520
+ def log_net(client, response, duration)
521
+ resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
522
+ log_status = (response.status || 0)
523
+ log_total_time = (duration || 0.0).truncate(3)
524
+
525
+ # The browsers request URL is the same so ignore it.
526
+ if client.to_sym == :http
527
+ Wgit.logger.debug("[#{client}] Request: #{response.url}")
528
+ end
529
+
530
+ Wgit.logger.debug(
531
+ format(resp_template, log_status, response.size, log_total_time)
532
+ )
533
+ end
534
+
379
535
  alias crawl crawl_urls
380
536
  alias crawl_pages crawl_urls
381
537
  alias crawl_page crawl_url