wgit 0.5.1 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +249 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +232 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +241 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -14
- data/lib/wgit/url.rb +213 -73
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +3 -2
- metadata +38 -19
data/bin/wgit
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'wgit'
|
4
|
+
|
5
|
+
# Eval .wgit.rb file (if it exists somewhere).
|
6
|
+
def eval_wgit(filepath = nil)
|
7
|
+
puts 'Searching for .wgit.rb file in local and home directories...'
|
8
|
+
|
9
|
+
[filepath, Dir.pwd, Dir.home].each do |dir|
|
10
|
+
path = "#{dir}/.wgit.rb"
|
11
|
+
next unless File.exist?(path)
|
12
|
+
|
13
|
+
puts "Eval'ing #{path}"
|
14
|
+
puts 'Call `eval_wgit` after changes to re-eval the file'
|
15
|
+
eval(File.read(path))
|
16
|
+
|
17
|
+
break
|
18
|
+
end
|
19
|
+
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
eval_wgit
|
24
|
+
puts "\n#{Wgit.version_str}\n\n"
|
25
|
+
|
26
|
+
# Use Pry if installed or fall back to IRB.
|
27
|
+
begin
|
28
|
+
require 'pry'
|
29
|
+
klass = Pry
|
30
|
+
rescue LoadError
|
31
|
+
require 'irb'
|
32
|
+
klass = IRB
|
33
|
+
|
34
|
+
puts "Starting IRB because Pry isn't installed."
|
35
|
+
end
|
36
|
+
|
37
|
+
klass.start
|
38
|
+
|
39
|
+
puts 'Interactive session complete.'
|
data/lib/wgit.rb
CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
|
|
6
6
|
require_relative 'wgit/utils'
|
7
7
|
require_relative 'wgit/url'
|
8
8
|
require_relative 'wgit/document'
|
9
|
-
require_relative 'wgit/
|
9
|
+
require_relative 'wgit/document_extractors'
|
10
10
|
require_relative 'wgit/crawler'
|
11
11
|
require_relative 'wgit/database/model'
|
12
12
|
require_relative 'wgit/database/database'
|
13
13
|
require_relative 'wgit/indexer'
|
14
|
+
require_relative 'wgit/dsl'
|
15
|
+
require_relative 'wgit/base'
|
14
16
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
data/lib/wgit/assertable.rb
CHANGED
@@ -6,7 +6,7 @@ module Wgit
|
|
6
6
|
# Default type fail message.
|
7
7
|
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
8
8
|
# Wrong method message.
|
9
|
-
|
9
|
+
NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
|
10
10
|
# Default duck fail message.
|
11
11
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
12
12
|
# Default required keys message.
|
@@ -42,7 +42,7 @@ present: %s"
|
|
42
42
|
# @raise [StandardError] If the assertion fails.
|
43
43
|
# @return [Object] The given arr on successful assertion.
|
44
44
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
45
|
-
raise
|
45
|
+
raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
|
46
46
|
|
47
47
|
arr.each { |obj| assert_types(obj, type_or_types, msg) }
|
48
48
|
end
|
@@ -56,7 +56,7 @@ present: %s"
|
|
56
56
|
# @raise [StandardError] If the assertion fails.
|
57
57
|
# @return [Object] The given obj_or_objs on successful assertion.
|
58
58
|
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
59
|
-
methods =
|
59
|
+
methods = *methods
|
60
60
|
|
61
61
|
if obj_or_objs.respond_to?(:each)
|
62
62
|
obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
|
data/lib/wgit/base.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wgit
|
2
|
+
# Class to inherit from, as an alternative form of using the `Wgit::DSL`.
|
3
|
+
# All subclasses must define a `#parse(doc, &block)` method.
|
4
|
+
class Base
|
5
|
+
extend Wgit::DSL
|
6
|
+
|
7
|
+
# Runs the crawl/index passing each crawled `Wgit::Document` and the given
|
8
|
+
# block to the subclass's `#parse` method.
|
9
|
+
def self.run(&block)
|
10
|
+
obj = new
|
11
|
+
unless obj.respond_to?(:parse)
|
12
|
+
raise "#{obj.class} must respond_to? #parse(doc, &block)"
|
13
|
+
end
|
14
|
+
|
15
|
+
crawl_method = @method || :crawl
|
16
|
+
send(crawl_method) { |doc| obj.parse(doc, &block) }
|
17
|
+
|
18
|
+
obj
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets the crawl/index method to call when `Base.run` is called.
|
22
|
+
# The mode method must match one defined in the `Wgit::Crawler` or
|
23
|
+
# `Wgit::Indexer` class.
|
24
|
+
#
|
25
|
+
# @param method [Symbol] The crawl/index method to call.
|
26
|
+
def self.mode(method)
|
27
|
+
@method = method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/wgit/core_ext.rb
CHANGED
data/lib/wgit/crawler.rb
CHANGED
@@ -5,26 +5,55 @@ require_relative 'document'
|
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
|
+
require 'set'
|
9
|
+
require 'benchmark'
|
8
10
|
require 'typhoeus'
|
11
|
+
require 'ferrum'
|
9
12
|
|
10
13
|
module Wgit
|
11
|
-
# The Crawler class provides a means of crawling web based HTTP Wgit::Url
|
12
|
-
# serialising their HTML into Wgit::Document instances. This is the
|
13
|
-
# class
|
14
|
+
# The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
|
15
|
+
# and serialising their HTML into `Wgit::Document` instances. This is the
|
16
|
+
# only Wgit class containing network logic (HTTP request/response handling).
|
14
17
|
class Crawler
|
15
18
|
include Assertable
|
16
19
|
|
20
|
+
# Set of supported file extensions for Wgit::Crawler#crawl_site.
|
21
|
+
@supported_file_extensions = Set.new(
|
22
|
+
%w[asp aspx cfm cgi htm html htmlx jsp php]
|
23
|
+
)
|
24
|
+
|
25
|
+
class << self
|
26
|
+
# The URL file extensions (from `<a>` hrefs) which will be crawled by
|
27
|
+
# `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
|
28
|
+
# doesn't keep the crawl of the site going. All URL's without a file
|
29
|
+
# extension will be crawled, because they're assumed to be HTML.
|
30
|
+
# The `#crawl` method will crawl anything since it's given the URL(s).
|
31
|
+
# You can add your own site's URL file extension e.g.
|
32
|
+
# `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
|
33
|
+
attr_reader :supported_file_extensions
|
34
|
+
end
|
35
|
+
|
17
36
|
# The amount of allowed redirects before raising an error. Set to 0 to
|
18
|
-
# disable redirects completely
|
37
|
+
# disable redirects completely; or you can pass `follow_redirects: false`
|
38
|
+
# to any Wgit::Crawler.crawl_* method.
|
19
39
|
attr_accessor :redirect_limit
|
20
40
|
|
21
41
|
# The maximum amount of time (in seconds) a crawl request has to complete
|
22
42
|
# before raising an error. Set to 0 to disable time outs completely.
|
23
|
-
attr_accessor :
|
43
|
+
attr_accessor :timeout
|
44
|
+
|
45
|
+
# Whether or not to UTF-8 encode the response body once crawled. Set to
|
46
|
+
# false if crawling more than just HTML e.g. images.
|
47
|
+
attr_accessor :encode
|
24
48
|
|
25
|
-
# Whether or not to
|
26
|
-
#
|
27
|
-
attr_accessor :
|
49
|
+
# Whether or not to parse the Javascript of the crawled document.
|
50
|
+
# Parsing requires Chrome/Chromium to be installed and in $PATH.
|
51
|
+
attr_accessor :parse_javascript
|
52
|
+
|
53
|
+
# The delay between checks in a page's HTML size. When the page has stopped
|
54
|
+
# "growing", the Javascript has finished dynamically updating the DOM.
|
55
|
+
# The value should balance between a good UX and enough JS parse time.
|
56
|
+
attr_accessor :parse_javascript_delay
|
28
57
|
|
29
58
|
# The Wgit::Response of the most recently crawled URL.
|
30
59
|
attr_reader :last_response
|
@@ -33,21 +62,32 @@ module Wgit
|
|
33
62
|
#
|
34
63
|
# @param redirect_limit [Integer] The amount of allowed redirects before
|
35
64
|
# raising an error. Set to 0 to disable redirects completely.
|
36
|
-
# @param
|
65
|
+
# @param timeout [Integer, Float] The maximum amount of time (in seconds)
|
37
66
|
# a crawl request has to complete before raising an error. Set to 0 to
|
38
67
|
# disable time outs completely.
|
39
|
-
# @param
|
40
|
-
# crawled. Set to false if crawling more than just HTML e.g. images
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
68
|
+
# @param encode [Boolean] Whether or not to UTF-8 encode the response body
|
69
|
+
# once crawled. Set to false if crawling more than just HTML e.g. images.
|
70
|
+
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
|
+
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
|
+
# installed and in $PATH.
|
73
|
+
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
|
+
parse_javascript: false, parse_javascript_delay: 1)
|
75
|
+
@redirect_limit = redirect_limit
|
76
|
+
@timeout = timeout
|
77
|
+
@encode = encode
|
78
|
+
@parse_javascript = parse_javascript
|
79
|
+
@parse_javascript_delay = parse_javascript_delay
|
45
80
|
end
|
46
81
|
|
47
82
|
# Crawls an entire website's HTML pages by recursively going through
|
48
|
-
# its internal
|
49
|
-
#
|
50
|
-
#
|
83
|
+
# its internal `<a>` links; this can be overridden with `follow: xpath`.
|
84
|
+
# Each crawled Document is yielded to a block. Use `doc.empty?` to
|
85
|
+
# determine if the crawled link was successful / is valid.
|
86
|
+
#
|
87
|
+
# Use the allow and disallow paths params to partially and selectively
|
88
|
+
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
89
|
+
# Note that each path must NOT start with a slash; the only exception being
|
90
|
+
# a `/` on its own with no other characters, referring to the index page.
|
51
91
|
#
|
52
92
|
# Only redirects to the same host are followed. For example, the Url
|
53
93
|
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
@@ -60,69 +100,79 @@ module Wgit
|
|
60
100
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
61
101
|
# It is recommended that this URL be the index page of the site to give a
|
62
102
|
# greater chance of finding all pages within that site/host.
|
63
|
-
# @param
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
103
|
+
# @param follow [String] The xpath extracting links to be followed during
|
104
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
105
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
106
|
+
# HTML.
|
107
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
108
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
109
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
110
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
67
111
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
68
112
|
# A block is the only way to interact with each crawled Document.
|
113
|
+
# Use `doc.empty?` to determine if the page is valid.
|
69
114
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
70
|
-
# from all of the site's pages or nil if the url could not be
|
115
|
+
# from all of the site's pages or nil if the given url could not be
|
71
116
|
# crawled successfully.
|
72
|
-
def crawl_site(
|
117
|
+
def crawl_site(
|
118
|
+
url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
|
119
|
+
)
|
73
120
|
doc = crawl_url(url, &block)
|
74
121
|
return nil if doc.nil?
|
75
122
|
|
76
|
-
|
77
|
-
|
78
|
-
|
123
|
+
link_opts = {
|
124
|
+
xpath: follow,
|
125
|
+
allow_paths: allow_paths,
|
126
|
+
disallow_paths: disallow_paths
|
127
|
+
}
|
79
128
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
80
|
-
crawled = [url, alt_url]
|
81
|
-
externals = doc.external_links
|
82
|
-
internals = get_internal_links(doc, link_opts)
|
83
129
|
|
84
|
-
|
130
|
+
crawled = Set.new([url, alt_url])
|
131
|
+
externals = Set.new(doc.external_links)
|
132
|
+
internals = Set.new(next_internal_links(doc, **link_opts))
|
85
133
|
|
86
|
-
|
87
|
-
crawled.uniq!
|
88
|
-
internals.uniq!
|
134
|
+
return externals.to_a if internals.empty?
|
89
135
|
|
136
|
+
loop do
|
90
137
|
links = internals - crawled
|
91
138
|
break if links.empty?
|
92
139
|
|
93
140
|
links.each do |link|
|
94
141
|
orig_link = link.dup
|
95
|
-
doc = crawl_url(link,
|
142
|
+
doc = crawl_url(link, follow_redirects: :host, &block)
|
96
143
|
|
97
|
-
crawled
|
144
|
+
crawled += [orig_link, link] # Push both links in case of redirects.
|
98
145
|
next if doc.nil?
|
99
146
|
|
100
|
-
internals
|
101
|
-
externals
|
147
|
+
internals += next_internal_links(doc, **link_opts)
|
148
|
+
externals += doc.external_links
|
102
149
|
end
|
103
150
|
end
|
104
151
|
|
105
|
-
externals.
|
152
|
+
externals.to_a
|
106
153
|
end
|
107
154
|
|
108
155
|
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
109
156
|
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
|
110
157
|
#
|
111
158
|
# @param urls [*Wgit::Url] The Url's to crawl.
|
159
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
160
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
161
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
162
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
163
|
+
# This value will be used for all urls crawled.
|
112
164
|
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
113
|
-
# way to interact with them.
|
165
|
+
# way to interact with them. Use `doc.empty?` to determine if the page
|
166
|
+
# is valid.
|
114
167
|
# @raise [StandardError] If no urls are provided.
|
115
168
|
# @return [Wgit::Document] The last Document crawled.
|
116
|
-
def crawl_urls(*urls,
|
169
|
+
def crawl_urls(*urls, follow_redirects: true, &block)
|
117
170
|
raise 'You must provide at least one Url' if urls.empty?
|
118
171
|
|
119
|
-
opts = {
|
120
|
-
follow_external_redirects: follow_external_redirects,
|
121
|
-
host: host
|
122
|
-
}
|
172
|
+
opts = { follow_redirects: follow_redirects }
|
123
173
|
doc = nil
|
124
174
|
|
125
|
-
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
175
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
126
176
|
|
127
177
|
doc
|
128
178
|
end
|
@@ -130,34 +180,25 @@ module Wgit
|
|
130
180
|
# Crawl the url returning the response Wgit::Document or nil, if an error
|
131
181
|
# occurs.
|
132
182
|
#
|
133
|
-
# @param url [Wgit::Url] The Url to crawl; which will
|
134
|
-
#
|
135
|
-
#
|
136
|
-
#
|
137
|
-
#
|
138
|
-
#
|
139
|
-
# an absolute redirect is determined to be internal or not. Must be
|
140
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
141
|
-
# 'http://www.example.com' will only allow redirects for Url's with a
|
142
|
-
# `to_host` value of 'www.example.com'.
|
183
|
+
# @param url [Wgit::Url] The Url to crawl; which will be modified in the
|
184
|
+
# event of a redirect.
|
185
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
186
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
187
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
188
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
143
189
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
144
190
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
|
+
# Use `doc.empty?` to determine if the page is valid.
|
145
192
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
146
193
|
# crawl was unsuccessful.
|
147
|
-
def crawl_url(url,
|
194
|
+
def crawl_url(url, follow_redirects: true)
|
148
195
|
# A String url isn't allowed because it's passed by value not reference,
|
149
196
|
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
150
197
|
assert_type(url, Wgit::Url)
|
151
|
-
raise 'host cannot be nil if follow_external_redirects is false' \
|
152
|
-
if !follow_external_redirects && host.nil?
|
153
198
|
|
154
|
-
html = fetch(
|
155
|
-
|
156
|
-
follow_external_redirects: follow_external_redirects,
|
157
|
-
host: host
|
158
|
-
)
|
199
|
+
html = fetch(url, follow_redirects: follow_redirects)
|
200
|
+
doc = Wgit::Document.new(url, html, encode: @encode)
|
159
201
|
|
160
|
-
doc = Wgit::Document.new(url, html, encode_html: @encode_html)
|
161
202
|
yield(doc) if block_given?
|
162
203
|
|
163
204
|
doc.empty? ? nil : doc
|
@@ -165,31 +206,28 @@ module Wgit
|
|
165
206
|
|
166
207
|
protected
|
167
208
|
|
168
|
-
# Returns the
|
209
|
+
# Returns the URL's HTML String or nil. Handles any errors that arise
|
169
210
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
170
211
|
# return a HTML body will be ignored, returning nil.
|
171
212
|
#
|
213
|
+
# If @parse_javascript is true, then the final resolved URL will be browsed
|
214
|
+
# to and Javascript parsed allowing for dynamic HTML generation.
|
215
|
+
#
|
172
216
|
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
173
217
|
# reference and gets modified as a result of the fetch/crawl.
|
174
|
-
# @param
|
175
|
-
#
|
176
|
-
#
|
177
|
-
#
|
178
|
-
#
|
179
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
180
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
181
|
-
# `to_host` value of 'www.example.com'.
|
218
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
219
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
220
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
221
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
222
|
+
# @raise [StandardError] If url isn't valid and absolute.
|
182
223
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
183
224
|
# unsuccessful.
|
184
|
-
def fetch(url,
|
225
|
+
def fetch(url, follow_redirects: true)
|
185
226
|
response = Wgit::Response.new
|
227
|
+
raise "Invalid url: #{url}" if url.invalid?
|
186
228
|
|
187
|
-
resolve(
|
188
|
-
|
189
|
-
response,
|
190
|
-
follow_external_redirects: follow_external_redirects,
|
191
|
-
host: host
|
192
|
-
)
|
229
|
+
resolve(url, response, follow_redirects: follow_redirects)
|
230
|
+
get_browser_response(url, response) if @parse_javascript
|
193
231
|
|
194
232
|
response.body_or_nil
|
195
233
|
rescue StandardError => e
|
@@ -209,18 +247,17 @@ module Wgit
|
|
209
247
|
# @param url [Wgit::Url] The URL to GET and resolve.
|
210
248
|
# @param response [Wgit::Response] The response to enrich. Modifies by
|
211
249
|
# reference.
|
212
|
-
# @param
|
213
|
-
#
|
214
|
-
#
|
215
|
-
#
|
216
|
-
# an absolute redirect is determined to be internal or not. Must be
|
217
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
218
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
219
|
-
# `to_host` value of 'www.example.com'.
|
250
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
251
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
252
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
253
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
220
254
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
221
|
-
def resolve(url, response,
|
255
|
+
def resolve(url, response, follow_redirects: true)
|
256
|
+
origin = url.to_url.to_origin # Recorded before any redirects.
|
257
|
+
follow_redirects, within = redirect?(follow_redirects)
|
258
|
+
|
222
259
|
loop do
|
223
|
-
|
260
|
+
get_http_response(url, response)
|
224
261
|
break unless response.redirect?
|
225
262
|
|
226
263
|
# Handle response 'Location' header.
|
@@ -229,17 +266,18 @@ module Wgit
|
|
229
266
|
|
230
267
|
yield(url, response, location) if block_given?
|
231
268
|
|
232
|
-
# Validate redirect.
|
233
|
-
|
234
|
-
|
235
|
-
|
269
|
+
# Validate if the redirect is allowed.
|
270
|
+
raise "Redirect not allowed: #{location}" unless follow_redirects
|
271
|
+
|
272
|
+
if within && !location.relative?(within => origin)
|
273
|
+
raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
|
236
274
|
end
|
237
275
|
|
238
276
|
raise "Too many redirects, exceeded: #{@redirect_limit}" \
|
239
277
|
if response.redirect_count >= @redirect_limit
|
240
278
|
|
241
279
|
# Process the location to be crawled next.
|
242
|
-
location = url.
|
280
|
+
location = url.to_origin.concat(location) if location.relative?
|
243
281
|
response.redirections[url.to_s] = location.to_s
|
244
282
|
url.replace(location) # Update the url on redirect.
|
245
283
|
end
|
@@ -252,7 +290,7 @@ module Wgit
|
|
252
290
|
# reference.
|
253
291
|
# @raise [StandardError] If a response can't be obtained.
|
254
292
|
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
255
|
-
def
|
293
|
+
def get_http_response(url, response)
|
256
294
|
# Perform a HTTP GET request.
|
257
295
|
orig_url = url.to_s
|
258
296
|
url = url.normalize if url.respond_to?(:normalize)
|
@@ -268,18 +306,41 @@ module Wgit
|
|
268
306
|
response.ip_address = http_response.primary_ip
|
269
307
|
response.add_total_time(http_response.total_time)
|
270
308
|
|
271
|
-
# Log
|
272
|
-
|
273
|
-
log_status = (response.status || 0)
|
274
|
-
log_total_time = response.total_time.truncate(3)
|
309
|
+
# Log the request/response details.
|
310
|
+
log_net(:http, response, http_response.total_time)
|
275
311
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
312
|
+
# Handle a failed response.
|
313
|
+
raise "No response (within timeout: #{@timeout} second(s))" \
|
314
|
+
if response.failure?
|
315
|
+
end
|
316
|
+
|
317
|
+
# Makes a browser request and enriches the given Wgit::Response from it.
|
318
|
+
#
|
319
|
+
# @param url [String] The url to browse to. Will call url#normalize if
|
320
|
+
# possible.
|
321
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
322
|
+
# reference.
|
323
|
+
# @raise [StandardError] If a response can't be obtained.
|
324
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
325
|
+
def get_browser_response(url, response)
|
326
|
+
url = url.normalize if url.respond_to?(:normalize)
|
327
|
+
browser = nil
|
328
|
+
|
329
|
+
crawl_time = Benchmark.measure { browser = browser_get(url) }.real
|
330
|
+
yield browser if block_given?
|
331
|
+
|
332
|
+
# Enrich the given Wgit::Response object (on top of Typhoeus response).
|
333
|
+
response.adapter_response = browser.network.response
|
334
|
+
response.status = browser.network.response.status
|
335
|
+
response.headers = browser.network.response.headers
|
336
|
+
response.body = browser.body
|
337
|
+
response.add_total_time(crawl_time)
|
338
|
+
|
339
|
+
# Log the request/response details.
|
340
|
+
log_net(:browser, response, crawl_time)
|
280
341
|
|
281
342
|
# Handle a failed response.
|
282
|
-
raise "No response (within timeout: #{@
|
343
|
+
raise "No browser response (within timeout: #{@timeout} second(s))" \
|
283
344
|
if response.failure?
|
284
345
|
end
|
285
346
|
|
@@ -290,7 +351,7 @@ module Wgit
|
|
290
351
|
def http_get(url)
|
291
352
|
opts = {
|
292
353
|
followlocation: false,
|
293
|
-
timeout: @
|
354
|
+
timeout: @timeout,
|
294
355
|
accept_encoding: 'gzip',
|
295
356
|
headers: {
|
296
357
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
@@ -299,34 +360,58 @@ module Wgit
|
|
299
360
|
}
|
300
361
|
|
301
362
|
# See https://rubydoc.info/gems/typhoeus for more info.
|
302
|
-
Typhoeus.get(url, opts)
|
363
|
+
Typhoeus.get(url, **opts)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Performs a HTTP GET request in a web browser and parses the response JS
|
367
|
+
# before returning the HTML body of the fully rendered webpage. This allows
|
368
|
+
# Javascript (SPA apps etc.) to generate HTML dynamically.
|
369
|
+
#
|
370
|
+
# @param url [String] The url to browse to.
|
371
|
+
# @return [Ferrum::Browser] The browser response object.
|
372
|
+
def browser_get(url)
|
373
|
+
@browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
|
374
|
+
@browser.goto(url)
|
375
|
+
|
376
|
+
# Wait for the page's JS to finish dynamically manipulating the DOM.
|
377
|
+
html = @browser.body
|
378
|
+
loop do
|
379
|
+
sleep @parse_javascript_delay
|
380
|
+
break if html.size == @browser.body.size
|
381
|
+
|
382
|
+
html = @browser.body
|
383
|
+
end
|
384
|
+
|
385
|
+
@browser
|
303
386
|
end
|
304
387
|
|
305
388
|
# Returns a doc's internal HTML page links in absolute form; used when
|
306
|
-
# crawling a site.
|
307
|
-
#
|
389
|
+
# crawling a site. By default, any `<a>` href returning HTML is returned;
|
390
|
+
# override this with `xpath:` if desired.
|
308
391
|
#
|
309
|
-
#
|
310
|
-
#
|
311
|
-
#
|
312
|
-
# files containing <a> links can keep the crawl going beyond the base URL.
|
392
|
+
# Use the allow and disallow paths params to partially and selectively
|
393
|
+
# crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
|
394
|
+
# that each path should NOT start with a slash.
|
313
395
|
#
|
314
396
|
# @param doc [Wgit::Document] The document from which to extract it's
|
315
|
-
# internal page links.
|
397
|
+
# internal (absolute) page links.
|
398
|
+
# @param xpath [String] The xpath selecting links to be returned. Only
|
399
|
+
# links pointing to the doc.url domain are allowed. The :default is any
|
400
|
+
# <a> href returning HTML. The allow/disallow paths will be applied to
|
401
|
+
# the returned value.
|
316
402
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
317
|
-
# them
|
403
|
+
# them if their path `File.fnmatch?` one of allow_paths.
|
318
404
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
319
|
-
# them if their path
|
405
|
+
# them if their path `File.fnmatch?` one of disallow_paths.
|
320
406
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
321
|
-
def
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
end
|
407
|
+
def next_internal_links(
|
408
|
+
doc, xpath: :default, allow_paths: nil, disallow_paths: nil
|
409
|
+
)
|
410
|
+
links = if xpath && xpath != :default
|
411
|
+
follow_xpath(doc, xpath)
|
412
|
+
else
|
413
|
+
follow_default(doc)
|
414
|
+
end
|
330
415
|
|
331
416
|
return links if allow_paths.nil? && disallow_paths.nil?
|
332
417
|
|
@@ -335,40 +420,82 @@ module Wgit
|
|
335
420
|
|
336
421
|
private
|
337
422
|
|
423
|
+
# Returns the next links used to continue crawling a site. The xpath value
|
424
|
+
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
|
+
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
426
|
+
# pointing to the site domain will raise an error.
|
427
|
+
def follow_xpath(doc, xpath)
|
428
|
+
links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
|
429
|
+
urls
|
430
|
+
.map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
|
431
|
+
.compact
|
432
|
+
end
|
433
|
+
|
434
|
+
if links.any? { |link| link.to_domain != doc.url.to_domain }
|
435
|
+
raise 'The links to follow must be within the site domain'
|
436
|
+
end
|
437
|
+
|
438
|
+
links
|
439
|
+
end
|
440
|
+
|
441
|
+
# Returns the default set of links used to continue crawling a site.
|
442
|
+
# By default, any <a> href returning HTML and pointing to the same domain
|
443
|
+
# will get returned.
|
444
|
+
def follow_default(doc)
|
445
|
+
doc
|
446
|
+
.internal_absolute_links
|
447
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
448
|
+
.uniq
|
449
|
+
.select do |link| # Whitelist only HTML content.
|
450
|
+
ext = link.to_extension
|
451
|
+
if ext
|
452
|
+
Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
|
453
|
+
else
|
454
|
+
true # URLs without an extension are assumed HTML.
|
455
|
+
end
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
338
459
|
# Validate and filter by the given URL paths.
|
339
460
|
def process_paths(links, allow_paths, disallow_paths)
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
if allow_paths # White list.
|
344
|
-
filter_method = :select
|
345
|
-
paths = allow_paths
|
346
|
-
else # Black list.
|
347
|
-
filter_method = :reject
|
348
|
-
paths = disallow_paths
|
461
|
+
if allow_paths
|
462
|
+
paths = validate_paths(allow_paths)
|
463
|
+
filter_links(links, :select!, paths)
|
349
464
|
end
|
350
465
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
.uniq
|
356
|
-
.map { |path| Wgit::Url.new(path).to_path }
|
466
|
+
if disallow_paths
|
467
|
+
paths = validate_paths(disallow_paths)
|
468
|
+
filter_links(links, :reject!, paths)
|
469
|
+
end
|
357
470
|
|
471
|
+
links
|
472
|
+
end
|
473
|
+
|
474
|
+
# Validate the paths are suitable for filtering.
|
475
|
+
def validate_paths(paths)
|
476
|
+
paths = *paths
|
477
|
+
raise 'The provided paths must all be Strings' \
|
478
|
+
unless paths.all? { |path| path.is_a?(String) }
|
479
|
+
|
480
|
+
Wgit::Utils.sanitize(paths, encode: false)
|
358
481
|
raise 'The provided paths cannot be empty' if paths.empty?
|
359
482
|
|
360
|
-
|
483
|
+
paths.map do |path|
|
484
|
+
path = Wgit::Url.parse(path)
|
485
|
+
path.index? ? path : path.omit_slashes
|
486
|
+
end
|
361
487
|
end
|
362
488
|
|
363
|
-
# Filters links by selecting
|
364
|
-
|
489
|
+
# Filters links by selecting/rejecting them based on their path.
|
490
|
+
# Uses File.fnmatch? so that globbing is supported.
|
491
|
+
def filter_links(links, filter_method, paths)
|
365
492
|
links.send(filter_method) do |link|
|
366
|
-
|
367
|
-
|
493
|
+
# Turn http://example.com into / meaning index.
|
494
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base
|
368
495
|
|
369
496
|
match = false
|
370
|
-
paths.each do |
|
371
|
-
match =
|
497
|
+
paths.each do |pattern|
|
498
|
+
match = File.fnmatch?(pattern, link, File::FNM_EXTGLOB)
|
372
499
|
break if match
|
373
500
|
end
|
374
501
|
|
@@ -376,6 +503,35 @@ module Wgit
|
|
376
503
|
end
|
377
504
|
end
|
378
505
|
|
506
|
+
# Returns whether or not to follow redirects, and within what context e.g.
|
507
|
+
# :host, :domain etc.
|
508
|
+
def redirect?(follow_redirects)
|
509
|
+
return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
|
510
|
+
|
511
|
+
unless [true, false].include?(follow_redirects)
|
512
|
+
raise "follow_redirects: must be a Boolean or Symbol, not: \
|
513
|
+
#{follow_redirects}"
|
514
|
+
end
|
515
|
+
|
516
|
+
[follow_redirects, nil]
|
517
|
+
end
|
518
|
+
|
519
|
+
# Log (at debug level) the network request/response details.
|
520
|
+
def log_net(client, response, duration)
|
521
|
+
resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
|
522
|
+
log_status = (response.status || 0)
|
523
|
+
log_total_time = (duration || 0.0).truncate(3)
|
524
|
+
|
525
|
+
# The browsers request URL is the same so ignore it.
|
526
|
+
if client.to_sym == :http
|
527
|
+
Wgit.logger.debug("[#{client}] Request: #{response.url}")
|
528
|
+
end
|
529
|
+
|
530
|
+
Wgit.logger.debug(
|
531
|
+
format(resp_template, log_status, response.size, log_total_time)
|
532
|
+
)
|
533
|
+
end
|
534
|
+
|
379
535
|
alias crawl crawl_urls
|
380
536
|
alias crawl_pages crawl_urls
|
381
537
|
alias crawl_page crawl_url
|