wgit 0.5.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +240 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +239 -0
- data/bin/wgit +39 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +304 -148
- data/lib/wgit/database/database.rb +310 -135
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +234 -169
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +20 -10
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +68 -156
- data/lib/wgit/response.rb +17 -17
- data/lib/wgit/url.rb +170 -42
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +8 -2
- metadata +54 -32
data/bin/wgit
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'wgit'
|
4
|
+
|
5
|
+
# Eval .wgit.rb file (if it exists somewhere).
|
6
|
+
def eval_wgit(filepath = nil)
|
7
|
+
puts 'Searching for .wgit.rb file in local and home directories...'
|
8
|
+
|
9
|
+
[filepath, Dir.pwd, Dir.home].each do |dir|
|
10
|
+
path = "#{dir}/.wgit.rb"
|
11
|
+
next unless File.exist?(path)
|
12
|
+
|
13
|
+
puts "Eval'ing #{path}"
|
14
|
+
puts 'Call `eval_wgit` after changes to re-eval the file'
|
15
|
+
eval(File.read(path))
|
16
|
+
|
17
|
+
break
|
18
|
+
end
|
19
|
+
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
eval_wgit
|
24
|
+
puts "\n#{Wgit.version_str}\n\n"
|
25
|
+
|
26
|
+
# Use Pry if installed or fall back to IRB.
|
27
|
+
begin
|
28
|
+
require 'pry'
|
29
|
+
klass = Pry
|
30
|
+
rescue LoadError
|
31
|
+
require 'irb'
|
32
|
+
klass = IRB
|
33
|
+
|
34
|
+
puts "Starting IRB because Pry isn't installed."
|
35
|
+
end
|
36
|
+
|
37
|
+
klass.start
|
38
|
+
|
39
|
+
puts 'Interactive session complete.'
|
data/lib/wgit.rb
CHANGED
@@ -6,9 +6,11 @@ require_relative 'wgit/assertable'
|
|
6
6
|
require_relative 'wgit/utils'
|
7
7
|
require_relative 'wgit/url'
|
8
8
|
require_relative 'wgit/document'
|
9
|
-
require_relative 'wgit/
|
9
|
+
require_relative 'wgit/document_extractors'
|
10
10
|
require_relative 'wgit/crawler'
|
11
11
|
require_relative 'wgit/database/model'
|
12
12
|
require_relative 'wgit/database/database'
|
13
13
|
require_relative 'wgit/indexer'
|
14
|
+
require_relative 'wgit/dsl'
|
15
|
+
require_relative 'wgit/base'
|
14
16
|
# require_relative 'wgit/core_ext' - Must be explicitly required.
|
data/lib/wgit/assertable.rb
CHANGED
@@ -6,7 +6,7 @@ module Wgit
|
|
6
6
|
# Default type fail message.
|
7
7
|
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
8
8
|
# Wrong method message.
|
9
|
-
|
9
|
+
NON_ENUMERABLE_MSG = 'Expected an Enumerable responding to #each, not: %s'
|
10
10
|
# Default duck fail message.
|
11
11
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
12
12
|
# Default required keys message.
|
@@ -42,7 +42,7 @@ present: %s"
|
|
42
42
|
# @raise [StandardError] If the assertion fails.
|
43
43
|
# @return [Object] The given arr on successful assertion.
|
44
44
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
45
|
-
raise
|
45
|
+
raise format(NON_ENUMERABLE_MSG, arr.class) unless arr.respond_to?(:each)
|
46
46
|
|
47
47
|
arr.each { |obj| assert_types(obj, type_or_types, msg) }
|
48
48
|
end
|
@@ -56,7 +56,7 @@ present: %s"
|
|
56
56
|
# @raise [StandardError] If the assertion fails.
|
57
57
|
# @return [Object] The given obj_or_objs on successful assertion.
|
58
58
|
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
59
|
-
methods =
|
59
|
+
methods = *methods
|
60
60
|
|
61
61
|
if obj_or_objs.respond_to?(:each)
|
62
62
|
obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
|
data/lib/wgit/base.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wgit
|
2
|
+
# Class to inherit from, as an alternative form of using the `Wgit::DSL`.
|
3
|
+
# All subclasses must define a `#parse(doc, &block)` method.
|
4
|
+
class Base
|
5
|
+
extend Wgit::DSL
|
6
|
+
|
7
|
+
# Runs the crawl/index passing each crawled `Wgit::Document` and the given
|
8
|
+
# block to the subclass's `#parse` method.
|
9
|
+
def self.run(&block)
|
10
|
+
obj = new
|
11
|
+
unless obj.respond_to?(:parse)
|
12
|
+
raise "#{obj.class} must respond_to? #parse(doc, &block)"
|
13
|
+
end
|
14
|
+
|
15
|
+
crawl_method = @method || :crawl
|
16
|
+
send(crawl_method) { |doc| obj.parse(doc, &block) }
|
17
|
+
|
18
|
+
obj
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets the crawl/index method to call when `Base.run` is called.
|
22
|
+
# The mode method must match one defined in the `Wgit::Crawler` or
|
23
|
+
# `Wgit::Indexer` class.
|
24
|
+
#
|
25
|
+
# @param method [Symbol] The crawl/index method to call.
|
26
|
+
def self.mode(method)
|
27
|
+
@method = method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/wgit/core_ext.rb
CHANGED
data/lib/wgit/crawler.rb
CHANGED
@@ -5,26 +5,55 @@ require_relative 'document'
|
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
|
+
require 'set'
|
9
|
+
require 'benchmark'
|
8
10
|
require 'typhoeus'
|
11
|
+
require 'ferrum'
|
9
12
|
|
10
13
|
module Wgit
|
11
|
-
# The Crawler class provides a means of crawling web based HTTP Wgit::Url
|
12
|
-
# serialising their HTML into Wgit::Document instances. This is the
|
13
|
-
# class
|
14
|
+
# The Crawler class provides a means of crawling web based HTTP `Wgit::Url`s,
|
15
|
+
# and serialising their HTML into `Wgit::Document` instances. This is the
|
16
|
+
# only Wgit class containing network logic (HTTP request/response handling).
|
14
17
|
class Crawler
|
15
18
|
include Assertable
|
16
19
|
|
20
|
+
# Set of supported file extensions for Wgit::Crawler#crawl_site.
|
21
|
+
@supported_file_extensions = Set.new(
|
22
|
+
%w[asp aspx cfm cgi htm html htmlx jsp php]
|
23
|
+
)
|
24
|
+
|
25
|
+
class << self
|
26
|
+
# The URL file extensions (from `<a>` hrefs) which will be crawled by
|
27
|
+
# `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
|
28
|
+
# doesn't keep the crawl of the site going. All URL's without a file
|
29
|
+
# extension will be crawled, because they're assumed to be HTML.
|
30
|
+
# The `#crawl` method will crawl anything since it's given the URL(s).
|
31
|
+
# You can add your own site's URL file extension e.g.
|
32
|
+
# `Wgit::Crawler.supported_file_extensions << 'html5'` etc.
|
33
|
+
attr_reader :supported_file_extensions
|
34
|
+
end
|
35
|
+
|
17
36
|
# The amount of allowed redirects before raising an error. Set to 0 to
|
18
|
-
# disable redirects completely
|
37
|
+
# disable redirects completely; or you can pass `follow_redirects: false`
|
38
|
+
# to any Wgit::Crawler.crawl_* method.
|
19
39
|
attr_accessor :redirect_limit
|
20
40
|
|
21
41
|
# The maximum amount of time (in seconds) a crawl request has to complete
|
22
42
|
# before raising an error. Set to 0 to disable time outs completely.
|
23
|
-
attr_accessor :
|
43
|
+
attr_accessor :timeout
|
44
|
+
|
45
|
+
# Whether or not to UTF-8 encode the response body once crawled. Set to
|
46
|
+
# false if crawling more than just HTML e.g. images.
|
47
|
+
attr_accessor :encode
|
24
48
|
|
25
|
-
# Whether or not to
|
26
|
-
#
|
27
|
-
attr_accessor :
|
49
|
+
# Whether or not to parse the Javascript of the crawled document.
|
50
|
+
# Parsing requires Chrome/Chromium to be installed and in $PATH.
|
51
|
+
attr_accessor :parse_javascript
|
52
|
+
|
53
|
+
# The delay between checks in a page's HTML size. When the page has stopped
|
54
|
+
# "growing", the Javascript has finished dynamically updating the DOM.
|
55
|
+
# The value should balance between a good UX and enough JS parse time.
|
56
|
+
attr_accessor :parse_javascript_delay
|
28
57
|
|
29
58
|
# The Wgit::Response of the most recently crawled URL.
|
30
59
|
attr_reader :last_response
|
@@ -33,21 +62,32 @@ module Wgit
|
|
33
62
|
#
|
34
63
|
# @param redirect_limit [Integer] The amount of allowed redirects before
|
35
64
|
# raising an error. Set to 0 to disable redirects completely.
|
36
|
-
# @param
|
65
|
+
# @param timeout [Integer, Float] The maximum amount of time (in seconds)
|
37
66
|
# a crawl request has to complete before raising an error. Set to 0 to
|
38
67
|
# disable time outs completely.
|
39
|
-
# @param
|
40
|
-
# crawled. Set to false if crawling more than just HTML e.g. images
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
68
|
+
# @param encode [Boolean] Whether or not to UTF-8 encode the response body
|
69
|
+
# once crawled. Set to false if crawling more than just HTML e.g. images.
|
70
|
+
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
|
+
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
|
+
# installed and in $PATH.
|
73
|
+
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
|
+
parse_javascript: false, parse_javascript_delay: 1)
|
75
|
+
@redirect_limit = redirect_limit
|
76
|
+
@timeout = timeout
|
77
|
+
@encode = encode
|
78
|
+
@parse_javascript = parse_javascript
|
79
|
+
@parse_javascript_delay = parse_javascript_delay
|
45
80
|
end
|
46
81
|
|
47
82
|
# Crawls an entire website's HTML pages by recursively going through
|
48
|
-
# its internal
|
49
|
-
#
|
50
|
-
#
|
83
|
+
# its internal `<a>` links; this can be overridden with `follow: xpath`.
|
84
|
+
# Each crawled Document is yielded to a block. Use `doc.empty?` to
|
85
|
+
# determine if the crawled link was successful / is valid.
|
86
|
+
#
|
87
|
+
# Use the allow and disallow paths params to partially and selectively
|
88
|
+
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
89
|
+
# Note that each path must NOT start with a slash; the only exception being
|
90
|
+
# a `/` on its own with no other characters, referring to the index page.
|
51
91
|
#
|
52
92
|
# Only redirects to the same host are followed. For example, the Url
|
53
93
|
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
@@ -60,69 +100,79 @@ module Wgit
|
|
60
100
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
61
101
|
# It is recommended that this URL be the index page of the site to give a
|
62
102
|
# greater chance of finding all pages within that site/host.
|
63
|
-
# @param
|
64
|
-
#
|
65
|
-
#
|
66
|
-
#
|
103
|
+
# @param follow [String] The xpath extracting links to be followed during
|
104
|
+
# the crawl. This changes how a site is crawled. Only links pointing to
|
105
|
+
# the site domain are allowed. The `:default` is any `<a>` href returning
|
106
|
+
# HTML.
|
107
|
+
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
108
|
+
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
109
|
+
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
110
|
+
# by rejecting them if their path `File.fnmatch?` one of disallow_paths.
|
67
111
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
68
112
|
# A block is the only way to interact with each crawled Document.
|
113
|
+
# Use `doc.empty?` to determine if the page is valid.
|
69
114
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
70
|
-
# from all of the site's pages or nil if the url could not be
|
115
|
+
# from all of the site's pages or nil if the given url could not be
|
71
116
|
# crawled successfully.
|
72
|
-
def crawl_site(
|
117
|
+
def crawl_site(
|
118
|
+
url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
|
119
|
+
)
|
73
120
|
doc = crawl_url(url, &block)
|
74
121
|
return nil if doc.nil?
|
75
122
|
|
76
|
-
|
77
|
-
|
78
|
-
|
123
|
+
link_opts = {
|
124
|
+
xpath: follow,
|
125
|
+
allow_paths: allow_paths,
|
126
|
+
disallow_paths: disallow_paths
|
127
|
+
}
|
79
128
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
80
|
-
crawled = [url, alt_url]
|
81
|
-
externals = doc.external_links
|
82
|
-
internals = get_internal_links(doc, link_opts)
|
83
129
|
|
84
|
-
|
130
|
+
crawled = Set.new([url, alt_url])
|
131
|
+
externals = Set.new(doc.external_links)
|
132
|
+
internals = Set.new(next_internal_links(doc, **link_opts))
|
85
133
|
|
86
|
-
|
87
|
-
crawled.uniq!
|
88
|
-
internals.uniq!
|
134
|
+
return externals.to_a if internals.empty?
|
89
135
|
|
136
|
+
loop do
|
90
137
|
links = internals - crawled
|
91
138
|
break if links.empty?
|
92
139
|
|
93
140
|
links.each do |link|
|
94
141
|
orig_link = link.dup
|
95
|
-
doc = crawl_url(link,
|
142
|
+
doc = crawl_url(link, follow_redirects: :host, &block)
|
96
143
|
|
97
|
-
crawled
|
144
|
+
crawled += [orig_link, link] # Push both links in case of redirects.
|
98
145
|
next if doc.nil?
|
99
146
|
|
100
|
-
internals
|
101
|
-
externals
|
147
|
+
internals += next_internal_links(doc, **link_opts)
|
148
|
+
externals += doc.external_links
|
102
149
|
end
|
103
150
|
end
|
104
151
|
|
105
|
-
externals.
|
152
|
+
externals.to_a
|
106
153
|
end
|
107
154
|
|
108
155
|
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
109
156
|
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
|
110
157
|
#
|
111
158
|
# @param urls [*Wgit::Url] The Url's to crawl.
|
159
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
160
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
161
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
162
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
163
|
+
# This value will be used for all urls crawled.
|
112
164
|
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
113
|
-
# way to interact with them.
|
165
|
+
# way to interact with them. Use `doc.empty?` to determine if the page
|
166
|
+
# is valid.
|
114
167
|
# @raise [StandardError] If no urls are provided.
|
115
168
|
# @return [Wgit::Document] The last Document crawled.
|
116
|
-
def crawl_urls(*urls,
|
169
|
+
def crawl_urls(*urls, follow_redirects: true, &block)
|
117
170
|
raise 'You must provide at least one Url' if urls.empty?
|
118
171
|
|
119
|
-
opts = {
|
120
|
-
follow_external_redirects: follow_external_redirects,
|
121
|
-
host: host
|
122
|
-
}
|
172
|
+
opts = { follow_redirects: follow_redirects }
|
123
173
|
doc = nil
|
124
174
|
|
125
|
-
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
175
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
126
176
|
|
127
177
|
doc
|
128
178
|
end
|
@@ -130,34 +180,25 @@ module Wgit
|
|
130
180
|
# Crawl the url returning the response Wgit::Document or nil, if an error
|
131
181
|
# occurs.
|
132
182
|
#
|
133
|
-
# @param url [Wgit::Url] The Url to crawl; which will
|
134
|
-
#
|
135
|
-
#
|
136
|
-
#
|
137
|
-
#
|
138
|
-
#
|
139
|
-
# an absolute redirect is determined to be internal or not. Must be
|
140
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
141
|
-
# 'http://www.example.com' will only allow redirects for Url's with a
|
142
|
-
# `to_host` value of 'www.example.com'.
|
183
|
+
# @param url [Wgit::Url] The Url to crawl; which will be modified in the
|
184
|
+
# event of a redirect.
|
185
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
186
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
187
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
188
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
143
189
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
144
190
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
|
+
# Use `doc.empty?` to determine if the page is valid.
|
145
192
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
146
193
|
# crawl was unsuccessful.
|
147
|
-
def crawl_url(url,
|
194
|
+
def crawl_url(url, follow_redirects: true)
|
148
195
|
# A String url isn't allowed because it's passed by value not reference,
|
149
196
|
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
150
197
|
assert_type(url, Wgit::Url)
|
151
|
-
raise 'host cannot be nil if follow_external_redirects is false' \
|
152
|
-
if !follow_external_redirects && host.nil?
|
153
198
|
|
154
|
-
html = fetch(
|
155
|
-
|
156
|
-
follow_external_redirects: follow_external_redirects,
|
157
|
-
host: host
|
158
|
-
)
|
199
|
+
html = fetch(url, follow_redirects: follow_redirects)
|
200
|
+
doc = Wgit::Document.new(url, html, encode: @encode)
|
159
201
|
|
160
|
-
doc = Wgit::Document.new(url, html, encode_html: @encode_html)
|
161
202
|
yield(doc) if block_given?
|
162
203
|
|
163
204
|
doc.empty? ? nil : doc
|
@@ -165,31 +206,28 @@ module Wgit
|
|
165
206
|
|
166
207
|
protected
|
167
208
|
|
168
|
-
# Returns the
|
209
|
+
# Returns the URL's HTML String or nil. Handles any errors that arise
|
169
210
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
170
211
|
# return a HTML body will be ignored, returning nil.
|
171
212
|
#
|
213
|
+
# If @parse_javascript is true, then the final resolved URL will be browsed
|
214
|
+
# to and Javascript parsed allowing for dynamic HTML generation.
|
215
|
+
#
|
172
216
|
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
173
217
|
# reference and gets modified as a result of the fetch/crawl.
|
174
|
-
# @param
|
175
|
-
#
|
176
|
-
#
|
177
|
-
#
|
178
|
-
#
|
179
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
180
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
181
|
-
# `to_host` value of 'www.example.com'.
|
218
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
219
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
220
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
221
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
222
|
+
# @raise [StandardError] If url isn't valid and absolute.
|
182
223
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
183
224
|
# unsuccessful.
|
184
|
-
def fetch(url,
|
225
|
+
def fetch(url, follow_redirects: true)
|
185
226
|
response = Wgit::Response.new
|
227
|
+
raise "Invalid url: #{url}" if url.invalid?
|
186
228
|
|
187
|
-
resolve(
|
188
|
-
|
189
|
-
response,
|
190
|
-
follow_external_redirects: follow_external_redirects,
|
191
|
-
host: host
|
192
|
-
)
|
229
|
+
resolve(url, response, follow_redirects: follow_redirects)
|
230
|
+
get_browser_response(url, response) if @parse_javascript
|
193
231
|
|
194
232
|
response.body_or_nil
|
195
233
|
rescue StandardError => e
|
@@ -209,18 +247,17 @@ module Wgit
|
|
209
247
|
# @param url [Wgit::Url] The URL to GET and resolve.
|
210
248
|
# @param response [Wgit::Response] The response to enrich. Modifies by
|
211
249
|
# reference.
|
212
|
-
# @param
|
213
|
-
#
|
214
|
-
#
|
215
|
-
#
|
216
|
-
# an absolute redirect is determined to be internal or not. Must be
|
217
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
218
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
219
|
-
# `to_host` value of 'www.example.com'.
|
250
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
251
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
252
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
253
|
+
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
220
254
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
221
|
-
def resolve(url, response,
|
255
|
+
def resolve(url, response, follow_redirects: true)
|
256
|
+
origin = url.to_url.to_origin # Recorded before any redirects.
|
257
|
+
follow_redirects, within = redirect?(follow_redirects)
|
258
|
+
|
222
259
|
loop do
|
223
|
-
|
260
|
+
get_http_response(url, response)
|
224
261
|
break unless response.redirect?
|
225
262
|
|
226
263
|
# Handle response 'Location' header.
|
@@ -229,17 +266,18 @@ module Wgit
|
|
229
266
|
|
230
267
|
yield(url, response, location) if block_given?
|
231
268
|
|
232
|
-
# Validate redirect.
|
233
|
-
|
234
|
-
|
235
|
-
|
269
|
+
# Validate if the redirect is allowed.
|
270
|
+
raise "Redirect not allowed: #{location}" unless follow_redirects
|
271
|
+
|
272
|
+
if within && !location.relative?(within => origin)
|
273
|
+
raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
|
236
274
|
end
|
237
275
|
|
238
276
|
raise "Too many redirects, exceeded: #{@redirect_limit}" \
|
239
277
|
if response.redirect_count >= @redirect_limit
|
240
278
|
|
241
279
|
# Process the location to be crawled next.
|
242
|
-
location = url.
|
280
|
+
location = url.to_origin.concat(location) if location.relative?
|
243
281
|
response.redirections[url.to_s] = location.to_s
|
244
282
|
url.replace(location) # Update the url on redirect.
|
245
283
|
end
|
@@ -252,7 +290,7 @@ module Wgit
|
|
252
290
|
# reference.
|
253
291
|
# @raise [StandardError] If a response can't be obtained.
|
254
292
|
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
255
|
-
def
|
293
|
+
def get_http_response(url, response)
|
256
294
|
# Perform a HTTP GET request.
|
257
295
|
orig_url = url.to_s
|
258
296
|
url = url.normalize if url.respond_to?(:normalize)
|
@@ -268,18 +306,41 @@ module Wgit
|
|
268
306
|
response.ip_address = http_response.primary_ip
|
269
307
|
response.add_total_time(http_response.total_time)
|
270
308
|
|
271
|
-
# Log
|
272
|
-
|
273
|
-
log_status = (response.status || 0)
|
274
|
-
log_total_time = response.total_time.truncate(3)
|
309
|
+
# Log the request/response details.
|
310
|
+
log_net(:http, response, http_response.total_time)
|
275
311
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
312
|
+
# Handle a failed response.
|
313
|
+
raise "No response (within timeout: #{@timeout} second(s))" \
|
314
|
+
if response.failure?
|
315
|
+
end
|
316
|
+
|
317
|
+
# Makes a browser request and enriches the given Wgit::Response from it.
|
318
|
+
#
|
319
|
+
# @param url [String] The url to browse to. Will call url#normalize if
|
320
|
+
# possible.
|
321
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
322
|
+
# reference.
|
323
|
+
# @raise [StandardError] If a response can't be obtained.
|
324
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
325
|
+
def get_browser_response(url, response)
|
326
|
+
url = url.normalize if url.respond_to?(:normalize)
|
327
|
+
browser = nil
|
328
|
+
|
329
|
+
crawl_time = Benchmark.measure { browser = browser_get(url) }.real
|
330
|
+
yield browser if block_given?
|
331
|
+
|
332
|
+
# Enrich the given Wgit::Response object (on top of Typhoeus response).
|
333
|
+
response.adapter_response = browser.network.response
|
334
|
+
response.status = browser.network.response.status
|
335
|
+
response.headers = browser.network.response.headers
|
336
|
+
response.body = browser.body
|
337
|
+
response.add_total_time(crawl_time)
|
338
|
+
|
339
|
+
# Log the request/response details.
|
340
|
+
log_net(:browser, response, crawl_time)
|
280
341
|
|
281
342
|
# Handle a failed response.
|
282
|
-
raise "No response (within timeout: #{@
|
343
|
+
raise "No browser response (within timeout: #{@timeout} second(s))" \
|
283
344
|
if response.failure?
|
284
345
|
end
|
285
346
|
|
@@ -290,7 +351,7 @@ module Wgit
|
|
290
351
|
def http_get(url)
|
291
352
|
opts = {
|
292
353
|
followlocation: false,
|
293
|
-
timeout: @
|
354
|
+
timeout: @timeout,
|
294
355
|
accept_encoding: 'gzip',
|
295
356
|
headers: {
|
296
357
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
@@ -299,34 +360,58 @@ module Wgit
|
|
299
360
|
}
|
300
361
|
|
301
362
|
# See https://rubydoc.info/gems/typhoeus for more info.
|
302
|
-
Typhoeus.get(url, opts)
|
363
|
+
Typhoeus.get(url, **opts)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Performs a HTTP GET request in a web browser and parses the response JS
|
367
|
+
# before returning the HTML body of the fully rendered webpage. This allows
|
368
|
+
# Javascript (SPA apps etc.) to generate HTML dynamically.
|
369
|
+
#
|
370
|
+
# @param url [String] The url to browse to.
|
371
|
+
# @return [Ferrum::Browser] The browser response object.
|
372
|
+
def browser_get(url)
|
373
|
+
@browser ||= Ferrum::Browser.new(timeout: @timeout, process_timeout: 10)
|
374
|
+
@browser.goto(url)
|
375
|
+
|
376
|
+
# Wait for the page's JS to finish dynamically manipulating the DOM.
|
377
|
+
html = @browser.body
|
378
|
+
loop do
|
379
|
+
sleep @parse_javascript_delay
|
380
|
+
break if html.size == @browser.body.size
|
381
|
+
|
382
|
+
html = @browser.body
|
383
|
+
end
|
384
|
+
|
385
|
+
@browser
|
303
386
|
end
|
304
387
|
|
305
388
|
# Returns a doc's internal HTML page links in absolute form; used when
|
306
|
-
# crawling a site.
|
307
|
-
#
|
389
|
+
# crawling a site. By default, any `<a>` href returning HTML is returned;
|
390
|
+
# override this with `xpath:` if desired.
|
308
391
|
#
|
309
|
-
#
|
310
|
-
#
|
311
|
-
#
|
312
|
-
# files containing <a> links can keep the crawl going beyond the base URL.
|
392
|
+
# Use the allow and disallow paths params to partially and selectively
|
393
|
+
# crawl a site; the glob syntax is supported e.g. `'wiki/\*'` etc. Note
|
394
|
+
# that each path should NOT start with a slash.
|
313
395
|
#
|
314
396
|
# @param doc [Wgit::Document] The document from which to extract it's
|
315
|
-
# internal page links.
|
397
|
+
# internal (absolute) page links.
|
398
|
+
# @param xpath [String] The xpath selecting links to be returned. Only
|
399
|
+
# links pointing to the doc.url domain are allowed. The :default is any
|
400
|
+
# <a> href returning HTML. The allow/disallow paths will be applied to
|
401
|
+
# the returned value.
|
316
402
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
317
|
-
# them
|
403
|
+
# them if their path `File.fnmatch?` one of allow_paths.
|
318
404
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
319
|
-
# them if their path
|
405
|
+
# them if their path `File.fnmatch?` one of disallow_paths.
|
320
406
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
321
|
-
def
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
end
|
407
|
+
def next_internal_links(
|
408
|
+
doc, xpath: :default, allow_paths: nil, disallow_paths: nil
|
409
|
+
)
|
410
|
+
links = if xpath && xpath != :default
|
411
|
+
follow_xpath(doc, xpath)
|
412
|
+
else
|
413
|
+
follow_default(doc)
|
414
|
+
end
|
330
415
|
|
331
416
|
return links if allow_paths.nil? && disallow_paths.nil?
|
332
417
|
|
@@ -335,40 +420,82 @@ module Wgit
|
|
335
420
|
|
336
421
|
private
|
337
422
|
|
423
|
+
# Returns the next links used to continue crawling a site. The xpath value
|
424
|
+
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
|
+
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
426
|
+
# pointing to the site domain will raise an error.
|
427
|
+
def follow_xpath(doc, xpath)
|
428
|
+
links = doc.send(:extract_from_html, xpath, singleton: false) do |urls|
|
429
|
+
urls
|
430
|
+
.map { |url| Wgit::Url.parse?(url)&.make_absolute(doc) }
|
431
|
+
.compact
|
432
|
+
end
|
433
|
+
|
434
|
+
if links.any? { |link| link.to_domain != doc.url.to_domain }
|
435
|
+
raise 'The links to follow must be within the site domain'
|
436
|
+
end
|
437
|
+
|
438
|
+
links
|
439
|
+
end
|
440
|
+
|
441
|
+
# Returns the default set of links used to continue crawling a site.
|
442
|
+
# By default, any <a> href returning HTML and pointing to the same domain
|
443
|
+
# will get returned.
|
444
|
+
def follow_default(doc)
|
445
|
+
doc
|
446
|
+
.internal_absolute_links
|
447
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
448
|
+
.uniq
|
449
|
+
.select do |link| # Whitelist only HTML content.
|
450
|
+
ext = link.to_extension
|
451
|
+
if ext
|
452
|
+
Wgit::Crawler.supported_file_extensions.include?(ext.downcase)
|
453
|
+
else
|
454
|
+
true # URLs without an extension are assumed HTML.
|
455
|
+
end
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
338
459
|
# Validate and filter by the given URL paths.
|
339
460
|
def process_paths(links, allow_paths, disallow_paths)
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
if allow_paths # White list.
|
344
|
-
filter_method = :select
|
345
|
-
paths = allow_paths
|
346
|
-
else # Black list.
|
347
|
-
filter_method = :reject
|
348
|
-
paths = disallow_paths
|
461
|
+
if allow_paths
|
462
|
+
paths = validate_paths(allow_paths)
|
463
|
+
filter_links(links, :select!, paths)
|
349
464
|
end
|
350
465
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
.uniq
|
356
|
-
.map { |path| Wgit::Url.new(path).to_path }
|
466
|
+
if disallow_paths
|
467
|
+
paths = validate_paths(disallow_paths)
|
468
|
+
filter_links(links, :reject!, paths)
|
469
|
+
end
|
357
470
|
|
471
|
+
links
|
472
|
+
end
|
473
|
+
|
474
|
+
# Validate the paths are suitable for filtering.
|
475
|
+
def validate_paths(paths)
|
476
|
+
paths = *paths
|
477
|
+
raise 'The provided paths must all be Strings' \
|
478
|
+
unless paths.all? { |path| path.is_a?(String) }
|
479
|
+
|
480
|
+
Wgit::Utils.sanitize(paths, encode: false)
|
358
481
|
raise 'The provided paths cannot be empty' if paths.empty?
|
359
482
|
|
360
|
-
|
483
|
+
paths.map do |path|
|
484
|
+
path = Wgit::Url.parse(path)
|
485
|
+
path.index? ? path : path.omit_slashes
|
486
|
+
end
|
361
487
|
end
|
362
488
|
|
363
|
-
# Filters links by selecting
|
364
|
-
|
489
|
+
# Filters links by selecting/rejecting them based on their path.
|
490
|
+
# Uses File.fnmatch? so that globbing is supported.
|
491
|
+
def filter_links(links, filter_method, paths)
|
365
492
|
links.send(filter_method) do |link|
|
366
|
-
|
367
|
-
|
493
|
+
# Turn http://example.com into / meaning index.
|
494
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base
|
368
495
|
|
369
496
|
match = false
|
370
|
-
paths.each do |
|
371
|
-
match =
|
497
|
+
paths.each do |pattern|
|
498
|
+
match = File.fnmatch?(pattern, link, File::FNM_EXTGLOB)
|
372
499
|
break if match
|
373
500
|
end
|
374
501
|
|
@@ -376,6 +503,35 @@ module Wgit
|
|
376
503
|
end
|
377
504
|
end
|
378
505
|
|
506
|
+
# Returns whether or not to follow redirects, and within what context e.g.
|
507
|
+
# :host, :domain etc.
|
508
|
+
def redirect?(follow_redirects)
|
509
|
+
return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
|
510
|
+
|
511
|
+
unless [true, false].include?(follow_redirects)
|
512
|
+
raise "follow_redirects: must be a Boolean or Symbol, not: \
|
513
|
+
#{follow_redirects}"
|
514
|
+
end
|
515
|
+
|
516
|
+
[follow_redirects, nil]
|
517
|
+
end
|
518
|
+
|
519
|
+
# Log (at debug level) the network request/response details.
|
520
|
+
def log_net(client, response, duration)
|
521
|
+
resp_template = "[#{client}] Response: %s (%s bytes in %s seconds)"
|
522
|
+
log_status = (response.status || 0)
|
523
|
+
log_total_time = (duration || 0.0).truncate(3)
|
524
|
+
|
525
|
+
# The browsers request URL is the same so ignore it.
|
526
|
+
if client.to_sym == :http
|
527
|
+
Wgit.logger.debug("[#{client}] Request: #{response.url}")
|
528
|
+
end
|
529
|
+
|
530
|
+
Wgit.logger.debug(
|
531
|
+
format(resp_template, log_status, response.size, log_total_time)
|
532
|
+
)
|
533
|
+
end
|
534
|
+
|
379
535
|
alias crawl crawl_urls
|
380
536
|
alias crawl_pages crawl_urls
|
381
537
|
alias crawl_page crawl_url
|