wgit 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/CHANGELOG.md +174 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +21 -0
- data/LICENSE.txt +21 -0
- data/README.md +399 -0
- data/lib/wgit/crawler.rb +135 -119
- data/lib/wgit/document.rb +45 -67
- data/lib/wgit/document_extensions.rb +1 -1
- data/lib/wgit/response.rb +6 -6
- data/lib/wgit/url.rb +23 -14
- data/lib/wgit/utils.rb +2 -2
- data/lib/wgit/version.rb +1 -1
- metadata +10 -5
data/lib/wgit/crawler.rb
CHANGED
@@ -5,6 +5,7 @@ require_relative 'document'
|
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
|
+
require 'set'
|
8
9
|
require 'typhoeus'
|
9
10
|
|
10
11
|
module Wgit
|
@@ -14,17 +15,26 @@ module Wgit
|
|
14
15
|
class Crawler
|
15
16
|
include Assertable
|
16
17
|
|
18
|
+
# The URL file extensions (from `<a>` hrefs) which will be crawled by
|
19
|
+
# `#crawl_site`. The idea is to omit anything that isn't HTML and therefore
|
20
|
+
# doesn't keep the crawl of the site going. All URL's without a file
|
21
|
+
# extension will be crawled, because they're assumed to be HTML.
|
22
|
+
SUPPORTED_FILE_EXTENSIONS = Set.new(%w[
|
23
|
+
asp aspx cfm cgi htm html htmlx jsp php
|
24
|
+
])
|
25
|
+
|
17
26
|
# The amount of allowed redirects before raising an error. Set to 0 to
|
18
|
-
# disable redirects completely
|
27
|
+
# disable redirects completely; or you can pass `follow_redirects: false`
|
28
|
+
# to any Wgit::Crawler.crawl_* method.
|
19
29
|
attr_accessor :redirect_limit
|
20
30
|
|
21
31
|
# The maximum amount of time (in seconds) a crawl request has to complete
|
22
32
|
# before raising an error. Set to 0 to disable time outs completely.
|
23
33
|
attr_accessor :time_out
|
24
34
|
|
25
|
-
# Whether or not to UTF-8 encode the
|
26
|
-
# crawling more than just HTML e.g. images
|
27
|
-
attr_accessor :
|
35
|
+
# Whether or not to UTF-8 encode the response body once crawled. Set to
|
36
|
+
# false if crawling more than just HTML e.g. images.
|
37
|
+
attr_accessor :encode
|
28
38
|
|
29
39
|
# The Wgit::Response of the most recently crawled URL.
|
30
40
|
attr_reader :last_response
|
@@ -36,18 +46,22 @@ module Wgit
|
|
36
46
|
# @param time_out [Integer, Float] The maximum amount of time (in seconds)
|
37
47
|
# a crawl request has to complete before raising an error. Set to 0 to
|
38
48
|
# disable time outs completely.
|
39
|
-
# @param
|
40
|
-
# crawled. Set to false if crawling more than just HTML e.g. images
|
41
|
-
def initialize(redirect_limit: 5, time_out: 5,
|
49
|
+
# @param encode [Boolean] Whether or not to UTF-8 encode the response body
|
50
|
+
# once crawled. Set to false if crawling more than just HTML e.g. images.
|
51
|
+
def initialize(redirect_limit: 5, time_out: 5, encode: true)
|
42
52
|
@redirect_limit = redirect_limit
|
43
53
|
@time_out = time_out
|
44
|
-
@
|
54
|
+
@encode = encode
|
45
55
|
end
|
46
56
|
|
47
57
|
# Crawls an entire website's HTML pages by recursively going through
|
48
|
-
# its internal
|
49
|
-
#
|
50
|
-
#
|
58
|
+
# its internal `<a>` links. Each crawled Document is yielded to a block.
|
59
|
+
# Use `doc.empty?` to determine if the crawled link is valid.
|
60
|
+
#
|
61
|
+
# Use the allow and disallow paths params to partially and selectively
|
62
|
+
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
63
|
+
# Note that each path must NOT start with a slash; the only exception being
|
64
|
+
# a `/` on its own with no other characters, referring to the index page.
|
51
65
|
#
|
52
66
|
# Only redirects to the same host are followed. For example, the Url
|
53
67
|
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
@@ -61,65 +75,64 @@ module Wgit
|
|
61
75
|
# It is recommended that this URL be the index page of the site to give a
|
62
76
|
# greater chance of finding all pages within that site/host.
|
63
77
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
64
|
-
# them
|
78
|
+
# them if their path `File.fnmatch?` one of allow_paths.
|
65
79
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
66
|
-
# them if their path
|
80
|
+
# them if their path `File.fnmatch?` one of disallow_paths.
|
67
81
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
68
82
|
# A block is the only way to interact with each crawled Document.
|
83
|
+
# Use `doc.empty?` to determine if the page is valid.
|
69
84
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
70
|
-
# from all of the site's pages or nil if the url could not be
|
85
|
+
# from all of the site's pages or nil if the given url could not be
|
71
86
|
# crawled successfully.
|
72
87
|
def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
|
73
88
|
doc = crawl_url(url, &block)
|
74
89
|
return nil if doc.nil?
|
75
90
|
|
76
|
-
|
77
|
-
link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
78
|
-
|
91
|
+
path_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
79
92
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
80
|
-
crawled = [url, alt_url]
|
81
|
-
externals = doc.external_links
|
82
|
-
internals = get_internal_links(doc, link_opts)
|
83
93
|
|
84
|
-
|
94
|
+
crawled = Set.new([url, alt_url])
|
95
|
+
externals = Set.new(doc.external_links)
|
96
|
+
internals = Set.new(get_internal_links(doc, path_opts))
|
85
97
|
|
86
|
-
|
87
|
-
crawled.uniq!
|
88
|
-
internals.uniq!
|
98
|
+
return externals.to_a if internals.empty?
|
89
99
|
|
100
|
+
loop do
|
90
101
|
links = internals - crawled
|
91
102
|
break if links.empty?
|
92
103
|
|
93
104
|
links.each do |link|
|
94
105
|
orig_link = link.dup
|
95
|
-
doc = crawl_url(link,
|
106
|
+
doc = crawl_url(link, follow_redirects: :host, &block)
|
96
107
|
|
97
|
-
crawled
|
108
|
+
crawled += [orig_link, link] # Push both links in case of redirects.
|
98
109
|
next if doc.nil?
|
99
110
|
|
100
|
-
internals
|
101
|
-
externals
|
111
|
+
internals += get_internal_links(doc, path_opts)
|
112
|
+
externals += doc.external_links
|
102
113
|
end
|
103
114
|
end
|
104
115
|
|
105
|
-
externals.
|
116
|
+
externals.to_a
|
106
117
|
end
|
107
118
|
|
108
119
|
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
109
120
|
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
|
110
121
|
#
|
111
122
|
# @param urls [*Wgit::Url] The Url's to crawl.
|
123
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
124
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
125
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
126
|
+
# :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
127
|
+
# This value will be used for all urls crawled.
|
112
128
|
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
113
129
|
# way to interact with them.
|
114
130
|
# @raise [StandardError] If no urls are provided.
|
115
131
|
# @return [Wgit::Document] The last Document crawled.
|
116
|
-
def crawl_urls(*urls,
|
132
|
+
def crawl_urls(*urls, follow_redirects: true, &block)
|
117
133
|
raise 'You must provide at least one Url' if urls.empty?
|
118
134
|
|
119
|
-
opts = {
|
120
|
-
follow_external_redirects: follow_external_redirects,
|
121
|
-
host: host
|
122
|
-
}
|
135
|
+
opts = { follow_redirects: follow_redirects }
|
123
136
|
doc = nil
|
124
137
|
|
125
138
|
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
@@ -131,33 +144,22 @@ module Wgit
|
|
131
144
|
# occurs.
|
132
145
|
#
|
133
146
|
# @param url [Wgit::Url] The Url to crawl; which will likely be modified.
|
134
|
-
# @param
|
135
|
-
#
|
136
|
-
#
|
137
|
-
#
|
138
|
-
# @param host [Wgit::Url, String] Specify the host by which
|
139
|
-
# an absolute redirect is determined to be internal or not. Must be
|
140
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
141
|
-
# 'http://www.example.com' will only allow redirects for Url's with a
|
142
|
-
# `to_host` value of 'www.example.com'.
|
147
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
148
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
149
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
150
|
+
# :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
143
151
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
144
152
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
145
153
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
146
154
|
# crawl was unsuccessful.
|
147
|
-
def crawl_url(url,
|
155
|
+
def crawl_url(url, follow_redirects: true)
|
148
156
|
# A String url isn't allowed because it's passed by value not reference,
|
149
157
|
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
150
158
|
assert_type(url, Wgit::Url)
|
151
|
-
raise 'host cannot be nil if follow_external_redirects is false' \
|
152
|
-
if !follow_external_redirects && host.nil?
|
153
159
|
|
154
|
-
html = fetch(
|
155
|
-
|
156
|
-
follow_external_redirects: follow_external_redirects,
|
157
|
-
host: host
|
158
|
-
)
|
160
|
+
html = fetch(url, follow_redirects: follow_redirects)
|
161
|
+
doc = Wgit::Document.new(url, html, encode: @encode)
|
159
162
|
|
160
|
-
doc = Wgit::Document.new(url, html, encode_html: @encode_html)
|
161
163
|
yield(doc) if block_given?
|
162
164
|
|
163
165
|
doc.empty? ? nil : doc
|
@@ -171,26 +173,18 @@ module Wgit
|
|
171
173
|
#
|
172
174
|
# @param url [Wgit::Url] The URL to fetch. This Url object is passed by
|
173
175
|
# reference and gets modified as a result of the fetch/crawl.
|
174
|
-
# @param
|
175
|
-
#
|
176
|
-
#
|
177
|
-
#
|
178
|
-
#
|
179
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
180
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
181
|
-
# `to_host` value of 'www.example.com'.
|
176
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
177
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
178
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
179
|
+
# :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
180
|
+
# @raise [StandardError] If url isn't valid and absolute.
|
182
181
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
183
182
|
# unsuccessful.
|
184
|
-
def fetch(url,
|
183
|
+
def fetch(url, follow_redirects: true)
|
185
184
|
response = Wgit::Response.new
|
185
|
+
raise "Invalid url: #{url}" if url.invalid?
|
186
186
|
|
187
|
-
resolve(
|
188
|
-
url,
|
189
|
-
response,
|
190
|
-
follow_external_redirects: follow_external_redirects,
|
191
|
-
host: host
|
192
|
-
)
|
193
|
-
|
187
|
+
resolve(url, response, follow_redirects: follow_redirects)
|
194
188
|
response.body_or_nil
|
195
189
|
rescue StandardError => e
|
196
190
|
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
|
@@ -209,16 +203,15 @@ module Wgit
|
|
209
203
|
# @param url [Wgit::Url] The URL to GET and resolve.
|
210
204
|
# @param response [Wgit::Response] The response to enrich. Modifies by
|
211
205
|
# reference.
|
212
|
-
# @param
|
213
|
-
#
|
214
|
-
#
|
215
|
-
#
|
216
|
-
# an absolute redirect is determined to be internal or not. Must be
|
217
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
218
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
219
|
-
# `to_host` value of 'www.example.com'.
|
206
|
+
# @param follow_redirects [Boolean, Symbol] Whether or not to follow
|
207
|
+
# redirects. Pass a Symbol to limit where the redirect is allowed to go
|
208
|
+
# e.g. :host only allows redirects within the same host. Choose from
|
209
|
+
# :base, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
220
210
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
221
|
-
def resolve(url, response,
|
211
|
+
def resolve(url, response, follow_redirects: true)
|
212
|
+
orig_url_base = url.to_url.to_base # Recorded before any redirects.
|
213
|
+
follow_redirects, within = redirect?(follow_redirects)
|
214
|
+
|
222
215
|
loop do
|
223
216
|
get_response(url, response)
|
224
217
|
break unless response.redirect?
|
@@ -229,10 +222,11 @@ module Wgit
|
|
229
222
|
|
230
223
|
yield(url, response, location) if block_given?
|
231
224
|
|
232
|
-
# Validate redirect.
|
233
|
-
|
234
|
-
|
235
|
-
|
225
|
+
# Validate if the redirect is allowed.
|
226
|
+
raise "Redirect not allowed: #{location}" unless follow_redirects
|
227
|
+
|
228
|
+
if within && !location.relative?(within => orig_url_base)
|
229
|
+
raise "Redirect (outside of #{within}) is not allowed: '#{location}'"
|
236
230
|
end
|
237
231
|
|
238
232
|
raise "Too many redirects, exceeded: #{@redirect_limit}" \
|
@@ -268,15 +262,8 @@ module Wgit
|
|
268
262
|
response.ip_address = http_response.primary_ip
|
269
263
|
response.add_total_time(http_response.total_time)
|
270
264
|
|
271
|
-
# Log
|
272
|
-
|
273
|
-
log_status = (response.status || 0)
|
274
|
-
log_total_time = response.total_time.truncate(3)
|
275
|
-
|
276
|
-
Wgit.logger.debug("[http] Request: #{response.url}")
|
277
|
-
Wgit.logger.debug(
|
278
|
-
format(resp_template, log_status, response.size, log_total_time)
|
279
|
-
)
|
265
|
+
# Log the request/response details.
|
266
|
+
log_http(response)
|
280
267
|
|
281
268
|
# Handle a failed response.
|
282
269
|
raise "No response (within timeout: #{@time_out} second(s))" \
|
@@ -304,28 +291,29 @@ module Wgit
|
|
304
291
|
|
305
292
|
# Returns a doc's internal HTML page links in absolute form; used when
|
306
293
|
# crawling a site. Use the allow and disallow paths params to partially
|
307
|
-
# and selectively crawl a site.
|
294
|
+
# and selectively crawl a site; the glob syntax is supported e.g.
|
295
|
+
# `'wiki/\*'` etc. Note that each path should NOT start with a slash.
|
308
296
|
#
|
309
297
|
# Override this method in a subclass to change how a site
|
310
|
-
# is crawled
|
298
|
+
# is crawled, not what is extracted from each page (Document extensions
|
311
299
|
# should be used for this purpose instead). Just remember that only HTML
|
312
|
-
# files containing
|
300
|
+
# files containing `<a>` links keep the crawl going beyond the base URL.
|
313
301
|
#
|
314
302
|
# @param doc [Wgit::Document] The document from which to extract it's
|
315
|
-
# internal page links.
|
303
|
+
# internal (absolute) page links.
|
316
304
|
# @param allow_paths [String, Array<String>] Filters links by selecting
|
317
|
-
# them
|
305
|
+
# them if their path `File.fnmatch?` one of allow_paths.
|
318
306
|
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
319
|
-
# them if their path
|
307
|
+
# them if their path `File.fnmatch?` one of disallow_paths.
|
320
308
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
321
309
|
def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
|
322
310
|
links = doc
|
323
311
|
.internal_absolute_links
|
324
312
|
.map(&:omit_fragment) # Because fragments don't alter content.
|
325
313
|
.uniq
|
326
|
-
.
|
314
|
+
.select do |link|
|
327
315
|
ext = link.to_extension
|
328
|
-
ext ?
|
316
|
+
ext ? SUPPORTED_FILE_EXTENSIONS.include?(ext.downcase) : true
|
329
317
|
end
|
330
318
|
|
331
319
|
return links if allow_paths.nil? && disallow_paths.nil?
|
@@ -335,40 +323,68 @@ module Wgit
|
|
335
323
|
|
336
324
|
private
|
337
325
|
|
326
|
+
# Returns whether or not to follow redirects, and within what context e.g.
|
327
|
+
# :host, :domain etc.
|
328
|
+
def redirect?(follow_redirects)
|
329
|
+
return [true, follow_redirects] if follow_redirects.is_a?(Symbol)
|
330
|
+
|
331
|
+
unless [true, false].include?(follow_redirects)
|
332
|
+
raise "follow_redirects: must be a Boolean or Symbol, not: \
|
333
|
+
#{follow_redirects}"
|
334
|
+
end
|
335
|
+
|
336
|
+
[follow_redirects, nil]
|
337
|
+
end
|
338
|
+
|
339
|
+
# Log (at debug level) the HTTP request/response details.
|
340
|
+
def log_http(response)
|
341
|
+
resp_template = '[http] Response: %s (%s bytes in %s seconds)'
|
342
|
+
log_status = (response.status || 0)
|
343
|
+
log_total_time = response.total_time.truncate(3)
|
344
|
+
|
345
|
+
Wgit.logger.debug("[http] Request: #{response.url}")
|
346
|
+
Wgit.logger.debug(
|
347
|
+
format(resp_template, log_status, response.size, log_total_time)
|
348
|
+
)
|
349
|
+
end
|
350
|
+
|
338
351
|
# Validate and filter by the given URL paths.
|
339
352
|
def process_paths(links, allow_paths, disallow_paths)
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
if allow_paths # White list.
|
344
|
-
filter_method = :select
|
345
|
-
paths = allow_paths
|
346
|
-
else # Black list.
|
347
|
-
filter_method = :reject
|
348
|
-
paths = disallow_paths
|
353
|
+
if allow_paths
|
354
|
+
paths = validate_paths(allow_paths)
|
355
|
+
filter_links(links, :select!, paths)
|
349
356
|
end
|
350
357
|
|
358
|
+
if disallow_paths
|
359
|
+
paths = validate_paths(disallow_paths)
|
360
|
+
filter_links(links, :reject!, paths)
|
361
|
+
end
|
362
|
+
|
363
|
+
links
|
364
|
+
end
|
365
|
+
|
366
|
+
# Validate the paths are suitable for filtering.
|
367
|
+
def validate_paths(paths)
|
351
368
|
paths = [paths] unless paths.is_a?(Array)
|
352
|
-
paths
|
353
|
-
|
354
|
-
.reject(&:empty?)
|
355
|
-
.uniq
|
356
|
-
.map { |path| Wgit::Url.new(path).to_path }
|
369
|
+
raise 'The provided paths must all be Strings' \
|
370
|
+
unless paths.all? { |path| path.is_a?(String) }
|
357
371
|
|
372
|
+
Wgit::Utils.process_arr(paths, encode: false)
|
358
373
|
raise 'The provided paths cannot be empty' if paths.empty?
|
359
374
|
|
360
|
-
|
375
|
+
paths
|
361
376
|
end
|
362
377
|
|
363
|
-
# Filters links by selecting
|
364
|
-
|
378
|
+
# Filters links by selecting/rejecting them based on their path.
|
379
|
+
# Uses File.fnmatch? so that globbing is supported.
|
380
|
+
def filter_links(links, filter_method, paths)
|
365
381
|
links.send(filter_method) do |link|
|
366
|
-
|
367
|
-
|
382
|
+
# Turn http://example.com into / meaning index.
|
383
|
+
link = link.to_endpoint == '/' ? '/' : link.omit_base
|
368
384
|
|
369
385
|
match = false
|
370
|
-
paths.each do |
|
371
|
-
match =
|
386
|
+
paths.each do |pattern|
|
387
|
+
match = File.fnmatch?(pattern, link, File::FNM_EXTGLOB)
|
372
388
|
break if match
|
373
389
|
end
|
374
390
|
|
data/lib/wgit/document.rb
CHANGED
@@ -7,41 +7,31 @@ require 'json'
|
|
7
7
|
module Wgit
|
8
8
|
# Class primarily modeling a HTML web document, although other MIME types
|
9
9
|
# will work e.g. images etc. Also doubles as a search result when
|
10
|
-
# loading Documents from the database via Wgit::Database#search
|
10
|
+
# loading Documents from the database via `Wgit::Database#search`.
|
11
11
|
#
|
12
12
|
# The initialize method dynamically initializes instance variables from the
|
13
13
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
14
14
|
# Document class can be easily extended allowing you to pull out the bits of
|
15
|
-
# a webpage that are important to you. See Wgit::Document.define_extension
|
15
|
+
# a webpage that are important to you. See `Wgit::Document.define_extension`.
|
16
16
|
class Document
|
17
17
|
include Assertable
|
18
18
|
|
19
19
|
# Regex for the allowed var names when defining an extension.
|
20
20
|
REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
|
21
21
|
|
22
|
-
# The
|
23
|
-
|
24
|
-
# See the README.md for how to add to this Array dynamically.
|
25
|
-
@text_elements = %i[
|
26
|
-
dd div dl dt figcaption figure hr li
|
27
|
-
main ol p pre span ul h1 h2 h3 h4 h5
|
28
|
-
]
|
29
|
-
|
30
|
-
class << self
|
31
|
-
# Class level instance reader method for @text_elements.
|
32
|
-
attr_reader :text_elements
|
33
|
-
end
|
22
|
+
# The xpath used to extract the visible text on a page.
|
23
|
+
TEXT_ELEMENTS_XPATH = '//*/text()'.freeze
|
34
24
|
|
35
25
|
# The URL of the webpage, an instance of Wgit::Url.
|
36
26
|
attr_reader :url
|
37
27
|
|
38
|
-
# The HTML of the
|
28
|
+
# The content/HTML of the document, an instance of String.
|
39
29
|
attr_reader :html
|
40
30
|
|
41
31
|
# The Nokogiri::HTML document object initialized from @html.
|
42
32
|
attr_reader :doc
|
43
33
|
|
44
|
-
# The score is only used following a Database#search and records matches.
|
34
|
+
# The score is only used following a `Database#search` and records matches.
|
45
35
|
attr_reader :score
|
46
36
|
|
47
37
|
# Initialize takes either two strings (representing the URL and HTML) or an
|
@@ -50,44 +40,29 @@ module Wgit
|
|
50
40
|
# pages retrieved from the database.
|
51
41
|
#
|
52
42
|
# During initialisation, the Document will call any private
|
53
|
-
#
|
43
|
+
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
54
44
|
# README.md and Wgit::Document.define_extension method for more details.
|
55
45
|
#
|
56
|
-
# @param url_or_obj [String, Wgit::Url,
|
46
|
+
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
57
47
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
58
48
|
# MongoDB collection object. The Object's :fetch method should support
|
59
49
|
# Strings as keys.
|
60
|
-
# @param html [String, NilClass] The crawled web page's HTML. This
|
61
|
-
# only used if url_or_obj is a String representing the web
|
62
|
-
# Otherwise, the HTML comes from the database object. A html
|
63
|
-
# be defaulted to an empty String.
|
64
|
-
|
50
|
+
# @param html [String, NilClass] The crawled web page's content/HTML. This
|
51
|
+
# param is only used if url_or_obj is a String representing the web
|
52
|
+
# page's URL. Otherwise, the HTML comes from the database object. A html
|
53
|
+
# of nil will be defaulted to an empty String.
|
54
|
+
# @param encode [Boolean] Whether or not to UTF-8 encode the html. Set to
|
55
|
+
# false if the Document content is an image etc.
|
56
|
+
def initialize(url_or_obj, html = '', encode: true)
|
65
57
|
if url_or_obj.is_a?(String)
|
66
|
-
init_from_strings(url_or_obj, html,
|
58
|
+
init_from_strings(url_or_obj, html, encode: encode)
|
67
59
|
else
|
68
|
-
init_from_object(url_or_obj,
|
60
|
+
init_from_object(url_or_obj, encode: encode)
|
69
61
|
end
|
70
62
|
end
|
71
63
|
|
72
64
|
### Document Class Methods ###
|
73
65
|
|
74
|
-
# Uses Document.text_elements to build an xpath String, used to obtain
|
75
|
-
# all of the combined text on a webpage.
|
76
|
-
#
|
77
|
-
# @return [String] An xpath String to obtain a webpage's text elements.
|
78
|
-
def self.text_elements_xpath
|
79
|
-
xpath = ''
|
80
|
-
return xpath if Wgit::Document.text_elements.empty?
|
81
|
-
|
82
|
-
el_xpath = '//%s/text()'
|
83
|
-
Wgit::Document.text_elements.each_with_index do |el, i|
|
84
|
-
xpath += ' | ' unless i.zero?
|
85
|
-
xpath += format(el_xpath, el)
|
86
|
-
end
|
87
|
-
|
88
|
-
xpath
|
89
|
-
end
|
90
|
-
|
91
66
|
# Defines an extension, which is a way to serialise HTML elements into
|
92
67
|
# instance variables upon Document initialization. See the default
|
93
68
|
# extensions defined in 'document_extensions.rb' as examples.
|
@@ -105,35 +80,36 @@ module Wgit
|
|
105
80
|
# a default will be used. The default value is: `singleton ? nil : []`.
|
106
81
|
#
|
107
82
|
# @param var [Symbol] The name of the variable to be initialised.
|
108
|
-
# @param xpath [String,
|
83
|
+
# @param xpath [String, #call] The xpath used to find the element(s)
|
109
84
|
# of the webpage. Only used when initializing from HTML.
|
110
85
|
#
|
111
86
|
# Pass a callable object (proc etc.) if you want the
|
112
87
|
# xpath value to be derived on Document initialisation (instead of when
|
113
88
|
# the extension is defined). The call method must return a valid xpath
|
114
89
|
# String.
|
115
|
-
# @param
|
90
|
+
# @param opts [Hash] The options to define an extension with. The
|
116
91
|
# options are only used when intializing from HTML, not the database.
|
117
|
-
# @option
|
92
|
+
# @option opts [Boolean] :singleton The singleton option determines
|
118
93
|
# whether or not the result(s) should be in an Array. If multiple
|
119
94
|
# results are found and singleton is true then the first result will be
|
120
95
|
# used. Defaults to true.
|
121
|
-
# @option
|
96
|
+
# @option opts [Boolean] :text_content_only The text_content_only option
|
122
97
|
# if true will use the text content of the Nokogiri result object,
|
123
98
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
124
|
-
# @
|
125
|
-
#
|
126
|
-
#
|
127
|
-
#
|
128
|
-
#
|
129
|
-
# Return nil if you want to inspect but not change the
|
130
|
-
# block is executed when a Wgit::Document is initialized
|
99
|
+
# @yieldparam value [Object] The value to be assigned to the new var.
|
100
|
+
# @yieldparam source [Wgit::Document, Object] The source of the value.
|
101
|
+
# @yieldparam type [Symbol] The source type, either :document or (DB)
|
102
|
+
# :object.
|
103
|
+
# @yieldreturn [Object] The return value of the block becomes the new var
|
104
|
+
# value, unless nil. Return nil if you want to inspect but not change the
|
105
|
+
# var value. The block is executed when a Wgit::Document is initialized,
|
106
|
+
# regardless of the source.
|
131
107
|
# @raise [StandardError] If the var param isn't valid.
|
132
|
-
# @return [Symbol] The given var Symbol.
|
133
|
-
def self.define_extension(var, xpath,
|
108
|
+
# @return [Symbol] The given var Symbol if successful.
|
109
|
+
def self.define_extension(var, xpath, opts = {}, &block)
|
134
110
|
var = var.to_sym
|
135
|
-
|
136
|
-
|
111
|
+
defaults = { singleton: true, text_content_only: true }
|
112
|
+
opts = defaults.merge(opts)
|
137
113
|
|
138
114
|
raise "var must match #{REGEX_EXTENSION_NAME}" unless \
|
139
115
|
var =~ REGEX_EXTENSION_NAME
|
@@ -141,7 +117,7 @@ module Wgit
|
|
141
117
|
# Define the private init_*_from_html method for HTML.
|
142
118
|
# Gets the HTML's xpath value and creates a var for it.
|
143
119
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
144
|
-
result = find_in_html(xpath,
|
120
|
+
result = find_in_html(xpath, opts, &block)
|
145
121
|
init_var(var, result)
|
146
122
|
end
|
147
123
|
Document.send :private, func_name
|
@@ -149,7 +125,7 @@ module Wgit
|
|
149
125
|
# Define the private init_*_from_object method for a Database object.
|
150
126
|
# Gets the Object's 'key' value and creates a var for it.
|
151
127
|
func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
|
152
|
-
result = find_in_object(obj, var.to_s, singleton:
|
128
|
+
result = find_in_object(obj, var.to_s, singleton: opts[:singleton], &block)
|
153
129
|
init_var(var, result)
|
154
130
|
end
|
155
131
|
Document.send :private, func_name
|
@@ -381,7 +357,7 @@ module Wgit
|
|
381
357
|
# original sentence, which ever is less. The algorithm obviously ensures
|
382
358
|
# that the search query is visible somewhere in the sentence.
|
383
359
|
#
|
384
|
-
# @param query [String,
|
360
|
+
# @param query [String, #to_s] The value to search the document's
|
385
361
|
# @text for.
|
386
362
|
# @param case_sensitive [Boolean] Whether character case must match.
|
387
363
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
@@ -401,10 +377,12 @@ module Wgit
|
|
401
377
|
results = {}
|
402
378
|
|
403
379
|
@text.each do |sentence|
|
380
|
+
sentence = sentence.strip
|
381
|
+
next if results[sentence]
|
382
|
+
|
404
383
|
hits = sentence.scan(regex).count
|
405
384
|
next unless hits.positive?
|
406
385
|
|
407
|
-
sentence.strip!
|
408
386
|
index = sentence.index(regex) # Index of first match.
|
409
387
|
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
410
388
|
|
@@ -422,7 +400,7 @@ module Wgit
|
|
422
400
|
# functionality. The original text is returned; no other reference to it
|
423
401
|
# is kept thereafter.
|
424
402
|
#
|
425
|
-
# @param query [String,
|
403
|
+
# @param query [String, #to_s] The value to search the document's
|
426
404
|
# @text for.
|
427
405
|
# @param case_sensitive [Boolean] Whether character case must match.
|
428
406
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
@@ -499,7 +477,7 @@ module Wgit
|
|
499
477
|
|
500
478
|
# Returns a value from the obj using the given key via obj#fetch.
|
501
479
|
#
|
502
|
-
# @param obj [
|
480
|
+
# @param obj [#fetch] The object containing the key/value.
|
503
481
|
# @param key [String] Used to find the value in the obj.
|
504
482
|
# @param singleton [Boolean] True if a single value, false otherwise.
|
505
483
|
# @yield [value, source] Given the value (String/Object) before it's set as
|
@@ -527,7 +505,7 @@ module Wgit
|
|
527
505
|
private
|
528
506
|
|
529
507
|
# Initialise the Document from URL and HTML Strings.
|
530
|
-
def init_from_strings(url, html,
|
508
|
+
def init_from_strings(url, html, encode: true)
|
531
509
|
assert_types(html, [String, NilClass])
|
532
510
|
|
533
511
|
# We already know url.is_a?(String) so parse into Url unless already so.
|
@@ -539,7 +517,7 @@ module Wgit
|
|
539
517
|
@doc = init_nokogiri
|
540
518
|
@score = 0.0
|
541
519
|
|
542
|
-
Wgit::Utils.process_str(@html, encode:
|
520
|
+
Wgit::Utils.process_str(@html, encode: encode)
|
543
521
|
|
544
522
|
# Dynamically run the init_*_from_html methods.
|
545
523
|
Document.private_instance_methods(false).each do |method|
|
@@ -552,7 +530,7 @@ module Wgit
|
|
552
530
|
|
553
531
|
# Initialise the Document from a Hash like Object containing Strings as
|
554
532
|
# keys e.g. database collection object or Hash.
|
555
|
-
def init_from_object(obj,
|
533
|
+
def init_from_object(obj, encode: true)
|
556
534
|
assert_respond_to(obj, :fetch)
|
557
535
|
|
558
536
|
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
@@ -560,7 +538,7 @@ module Wgit
|
|
560
538
|
@doc = init_nokogiri
|
561
539
|
@score = obj.fetch('score', 0.0)
|
562
540
|
|
563
|
-
Wgit::Utils.process_str(@html, encode:
|
541
|
+
Wgit::Utils.process_str(@html, encode: encode)
|
564
542
|
|
565
543
|
# Dynamically run the init_*_from_object methods.
|
566
544
|
Document.private_instance_methods(false).each do |method|
|