wgit 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +155 -66
- data/lib/wgit/database/database.rb +9 -8
- data/lib/wgit/database/model.rb +2 -2
- data/lib/wgit/document.rb +55 -62
- data/lib/wgit/document_extensions.rb +2 -2
- data/lib/wgit/indexer.rb +27 -15
- data/lib/wgit/response.rb +144 -0
- data/lib/wgit/url.rb +149 -85
- data/lib/wgit/utils.rb +6 -3
- data/lib/wgit/version.rb +7 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3e5c6b85b0ac78d234674d6003f8624b266c09668b4cfd78945106a917f78078
|
4
|
+
data.tar.gz: 3fc90cf5c132804f12e54f2b5f446143591923fff0677accc2ab907295ba34c4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f39df81391a07b344678a2b8d443b945391728d215e142ed73a55ef80cfc9c9a8407db9e4faa60c3e43e5b8e65bf8e84c3a343ff962b3c0276eed920639f3870
|
7
|
+
data.tar.gz: 1690895b56def00cbed58e485b23f5158ada0adb89f1c0e87bff3c638332648761dbac81b8f08e6c9c6ee911f4cbf9df72f3bfbce5d8abc2207d434edfde61ee
|
data/lib/wgit/crawler.rb
CHANGED
@@ -4,11 +4,13 @@ require_relative 'url'
|
|
4
4
|
require_relative 'document'
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
|
+
require_relative 'response'
|
7
8
|
require 'typhoeus'
|
8
9
|
|
9
10
|
module Wgit
|
10
11
|
# The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
|
11
|
-
# serialising their HTML into Wgit::Document instances.
|
12
|
+
# serialising their HTML into Wgit::Document instances. This is the only Wgit
|
13
|
+
# class which contains network logic e.g. request/response handling.
|
12
14
|
class Crawler
|
13
15
|
include Assertable
|
14
16
|
|
@@ -20,8 +22,11 @@ module Wgit
|
|
20
22
|
# before raising an error. Set to 0 to disable time outs completely.
|
21
23
|
attr_accessor :time_out
|
22
24
|
|
23
|
-
#
|
24
|
-
#
|
25
|
+
# Whether or not to UTF-8 encode the HTML once crawled. Set to false if
|
26
|
+
# crawling more than just HTML e.g. images etc.
|
27
|
+
attr_accessor :encode_html
|
28
|
+
|
29
|
+
# The Wgit::Response of the most recently crawled URL.
|
25
30
|
attr_reader :last_response
|
26
31
|
|
27
32
|
# Initializes and returns a Wgit::Crawler instance.
|
@@ -31,13 +36,18 @@ module Wgit
|
|
31
36
|
# @param time_out [Integer, Float] The maximum amount of time (in seconds)
|
32
37
|
# a crawl request has to complete before raising an error. Set to 0 to
|
33
38
|
# disable time outs completely.
|
34
|
-
|
39
|
+
# @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
|
40
|
+
# crawled. Set to false if crawling more than just HTML e.g. images etc.
|
41
|
+
def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
|
35
42
|
@redirect_limit = redirect_limit
|
36
43
|
@time_out = time_out
|
44
|
+
@encode_html = encode_html
|
37
45
|
end
|
38
46
|
|
39
47
|
# Crawls an entire website's HTML pages by recursively going through
|
40
|
-
# its internal links. Each crawled Document is yielded to a block.
|
48
|
+
# its internal <a> links. Each crawled Document is yielded to a block. Use
|
49
|
+
# the allow and disallow paths params to partially and selectively crawl a
|
50
|
+
# site.
|
41
51
|
#
|
42
52
|
# Only redirects to the same host are followed. For example, the Url
|
43
53
|
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
@@ -50,20 +60,26 @@ module Wgit
|
|
50
60
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
51
61
|
# It is recommended that this URL be the index page of the site to give a
|
52
62
|
# greater chance of finding all pages within that site/host.
|
63
|
+
# @param allow_paths [String, Array<String>] Filters links by selecting
|
64
|
+
# them only if their path includes one of allow_paths.
|
65
|
+
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
66
|
+
# them if their path includes one of disallow_paths.
|
53
67
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
54
68
|
# A block is the only way to interact with each crawled Document.
|
55
69
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
56
70
|
# from all of the site's pages or nil if the url could not be
|
57
71
|
# crawled successfully.
|
58
|
-
def crawl_site(url, &block)
|
72
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
|
59
73
|
doc = crawl_url(url, &block)
|
60
74
|
return nil if doc.nil?
|
61
75
|
|
62
|
-
|
76
|
+
crawl_opts = { follow_external_redirects: false, host: url.to_base }
|
77
|
+
link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
78
|
+
|
63
79
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
64
80
|
crawled = [url, alt_url]
|
65
81
|
externals = doc.external_links
|
66
|
-
internals = get_internal_links(doc)
|
82
|
+
internals = get_internal_links(doc, link_opts)
|
67
83
|
|
68
84
|
return doc.external_links.uniq if internals.empty?
|
69
85
|
|
@@ -76,12 +92,12 @@ module Wgit
|
|
76
92
|
|
77
93
|
links.each do |link|
|
78
94
|
orig_link = link.dup
|
79
|
-
doc = crawl_url(link,
|
95
|
+
doc = crawl_url(link, crawl_opts, &block)
|
80
96
|
|
81
97
|
crawled.push(orig_link, link) # Push both in case of redirects.
|
82
98
|
next if doc.nil?
|
83
99
|
|
84
|
-
internals.concat(get_internal_links(doc))
|
100
|
+
internals.concat(get_internal_links(doc, link_opts))
|
85
101
|
externals.concat(doc.external_links)
|
86
102
|
end
|
87
103
|
end
|
@@ -141,7 +157,7 @@ module Wgit
|
|
141
157
|
host: host
|
142
158
|
)
|
143
159
|
|
144
|
-
doc = Wgit::Document.new(url, html)
|
160
|
+
doc = Wgit::Document.new(url, html, encode_html: @encode_html)
|
145
161
|
yield(doc) if block_given?
|
146
162
|
|
147
163
|
doc.empty? ? nil : doc
|
@@ -149,7 +165,7 @@ module Wgit
|
|
149
165
|
|
150
166
|
protected
|
151
167
|
|
152
|
-
#
|
168
|
+
# Returns the url HTML String or nil. Handles any errors that arise
|
153
169
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
154
170
|
# return a HTML body will be ignored, returning nil.
|
155
171
|
#
|
@@ -166,31 +182,33 @@ module Wgit
|
|
166
182
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
167
183
|
# unsuccessful.
|
168
184
|
def fetch(url, follow_external_redirects: true, host: nil)
|
169
|
-
response
|
170
|
-
crawl_duration = nil
|
185
|
+
response = Wgit::Response.new
|
171
186
|
|
172
|
-
|
187
|
+
resolve(
|
173
188
|
url,
|
189
|
+
response,
|
174
190
|
follow_external_redirects: follow_external_redirects,
|
175
191
|
host: host
|
176
192
|
)
|
177
|
-
crawl_duration = response.total_time
|
178
193
|
|
179
|
-
response.
|
194
|
+
response.body_or_nil
|
180
195
|
rescue StandardError => e
|
181
|
-
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e
|
196
|
+
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
|
182
197
|
|
183
198
|
nil
|
184
199
|
ensure
|
185
|
-
url.crawled = true #
|
186
|
-
url.crawl_duration =
|
187
|
-
|
200
|
+
url.crawled = true # Sets date_crawled underneath.
|
201
|
+
url.crawl_duration = response.total_time
|
202
|
+
|
203
|
+
@last_response = response
|
188
204
|
end
|
189
205
|
|
190
|
-
#
|
191
|
-
#
|
206
|
+
# GETs the given url, resolving any redirects. The given response object
|
207
|
+
# will be enriched.
|
192
208
|
#
|
193
|
-
# @param url [Wgit::Url] The URL to resolve.
|
209
|
+
# @param url [Wgit::Url] The URL to GET and resolve.
|
210
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
211
|
+
# reference.
|
194
212
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
195
213
|
# an external redirect. If false, you must also provide a `host:`
|
196
214
|
# parameter.
|
@@ -200,91 +218,162 @@ module Wgit
|
|
200
218
|
# 'http://www.example.com' will only allow redirects for Urls with a
|
201
219
|
# `to_host` value of 'www.example.com'.
|
202
220
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
203
|
-
|
204
|
-
def resolve(url, follow_external_redirects: true, host: nil)
|
205
|
-
response = nil
|
206
|
-
redirect_count = 0
|
207
|
-
total_net_time = 0.0
|
208
|
-
|
221
|
+
def resolve(url, response, follow_external_redirects: true, host: nil)
|
209
222
|
loop do
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
# Break unless it's a redirect.
|
214
|
-
break unless (response.code >= 300) && (response.code < 400)
|
223
|
+
get_response(url, response)
|
224
|
+
break unless response.redirect?
|
215
225
|
|
216
226
|
# Handle response 'Location' header.
|
217
|
-
location = Wgit::
|
218
|
-
location = Wgit::Url.new(location)
|
227
|
+
location = Wgit::Url.new(response.headers.fetch(:location, ''))
|
219
228
|
raise 'Encountered redirect without Location header' if location.empty?
|
220
229
|
|
221
230
|
yield(url, response, location) if block_given?
|
222
231
|
|
223
|
-
#
|
232
|
+
# Validate redirect.
|
224
233
|
if !follow_external_redirects && !location.relative?(host: host)
|
225
234
|
raise "External redirect not allowed - Redirected to: \
|
226
235
|
'#{location}', which is outside of host: '#{host}'"
|
227
236
|
end
|
228
237
|
|
229
|
-
raise "Too many redirects, exceeded: #{
|
230
|
-
if redirect_count >= @redirect_limit
|
231
|
-
|
232
|
-
redirect_count += 1
|
238
|
+
raise "Too many redirects, exceeded: #{@redirect_limit}" \
|
239
|
+
if response.redirect_count >= @redirect_limit
|
233
240
|
|
234
241
|
# Process the location to be crawled next.
|
235
242
|
location = url.to_base.concat(location) if location.relative?
|
243
|
+
response.redirections[url.to_s] = location.to_s
|
236
244
|
url.replace(location) # Update the url on redirect.
|
237
245
|
end
|
238
|
-
|
239
|
-
response.options[:redirect_count] = redirect_count
|
240
|
-
response.options[:total_time] = total_net_time
|
241
|
-
|
242
|
-
response
|
243
246
|
end
|
244
247
|
|
245
|
-
#
|
248
|
+
# Makes a HTTP request and enriches the given Wgit::Response from it.
|
246
249
|
#
|
247
250
|
# @param url [String] The url to GET. Will call url#normalize if possible.
|
251
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
252
|
+
# reference.
|
248
253
|
# @raise [StandardError] If a response can't be obtained.
|
249
|
-
# @return [
|
250
|
-
def get_response(url)
|
251
|
-
|
254
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
255
|
+
def get_response(url, response)
|
256
|
+
# Perform a HTTP GET request.
|
257
|
+
orig_url = url.to_s
|
258
|
+
url = url.normalize if url.respond_to?(:normalize)
|
259
|
+
|
260
|
+
http_response = http_get(url)
|
261
|
+
|
262
|
+
# Enrich the given Wgit::Response object.
|
263
|
+
response.adapter_response = http_response
|
264
|
+
response.url = orig_url
|
265
|
+
response.status = http_response.code
|
266
|
+
response.headers = http_response.headers
|
267
|
+
response.body = http_response.body
|
268
|
+
response.ip_address = http_response.primary_ip
|
269
|
+
response.add_total_time(http_response.total_time)
|
270
|
+
|
271
|
+
# Log (debug) the request/response details.
|
272
|
+
resp_template = '[http] Response: %s (%s bytes in %s seconds)'
|
273
|
+
log_status = (response.status || 0)
|
274
|
+
log_total_time = response.total_time.truncate(3)
|
275
|
+
|
276
|
+
Wgit.logger.debug("[http] Request: #{response.url}")
|
277
|
+
Wgit.logger.debug(
|
278
|
+
format(resp_template, log_status, response.size, log_total_time)
|
279
|
+
)
|
252
280
|
|
281
|
+
# Handle a failed response.
|
282
|
+
raise "No response (within timeout: #{@time_out} second(s))" \
|
283
|
+
if response.failure?
|
284
|
+
end
|
285
|
+
|
286
|
+
# Performs a HTTP GET request and returns the response.
|
287
|
+
#
|
288
|
+
# @param url [String] The url to GET.
|
289
|
+
# @return [Typhoeus::Response] The HTTP response object.
|
290
|
+
def http_get(url)
|
253
291
|
opts = {
|
254
292
|
followlocation: false,
|
255
293
|
timeout: @time_out,
|
256
294
|
accept_encoding: 'gzip',
|
257
295
|
headers: {
|
258
296
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
259
|
-
'Accept'
|
297
|
+
'Accept' => 'text/html'
|
260
298
|
}
|
261
299
|
}
|
262
300
|
|
263
|
-
|
264
|
-
|
265
|
-
# Handle response status code.
|
266
|
-
raise "No response (within timeout: #{@time_out} second(s))" \
|
267
|
-
if response.code.zero?
|
268
|
-
|
269
|
-
response
|
301
|
+
# See https://rubydoc.info/gems/typhoeus for more info.
|
302
|
+
Typhoeus.get(url, opts)
|
270
303
|
end
|
271
304
|
|
272
305
|
# Returns a doc's internal HTML page links in absolute form; used when
|
273
|
-
# crawling a site.
|
306
|
+
# crawling a site. Use the allow and disallow paths params to partially
|
307
|
+
# and selectively crawl a site.
|
308
|
+
#
|
309
|
+
# Override this method in a subclass to change how a site
|
274
310
|
# is crawled; not what is extracted from each page (Document extensions
|
275
|
-
# should be used for this purpose instead).
|
311
|
+
# should be used for this purpose instead). Just remember that only HTML
|
312
|
+
# files containing <a> links can keep the crawl going beyond the base URL.
|
276
313
|
#
|
277
314
|
# @param doc [Wgit::Document] The document from which to extract it's
|
278
315
|
# internal page links.
|
316
|
+
# @param allow_paths [String, Array<String>] Filters links by selecting
|
317
|
+
# them only if their path includes one of allow_paths.
|
318
|
+
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
319
|
+
# them if their path includes one of disallow_paths.
|
279
320
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
280
|
-
def get_internal_links(doc)
|
281
|
-
doc
|
282
|
-
|
283
|
-
|
284
|
-
|
321
|
+
def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
|
322
|
+
links = doc
|
323
|
+
.internal_absolute_links
|
324
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
325
|
+
.uniq
|
326
|
+
.reject do |link|
|
285
327
|
ext = link.to_extension
|
286
328
|
ext ? !%w[htm html].include?(ext.downcase) : false
|
287
329
|
end
|
330
|
+
|
331
|
+
return links if allow_paths.nil? && disallow_paths.nil?
|
332
|
+
|
333
|
+
process_paths(links, allow_paths, disallow_paths)
|
334
|
+
end
|
335
|
+
|
336
|
+
private
|
337
|
+
|
338
|
+
# Validate and filter by the given URL paths.
|
339
|
+
def process_paths(links, allow_paths, disallow_paths)
|
340
|
+
raise "You can't provide both allow_paths: and disallow_paths: params" \
|
341
|
+
if allow_paths && disallow_paths
|
342
|
+
|
343
|
+
if allow_paths # White list.
|
344
|
+
filter_method = :select
|
345
|
+
paths = allow_paths
|
346
|
+
else # Black list.
|
347
|
+
filter_method = :reject
|
348
|
+
paths = disallow_paths
|
349
|
+
end
|
350
|
+
|
351
|
+
paths = [paths] unless paths.is_a?(Array)
|
352
|
+
paths = paths
|
353
|
+
.compact
|
354
|
+
.reject(&:empty?)
|
355
|
+
.uniq
|
356
|
+
.map { |path| Wgit::Url.new(path).to_path }
|
357
|
+
|
358
|
+
raise 'The provided paths cannot be empty' if paths.empty?
|
359
|
+
|
360
|
+
filter_links_by_path(links, filter_method, paths)
|
361
|
+
end
|
362
|
+
|
363
|
+
# Filters links by selecting or rejecting them based on their path.
|
364
|
+
def filter_links_by_path(links, filter_method, paths)
|
365
|
+
links.send(filter_method) do |link|
|
366
|
+
link_path = link.to_path
|
367
|
+
next(false) unless link_path
|
368
|
+
|
369
|
+
match = false
|
370
|
+
paths.each do |path|
|
371
|
+
match = link_path.start_with?(path)
|
372
|
+
break if match
|
373
|
+
end
|
374
|
+
|
375
|
+
match
|
376
|
+
end
|
288
377
|
end
|
289
378
|
|
290
379
|
alias crawl crawl_urls
|
@@ -220,19 +220,20 @@ module Wgit
|
|
220
220
|
# @param url [Wgit::Url] The Url to search the DB for.
|
221
221
|
# @return [Boolean] True if url exists, otherwise false.
|
222
222
|
def url?(url)
|
223
|
-
|
224
|
-
|
223
|
+
assert_type(url, String) # This includes Wgit::Url's.
|
224
|
+
hash = { 'url' => url }
|
225
|
+
@client[:urls].find(hash).any?
|
225
226
|
end
|
226
227
|
|
227
|
-
# Returns whether or not a record with the given doc 'url' field
|
228
|
-
# unique) exists in the database's 'documents' collection.
|
228
|
+
# Returns whether or not a record with the given doc 'url.url' field
|
229
|
+
# (which is unique) exists in the database's 'documents' collection.
|
229
230
|
#
|
230
231
|
# @param doc [Wgit::Document] The Document to search the DB for.
|
231
232
|
# @return [Boolean] True if doc exists, otherwise false.
|
232
233
|
def doc?(doc)
|
233
|
-
|
234
|
-
|
235
|
-
@client[:documents].find(
|
234
|
+
assert_type(doc, Wgit::Document)
|
235
|
+
hash = { 'url.url' => doc.url }
|
236
|
+
@client[:documents].find(hash).any?
|
236
237
|
end
|
237
238
|
|
238
239
|
### Update Data ###
|
@@ -309,7 +310,7 @@ module Wgit
|
|
309
310
|
# @return [Integer] The number of updated records.
|
310
311
|
def update_doc(doc)
|
311
312
|
assert_type(doc, Wgit::Document)
|
312
|
-
selection = { url
|
313
|
+
selection = { 'url.url' => doc.url }
|
313
314
|
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
314
315
|
update = { '$set' => doc_hash }
|
315
316
|
mutate(true, :documents, selection, update)
|
data/lib/wgit/database/model.rb
CHANGED
@@ -26,7 +26,7 @@ module Wgit
|
|
26
26
|
raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
|
27
27
|
|
28
28
|
model = doc.to_h(include_html: false, include_score: false)
|
29
|
-
model['url'] =
|
29
|
+
model['url'] = url(doc.url) # Expand Url String into full object.
|
30
30
|
|
31
31
|
Wgit::Utils.remove_non_bson_types(model)
|
32
32
|
end
|
@@ -36,7 +36,7 @@ module Wgit
|
|
36
36
|
# @return [Hash] Insertion fields common to all models.
|
37
37
|
def self.common_insert_data
|
38
38
|
{
|
39
|
-
date_added:
|
39
|
+
date_added: Wgit::Utils.time_stamp,
|
40
40
|
date_modified: Wgit::Utils.time_stamp
|
41
41
|
}
|
42
42
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -5,7 +5,8 @@ require 'nokogiri'
|
|
5
5
|
require 'json'
|
6
6
|
|
7
7
|
module Wgit
|
8
|
-
# Class modeling a HTML web document
|
8
|
+
# Class primarily modeling a HTML web document, although other MIME types
|
9
|
+
# will work e.g. images etc. Also doubles as a search result when
|
9
10
|
# loading Documents from the database via Wgit::Database#search.
|
10
11
|
#
|
11
12
|
# The initialize method dynamically initializes instance variables from the
|
@@ -60,11 +61,11 @@ module Wgit
|
|
60
61
|
# only used if url_or_obj is a String representing the web page's URL.
|
61
62
|
# Otherwise, the HTML comes from the database object. A html of nil will
|
62
63
|
# be defaulted to an empty String.
|
63
|
-
def initialize(url_or_obj, html = '')
|
64
|
+
def initialize(url_or_obj, html = '', encode_html: true)
|
64
65
|
if url_or_obj.is_a?(String)
|
65
|
-
init_from_strings(url_or_obj, html)
|
66
|
+
init_from_strings(url_or_obj, html, encode_html: encode_html)
|
66
67
|
else
|
67
|
-
init_from_object(url_or_obj)
|
68
|
+
init_from_object(url_or_obj, encode_html: encode_html)
|
68
69
|
end
|
69
70
|
end
|
70
71
|
|
@@ -91,25 +92,28 @@ module Wgit
|
|
91
92
|
# instance variables upon Document initialization. See the default
|
92
93
|
# extensions defined in 'document_extensions.rb' as examples.
|
93
94
|
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
#
|
95
|
+
# Note that defined extensions work for both Documents initialized from
|
96
|
+
# HTML (via Wgit::Crawler methods) and from database objects.
|
97
|
+
# An extension once defined, initializes a private instance variable with
|
98
|
+
# the xpath or database object result(s).
|
99
|
+
#
|
100
|
+
# When initialising from HTML, a singleton value of true will only
|
101
|
+
# ever return one result; otherwise all xpath results are returned in an
|
97
102
|
# Array. When initialising from a database object, the value is taken as
|
98
103
|
# is and singleton is only used to define the default empty value.
|
99
104
|
# If a value cannot be found (in either the HTML or database object), then
|
100
|
-
# a default will be used. The default value is: singleton ? nil : []
|
101
|
-
#
|
102
|
-
# Note that defined extensions work for both documents initialized from
|
103
|
-
# the WWW (via Wgit::Crawler methods) and from database objects. This
|
104
|
-
# effectively implements ORM like behavior using this class.
|
105
|
+
# a default will be used. The default value is: `singleton ? nil : []`.
|
105
106
|
#
|
106
107
|
# @param var [Symbol] The name of the variable to be initialised.
|
107
108
|
# @param xpath [String, Object#call] The xpath used to find the element(s)
|
108
|
-
# of the webpage.
|
109
|
+
# of the webpage. Only used when initializing from HTML.
|
110
|
+
#
|
111
|
+
# Pass a callable object (proc etc.) if you want the
|
109
112
|
# xpath value to be derived on Document initialisation (instead of when
|
110
113
|
# the extension is defined). The call method must return a valid xpath
|
111
114
|
# String.
|
112
|
-
# @param options [Hash] The options to define an extension with.
|
115
|
+
# @param options [Hash] The options to define an extension with. The
|
116
|
+
# options are only used when intializing from HTML, not the database.
|
113
117
|
# @option options [Boolean] :singleton The singleton option determines
|
114
118
|
# whether or not the result(s) should be in an Array. If multiple
|
115
119
|
# results are found and singleton is true then the first result will be
|
@@ -117,16 +121,17 @@ module Wgit
|
|
117
121
|
# @option options [Boolean] :text_content_only The text_content_only option
|
118
122
|
# if true will use the text content of the Nokogiri result object,
|
119
123
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
120
|
-
# @yield [value, source] Yields the value (Object) about to be
|
121
|
-
# the new var
|
122
|
-
#
|
123
|
-
#
|
124
|
-
# value
|
125
|
-
#
|
124
|
+
# @yield [value, source, type] Yields the value (Object) about to be
|
125
|
+
# assigned to the new var, the source of the value (Wgit::Document or DB
|
126
|
+
# Object) and the source type (Symbol of either :document or :object).
|
127
|
+
#
|
128
|
+
# The return value of the block becomes the new var value, unless nil.
|
129
|
+
# Return nil if you want to inspect but not change the var value. The
|
130
|
+
# block is executed when a Wgit::Document is initialized.
|
126
131
|
# @raise [StandardError] If the var param isn't valid.
|
127
|
-
# @return [Symbol] The
|
128
|
-
# if var == "title" then :init_title is returned.
|
132
|
+
# @return [Symbol] The given var Symbol.
|
129
133
|
def self.define_extension(var, xpath, options = {}, &block)
|
134
|
+
var = var.to_sym
|
130
135
|
default_options = { singleton: true, text_content_only: true }
|
131
136
|
options = default_options.merge(options)
|
132
137
|
|
@@ -149,7 +154,7 @@ module Wgit
|
|
149
154
|
end
|
150
155
|
Document.send :private, func_name
|
151
156
|
|
152
|
-
|
157
|
+
var
|
153
158
|
end
|
154
159
|
|
155
160
|
# Removes the init_* methods created when an extension is defined.
|
@@ -189,55 +194,48 @@ module Wgit
|
|
189
194
|
@html[range]
|
190
195
|
end
|
191
196
|
|
192
|
-
# Returns the timestamp of when this Document was crawled.
|
193
|
-
#
|
194
|
-
# @return [Time] Time of when this Document was crawled.
|
195
|
-
def date_crawled
|
196
|
-
@url.date_crawled
|
197
|
-
end
|
198
|
-
|
199
|
-
# Returns the duration of the crawl for this Document (in seconds).
|
200
|
-
#
|
201
|
-
# @return [Float] The duration of the crawl for this Document.
|
202
|
-
def crawl_duration
|
203
|
-
@url.crawl_duration
|
204
|
-
end
|
205
|
-
|
206
197
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
207
198
|
# <base> element's href value or @url (if @base is nil). If @base is
|
208
199
|
# present and relative, then @url.to_base + @base is returned. This method
|
209
200
|
# should be used instead of `doc.url.to_base` etc. when manually building
|
210
|
-
# absolute links from relative links.
|
201
|
+
# absolute links from relative links; or use `link.prefix_base(doc)`.
|
211
202
|
#
|
212
203
|
# Provide the `link:` parameter to get the correct base URL for that type
|
213
204
|
# of link. For example, a link of `#top` would always return @url because
|
214
205
|
# it applies to that page, not a different one. Query strings work in the
|
215
206
|
# same way. Use this parameter if manually concatting Url's e.g.
|
216
207
|
#
|
217
|
-
# relative_link = Wgit::Url.new
|
208
|
+
# relative_link = Wgit::Url.new('?q=hello')
|
218
209
|
# absolute_link = doc.base_url(link: relative_link).concat(relative_link)
|
219
210
|
#
|
220
211
|
# This is similar to how Wgit::Document#internal_absolute_links works.
|
221
212
|
#
|
222
213
|
# @param link [Wgit::Url, String] The link to obtain the correct base URL
|
223
|
-
# for.
|
214
|
+
# for; must be relative, not absolute.
|
215
|
+
# @raise [StandardError] If link is relative or if a base URL can't be
|
216
|
+
# established e.g. the doc @url is relative and <base> is nil.
|
224
217
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
225
218
|
# 'http://example.com/public'.
|
226
219
|
def base_url(link: nil)
|
220
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
221
|
+
if @url.relative? && @base.nil?
|
222
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
|
223
|
+
if @url.relative? && @base&.relative?
|
224
|
+
|
227
225
|
get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
|
228
226
|
|
229
227
|
if link
|
230
228
|
link = Wgit::Url.new(link)
|
231
229
|
raise "link must be relative: #{link}" unless link.relative?
|
232
230
|
|
233
|
-
if link.
|
231
|
+
if link.is_fragment? || link.is_query?
|
234
232
|
base_url = @base ? get_base.call : @url
|
235
|
-
return base_url.
|
233
|
+
return base_url.omit_fragment.omit_query
|
236
234
|
end
|
237
235
|
end
|
238
236
|
|
239
|
-
base_url = @base ? get_base.call : @url.
|
240
|
-
base_url.
|
237
|
+
base_url = @base ? get_base.call : @url.to_base
|
238
|
+
base_url.omit_fragment.omit_query
|
241
239
|
end
|
242
240
|
|
243
241
|
# Returns a Hash containing this Document's instance vars.
|
@@ -340,7 +338,7 @@ module Wgit
|
|
340
338
|
|
341
339
|
links = @links
|
342
340
|
.select { |link| link.relative?(host: @url.to_base) }
|
343
|
-
.map(&:
|
341
|
+
.map(&:omit_base)
|
344
342
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
345
343
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
346
344
|
end
|
@@ -354,7 +352,7 @@ module Wgit
|
|
354
352
|
#
|
355
353
|
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
356
354
|
def internal_absolute_links
|
357
|
-
internal_links.map { |link|
|
355
|
+
internal_links.map { |link| link.prefix_base(self) }
|
358
356
|
end
|
359
357
|
|
360
358
|
# Returns all external links from this Document in absolute form. External
|
@@ -366,7 +364,7 @@ module Wgit
|
|
366
364
|
|
367
365
|
links = @links
|
368
366
|
.reject { |link| link.relative?(host: @url.to_base) }
|
369
|
-
.map(&:
|
367
|
+
.map(&:omit_trailing_slash)
|
370
368
|
|
371
369
|
Wgit::Utils.process_arr(links)
|
372
370
|
end
|
@@ -438,7 +436,7 @@ module Wgit
|
|
438
436
|
orig_text = @text
|
439
437
|
@text = search(
|
440
438
|
query, case_sensitive: case_sensitive,
|
441
|
-
|
439
|
+
whole_sentence: whole_sentence, sentence_limit: sentence_limit
|
442
440
|
)
|
443
441
|
|
444
442
|
orig_text
|
@@ -473,7 +471,7 @@ module Wgit
|
|
473
471
|
# @yield [value, source] Given the value (String/Object) before it's set as
|
474
472
|
# an instance variable so that you can inspect/alter the value if
|
475
473
|
# desired. Return nil from the block if you don't want to override the
|
476
|
-
# value. Also given the source (Symbol) which is always :
|
474
|
+
# value. Also given the source (Symbol) which is always :document.
|
477
475
|
# @return [String, Object] The value found in the html or the default value
|
478
476
|
# (singleton ? nil : []).
|
479
477
|
def find_in_html(xpath, singleton: true, text_content_only: true)
|
@@ -492,7 +490,7 @@ module Wgit
|
|
492
490
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
493
491
|
|
494
492
|
if block_given?
|
495
|
-
new_result = yield(result, :
|
493
|
+
new_result = yield(result, self, :document)
|
496
494
|
result = new_result unless new_result.nil?
|
497
495
|
end
|
498
496
|
|
@@ -519,7 +517,7 @@ module Wgit
|
|
519
517
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
520
518
|
|
521
519
|
if block_given?
|
522
|
-
new_result = yield(result, :object)
|
520
|
+
new_result = yield(result, obj, :object)
|
523
521
|
result = new_result unless new_result.nil?
|
524
522
|
end
|
525
523
|
|
@@ -529,19 +527,19 @@ module Wgit
|
|
529
527
|
private
|
530
528
|
|
531
529
|
# Initialise the Document from URL and HTML Strings.
|
532
|
-
def init_from_strings(url, html)
|
530
|
+
def init_from_strings(url, html, encode_html: true)
|
533
531
|
assert_types(html, [String, NilClass])
|
534
532
|
|
535
533
|
# We already know url.is_a?(String) so parse into Url unless already so.
|
536
534
|
url = Wgit::Url.parse(url)
|
537
|
-
url.crawled = true unless url.crawled # Avoid overriding date_crawled.
|
535
|
+
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
538
536
|
|
539
537
|
@url = url
|
540
538
|
@html = html || ''
|
541
539
|
@doc = init_nokogiri
|
542
540
|
@score = 0.0
|
543
541
|
|
544
|
-
|
542
|
+
Wgit::Utils.process_str(@html, encode: encode_html)
|
545
543
|
|
546
544
|
# Dynamically run the init_*_from_html methods.
|
547
545
|
Document.private_instance_methods(false).each do |method|
|
@@ -554,7 +552,7 @@ module Wgit
|
|
554
552
|
|
555
553
|
# Initialise the Document from a Hash like Object containing Strings as
|
556
554
|
# keys e.g. database collection object or Hash.
|
557
|
-
def init_from_object(obj)
|
555
|
+
def init_from_object(obj, encode_html: true)
|
558
556
|
assert_respond_to(obj, :fetch)
|
559
557
|
|
560
558
|
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
@@ -562,7 +560,7 @@ module Wgit
|
|
562
560
|
@doc = init_nokogiri
|
563
561
|
@score = obj.fetch('score', 0.0)
|
564
562
|
|
565
|
-
|
563
|
+
Wgit::Utils.process_str(@html, encode: encode_html)
|
566
564
|
|
567
565
|
# Dynamically run the init_*_from_object methods.
|
568
566
|
Document.private_instance_methods(false).each do |method|
|
@@ -573,12 +571,6 @@ module Wgit
|
|
573
571
|
end
|
574
572
|
end
|
575
573
|
|
576
|
-
# Ensure the @url and @html Strings are correctly encoded etc.
|
577
|
-
def process_url_and_html
|
578
|
-
@url = Wgit::Utils.process_str(@url)
|
579
|
-
@html = Wgit::Utils.process_str(@html)
|
580
|
-
end
|
581
|
-
|
582
574
|
# Initialises an instance variable and defines a getter method for it.
|
583
575
|
#
|
584
576
|
# @param var [Symbol] The name of the variable to be initialized.
|
@@ -597,6 +589,7 @@ module Wgit
|
|
597
589
|
end
|
598
590
|
end
|
599
591
|
|
592
|
+
alias content html
|
600
593
|
alias statistics stats
|
601
594
|
alias internal_urls internal_links
|
602
595
|
alias internal_absolute_urls internal_absolute_links
|