wgit 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit/crawler.rb +155 -66
- data/lib/wgit/database/database.rb +9 -8
- data/lib/wgit/database/model.rb +2 -2
- data/lib/wgit/document.rb +55 -62
- data/lib/wgit/document_extensions.rb +2 -2
- data/lib/wgit/indexer.rb +27 -15
- data/lib/wgit/response.rb +144 -0
- data/lib/wgit/url.rb +149 -85
- data/lib/wgit/utils.rb +6 -3
- data/lib/wgit/version.rb +7 -2
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3e5c6b85b0ac78d234674d6003f8624b266c09668b4cfd78945106a917f78078
|
4
|
+
data.tar.gz: 3fc90cf5c132804f12e54f2b5f446143591923fff0677accc2ab907295ba34c4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f39df81391a07b344678a2b8d443b945391728d215e142ed73a55ef80cfc9c9a8407db9e4faa60c3e43e5b8e65bf8e84c3a343ff962b3c0276eed920639f3870
|
7
|
+
data.tar.gz: 1690895b56def00cbed58e485b23f5158ada0adb89f1c0e87bff3c638332648761dbac81b8f08e6c9c6ee911f4cbf9df72f3bfbce5d8abc2207d434edfde61ee
|
data/lib/wgit/crawler.rb
CHANGED
@@ -4,11 +4,13 @@ require_relative 'url'
|
|
4
4
|
require_relative 'document'
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
|
+
require_relative 'response'
|
7
8
|
require 'typhoeus'
|
8
9
|
|
9
10
|
module Wgit
|
10
11
|
# The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
|
11
|
-
# serialising their HTML into Wgit::Document instances.
|
12
|
+
# serialising their HTML into Wgit::Document instances. This is the only Wgit
|
13
|
+
# class which contains network logic e.g. request/response handling.
|
12
14
|
class Crawler
|
13
15
|
include Assertable
|
14
16
|
|
@@ -20,8 +22,11 @@ module Wgit
|
|
20
22
|
# before raising an error. Set to 0 to disable time outs completely.
|
21
23
|
attr_accessor :time_out
|
22
24
|
|
23
|
-
#
|
24
|
-
#
|
25
|
+
# Whether or not to UTF-8 encode the HTML once crawled. Set to false if
|
26
|
+
# crawling more than just HTML e.g. images etc.
|
27
|
+
attr_accessor :encode_html
|
28
|
+
|
29
|
+
# The Wgit::Response of the most recently crawled URL.
|
25
30
|
attr_reader :last_response
|
26
31
|
|
27
32
|
# Initializes and returns a Wgit::Crawler instance.
|
@@ -31,13 +36,18 @@ module Wgit
|
|
31
36
|
# @param time_out [Integer, Float] The maximum amount of time (in seconds)
|
32
37
|
# a crawl request has to complete before raising an error. Set to 0 to
|
33
38
|
# disable time outs completely.
|
34
|
-
|
39
|
+
# @param encode_html [Boolean] Whether or not to UTF-8 encode the HTML once
|
40
|
+
# crawled. Set to false if crawling more than just HTML e.g. images etc.
|
41
|
+
def initialize(redirect_limit: 5, time_out: 5, encode_html: true)
|
35
42
|
@redirect_limit = redirect_limit
|
36
43
|
@time_out = time_out
|
44
|
+
@encode_html = encode_html
|
37
45
|
end
|
38
46
|
|
39
47
|
# Crawls an entire website's HTML pages by recursively going through
|
40
|
-
# its internal links. Each crawled Document is yielded to a block.
|
48
|
+
# its internal <a> links. Each crawled Document is yielded to a block. Use
|
49
|
+
# the allow and disallow paths params to partially and selectively crawl a
|
50
|
+
# site.
|
41
51
|
#
|
42
52
|
# Only redirects to the same host are followed. For example, the Url
|
43
53
|
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
@@ -50,20 +60,26 @@ module Wgit
|
|
50
60
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
51
61
|
# It is recommended that this URL be the index page of the site to give a
|
52
62
|
# greater chance of finding all pages within that site/host.
|
63
|
+
# @param allow_paths [String, Array<String>] Filters links by selecting
|
64
|
+
# them only if their path includes one of allow_paths.
|
65
|
+
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
66
|
+
# them if their path includes one of disallow_paths.
|
53
67
|
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
54
68
|
# A block is the only way to interact with each crawled Document.
|
55
69
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
56
70
|
# from all of the site's pages or nil if the url could not be
|
57
71
|
# crawled successfully.
|
58
|
-
def crawl_site(url, &block)
|
72
|
+
def crawl_site(url, allow_paths: nil, disallow_paths: nil, &block)
|
59
73
|
doc = crawl_url(url, &block)
|
60
74
|
return nil if doc.nil?
|
61
75
|
|
62
|
-
|
76
|
+
crawl_opts = { follow_external_redirects: false, host: url.to_base }
|
77
|
+
link_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
|
78
|
+
|
63
79
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
64
80
|
crawled = [url, alt_url]
|
65
81
|
externals = doc.external_links
|
66
|
-
internals = get_internal_links(doc)
|
82
|
+
internals = get_internal_links(doc, link_opts)
|
67
83
|
|
68
84
|
return doc.external_links.uniq if internals.empty?
|
69
85
|
|
@@ -76,12 +92,12 @@ module Wgit
|
|
76
92
|
|
77
93
|
links.each do |link|
|
78
94
|
orig_link = link.dup
|
79
|
-
doc = crawl_url(link,
|
95
|
+
doc = crawl_url(link, crawl_opts, &block)
|
80
96
|
|
81
97
|
crawled.push(orig_link, link) # Push both in case of redirects.
|
82
98
|
next if doc.nil?
|
83
99
|
|
84
|
-
internals.concat(get_internal_links(doc))
|
100
|
+
internals.concat(get_internal_links(doc, link_opts))
|
85
101
|
externals.concat(doc.external_links)
|
86
102
|
end
|
87
103
|
end
|
@@ -141,7 +157,7 @@ module Wgit
|
|
141
157
|
host: host
|
142
158
|
)
|
143
159
|
|
144
|
-
doc = Wgit::Document.new(url, html)
|
160
|
+
doc = Wgit::Document.new(url, html, encode_html: @encode_html)
|
145
161
|
yield(doc) if block_given?
|
146
162
|
|
147
163
|
doc.empty? ? nil : doc
|
@@ -149,7 +165,7 @@ module Wgit
|
|
149
165
|
|
150
166
|
protected
|
151
167
|
|
152
|
-
#
|
168
|
+
# Returns the url HTML String or nil. Handles any errors that arise
|
153
169
|
# and sets the @last_response. Errors or any HTTP response that doesn't
|
154
170
|
# return a HTML body will be ignored, returning nil.
|
155
171
|
#
|
@@ -166,31 +182,33 @@ module Wgit
|
|
166
182
|
# @return [String, nil] The crawled HTML or nil if the crawl was
|
167
183
|
# unsuccessful.
|
168
184
|
def fetch(url, follow_external_redirects: true, host: nil)
|
169
|
-
response
|
170
|
-
crawl_duration = nil
|
185
|
+
response = Wgit::Response.new
|
171
186
|
|
172
|
-
|
187
|
+
resolve(
|
173
188
|
url,
|
189
|
+
response,
|
174
190
|
follow_external_redirects: follow_external_redirects,
|
175
191
|
host: host
|
176
192
|
)
|
177
|
-
crawl_duration = response.total_time
|
178
193
|
|
179
|
-
response.
|
194
|
+
response.body_or_nil
|
180
195
|
rescue StandardError => e
|
181
|
-
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e
|
196
|
+
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e}")
|
182
197
|
|
183
198
|
nil
|
184
199
|
ensure
|
185
|
-
url.crawled = true #
|
186
|
-
url.crawl_duration =
|
187
|
-
|
200
|
+
url.crawled = true # Sets date_crawled underneath.
|
201
|
+
url.crawl_duration = response.total_time
|
202
|
+
|
203
|
+
@last_response = response
|
188
204
|
end
|
189
205
|
|
190
|
-
#
|
191
|
-
#
|
206
|
+
# GETs the given url, resolving any redirects. The given response object
|
207
|
+
# will be enriched.
|
192
208
|
#
|
193
|
-
# @param url [Wgit::Url] The URL to resolve.
|
209
|
+
# @param url [Wgit::Url] The URL to GET and resolve.
|
210
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
211
|
+
# reference.
|
194
212
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
195
213
|
# an external redirect. If false, you must also provide a `host:`
|
196
214
|
# parameter.
|
@@ -200,91 +218,162 @@ module Wgit
|
|
200
218
|
# 'http://www.example.com' will only allow redirects for Urls with a
|
201
219
|
# `to_host` value of 'www.example.com'.
|
202
220
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
203
|
-
|
204
|
-
def resolve(url, follow_external_redirects: true, host: nil)
|
205
|
-
response = nil
|
206
|
-
redirect_count = 0
|
207
|
-
total_net_time = 0.0
|
208
|
-
|
221
|
+
def resolve(url, response, follow_external_redirects: true, host: nil)
|
209
222
|
loop do
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
# Break unless it's a redirect.
|
214
|
-
break unless (response.code >= 300) && (response.code < 400)
|
223
|
+
get_response(url, response)
|
224
|
+
break unless response.redirect?
|
215
225
|
|
216
226
|
# Handle response 'Location' header.
|
217
|
-
location = Wgit::
|
218
|
-
location = Wgit::Url.new(location)
|
227
|
+
location = Wgit::Url.new(response.headers.fetch(:location, ''))
|
219
228
|
raise 'Encountered redirect without Location header' if location.empty?
|
220
229
|
|
221
230
|
yield(url, response, location) if block_given?
|
222
231
|
|
223
|
-
#
|
232
|
+
# Validate redirect.
|
224
233
|
if !follow_external_redirects && !location.relative?(host: host)
|
225
234
|
raise "External redirect not allowed - Redirected to: \
|
226
235
|
'#{location}', which is outside of host: '#{host}'"
|
227
236
|
end
|
228
237
|
|
229
|
-
raise "Too many redirects, exceeded: #{
|
230
|
-
if redirect_count >= @redirect_limit
|
231
|
-
|
232
|
-
redirect_count += 1
|
238
|
+
raise "Too many redirects, exceeded: #{@redirect_limit}" \
|
239
|
+
if response.redirect_count >= @redirect_limit
|
233
240
|
|
234
241
|
# Process the location to be crawled next.
|
235
242
|
location = url.to_base.concat(location) if location.relative?
|
243
|
+
response.redirections[url.to_s] = location.to_s
|
236
244
|
url.replace(location) # Update the url on redirect.
|
237
245
|
end
|
238
|
-
|
239
|
-
response.options[:redirect_count] = redirect_count
|
240
|
-
response.options[:total_time] = total_net_time
|
241
|
-
|
242
|
-
response
|
243
246
|
end
|
244
247
|
|
245
|
-
#
|
248
|
+
# Makes a HTTP request and enriches the given Wgit::Response from it.
|
246
249
|
#
|
247
250
|
# @param url [String] The url to GET. Will call url#normalize if possible.
|
251
|
+
# @param response [Wgit::Response] The response to enrich. Modifies by
|
252
|
+
# reference.
|
248
253
|
# @raise [StandardError] If a response can't be obtained.
|
249
|
-
# @return [
|
250
|
-
def get_response(url)
|
251
|
-
|
254
|
+
# @return [Wgit::Response] The enriched HTTP Wgit::Response object.
|
255
|
+
def get_response(url, response)
|
256
|
+
# Perform a HTTP GET request.
|
257
|
+
orig_url = url.to_s
|
258
|
+
url = url.normalize if url.respond_to?(:normalize)
|
259
|
+
|
260
|
+
http_response = http_get(url)
|
261
|
+
|
262
|
+
# Enrich the given Wgit::Response object.
|
263
|
+
response.adapter_response = http_response
|
264
|
+
response.url = orig_url
|
265
|
+
response.status = http_response.code
|
266
|
+
response.headers = http_response.headers
|
267
|
+
response.body = http_response.body
|
268
|
+
response.ip_address = http_response.primary_ip
|
269
|
+
response.add_total_time(http_response.total_time)
|
270
|
+
|
271
|
+
# Log (debug) the request/response details.
|
272
|
+
resp_template = '[http] Response: %s (%s bytes in %s seconds)'
|
273
|
+
log_status = (response.status || 0)
|
274
|
+
log_total_time = response.total_time.truncate(3)
|
275
|
+
|
276
|
+
Wgit.logger.debug("[http] Request: #{response.url}")
|
277
|
+
Wgit.logger.debug(
|
278
|
+
format(resp_template, log_status, response.size, log_total_time)
|
279
|
+
)
|
252
280
|
|
281
|
+
# Handle a failed response.
|
282
|
+
raise "No response (within timeout: #{@time_out} second(s))" \
|
283
|
+
if response.failure?
|
284
|
+
end
|
285
|
+
|
286
|
+
# Performs a HTTP GET request and returns the response.
|
287
|
+
#
|
288
|
+
# @param url [String] The url to GET.
|
289
|
+
# @return [Typhoeus::Response] The HTTP response object.
|
290
|
+
def http_get(url)
|
253
291
|
opts = {
|
254
292
|
followlocation: false,
|
255
293
|
timeout: @time_out,
|
256
294
|
accept_encoding: 'gzip',
|
257
295
|
headers: {
|
258
296
|
'User-Agent' => "wgit/#{Wgit::VERSION}",
|
259
|
-
'Accept'
|
297
|
+
'Accept' => 'text/html'
|
260
298
|
}
|
261
299
|
}
|
262
300
|
|
263
|
-
|
264
|
-
|
265
|
-
# Handle response status code.
|
266
|
-
raise "No response (within timeout: #{@time_out} second(s))" \
|
267
|
-
if response.code.zero?
|
268
|
-
|
269
|
-
response
|
301
|
+
# See https://rubydoc.info/gems/typhoeus for more info.
|
302
|
+
Typhoeus.get(url, opts)
|
270
303
|
end
|
271
304
|
|
272
305
|
# Returns a doc's internal HTML page links in absolute form; used when
|
273
|
-
# crawling a site.
|
306
|
+
# crawling a site. Use the allow and disallow paths params to partially
|
307
|
+
# and selectively crawl a site.
|
308
|
+
#
|
309
|
+
# Override this method in a subclass to change how a site
|
274
310
|
# is crawled; not what is extracted from each page (Document extensions
|
275
|
-
# should be used for this purpose instead).
|
311
|
+
# should be used for this purpose instead). Just remember that only HTML
|
312
|
+
# files containing <a> links can keep the crawl going beyond the base URL.
|
276
313
|
#
|
277
314
|
# @param doc [Wgit::Document] The document from which to extract it's
|
278
315
|
# internal page links.
|
316
|
+
# @param allow_paths [String, Array<String>] Filters links by selecting
|
317
|
+
# them only if their path includes one of allow_paths.
|
318
|
+
# @param disallow_paths [String, Array<String>] Filters links by rejecting
|
319
|
+
# them if their path includes one of disallow_paths.
|
279
320
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
280
|
-
def get_internal_links(doc)
|
281
|
-
doc
|
282
|
-
|
283
|
-
|
284
|
-
|
321
|
+
def get_internal_links(doc, allow_paths: nil, disallow_paths: nil)
|
322
|
+
links = doc
|
323
|
+
.internal_absolute_links
|
324
|
+
.map(&:omit_fragment) # Because fragments don't alter content.
|
325
|
+
.uniq
|
326
|
+
.reject do |link|
|
285
327
|
ext = link.to_extension
|
286
328
|
ext ? !%w[htm html].include?(ext.downcase) : false
|
287
329
|
end
|
330
|
+
|
331
|
+
return links if allow_paths.nil? && disallow_paths.nil?
|
332
|
+
|
333
|
+
process_paths(links, allow_paths, disallow_paths)
|
334
|
+
end
|
335
|
+
|
336
|
+
private
|
337
|
+
|
338
|
+
# Validate and filter by the given URL paths.
|
339
|
+
def process_paths(links, allow_paths, disallow_paths)
|
340
|
+
raise "You can't provide both allow_paths: and disallow_paths: params" \
|
341
|
+
if allow_paths && disallow_paths
|
342
|
+
|
343
|
+
if allow_paths # White list.
|
344
|
+
filter_method = :select
|
345
|
+
paths = allow_paths
|
346
|
+
else # Black list.
|
347
|
+
filter_method = :reject
|
348
|
+
paths = disallow_paths
|
349
|
+
end
|
350
|
+
|
351
|
+
paths = [paths] unless paths.is_a?(Array)
|
352
|
+
paths = paths
|
353
|
+
.compact
|
354
|
+
.reject(&:empty?)
|
355
|
+
.uniq
|
356
|
+
.map { |path| Wgit::Url.new(path).to_path }
|
357
|
+
|
358
|
+
raise 'The provided paths cannot be empty' if paths.empty?
|
359
|
+
|
360
|
+
filter_links_by_path(links, filter_method, paths)
|
361
|
+
end
|
362
|
+
|
363
|
+
# Filters links by selecting or rejecting them based on their path.
|
364
|
+
def filter_links_by_path(links, filter_method, paths)
|
365
|
+
links.send(filter_method) do |link|
|
366
|
+
link_path = link.to_path
|
367
|
+
next(false) unless link_path
|
368
|
+
|
369
|
+
match = false
|
370
|
+
paths.each do |path|
|
371
|
+
match = link_path.start_with?(path)
|
372
|
+
break if match
|
373
|
+
end
|
374
|
+
|
375
|
+
match
|
376
|
+
end
|
288
377
|
end
|
289
378
|
|
290
379
|
alias crawl crawl_urls
|
@@ -220,19 +220,20 @@ module Wgit
|
|
220
220
|
# @param url [Wgit::Url] The Url to search the DB for.
|
221
221
|
# @return [Boolean] True if url exists, otherwise false.
|
222
222
|
def url?(url)
|
223
|
-
|
224
|
-
|
223
|
+
assert_type(url, String) # This includes Wgit::Url's.
|
224
|
+
hash = { 'url' => url }
|
225
|
+
@client[:urls].find(hash).any?
|
225
226
|
end
|
226
227
|
|
227
|
-
# Returns whether or not a record with the given doc 'url' field
|
228
|
-
# unique) exists in the database's 'documents' collection.
|
228
|
+
# Returns whether or not a record with the given doc 'url.url' field
|
229
|
+
# (which is unique) exists in the database's 'documents' collection.
|
229
230
|
#
|
230
231
|
# @param doc [Wgit::Document] The Document to search the DB for.
|
231
232
|
# @return [Boolean] True if doc exists, otherwise false.
|
232
233
|
def doc?(doc)
|
233
|
-
|
234
|
-
|
235
|
-
@client[:documents].find(
|
234
|
+
assert_type(doc, Wgit::Document)
|
235
|
+
hash = { 'url.url' => doc.url }
|
236
|
+
@client[:documents].find(hash).any?
|
236
237
|
end
|
237
238
|
|
238
239
|
### Update Data ###
|
@@ -309,7 +310,7 @@ module Wgit
|
|
309
310
|
# @return [Integer] The number of updated records.
|
310
311
|
def update_doc(doc)
|
311
312
|
assert_type(doc, Wgit::Document)
|
312
|
-
selection = { url
|
313
|
+
selection = { 'url.url' => doc.url }
|
313
314
|
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
314
315
|
update = { '$set' => doc_hash }
|
315
316
|
mutate(true, :documents, selection, update)
|
data/lib/wgit/database/model.rb
CHANGED
@@ -26,7 +26,7 @@ module Wgit
|
|
26
26
|
raise 'doc must respond_to? :to_h' unless doc.respond_to?(:to_h)
|
27
27
|
|
28
28
|
model = doc.to_h(include_html: false, include_score: false)
|
29
|
-
model['url'] =
|
29
|
+
model['url'] = url(doc.url) # Expand Url String into full object.
|
30
30
|
|
31
31
|
Wgit::Utils.remove_non_bson_types(model)
|
32
32
|
end
|
@@ -36,7 +36,7 @@ module Wgit
|
|
36
36
|
# @return [Hash] Insertion fields common to all models.
|
37
37
|
def self.common_insert_data
|
38
38
|
{
|
39
|
-
date_added:
|
39
|
+
date_added: Wgit::Utils.time_stamp,
|
40
40
|
date_modified: Wgit::Utils.time_stamp
|
41
41
|
}
|
42
42
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -5,7 +5,8 @@ require 'nokogiri'
|
|
5
5
|
require 'json'
|
6
6
|
|
7
7
|
module Wgit
|
8
|
-
# Class modeling a HTML web document
|
8
|
+
# Class primarily modeling a HTML web document, although other MIME types
|
9
|
+
# will work e.g. images etc. Also doubles as a search result when
|
9
10
|
# loading Documents from the database via Wgit::Database#search.
|
10
11
|
#
|
11
12
|
# The initialize method dynamically initializes instance variables from the
|
@@ -60,11 +61,11 @@ module Wgit
|
|
60
61
|
# only used if url_or_obj is a String representing the web page's URL.
|
61
62
|
# Otherwise, the HTML comes from the database object. A html of nil will
|
62
63
|
# be defaulted to an empty String.
|
63
|
-
def initialize(url_or_obj, html = '')
|
64
|
+
def initialize(url_or_obj, html = '', encode_html: true)
|
64
65
|
if url_or_obj.is_a?(String)
|
65
|
-
init_from_strings(url_or_obj, html)
|
66
|
+
init_from_strings(url_or_obj, html, encode_html: encode_html)
|
66
67
|
else
|
67
|
-
init_from_object(url_or_obj)
|
68
|
+
init_from_object(url_or_obj, encode_html: encode_html)
|
68
69
|
end
|
69
70
|
end
|
70
71
|
|
@@ -91,25 +92,28 @@ module Wgit
|
|
91
92
|
# instance variables upon Document initialization. See the default
|
92
93
|
# extensions defined in 'document_extensions.rb' as examples.
|
93
94
|
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
#
|
95
|
+
# Note that defined extensions work for both Documents initialized from
|
96
|
+
# HTML (via Wgit::Crawler methods) and from database objects.
|
97
|
+
# An extension once defined, initializes a private instance variable with
|
98
|
+
# the xpath or database object result(s).
|
99
|
+
#
|
100
|
+
# When initialising from HTML, a singleton value of true will only
|
101
|
+
# ever return one result; otherwise all xpath results are returned in an
|
97
102
|
# Array. When initialising from a database object, the value is taken as
|
98
103
|
# is and singleton is only used to define the default empty value.
|
99
104
|
# If a value cannot be found (in either the HTML or database object), then
|
100
|
-
# a default will be used. The default value is: singleton ? nil : []
|
101
|
-
#
|
102
|
-
# Note that defined extensions work for both documents initialized from
|
103
|
-
# the WWW (via Wgit::Crawler methods) and from database objects. This
|
104
|
-
# effectively implements ORM like behavior using this class.
|
105
|
+
# a default will be used. The default value is: `singleton ? nil : []`.
|
105
106
|
#
|
106
107
|
# @param var [Symbol] The name of the variable to be initialised.
|
107
108
|
# @param xpath [String, Object#call] The xpath used to find the element(s)
|
108
|
-
# of the webpage.
|
109
|
+
# of the webpage. Only used when initializing from HTML.
|
110
|
+
#
|
111
|
+
# Pass a callable object (proc etc.) if you want the
|
109
112
|
# xpath value to be derived on Document initialisation (instead of when
|
110
113
|
# the extension is defined). The call method must return a valid xpath
|
111
114
|
# String.
|
112
|
-
# @param options [Hash] The options to define an extension with.
|
115
|
+
# @param options [Hash] The options to define an extension with. The
|
116
|
+
# options are only used when intializing from HTML, not the database.
|
113
117
|
# @option options [Boolean] :singleton The singleton option determines
|
114
118
|
# whether or not the result(s) should be in an Array. If multiple
|
115
119
|
# results are found and singleton is true then the first result will be
|
@@ -117,16 +121,17 @@ module Wgit
|
|
117
121
|
# @option options [Boolean] :text_content_only The text_content_only option
|
118
122
|
# if true will use the text content of the Nokogiri result object,
|
119
123
|
# otherwise the Nokogiri object itself is returned. Defaults to true.
|
120
|
-
# @yield [value, source] Yields the value (Object) about to be
|
121
|
-
# the new var
|
122
|
-
#
|
123
|
-
#
|
124
|
-
# value
|
125
|
-
#
|
124
|
+
# @yield [value, source, type] Yields the value (Object) about to be
|
125
|
+
# assigned to the new var, the source of the value (Wgit::Document or DB
|
126
|
+
# Object) and the source type (Symbol of either :document or :object).
|
127
|
+
#
|
128
|
+
# The return value of the block becomes the new var value, unless nil.
|
129
|
+
# Return nil if you want to inspect but not change the var value. The
|
130
|
+
# block is executed when a Wgit::Document is initialized.
|
126
131
|
# @raise [StandardError] If the var param isn't valid.
|
127
|
-
# @return [Symbol] The
|
128
|
-
# if var == "title" then :init_title is returned.
|
132
|
+
# @return [Symbol] The given var Symbol.
|
129
133
|
def self.define_extension(var, xpath, options = {}, &block)
|
134
|
+
var = var.to_sym
|
130
135
|
default_options = { singleton: true, text_content_only: true }
|
131
136
|
options = default_options.merge(options)
|
132
137
|
|
@@ -149,7 +154,7 @@ module Wgit
|
|
149
154
|
end
|
150
155
|
Document.send :private, func_name
|
151
156
|
|
152
|
-
|
157
|
+
var
|
153
158
|
end
|
154
159
|
|
155
160
|
# Removes the init_* methods created when an extension is defined.
|
@@ -189,55 +194,48 @@ module Wgit
|
|
189
194
|
@html[range]
|
190
195
|
end
|
191
196
|
|
192
|
-
# Returns the timestamp of when this Document was crawled.
|
193
|
-
#
|
194
|
-
# @return [Time] Time of when this Document was crawled.
|
195
|
-
def date_crawled
|
196
|
-
@url.date_crawled
|
197
|
-
end
|
198
|
-
|
199
|
-
# Returns the duration of the crawl for this Document (in seconds).
|
200
|
-
#
|
201
|
-
# @return [Float] The duration of the crawl for this Document.
|
202
|
-
def crawl_duration
|
203
|
-
@url.crawl_duration
|
204
|
-
end
|
205
|
-
|
206
197
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
207
198
|
# <base> element's href value or @url (if @base is nil). If @base is
|
208
199
|
# present and relative, then @url.to_base + @base is returned. This method
|
209
200
|
# should be used instead of `doc.url.to_base` etc. when manually building
|
210
|
-
# absolute links from relative links.
|
201
|
+
# absolute links from relative links; or use `link.prefix_base(doc)`.
|
211
202
|
#
|
212
203
|
# Provide the `link:` parameter to get the correct base URL for that type
|
213
204
|
# of link. For example, a link of `#top` would always return @url because
|
214
205
|
# it applies to that page, not a different one. Query strings work in the
|
215
206
|
# same way. Use this parameter if manually concatting Url's e.g.
|
216
207
|
#
|
217
|
-
# relative_link = Wgit::Url.new
|
208
|
+
# relative_link = Wgit::Url.new('?q=hello')
|
218
209
|
# absolute_link = doc.base_url(link: relative_link).concat(relative_link)
|
219
210
|
#
|
220
211
|
# This is similar to how Wgit::Document#internal_absolute_links works.
|
221
212
|
#
|
222
213
|
# @param link [Wgit::Url, String] The link to obtain the correct base URL
|
223
|
-
# for.
|
214
|
+
# for; must be relative, not absolute.
|
215
|
+
# @raise [StandardError] If link is relative or if a base URL can't be
|
216
|
+
# established e.g. the doc @url is relative and <base> is nil.
|
224
217
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
225
218
|
# 'http://example.com/public'.
|
226
219
|
def base_url(link: nil)
|
220
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
221
|
+
if @url.relative? && @base.nil?
|
222
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
|
223
|
+
if @url.relative? && @base&.relative?
|
224
|
+
|
227
225
|
get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
|
228
226
|
|
229
227
|
if link
|
230
228
|
link = Wgit::Url.new(link)
|
231
229
|
raise "link must be relative: #{link}" unless link.relative?
|
232
230
|
|
233
|
-
if link.
|
231
|
+
if link.is_fragment? || link.is_query?
|
234
232
|
base_url = @base ? get_base.call : @url
|
235
|
-
return base_url.
|
233
|
+
return base_url.omit_fragment.omit_query
|
236
234
|
end
|
237
235
|
end
|
238
236
|
|
239
|
-
base_url = @base ? get_base.call : @url.
|
240
|
-
base_url.
|
237
|
+
base_url = @base ? get_base.call : @url.to_base
|
238
|
+
base_url.omit_fragment.omit_query
|
241
239
|
end
|
242
240
|
|
243
241
|
# Returns a Hash containing this Document's instance vars.
|
@@ -340,7 +338,7 @@ module Wgit
|
|
340
338
|
|
341
339
|
links = @links
|
342
340
|
.select { |link| link.relative?(host: @url.to_base) }
|
343
|
-
.map(&:
|
341
|
+
.map(&:omit_base)
|
344
342
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
345
343
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
346
344
|
end
|
@@ -354,7 +352,7 @@ module Wgit
|
|
354
352
|
#
|
355
353
|
# @return [Array<Wgit::Url>] Self's internal Url's in absolute form.
|
356
354
|
def internal_absolute_links
|
357
|
-
internal_links.map { |link|
|
355
|
+
internal_links.map { |link| link.prefix_base(self) }
|
358
356
|
end
|
359
357
|
|
360
358
|
# Returns all external links from this Document in absolute form. External
|
@@ -366,7 +364,7 @@ module Wgit
|
|
366
364
|
|
367
365
|
links = @links
|
368
366
|
.reject { |link| link.relative?(host: @url.to_base) }
|
369
|
-
.map(&:
|
367
|
+
.map(&:omit_trailing_slash)
|
370
368
|
|
371
369
|
Wgit::Utils.process_arr(links)
|
372
370
|
end
|
@@ -438,7 +436,7 @@ module Wgit
|
|
438
436
|
orig_text = @text
|
439
437
|
@text = search(
|
440
438
|
query, case_sensitive: case_sensitive,
|
441
|
-
|
439
|
+
whole_sentence: whole_sentence, sentence_limit: sentence_limit
|
442
440
|
)
|
443
441
|
|
444
442
|
orig_text
|
@@ -473,7 +471,7 @@ module Wgit
|
|
473
471
|
# @yield [value, source] Given the value (String/Object) before it's set as
|
474
472
|
# an instance variable so that you can inspect/alter the value if
|
475
473
|
# desired. Return nil from the block if you don't want to override the
|
476
|
-
# value. Also given the source (Symbol) which is always :
|
474
|
+
# value. Also given the source (Symbol) which is always :document.
|
477
475
|
# @return [String, Object] The value found in the html or the default value
|
478
476
|
# (singleton ? nil : []).
|
479
477
|
def find_in_html(xpath, singleton: true, text_content_only: true)
|
@@ -492,7 +490,7 @@ module Wgit
|
|
492
490
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
493
491
|
|
494
492
|
if block_given?
|
495
|
-
new_result = yield(result, :
|
493
|
+
new_result = yield(result, self, :document)
|
496
494
|
result = new_result unless new_result.nil?
|
497
495
|
end
|
498
496
|
|
@@ -519,7 +517,7 @@ module Wgit
|
|
519
517
|
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
520
518
|
|
521
519
|
if block_given?
|
522
|
-
new_result = yield(result, :object)
|
520
|
+
new_result = yield(result, obj, :object)
|
523
521
|
result = new_result unless new_result.nil?
|
524
522
|
end
|
525
523
|
|
@@ -529,19 +527,19 @@ module Wgit
|
|
529
527
|
private
|
530
528
|
|
531
529
|
# Initialise the Document from URL and HTML Strings.
|
532
|
-
def init_from_strings(url, html)
|
530
|
+
def init_from_strings(url, html, encode_html: true)
|
533
531
|
assert_types(html, [String, NilClass])
|
534
532
|
|
535
533
|
# We already know url.is_a?(String) so parse into Url unless already so.
|
536
534
|
url = Wgit::Url.parse(url)
|
537
|
-
url.crawled = true unless url.crawled # Avoid overriding date_crawled.
|
535
|
+
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
538
536
|
|
539
537
|
@url = url
|
540
538
|
@html = html || ''
|
541
539
|
@doc = init_nokogiri
|
542
540
|
@score = 0.0
|
543
541
|
|
544
|
-
|
542
|
+
Wgit::Utils.process_str(@html, encode: encode_html)
|
545
543
|
|
546
544
|
# Dynamically run the init_*_from_html methods.
|
547
545
|
Document.private_instance_methods(false).each do |method|
|
@@ -554,7 +552,7 @@ module Wgit
|
|
554
552
|
|
555
553
|
# Initialise the Document from a Hash like Object containing Strings as
|
556
554
|
# keys e.g. database collection object or Hash.
|
557
|
-
def init_from_object(obj)
|
555
|
+
def init_from_object(obj, encode_html: true)
|
558
556
|
assert_respond_to(obj, :fetch)
|
559
557
|
|
560
558
|
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
@@ -562,7 +560,7 @@ module Wgit
|
|
562
560
|
@doc = init_nokogiri
|
563
561
|
@score = obj.fetch('score', 0.0)
|
564
562
|
|
565
|
-
|
563
|
+
Wgit::Utils.process_str(@html, encode: encode_html)
|
566
564
|
|
567
565
|
# Dynamically run the init_*_from_object methods.
|
568
566
|
Document.private_instance_methods(false).each do |method|
|
@@ -573,12 +571,6 @@ module Wgit
|
|
573
571
|
end
|
574
572
|
end
|
575
573
|
|
576
|
-
# Ensure the @url and @html Strings are correctly encoded etc.
|
577
|
-
def process_url_and_html
|
578
|
-
@url = Wgit::Utils.process_str(@url)
|
579
|
-
@html = Wgit::Utils.process_str(@html)
|
580
|
-
end
|
581
|
-
|
582
574
|
# Initialises an instance variable and defines a getter method for it.
|
583
575
|
#
|
584
576
|
# @param var [Symbol] The name of the variable to be initialized.
|
@@ -597,6 +589,7 @@ module Wgit
|
|
597
589
|
end
|
598
590
|
end
|
599
591
|
|
592
|
+
alias content html
|
600
593
|
alias statistics stats
|
601
594
|
alias internal_urls internal_links
|
602
595
|
alias internal_absolute_urls internal_absolute_links
|