wgit 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +39 -0
- data/LICENSE.txt +1 -1
- data/README.md +118 -323
- data/bin/wgit +9 -5
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +138 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +133 -31
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- metadata +26 -14
data/lib/wgit/database/model.rb
CHANGED
@@ -14,8 +14,7 @@ module Wgit
|
|
14
14
|
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
15
15
|
|
16
16
|
model = url.to_h
|
17
|
-
|
18
|
-
Wgit::Utils.remove_non_bson_types(model)
|
17
|
+
select_bson_types(model)
|
19
18
|
end
|
20
19
|
|
21
20
|
# The data model for a Wgit::Document collection object.
|
@@ -28,7 +27,7 @@ module Wgit
|
|
28
27
|
model = doc.to_h(include_html: false, include_score: false)
|
29
28
|
model['url'] = url(doc.url) # Expand Url String into full object.
|
30
29
|
|
31
|
-
|
30
|
+
select_bson_types(model)
|
32
31
|
end
|
33
32
|
|
34
33
|
# Common fields when inserting a record into the DB.
|
@@ -49,5 +48,13 @@ module Wgit
|
|
49
48
|
date_modified: Wgit::Utils.time_stamp
|
50
49
|
}
|
51
50
|
end
|
51
|
+
|
52
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
53
|
+
#
|
54
|
+
# @param model_hash [Hash] The model Hash to sanitize.
|
55
|
+
# @return [Hash] The model Hash with non bson types removed.
|
56
|
+
def self.select_bson_types(model_hash)
|
57
|
+
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
58
|
+
end
|
52
59
|
end
|
53
60
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -6,19 +6,19 @@ require 'json'
|
|
6
6
|
require 'set'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
10
10
|
# will work e.g. images etc. Also doubles as a search result when
|
11
11
|
# loading Documents from the database via `Wgit::Database#search`.
|
12
12
|
#
|
13
13
|
# The initialize method dynamically initializes instance variables from the
|
14
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
15
|
-
# Document class can be easily extended allowing you to
|
16
|
-
# a webpage that are important to you. See `Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
17
17
|
class Document
|
18
18
|
include Assertable
|
19
19
|
|
20
|
-
# Regex for the allowed var names when defining an
|
21
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
22
22
|
|
23
23
|
# Set of text elements used to build Document#text.
|
24
24
|
@text_elements = Set.new(%i[
|
@@ -29,8 +29,8 @@ module Wgit
|
|
29
29
|
summary sup td textarea th time u ul var wbr
|
30
30
|
])
|
31
31
|
|
32
|
-
# Set of Symbols representing the defined Document
|
33
|
-
@
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
34
34
|
|
35
35
|
class << self
|
36
36
|
# Set of HTML elements that make up the visible text on a page. These
|
@@ -38,9 +38,9 @@ module Wgit
|
|
38
38
|
# README.md for how to add to this Set dynamically.
|
39
39
|
attr_reader :text_elements
|
40
40
|
|
41
|
-
# Set of Symbols representing the defined Document
|
42
|
-
# read-only. Use Wgit::Document.
|
43
|
-
attr_reader :
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
44
44
|
end
|
45
45
|
|
46
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
@@ -50,7 +50,7 @@ module Wgit
|
|
50
50
|
attr_reader :html
|
51
51
|
|
52
52
|
# The Nokogiri::HTML document object initialized from @html.
|
53
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
54
54
|
|
55
55
|
# The score is only used following a `Database#search` and records matches.
|
56
56
|
attr_reader :score
|
@@ -62,7 +62,7 @@ module Wgit
|
|
62
62
|
#
|
63
63
|
# During initialisation, the Document will call any private
|
64
64
|
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
65
|
-
#
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
66
66
|
#
|
67
67
|
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
68
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
@@ -101,13 +101,16 @@ module Wgit
|
|
101
101
|
xpath
|
102
102
|
end
|
103
103
|
|
104
|
-
# Defines
|
105
|
-
# instance variables upon Document initialization. See the default
|
106
|
-
#
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
107
110
|
#
|
108
|
-
# Note that defined
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
109
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
110
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
111
114
|
# the xpath or database object result(s).
|
112
115
|
#
|
113
116
|
# When initialising from HTML, a singleton value of true will only
|
@@ -118,15 +121,17 @@ module Wgit
|
|
118
121
|
# object), then a default will be used. The default value is:
|
119
122
|
# `singleton ? nil : []`.
|
120
123
|
#
|
121
|
-
# @param var [Symbol] The name of the variable to be initialised
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
122
127
|
# @param xpath [String, #call] The xpath used to find the element(s)
|
123
128
|
# of the webpage. Only used when initializing from HTML.
|
124
129
|
#
|
125
130
|
# Pass a callable object (proc etc.) if you want the
|
126
131
|
# xpath value to be derived on Document initialisation (instead of when
|
127
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
128
133
|
# String.
|
129
|
-
# @param opts [Hash] The options to define an
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
130
135
|
# options are only used when intializing from HTML, not the database.
|
131
136
|
# @option opts [Boolean] :singleton The singleton option determines
|
132
137
|
# whether or not the result(s) should be in an Array. If multiple
|
@@ -147,46 +152,50 @@ module Wgit
|
|
147
152
|
# value. Return the block's value param unchanged if you want to inspect.
|
148
153
|
# @raise [StandardError] If the var param isn't valid.
|
149
154
|
# @return [Symbol] The given var Symbol if successful.
|
150
|
-
def self.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
151
156
|
var = var.to_sym
|
152
157
|
defaults = { singleton: true, text_content_only: true }
|
153
158
|
opts = defaults.merge(opts)
|
154
159
|
|
155
|
-
raise "var must match #{
|
156
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
157
162
|
|
158
163
|
# Define the private init_*_from_html method for HTML.
|
159
164
|
# Gets the HTML's xpath value and creates a var for it.
|
160
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
161
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
162
167
|
init_var(var, result)
|
163
168
|
end
|
164
169
|
Document.send(:private, func_name)
|
165
170
|
|
166
171
|
# Define the private init_*_from_object method for a Database object.
|
167
172
|
# Gets the Object's 'key' value and creates a var for it.
|
168
|
-
func_name = Document.send(
|
169
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
170
179
|
init_var(var, result)
|
171
180
|
end
|
172
181
|
Document.send(:private, func_name)
|
173
182
|
|
174
|
-
@
|
183
|
+
@extractors << var
|
175
184
|
var
|
176
185
|
end
|
177
186
|
|
178
|
-
# Removes the `init_*` methods created when an
|
179
|
-
# Therefore, this is the opposing method to `Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
180
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
181
190
|
#
|
182
|
-
# @param var [Symbol] The
|
183
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
184
193
|
# otherwise false.
|
185
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
186
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
187
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
188
197
|
|
189
|
-
@
|
198
|
+
@extractors.delete(var.to_sym)
|
190
199
|
true
|
191
200
|
rescue NameError
|
192
201
|
false
|
@@ -215,9 +224,9 @@ module Wgit
|
|
215
224
|
|
216
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
217
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
218
|
-
# present and relative, then @url.
|
219
|
-
# should be used instead of `doc.url.
|
220
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
221
230
|
#
|
222
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
223
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -236,12 +245,16 @@ module Wgit
|
|
236
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
237
246
|
# 'http://example.com/public'.
|
238
247
|
def base_url(link: nil)
|
239
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
240
248
|
if @url.relative? && @base.nil?
|
241
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
242
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
243
256
|
|
244
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
245
258
|
|
246
259
|
if link
|
247
260
|
link = Wgit::Url.new(link)
|
@@ -253,7 +266,7 @@ module Wgit
|
|
253
266
|
end
|
254
267
|
end
|
255
268
|
|
256
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
257
270
|
base_url.omit_fragment.omit_query
|
258
271
|
end
|
259
272
|
|
@@ -267,7 +280,7 @@ module Wgit
|
|
267
280
|
def to_h(include_html: false, include_score: true)
|
268
281
|
ignore = include_html ? [] : ['@html']
|
269
282
|
ignore << '@score' unless include_score
|
270
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
271
284
|
|
272
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
273
286
|
end
|
@@ -284,7 +297,7 @@ module Wgit
|
|
284
297
|
|
285
298
|
# Returns a Hash containing this Document's instance variables and
|
286
299
|
# their #length (if they respond to it). Works dynamically so that any
|
287
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
288
301
|
# the returned Hash as well. The number of text snippets as well as total
|
289
302
|
# number of textual bytes are always included in the returned Hash.
|
290
303
|
#
|
@@ -324,21 +337,39 @@ module Wgit
|
|
324
337
|
end
|
325
338
|
|
326
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
327
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
328
341
|
#
|
329
342
|
# @param xpath [String] The xpath to search the @html with.
|
330
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
331
344
|
def xpath(xpath)
|
332
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
333
346
|
end
|
334
347
|
|
335
|
-
# Uses Nokogiri's
|
336
|
-
# results.
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
355
|
+
end
|
356
|
+
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
337
359
|
#
|
338
360
|
# @param selector [String] The CSS selector to search the @html with.
|
339
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
340
362
|
def css(selector)
|
341
|
-
@
|
363
|
+
@parser.css(selector)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
342
373
|
end
|
343
374
|
|
344
375
|
# Returns all unique internal links from this Document in relative form.
|
@@ -356,13 +387,13 @@ module Wgit
|
|
356
387
|
return [] if @links.empty?
|
357
388
|
|
358
389
|
links = @links
|
359
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
360
391
|
.map(&:omit_base)
|
361
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
362
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
363
394
|
end
|
364
395
|
|
365
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
366
397
|
end
|
367
398
|
|
368
399
|
# Returns all unique internal links from this Document in absolute form by
|
@@ -371,7 +402,7 @@ module Wgit
|
|
371
402
|
#
|
372
403
|
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
373
404
|
def internal_absolute_links
|
374
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
375
406
|
end
|
376
407
|
|
377
408
|
# Returns all unique external links from this Document in absolute form.
|
@@ -382,10 +413,10 @@ module Wgit
|
|
382
413
|
return [] if @links.empty?
|
383
414
|
|
384
415
|
links = @links
|
385
|
-
.reject { |link| link.relative?(host: @url.
|
416
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
386
417
|
.map(&:omit_trailing_slash)
|
387
418
|
|
388
|
-
Wgit::Utils.
|
419
|
+
Wgit::Utils.sanitize(links)
|
389
420
|
end
|
390
421
|
|
391
422
|
# Searches the @text for the given query and returns the results.
|
@@ -400,8 +431,8 @@ module Wgit
|
|
400
431
|
# original sentence, which ever is less. The algorithm obviously ensures
|
401
432
|
# that the search query is visible somewhere in the sentence.
|
402
433
|
#
|
403
|
-
# @param query [
|
404
|
-
# @text for.
|
434
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
435
|
+
# document's @text for.
|
405
436
|
# @param case_sensitive [Boolean] Whether character case must match.
|
406
437
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
407
438
|
# for separately.
|
@@ -411,12 +442,16 @@ module Wgit
|
|
411
442
|
def search(
|
412
443
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
413
444
|
)
|
414
|
-
query = query.to_s
|
415
|
-
raise 'A search query must be provided' if query.empty?
|
416
445
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
417
446
|
|
418
|
-
|
419
|
-
|
447
|
+
if query.is_a?(Regexp)
|
448
|
+
regex = query
|
449
|
+
else # respond_to? #to_s == true
|
450
|
+
query = query.to_s
|
451
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
452
|
+
regex = Regexp.new(query, !case_sensitive)
|
453
|
+
end
|
454
|
+
|
420
455
|
results = {}
|
421
456
|
|
422
457
|
@text.each do |sentence|
|
@@ -443,8 +478,8 @@ module Wgit
|
|
443
478
|
# functionality. The original text is returned; no other reference to it
|
444
479
|
# is kept thereafter.
|
445
480
|
#
|
446
|
-
# @param query [
|
447
|
-
# @text for.
|
481
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
482
|
+
# document's @text for.
|
448
483
|
# @param case_sensitive [Boolean] Whether character case must match.
|
449
484
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
450
485
|
# for separately.
|
@@ -463,13 +498,31 @@ module Wgit
|
|
463
498
|
orig_text
|
464
499
|
end
|
465
500
|
|
501
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
502
|
+
# parameter.
|
503
|
+
#
|
504
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
505
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
506
|
+
# Object) : results (Array).
|
507
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
508
|
+
# (String) : result (Nokogiri Object).
|
509
|
+
# @return [String, Object] The value found in the html or the default value
|
510
|
+
# (singleton ? nil : []).
|
511
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
512
|
+
send(
|
513
|
+
:extract_from_html, xpath,
|
514
|
+
singleton: singleton, text_content_only: text_content_only
|
515
|
+
)
|
516
|
+
end
|
517
|
+
|
466
518
|
protected
|
467
519
|
|
468
520
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
469
521
|
# Override this method to custom configure the Nokogiri object returned.
|
470
522
|
# Gets called from Wgit::Document.new upon initialization.
|
471
523
|
#
|
472
|
-
# @yield [config] The given block is passed to Nokogiri::HTML for
|
524
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
525
|
+
# initialisation.
|
473
526
|
# @raise [StandardError] If @html isn't set.
|
474
527
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
475
528
|
def init_nokogiri(&block)
|
@@ -481,7 +534,7 @@ module Wgit
|
|
481
534
|
# Extracts a value/object from this Document's @html using the given xpath
|
482
535
|
# parameter.
|
483
536
|
#
|
484
|
-
# @param xpath [String] Used to find the value/object in @html.
|
537
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
485
538
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
486
539
|
# Object) : results (Array).
|
487
540
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
@@ -497,23 +550,16 @@ module Wgit
|
|
497
550
|
# the block's `value` param unchanged if you simply want to inspect it.
|
498
551
|
# @return [String, Object] The value found in the html or the default value
|
499
552
|
# (singleton ? nil : []).
|
500
|
-
def
|
501
|
-
|
502
|
-
|
503
|
-
results = @doc.xpath(xpath)
|
504
|
-
|
505
|
-
return default if results.nil? || results.empty?
|
553
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
554
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
555
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
506
556
|
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
text_content_only ? results.map(&:content) : results
|
511
|
-
end
|
512
|
-
|
513
|
-
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
557
|
+
if text_content_only
|
558
|
+
result = singleton ? result&.content : result.map(&:content)
|
559
|
+
end
|
514
560
|
|
561
|
+
Wgit::Utils.sanitize(result)
|
515
562
|
result = yield(result, self, :document) if block_given?
|
516
|
-
|
517
563
|
result
|
518
564
|
end
|
519
565
|
|
@@ -533,16 +579,14 @@ module Wgit
|
|
533
579
|
# the block's `value` param unchanged if you simply want to inspect it.
|
534
580
|
# @return [String, Object] The value found in the obj or the default value
|
535
581
|
# (singleton ? nil : []).
|
536
|
-
def
|
582
|
+
def extract_from_object(obj, key, singleton: true)
|
537
583
|
assert_respond_to(obj, :fetch)
|
538
584
|
|
539
585
|
default = singleton ? nil : []
|
540
586
|
result = obj.fetch(key.to_s, default)
|
541
587
|
|
542
|
-
|
543
|
-
|
588
|
+
Wgit::Utils.sanitize(result)
|
544
589
|
result = yield(result, obj, :object) if block_given?
|
545
|
-
|
546
590
|
result
|
547
591
|
end
|
548
592
|
|
@@ -556,12 +600,12 @@ module Wgit
|
|
556
600
|
url = Wgit::Url.parse(url)
|
557
601
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
558
602
|
|
559
|
-
@url
|
560
|
-
@html
|
561
|
-
@
|
562
|
-
@score
|
603
|
+
@url = url
|
604
|
+
@html = html || ''
|
605
|
+
@parser = init_nokogiri
|
606
|
+
@score = 0.0
|
563
607
|
|
564
|
-
Wgit::Utils.
|
608
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
565
609
|
|
566
610
|
# Dynamically run the init_*_from_html methods.
|
567
611
|
Document.private_instance_methods(false).each do |method|
|
@@ -577,12 +621,12 @@ module Wgit
|
|
577
621
|
def init_from_object(obj, encode: true)
|
578
622
|
assert_respond_to(obj, :fetch)
|
579
623
|
|
580
|
-
@url
|
581
|
-
@html
|
582
|
-
@
|
583
|
-
@score
|
624
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
625
|
+
@html = obj.fetch('html', '')
|
626
|
+
@parser = init_nokogiri
|
627
|
+
@score = obj.fetch('score', 0.0)
|
584
628
|
|
585
|
-
Wgit::Utils.
|
629
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
586
630
|
|
587
631
|
# Dynamically run the init_*_from_object methods.
|
588
632
|
Document.private_instance_methods(false).each do |method|
|
@@ -593,11 +637,11 @@ module Wgit
|
|
593
637
|
end
|
594
638
|
end
|
595
639
|
|
596
|
-
# Initialises an instance variable and defines
|
640
|
+
# Initialises an instance variable and defines an accessor method for it.
|
597
641
|
#
|
598
642
|
# @param var [Symbol] The name of the variable to be initialized.
|
599
643
|
# @param value [Object] The newly initialized variable's value.
|
600
|
-
# @return [Symbol] The name of the
|
644
|
+
# @return [Symbol] The name of the defined getter method.
|
601
645
|
def init_var(var, value)
|
602
646
|
# instance_var_name starts with @, var_name doesn't.
|
603
647
|
var = var.to_s
|
@@ -605,10 +649,9 @@ module Wgit
|
|
605
649
|
instance_var_name = "@#{var_name}".to_sym
|
606
650
|
|
607
651
|
instance_variable_set(instance_var_name, value)
|
652
|
+
Wgit::Document.attr_accessor(var_name)
|
608
653
|
|
609
|
-
|
610
|
-
instance_variable_get(instance_var_name)
|
611
|
-
end
|
654
|
+
var_name
|
612
655
|
end
|
613
656
|
|
614
657
|
alias content html
|