wgit 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +39 -0
- data/LICENSE.txt +1 -1
- data/README.md +118 -323
- data/bin/wgit +9 -5
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +30 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +138 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +65 -162
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +133 -31
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- metadata +26 -14
data/lib/wgit/database/model.rb
CHANGED
@@ -14,8 +14,7 @@ module Wgit
|
|
14
14
|
raise 'url must respond_to? :to_h' unless url.respond_to?(:to_h)
|
15
15
|
|
16
16
|
model = url.to_h
|
17
|
-
|
18
|
-
Wgit::Utils.remove_non_bson_types(model)
|
17
|
+
select_bson_types(model)
|
19
18
|
end
|
20
19
|
|
21
20
|
# The data model for a Wgit::Document collection object.
|
@@ -28,7 +27,7 @@ module Wgit
|
|
28
27
|
model = doc.to_h(include_html: false, include_score: false)
|
29
28
|
model['url'] = url(doc.url) # Expand Url String into full object.
|
30
29
|
|
31
|
-
|
30
|
+
select_bson_types(model)
|
32
31
|
end
|
33
32
|
|
34
33
|
# Common fields when inserting a record into the DB.
|
@@ -49,5 +48,13 @@ module Wgit
|
|
49
48
|
date_modified: Wgit::Utils.time_stamp
|
50
49
|
}
|
51
50
|
end
|
51
|
+
|
52
|
+
# Returns the model having removed non bson types (for use with MongoDB).
|
53
|
+
#
|
54
|
+
# @param model_hash [Hash] The model Hash to sanitize.
|
55
|
+
# @return [Hash] The model Hash with non bson types removed.
|
56
|
+
def self.select_bson_types(model_hash)
|
57
|
+
model_hash.select { |_k, v| v.respond_to?(:bson_type) }
|
58
|
+
end
|
52
59
|
end
|
53
60
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -6,19 +6,19 @@ require 'json'
|
|
6
6
|
require 'set'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
10
10
|
# will work e.g. images etc. Also doubles as a search result when
|
11
11
|
# loading Documents from the database via `Wgit::Database#search`.
|
12
12
|
#
|
13
13
|
# The initialize method dynamically initializes instance variables from the
|
14
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
15
|
-
# Document class can be easily extended allowing you to
|
16
|
-
# a webpage that are important to you. See `Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
17
17
|
class Document
|
18
18
|
include Assertable
|
19
19
|
|
20
|
-
# Regex for the allowed var names when defining an
|
21
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
22
22
|
|
23
23
|
# Set of text elements used to build Document#text.
|
24
24
|
@text_elements = Set.new(%i[
|
@@ -29,8 +29,8 @@ module Wgit
|
|
29
29
|
summary sup td textarea th time u ul var wbr
|
30
30
|
])
|
31
31
|
|
32
|
-
# Set of Symbols representing the defined Document
|
33
|
-
@
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
34
34
|
|
35
35
|
class << self
|
36
36
|
# Set of HTML elements that make up the visible text on a page. These
|
@@ -38,9 +38,9 @@ module Wgit
|
|
38
38
|
# README.md for how to add to this Set dynamically.
|
39
39
|
attr_reader :text_elements
|
40
40
|
|
41
|
-
# Set of Symbols representing the defined Document
|
42
|
-
# read-only. Use Wgit::Document.
|
43
|
-
attr_reader :
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
44
44
|
end
|
45
45
|
|
46
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
@@ -50,7 +50,7 @@ module Wgit
|
|
50
50
|
attr_reader :html
|
51
51
|
|
52
52
|
# The Nokogiri::HTML document object initialized from @html.
|
53
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
54
54
|
|
55
55
|
# The score is only used following a `Database#search` and records matches.
|
56
56
|
attr_reader :score
|
@@ -62,7 +62,7 @@ module Wgit
|
|
62
62
|
#
|
63
63
|
# During initialisation, the Document will call any private
|
64
64
|
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
65
|
-
#
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
66
66
|
#
|
67
67
|
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
68
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
@@ -101,13 +101,16 @@ module Wgit
|
|
101
101
|
xpath
|
102
102
|
end
|
103
103
|
|
104
|
-
# Defines
|
105
|
-
# instance variables upon Document initialization. See the default
|
106
|
-
#
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
107
110
|
#
|
108
|
-
# Note that defined
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
109
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
110
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
111
114
|
# the xpath or database object result(s).
|
112
115
|
#
|
113
116
|
# When initialising from HTML, a singleton value of true will only
|
@@ -118,15 +121,17 @@ module Wgit
|
|
118
121
|
# object), then a default will be used. The default value is:
|
119
122
|
# `singleton ? nil : []`.
|
120
123
|
#
|
121
|
-
# @param var [Symbol] The name of the variable to be initialised
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
122
127
|
# @param xpath [String, #call] The xpath used to find the element(s)
|
123
128
|
# of the webpage. Only used when initializing from HTML.
|
124
129
|
#
|
125
130
|
# Pass a callable object (proc etc.) if you want the
|
126
131
|
# xpath value to be derived on Document initialisation (instead of when
|
127
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
128
133
|
# String.
|
129
|
-
# @param opts [Hash] The options to define an
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
130
135
|
# options are only used when intializing from HTML, not the database.
|
131
136
|
# @option opts [Boolean] :singleton The singleton option determines
|
132
137
|
# whether or not the result(s) should be in an Array. If multiple
|
@@ -147,46 +152,50 @@ module Wgit
|
|
147
152
|
# value. Return the block's value param unchanged if you want to inspect.
|
148
153
|
# @raise [StandardError] If the var param isn't valid.
|
149
154
|
# @return [Symbol] The given var Symbol if successful.
|
150
|
-
def self.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
151
156
|
var = var.to_sym
|
152
157
|
defaults = { singleton: true, text_content_only: true }
|
153
158
|
opts = defaults.merge(opts)
|
154
159
|
|
155
|
-
raise "var must match #{
|
156
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
157
162
|
|
158
163
|
# Define the private init_*_from_html method for HTML.
|
159
164
|
# Gets the HTML's xpath value and creates a var for it.
|
160
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
161
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
162
167
|
init_var(var, result)
|
163
168
|
end
|
164
169
|
Document.send(:private, func_name)
|
165
170
|
|
166
171
|
# Define the private init_*_from_object method for a Database object.
|
167
172
|
# Gets the Object's 'key' value and creates a var for it.
|
168
|
-
func_name = Document.send(
|
169
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
170
179
|
init_var(var, result)
|
171
180
|
end
|
172
181
|
Document.send(:private, func_name)
|
173
182
|
|
174
|
-
@
|
183
|
+
@extractors << var
|
175
184
|
var
|
176
185
|
end
|
177
186
|
|
178
|
-
# Removes the `init_*` methods created when an
|
179
|
-
# Therefore, this is the opposing method to `Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
180
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
181
190
|
#
|
182
|
-
# @param var [Symbol] The
|
183
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
184
193
|
# otherwise false.
|
185
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
186
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
187
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
188
197
|
|
189
|
-
@
|
198
|
+
@extractors.delete(var.to_sym)
|
190
199
|
true
|
191
200
|
rescue NameError
|
192
201
|
false
|
@@ -215,9 +224,9 @@ module Wgit
|
|
215
224
|
|
216
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
217
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
218
|
-
# present and relative, then @url.
|
219
|
-
# should be used instead of `doc.url.
|
220
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
221
230
|
#
|
222
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
223
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -236,12 +245,16 @@ module Wgit
|
|
236
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
237
246
|
# 'http://example.com/public'.
|
238
247
|
def base_url(link: nil)
|
239
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
240
248
|
if @url.relative? && @base.nil?
|
241
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
242
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
243
256
|
|
244
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
245
258
|
|
246
259
|
if link
|
247
260
|
link = Wgit::Url.new(link)
|
@@ -253,7 +266,7 @@ module Wgit
|
|
253
266
|
end
|
254
267
|
end
|
255
268
|
|
256
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
257
270
|
base_url.omit_fragment.omit_query
|
258
271
|
end
|
259
272
|
|
@@ -267,7 +280,7 @@ module Wgit
|
|
267
280
|
def to_h(include_html: false, include_score: true)
|
268
281
|
ignore = include_html ? [] : ['@html']
|
269
282
|
ignore << '@score' unless include_score
|
270
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
271
284
|
|
272
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
273
286
|
end
|
@@ -284,7 +297,7 @@ module Wgit
|
|
284
297
|
|
285
298
|
# Returns a Hash containing this Document's instance variables and
|
286
299
|
# their #length (if they respond to it). Works dynamically so that any
|
287
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
288
301
|
# the returned Hash as well. The number of text snippets as well as total
|
289
302
|
# number of textual bytes are always included in the returned Hash.
|
290
303
|
#
|
@@ -324,21 +337,39 @@ module Wgit
|
|
324
337
|
end
|
325
338
|
|
326
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
327
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
328
341
|
#
|
329
342
|
# @param xpath [String] The xpath to search the @html with.
|
330
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
331
344
|
def xpath(xpath)
|
332
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
333
346
|
end
|
334
347
|
|
335
|
-
# Uses Nokogiri's
|
336
|
-
# results.
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
355
|
+
end
|
356
|
+
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
337
359
|
#
|
338
360
|
# @param selector [String] The CSS selector to search the @html with.
|
339
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
340
362
|
def css(selector)
|
341
|
-
@
|
363
|
+
@parser.css(selector)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
342
373
|
end
|
343
374
|
|
344
375
|
# Returns all unique internal links from this Document in relative form.
|
@@ -356,13 +387,13 @@ module Wgit
|
|
356
387
|
return [] if @links.empty?
|
357
388
|
|
358
389
|
links = @links
|
359
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
360
391
|
.map(&:omit_base)
|
361
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
362
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
363
394
|
end
|
364
395
|
|
365
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
366
397
|
end
|
367
398
|
|
368
399
|
# Returns all unique internal links from this Document in absolute form by
|
@@ -371,7 +402,7 @@ module Wgit
|
|
371
402
|
#
|
372
403
|
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
373
404
|
def internal_absolute_links
|
374
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
375
406
|
end
|
376
407
|
|
377
408
|
# Returns all unique external links from this Document in absolute form.
|
@@ -382,10 +413,10 @@ module Wgit
|
|
382
413
|
return [] if @links.empty?
|
383
414
|
|
384
415
|
links = @links
|
385
|
-
.reject { |link| link.relative?(host: @url.
|
416
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
386
417
|
.map(&:omit_trailing_slash)
|
387
418
|
|
388
|
-
Wgit::Utils.
|
419
|
+
Wgit::Utils.sanitize(links)
|
389
420
|
end
|
390
421
|
|
391
422
|
# Searches the @text for the given query and returns the results.
|
@@ -400,8 +431,8 @@ module Wgit
|
|
400
431
|
# original sentence, which ever is less. The algorithm obviously ensures
|
401
432
|
# that the search query is visible somewhere in the sentence.
|
402
433
|
#
|
403
|
-
# @param query [
|
404
|
-
# @text for.
|
434
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
435
|
+
# document's @text for.
|
405
436
|
# @param case_sensitive [Boolean] Whether character case must match.
|
406
437
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
407
438
|
# for separately.
|
@@ -411,12 +442,16 @@ module Wgit
|
|
411
442
|
def search(
|
412
443
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
413
444
|
)
|
414
|
-
query = query.to_s
|
415
|
-
raise 'A search query must be provided' if query.empty?
|
416
445
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
417
446
|
|
418
|
-
|
419
|
-
|
447
|
+
if query.is_a?(Regexp)
|
448
|
+
regex = query
|
449
|
+
else # respond_to? #to_s == true
|
450
|
+
query = query.to_s
|
451
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
452
|
+
regex = Regexp.new(query, !case_sensitive)
|
453
|
+
end
|
454
|
+
|
420
455
|
results = {}
|
421
456
|
|
422
457
|
@text.each do |sentence|
|
@@ -443,8 +478,8 @@ module Wgit
|
|
443
478
|
# functionality. The original text is returned; no other reference to it
|
444
479
|
# is kept thereafter.
|
445
480
|
#
|
446
|
-
# @param query [
|
447
|
-
# @text for.
|
481
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
482
|
+
# document's @text for.
|
448
483
|
# @param case_sensitive [Boolean] Whether character case must match.
|
449
484
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
450
485
|
# for separately.
|
@@ -463,13 +498,31 @@ module Wgit
|
|
463
498
|
orig_text
|
464
499
|
end
|
465
500
|
|
501
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
502
|
+
# parameter.
|
503
|
+
#
|
504
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
505
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
506
|
+
# Object) : results (Array).
|
507
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
508
|
+
# (String) : result (Nokogiri Object).
|
509
|
+
# @return [String, Object] The value found in the html or the default value
|
510
|
+
# (singleton ? nil : []).
|
511
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
512
|
+
send(
|
513
|
+
:extract_from_html, xpath,
|
514
|
+
singleton: singleton, text_content_only: text_content_only
|
515
|
+
)
|
516
|
+
end
|
517
|
+
|
466
518
|
protected
|
467
519
|
|
468
520
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
469
521
|
# Override this method to custom configure the Nokogiri object returned.
|
470
522
|
# Gets called from Wgit::Document.new upon initialization.
|
471
523
|
#
|
472
|
-
# @yield [config] The given block is passed to Nokogiri::HTML for
|
524
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
525
|
+
# initialisation.
|
473
526
|
# @raise [StandardError] If @html isn't set.
|
474
527
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
475
528
|
def init_nokogiri(&block)
|
@@ -481,7 +534,7 @@ module Wgit
|
|
481
534
|
# Extracts a value/object from this Document's @html using the given xpath
|
482
535
|
# parameter.
|
483
536
|
#
|
484
|
-
# @param xpath [String] Used to find the value/object in @html.
|
537
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
485
538
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
486
539
|
# Object) : results (Array).
|
487
540
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
@@ -497,23 +550,16 @@ module Wgit
|
|
497
550
|
# the block's `value` param unchanged if you simply want to inspect it.
|
498
551
|
# @return [String, Object] The value found in the html or the default value
|
499
552
|
# (singleton ? nil : []).
|
500
|
-
def
|
501
|
-
|
502
|
-
|
503
|
-
results = @doc.xpath(xpath)
|
504
|
-
|
505
|
-
return default if results.nil? || results.empty?
|
553
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
554
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
555
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
506
556
|
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
text_content_only ? results.map(&:content) : results
|
511
|
-
end
|
512
|
-
|
513
|
-
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
557
|
+
if text_content_only
|
558
|
+
result = singleton ? result&.content : result.map(&:content)
|
559
|
+
end
|
514
560
|
|
561
|
+
Wgit::Utils.sanitize(result)
|
515
562
|
result = yield(result, self, :document) if block_given?
|
516
|
-
|
517
563
|
result
|
518
564
|
end
|
519
565
|
|
@@ -533,16 +579,14 @@ module Wgit
|
|
533
579
|
# the block's `value` param unchanged if you simply want to inspect it.
|
534
580
|
# @return [String, Object] The value found in the obj or the default value
|
535
581
|
# (singleton ? nil : []).
|
536
|
-
def
|
582
|
+
def extract_from_object(obj, key, singleton: true)
|
537
583
|
assert_respond_to(obj, :fetch)
|
538
584
|
|
539
585
|
default = singleton ? nil : []
|
540
586
|
result = obj.fetch(key.to_s, default)
|
541
587
|
|
542
|
-
|
543
|
-
|
588
|
+
Wgit::Utils.sanitize(result)
|
544
589
|
result = yield(result, obj, :object) if block_given?
|
545
|
-
|
546
590
|
result
|
547
591
|
end
|
548
592
|
|
@@ -556,12 +600,12 @@ module Wgit
|
|
556
600
|
url = Wgit::Url.parse(url)
|
557
601
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
558
602
|
|
559
|
-
@url
|
560
|
-
@html
|
561
|
-
@
|
562
|
-
@score
|
603
|
+
@url = url
|
604
|
+
@html = html || ''
|
605
|
+
@parser = init_nokogiri
|
606
|
+
@score = 0.0
|
563
607
|
|
564
|
-
Wgit::Utils.
|
608
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
565
609
|
|
566
610
|
# Dynamically run the init_*_from_html methods.
|
567
611
|
Document.private_instance_methods(false).each do |method|
|
@@ -577,12 +621,12 @@ module Wgit
|
|
577
621
|
def init_from_object(obj, encode: true)
|
578
622
|
assert_respond_to(obj, :fetch)
|
579
623
|
|
580
|
-
@url
|
581
|
-
@html
|
582
|
-
@
|
583
|
-
@score
|
624
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
625
|
+
@html = obj.fetch('html', '')
|
626
|
+
@parser = init_nokogiri
|
627
|
+
@score = obj.fetch('score', 0.0)
|
584
628
|
|
585
|
-
Wgit::Utils.
|
629
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
586
630
|
|
587
631
|
# Dynamically run the init_*_from_object methods.
|
588
632
|
Document.private_instance_methods(false).each do |method|
|
@@ -593,11 +637,11 @@ module Wgit
|
|
593
637
|
end
|
594
638
|
end
|
595
639
|
|
596
|
-
# Initialises an instance variable and defines
|
640
|
+
# Initialises an instance variable and defines an accessor method for it.
|
597
641
|
#
|
598
642
|
# @param var [Symbol] The name of the variable to be initialized.
|
599
643
|
# @param value [Object] The newly initialized variable's value.
|
600
|
-
# @return [Symbol] The name of the
|
644
|
+
# @return [Symbol] The name of the defined getter method.
|
601
645
|
def init_var(var, value)
|
602
646
|
# instance_var_name starts with @, var_name doesn't.
|
603
647
|
var = var.to_s
|
@@ -605,10 +649,9 @@ module Wgit
|
|
605
649
|
instance_var_name = "@#{var_name}".to_sym
|
606
650
|
|
607
651
|
instance_variable_set(instance_var_name, value)
|
652
|
+
Wgit::Document.attr_accessor(var_name)
|
608
653
|
|
609
|
-
|
610
|
-
instance_variable_get(instance_var_name)
|
611
|
-
end
|
654
|
+
var_name
|
612
655
|
end
|
613
656
|
|
614
657
|
alias content html
|