wgit 0.8.0 → 0.10.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +68 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -326
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +39 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +145 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +66 -163
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +177 -63
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/document.rb
CHANGED
@@ -6,19 +6,19 @@ require 'json'
|
|
6
6
|
require 'set'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
10
10
|
# will work e.g. images etc. Also doubles as a search result when
|
11
11
|
# loading Documents from the database via `Wgit::Database#search`.
|
12
12
|
#
|
13
13
|
# The initialize method dynamically initializes instance variables from the
|
14
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
15
|
-
# Document class can be easily extended allowing you to
|
16
|
-
# a webpage that are important to you. See `Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
17
17
|
class Document
|
18
18
|
include Assertable
|
19
19
|
|
20
|
-
# Regex for the allowed var names when defining an
|
21
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
22
22
|
|
23
23
|
# Set of text elements used to build Document#text.
|
24
24
|
@text_elements = Set.new(%i[
|
@@ -29,8 +29,8 @@ module Wgit
|
|
29
29
|
summary sup td textarea th time u ul var wbr
|
30
30
|
])
|
31
31
|
|
32
|
-
# Set of Symbols representing the defined Document
|
33
|
-
@
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
34
34
|
|
35
35
|
class << self
|
36
36
|
# Set of HTML elements that make up the visible text on a page. These
|
@@ -38,9 +38,9 @@ module Wgit
|
|
38
38
|
# README.md for how to add to this Set dynamically.
|
39
39
|
attr_reader :text_elements
|
40
40
|
|
41
|
-
# Set of Symbols representing the defined Document
|
42
|
-
# read-only. Use Wgit::Document.
|
43
|
-
attr_reader :
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
44
44
|
end
|
45
45
|
|
46
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
@@ -50,7 +50,7 @@ module Wgit
|
|
50
50
|
attr_reader :html
|
51
51
|
|
52
52
|
# The Nokogiri::HTML document object initialized from @html.
|
53
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
54
54
|
|
55
55
|
# The score is only used following a `Database#search` and records matches.
|
56
56
|
attr_reader :score
|
@@ -62,7 +62,7 @@ module Wgit
|
|
62
62
|
#
|
63
63
|
# During initialisation, the Document will call any private
|
64
64
|
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
65
|
-
#
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
66
66
|
#
|
67
67
|
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
68
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
@@ -101,13 +101,16 @@ module Wgit
|
|
101
101
|
xpath
|
102
102
|
end
|
103
103
|
|
104
|
-
# Defines
|
105
|
-
# instance variables upon Document initialization. See the default
|
106
|
-
#
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
107
110
|
#
|
108
|
-
# Note that defined
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
109
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
110
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
111
114
|
# the xpath or database object result(s).
|
112
115
|
#
|
113
116
|
# When initialising from HTML, a singleton value of true will only
|
@@ -118,15 +121,17 @@ module Wgit
|
|
118
121
|
# object), then a default will be used. The default value is:
|
119
122
|
# `singleton ? nil : []`.
|
120
123
|
#
|
121
|
-
# @param var [Symbol] The name of the variable to be initialised
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
122
127
|
# @param xpath [String, #call] The xpath used to find the element(s)
|
123
128
|
# of the webpage. Only used when initializing from HTML.
|
124
129
|
#
|
125
130
|
# Pass a callable object (proc etc.) if you want the
|
126
131
|
# xpath value to be derived on Document initialisation (instead of when
|
127
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
128
133
|
# String.
|
129
|
-
# @param opts [Hash] The options to define an
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
130
135
|
# options are only used when intializing from HTML, not the database.
|
131
136
|
# @option opts [Boolean] :singleton The singleton option determines
|
132
137
|
# whether or not the result(s) should be in an Array. If multiple
|
@@ -147,46 +152,50 @@ module Wgit
|
|
147
152
|
# value. Return the block's value param unchanged if you want to inspect.
|
148
153
|
# @raise [StandardError] If the var param isn't valid.
|
149
154
|
# @return [Symbol] The given var Symbol if successful.
|
150
|
-
def self.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
151
156
|
var = var.to_sym
|
152
157
|
defaults = { singleton: true, text_content_only: true }
|
153
158
|
opts = defaults.merge(opts)
|
154
159
|
|
155
|
-
raise "var must match #{
|
156
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
157
162
|
|
158
163
|
# Define the private init_*_from_html method for HTML.
|
159
164
|
# Gets the HTML's xpath value and creates a var for it.
|
160
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
161
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
162
167
|
init_var(var, result)
|
163
168
|
end
|
164
169
|
Document.send(:private, func_name)
|
165
170
|
|
166
171
|
# Define the private init_*_from_object method for a Database object.
|
167
172
|
# Gets the Object's 'key' value and creates a var for it.
|
168
|
-
func_name = Document.send(
|
169
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
170
179
|
init_var(var, result)
|
171
180
|
end
|
172
181
|
Document.send(:private, func_name)
|
173
182
|
|
174
|
-
@
|
183
|
+
@extractors << var
|
175
184
|
var
|
176
185
|
end
|
177
186
|
|
178
|
-
# Removes the `init_*` methods created when an
|
179
|
-
# Therefore, this is the opposing method to `Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
180
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
181
190
|
#
|
182
|
-
# @param var [Symbol] The
|
183
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
184
193
|
# otherwise false.
|
185
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
186
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
187
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
188
197
|
|
189
|
-
@
|
198
|
+
@extractors.delete(var.to_sym)
|
190
199
|
true
|
191
200
|
rescue NameError
|
192
201
|
false
|
@@ -215,9 +224,9 @@ module Wgit
|
|
215
224
|
|
216
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
217
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
218
|
-
# present and relative, then @url.
|
219
|
-
# should be used instead of `doc.url.
|
220
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
221
230
|
#
|
222
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
223
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -236,12 +245,16 @@ module Wgit
|
|
236
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
237
246
|
# 'http://example.com/public'.
|
238
247
|
def base_url(link: nil)
|
239
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
240
248
|
if @url.relative? && @base.nil?
|
241
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
242
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
243
256
|
|
244
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
245
258
|
|
246
259
|
if link
|
247
260
|
link = Wgit::Url.new(link)
|
@@ -253,7 +266,7 @@ module Wgit
|
|
253
266
|
end
|
254
267
|
end
|
255
268
|
|
256
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
257
270
|
base_url.omit_fragment.omit_query
|
258
271
|
end
|
259
272
|
|
@@ -267,7 +280,7 @@ module Wgit
|
|
267
280
|
def to_h(include_html: false, include_score: true)
|
268
281
|
ignore = include_html ? [] : ['@html']
|
269
282
|
ignore << '@score' unless include_score
|
270
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
271
284
|
|
272
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
273
286
|
end
|
@@ -284,7 +297,7 @@ module Wgit
|
|
284
297
|
|
285
298
|
# Returns a Hash containing this Document's instance variables and
|
286
299
|
# their #length (if they respond to it). Works dynamically so that any
|
287
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
288
301
|
# the returned Hash as well. The number of text snippets as well as total
|
289
302
|
# number of textual bytes are always included in the returned Hash.
|
290
303
|
#
|
@@ -324,21 +337,39 @@ module Wgit
|
|
324
337
|
end
|
325
338
|
|
326
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
327
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
328
341
|
#
|
329
342
|
# @param xpath [String] The xpath to search the @html with.
|
330
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
331
344
|
def xpath(xpath)
|
332
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
333
346
|
end
|
334
347
|
|
335
|
-
# Uses Nokogiri's
|
336
|
-
# results.
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
355
|
+
end
|
356
|
+
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
337
359
|
#
|
338
360
|
# @param selector [String] The CSS selector to search the @html with.
|
339
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
340
362
|
def css(selector)
|
341
|
-
@
|
363
|
+
@parser.css(selector)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
342
373
|
end
|
343
374
|
|
344
375
|
# Returns all unique internal links from this Document in relative form.
|
@@ -356,13 +387,13 @@ module Wgit
|
|
356
387
|
return [] if @links.empty?
|
357
388
|
|
358
389
|
links = @links
|
359
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
360
391
|
.map(&:omit_base)
|
361
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
362
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
363
394
|
end
|
364
395
|
|
365
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
366
397
|
end
|
367
398
|
|
368
399
|
# Returns all unique internal links from this Document in absolute form by
|
@@ -371,7 +402,7 @@ module Wgit
|
|
371
402
|
#
|
372
403
|
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
373
404
|
def internal_absolute_links
|
374
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
375
406
|
end
|
376
407
|
|
377
408
|
# Returns all unique external links from this Document in absolute form.
|
@@ -382,10 +413,17 @@ module Wgit
|
|
382
413
|
return [] if @links.empty?
|
383
414
|
|
384
415
|
links = @links
|
385
|
-
.
|
416
|
+
.map do |link|
|
417
|
+
if link.scheme_relative?
|
418
|
+
link.prefix_scheme(@url.to_scheme.to_sym)
|
419
|
+
else
|
420
|
+
link
|
421
|
+
end
|
422
|
+
end
|
423
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
386
424
|
.map(&:omit_trailing_slash)
|
387
425
|
|
388
|
-
Wgit::Utils.
|
426
|
+
Wgit::Utils.sanitize(links)
|
389
427
|
end
|
390
428
|
|
391
429
|
# Searches the @text for the given query and returns the results.
|
@@ -400,8 +438,8 @@ module Wgit
|
|
400
438
|
# original sentence, which ever is less. The algorithm obviously ensures
|
401
439
|
# that the search query is visible somewhere in the sentence.
|
402
440
|
#
|
403
|
-
# @param query [
|
404
|
-
# @text for.
|
441
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
442
|
+
# document's @text for.
|
405
443
|
# @param case_sensitive [Boolean] Whether character case must match.
|
406
444
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
407
445
|
# for separately.
|
@@ -411,12 +449,16 @@ module Wgit
|
|
411
449
|
def search(
|
412
450
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
413
451
|
)
|
414
|
-
query = query.to_s
|
415
|
-
raise 'A search query must be provided' if query.empty?
|
416
452
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
417
453
|
|
418
|
-
|
419
|
-
|
454
|
+
if query.is_a?(Regexp)
|
455
|
+
regex = query
|
456
|
+
else # respond_to? #to_s == true
|
457
|
+
query = query.to_s
|
458
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
459
|
+
regex = Regexp.new(query, !case_sensitive)
|
460
|
+
end
|
461
|
+
|
420
462
|
results = {}
|
421
463
|
|
422
464
|
@text.each do |sentence|
|
@@ -443,8 +485,8 @@ module Wgit
|
|
443
485
|
# functionality. The original text is returned; no other reference to it
|
444
486
|
# is kept thereafter.
|
445
487
|
#
|
446
|
-
# @param query [
|
447
|
-
# @text for.
|
488
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
489
|
+
# document's @text for.
|
448
490
|
# @param case_sensitive [Boolean] Whether character case must match.
|
449
491
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
450
492
|
# for separately.
|
@@ -463,13 +505,31 @@ module Wgit
|
|
463
505
|
orig_text
|
464
506
|
end
|
465
507
|
|
508
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
509
|
+
# parameter.
|
510
|
+
#
|
511
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
513
|
+
# Object) : results (Array).
|
514
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
|
+
# (String) : result (Nokogiri Object).
|
516
|
+
# @return [String, Object] The value found in the html or the default value
|
517
|
+
# (singleton ? nil : []).
|
518
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
519
|
+
send(
|
520
|
+
:extract_from_html, xpath,
|
521
|
+
singleton: singleton, text_content_only: text_content_only
|
522
|
+
)
|
523
|
+
end
|
524
|
+
|
466
525
|
protected
|
467
526
|
|
468
527
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
469
528
|
# Override this method to custom configure the Nokogiri object returned.
|
470
529
|
# Gets called from Wgit::Document.new upon initialization.
|
471
530
|
#
|
472
|
-
# @yield [config] The given block is passed to Nokogiri::HTML for
|
531
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
532
|
+
# initialisation.
|
473
533
|
# @raise [StandardError] If @html isn't set.
|
474
534
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
475
535
|
def init_nokogiri(&block)
|
@@ -481,7 +541,7 @@ module Wgit
|
|
481
541
|
# Extracts a value/object from this Document's @html using the given xpath
|
482
542
|
# parameter.
|
483
543
|
#
|
484
|
-
# @param xpath [String] Used to find the value/object in @html.
|
544
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
485
545
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
486
546
|
# Object) : results (Array).
|
487
547
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
@@ -497,23 +557,16 @@ module Wgit
|
|
497
557
|
# the block's `value` param unchanged if you simply want to inspect it.
|
498
558
|
# @return [String, Object] The value found in the html or the default value
|
499
559
|
# (singleton ? nil : []).
|
500
|
-
def
|
501
|
-
|
502
|
-
|
503
|
-
results = @doc.xpath(xpath)
|
504
|
-
|
505
|
-
return default if results.nil? || results.empty?
|
560
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
506
563
|
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
text_content_only ? results.map(&:content) : results
|
511
|
-
end
|
512
|
-
|
513
|
-
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
564
|
+
if text_content_only
|
565
|
+
result = singleton ? result&.content : result.map(&:content)
|
566
|
+
end
|
514
567
|
|
568
|
+
Wgit::Utils.sanitize(result)
|
515
569
|
result = yield(result, self, :document) if block_given?
|
516
|
-
|
517
570
|
result
|
518
571
|
end
|
519
572
|
|
@@ -533,16 +586,14 @@ module Wgit
|
|
533
586
|
# the block's `value` param unchanged if you simply want to inspect it.
|
534
587
|
# @return [String, Object] The value found in the obj or the default value
|
535
588
|
# (singleton ? nil : []).
|
536
|
-
def
|
589
|
+
def extract_from_object(obj, key, singleton: true)
|
537
590
|
assert_respond_to(obj, :fetch)
|
538
591
|
|
539
592
|
default = singleton ? nil : []
|
540
593
|
result = obj.fetch(key.to_s, default)
|
541
594
|
|
542
|
-
|
543
|
-
|
595
|
+
Wgit::Utils.sanitize(result)
|
544
596
|
result = yield(result, obj, :object) if block_given?
|
545
|
-
|
546
597
|
result
|
547
598
|
end
|
548
599
|
|
@@ -556,12 +607,12 @@ module Wgit
|
|
556
607
|
url = Wgit::Url.parse(url)
|
557
608
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
558
609
|
|
559
|
-
@url
|
560
|
-
@html
|
561
|
-
@
|
562
|
-
@score
|
610
|
+
@url = url
|
611
|
+
@html = html || ''
|
612
|
+
@parser = init_nokogiri
|
613
|
+
@score = 0.0
|
563
614
|
|
564
|
-
Wgit::Utils.
|
615
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
565
616
|
|
566
617
|
# Dynamically run the init_*_from_html methods.
|
567
618
|
Document.private_instance_methods(false).each do |method|
|
@@ -577,12 +628,12 @@ module Wgit
|
|
577
628
|
def init_from_object(obj, encode: true)
|
578
629
|
assert_respond_to(obj, :fetch)
|
579
630
|
|
580
|
-
@url
|
581
|
-
@html
|
582
|
-
@
|
583
|
-
@score
|
631
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
632
|
+
@html = obj.fetch('html', '')
|
633
|
+
@parser = init_nokogiri
|
634
|
+
@score = obj.fetch('score', 0.0)
|
584
635
|
|
585
|
-
Wgit::Utils.
|
636
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
586
637
|
|
587
638
|
# Dynamically run the init_*_from_object methods.
|
588
639
|
Document.private_instance_methods(false).each do |method|
|
@@ -593,11 +644,11 @@ module Wgit
|
|
593
644
|
end
|
594
645
|
end
|
595
646
|
|
596
|
-
# Initialises an instance variable and defines
|
647
|
+
# Initialises an instance variable and defines an accessor method for it.
|
597
648
|
#
|
598
649
|
# @param var [Symbol] The name of the variable to be initialized.
|
599
650
|
# @param value [Object] The newly initialized variable's value.
|
600
|
-
# @return [Symbol] The name of the
|
651
|
+
# @return [Symbol] The name of the defined getter method.
|
601
652
|
def init_var(var, value)
|
602
653
|
# instance_var_name starts with @, var_name doesn't.
|
603
654
|
var = var.to_s
|
@@ -605,10 +656,9 @@ module Wgit
|
|
605
656
|
instance_var_name = "@#{var_name}".to_sym
|
606
657
|
|
607
658
|
instance_variable_set(instance_var_name, value)
|
659
|
+
Wgit::Document.attr_accessor(var_name)
|
608
660
|
|
609
|
-
|
610
|
-
instance_variable_get(instance_var_name)
|
611
|
-
end
|
661
|
+
var_name
|
612
662
|
end
|
613
663
|
|
614
664
|
alias content html
|
@@ -1,19 +1,19 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
### Default Document
|
3
|
+
### Default Document Extractors ###
|
4
4
|
|
5
5
|
# Base.
|
6
|
-
Wgit::Document.
|
6
|
+
Wgit::Document.define_extractor(
|
7
7
|
:base,
|
8
8
|
'//base/@href',
|
9
9
|
singleton: true,
|
10
10
|
text_content_only: true
|
11
11
|
) do |base|
|
12
|
-
Wgit::Url.
|
12
|
+
Wgit::Url.parse?(base) if base
|
13
13
|
end
|
14
14
|
|
15
15
|
# Title.
|
16
|
-
Wgit::Document.
|
16
|
+
Wgit::Document.define_extractor(
|
17
17
|
:title,
|
18
18
|
'//title',
|
19
19
|
singleton: true,
|
@@ -21,7 +21,7 @@ Wgit::Document.define_extension(
|
|
21
21
|
)
|
22
22
|
|
23
23
|
# Description.
|
24
|
-
Wgit::Document.
|
24
|
+
Wgit::Document.define_extractor(
|
25
25
|
:description,
|
26
26
|
'//meta[@name="description"]/@content',
|
27
27
|
singleton: true,
|
@@ -29,7 +29,7 @@ Wgit::Document.define_extension(
|
|
29
29
|
)
|
30
30
|
|
31
31
|
# Author.
|
32
|
-
Wgit::Document.
|
32
|
+
Wgit::Document.define_extractor(
|
33
33
|
:author,
|
34
34
|
'//meta[@name="author"]/@content',
|
35
35
|
singleton: true,
|
@@ -37,7 +37,7 @@ Wgit::Document.define_extension(
|
|
37
37
|
)
|
38
38
|
|
39
39
|
# Keywords.
|
40
|
-
Wgit::Document.
|
40
|
+
Wgit::Document.define_extractor(
|
41
41
|
:keywords,
|
42
42
|
'//meta[@name="keywords"]/@content',
|
43
43
|
singleton: true,
|
@@ -45,25 +45,25 @@ Wgit::Document.define_extension(
|
|
45
45
|
) do |keywords, _source, type|
|
46
46
|
if keywords && (type == :document)
|
47
47
|
keywords = keywords.split(',')
|
48
|
-
Wgit::Utils.
|
48
|
+
Wgit::Utils.sanitize(keywords)
|
49
49
|
end
|
50
50
|
keywords
|
51
51
|
end
|
52
52
|
|
53
53
|
# Links.
|
54
|
-
Wgit::Document.
|
54
|
+
Wgit::Document.define_extractor(
|
55
55
|
:links,
|
56
56
|
'//a/@href',
|
57
57
|
singleton: false,
|
58
58
|
text_content_only: true
|
59
59
|
) do |links|
|
60
60
|
links
|
61
|
-
.map { |link| Wgit::Url.
|
61
|
+
.map { |link| Wgit::Url.parse?(link) }
|
62
62
|
.compact # Remove unparsable links.
|
63
63
|
end
|
64
64
|
|
65
65
|
# Text.
|
66
|
-
Wgit::Document.
|
66
|
+
Wgit::Document.define_extractor(
|
67
67
|
:text,
|
68
68
|
proc { Wgit::Document.text_elements_xpath },
|
69
69
|
singleton: false,
|