wgit 0.8.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +68 -2
- data/LICENSE.txt +1 -1
- data/README.md +114 -326
- data/bin/wgit +9 -5
- data/lib/wgit/assertable.rb +3 -3
- data/lib/wgit/base.rb +39 -0
- data/lib/wgit/crawler.rb +206 -76
- data/lib/wgit/database/database.rb +309 -134
- data/lib/wgit/database/model.rb +10 -3
- data/lib/wgit/document.rb +145 -95
- data/lib/wgit/{document_extensions.rb → document_extractors.rb} +11 -11
- data/lib/wgit/dsl.rb +324 -0
- data/lib/wgit/indexer.rb +66 -163
- data/lib/wgit/response.rb +5 -2
- data/lib/wgit/url.rb +177 -63
- data/lib/wgit/utils.rb +32 -20
- data/lib/wgit/version.rb +2 -1
- data/lib/wgit.rb +3 -1
- metadata +34 -19
data/lib/wgit/document.rb
CHANGED
@@ -6,19 +6,19 @@ require 'json'
|
|
6
6
|
require 'set'
|
7
7
|
|
8
8
|
module Wgit
|
9
|
-
# Class
|
9
|
+
# Class modeling/serialising a HTML web document, although other MIME types
|
10
10
|
# will work e.g. images etc. Also doubles as a search result when
|
11
11
|
# loading Documents from the database via `Wgit::Database#search`.
|
12
12
|
#
|
13
13
|
# The initialize method dynamically initializes instance variables from the
|
14
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
15
|
-
# Document class can be easily extended allowing you to
|
16
|
-
# a webpage that are important to you. See `Wgit::Document.
|
15
|
+
# Document class can be easily extended allowing you to extract the bits of
|
16
|
+
# a webpage that are important to you. See `Wgit::Document.define_extractor`.
|
17
17
|
class Document
|
18
18
|
include Assertable
|
19
19
|
|
20
|
-
# Regex for the allowed var names when defining an
|
21
|
-
|
20
|
+
# Regex for the allowed var names when defining an extractor.
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
|
22
22
|
|
23
23
|
# Set of text elements used to build Document#text.
|
24
24
|
@text_elements = Set.new(%i[
|
@@ -29,8 +29,8 @@ module Wgit
|
|
29
29
|
summary sup td textarea th time u ul var wbr
|
30
30
|
])
|
31
31
|
|
32
|
-
# Set of Symbols representing the defined Document
|
33
|
-
@
|
32
|
+
# Set of Symbols representing the defined Document extractors.
|
33
|
+
@extractors = Set.new
|
34
34
|
|
35
35
|
class << self
|
36
36
|
# Set of HTML elements that make up the visible text on a page. These
|
@@ -38,9 +38,9 @@ module Wgit
|
|
38
38
|
# README.md for how to add to this Set dynamically.
|
39
39
|
attr_reader :text_elements
|
40
40
|
|
41
|
-
# Set of Symbols representing the defined Document
|
42
|
-
# read-only. Use Wgit::Document.
|
43
|
-
attr_reader :
|
41
|
+
# Set of Symbols representing the defined Document extractors. Is
|
42
|
+
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
|
+
attr_reader :extractors
|
44
44
|
end
|
45
45
|
|
46
46
|
# The URL of the webpage, an instance of Wgit::Url.
|
@@ -50,7 +50,7 @@ module Wgit
|
|
50
50
|
attr_reader :html
|
51
51
|
|
52
52
|
# The Nokogiri::HTML document object initialized from @html.
|
53
|
-
attr_reader :
|
53
|
+
attr_reader :parser
|
54
54
|
|
55
55
|
# The score is only used following a `Database#search` and records matches.
|
56
56
|
attr_reader :score
|
@@ -62,7 +62,7 @@ module Wgit
|
|
62
62
|
#
|
63
63
|
# During initialisation, the Document will call any private
|
64
64
|
# `init_*_from_html` and `init_*_from_object` methods it can find. See the
|
65
|
-
#
|
65
|
+
# Wgit::Document.define_extractor method for more details.
|
66
66
|
#
|
67
67
|
# @param url_or_obj [String, Wgit::Url, #fetch] Either a String
|
68
68
|
# representing a URL or a Hash-like object responding to :fetch. e.g. a
|
@@ -101,13 +101,16 @@ module Wgit
|
|
101
101
|
xpath
|
102
102
|
end
|
103
103
|
|
104
|
-
# Defines
|
105
|
-
# instance variables upon Document initialization. See the default
|
106
|
-
#
|
104
|
+
# Defines a content extractor, which extracts HTML elements/content
|
105
|
+
# into instance variables upon Document initialization. See the default
|
106
|
+
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
107
|
+
# extractor means that every subsequently crawled/initialized document
|
108
|
+
# will attempt to extract the xpath's content. Use `#xpath` for a one off
|
109
|
+
# content extraction.
|
107
110
|
#
|
108
|
-
# Note that defined
|
111
|
+
# Note that defined extractors work for both Documents initialized from
|
109
112
|
# HTML (via Wgit::Crawler methods) and from database objects.
|
110
|
-
# An
|
113
|
+
# An extractor once defined, initializes a private instance variable with
|
111
114
|
# the xpath or database object result(s).
|
112
115
|
#
|
113
116
|
# When initialising from HTML, a singleton value of true will only
|
@@ -118,15 +121,17 @@ module Wgit
|
|
118
121
|
# object), then a default will be used. The default value is:
|
119
122
|
# `singleton ? nil : []`.
|
120
123
|
#
|
121
|
-
# @param var [Symbol] The name of the variable to be initialised
|
124
|
+
# @param var [Symbol] The name of the variable to be initialised, that will
|
125
|
+
# contain the extracted content. A getter and setter method is defined
|
126
|
+
# for the initialised variable.
|
122
127
|
# @param xpath [String, #call] The xpath used to find the element(s)
|
123
128
|
# of the webpage. Only used when initializing from HTML.
|
124
129
|
#
|
125
130
|
# Pass a callable object (proc etc.) if you want the
|
126
131
|
# xpath value to be derived on Document initialisation (instead of when
|
127
|
-
# the
|
132
|
+
# the extractor is defined). The call method must return a valid xpath
|
128
133
|
# String.
|
129
|
-
# @param opts [Hash] The options to define an
|
134
|
+
# @param opts [Hash] The options to define an extractor with. The
|
130
135
|
# options are only used when intializing from HTML, not the database.
|
131
136
|
# @option opts [Boolean] :singleton The singleton option determines
|
132
137
|
# whether or not the result(s) should be in an Array. If multiple
|
@@ -147,46 +152,50 @@ module Wgit
|
|
147
152
|
# value. Return the block's value param unchanged if you want to inspect.
|
148
153
|
# @raise [StandardError] If the var param isn't valid.
|
149
154
|
# @return [Symbol] The given var Symbol if successful.
|
150
|
-
def self.
|
155
|
+
def self.define_extractor(var, xpath, opts = {}, &block)
|
151
156
|
var = var.to_sym
|
152
157
|
defaults = { singleton: true, text_content_only: true }
|
153
158
|
opts = defaults.merge(opts)
|
154
159
|
|
155
|
-
raise "var must match #{
|
156
|
-
var =~
|
160
|
+
raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
|
161
|
+
var =~ REGEX_EXTRACTOR_NAME
|
157
162
|
|
158
163
|
# Define the private init_*_from_html method for HTML.
|
159
164
|
# Gets the HTML's xpath value and creates a var for it.
|
160
165
|
func_name = Document.send(:define_method, "init_#{var}_from_html") do
|
161
|
-
result =
|
166
|
+
result = extract_from_html(xpath, **opts, &block)
|
162
167
|
init_var(var, result)
|
163
168
|
end
|
164
169
|
Document.send(:private, func_name)
|
165
170
|
|
166
171
|
# Define the private init_*_from_object method for a Database object.
|
167
172
|
# Gets the Object's 'key' value and creates a var for it.
|
168
|
-
func_name = Document.send(
|
169
|
-
|
173
|
+
func_name = Document.send(
|
174
|
+
:define_method, "init_#{var}_from_object"
|
175
|
+
) do |obj|
|
176
|
+
result = extract_from_object(
|
177
|
+
obj, var.to_s, singleton: opts[:singleton], &block
|
178
|
+
)
|
170
179
|
init_var(var, result)
|
171
180
|
end
|
172
181
|
Document.send(:private, func_name)
|
173
182
|
|
174
|
-
@
|
183
|
+
@extractors << var
|
175
184
|
var
|
176
185
|
end
|
177
186
|
|
178
|
-
# Removes the `init_*` methods created when an
|
179
|
-
# Therefore, this is the opposing method to `Document.
|
187
|
+
# Removes the `init_*` methods created when an extractor is defined.
|
188
|
+
# Therefore, this is the opposing method to `Document.define_extractor`.
|
180
189
|
# Returns true if successful or false if the method(s) cannot be found.
|
181
190
|
#
|
182
|
-
# @param var [Symbol] The
|
183
|
-
# @return [Boolean] True if the
|
191
|
+
# @param var [Symbol] The extractor variable to remove.
|
192
|
+
# @return [Boolean] True if the extractor `var` was found and removed;
|
184
193
|
# otherwise false.
|
185
|
-
def self.
|
194
|
+
def self.remove_extractor(var)
|
186
195
|
Document.send(:remove_method, "init_#{var}_from_html")
|
187
196
|
Document.send(:remove_method, "init_#{var}_from_object")
|
188
197
|
|
189
|
-
@
|
198
|
+
@extractors.delete(var.to_sym)
|
190
199
|
true
|
191
200
|
rescue NameError
|
192
201
|
false
|
@@ -215,9 +224,9 @@ module Wgit
|
|
215
224
|
|
216
225
|
# Returns the base URL of this Wgit::Document. The base URL is either the
|
217
226
|
# <base> element's href value or @url (if @base is nil). If @base is
|
218
|
-
# present and relative, then @url.
|
219
|
-
# should be used instead of `doc.url.
|
220
|
-
# absolute links from relative links; or use `link.
|
227
|
+
# present and relative, then @url.to_origin + @base is returned. This method
|
228
|
+
# should be used instead of `doc.url.to_origin` etc. when manually building
|
229
|
+
# absolute links from relative links; or use `link.make_absolute(doc)`.
|
221
230
|
#
|
222
231
|
# Provide the `link:` parameter to get the correct base URL for that type
|
223
232
|
# of link. For example, a link of `#top` would always return @url because
|
@@ -236,12 +245,16 @@ module Wgit
|
|
236
245
|
# @return [Wgit::Url] The base URL of this Document e.g.
|
237
246
|
# 'http://example.com/public'.
|
238
247
|
def base_url(link: nil)
|
239
|
-
raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
|
240
248
|
if @url.relative? && @base.nil?
|
241
|
-
|
249
|
+
raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
|
250
|
+
end
|
251
|
+
|
242
252
|
if @url.relative? && @base&.relative?
|
253
|
+
raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
|
254
|
+
be relative"
|
255
|
+
end
|
243
256
|
|
244
|
-
get_base = -> { @base.relative? ? @url.
|
257
|
+
get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
|
245
258
|
|
246
259
|
if link
|
247
260
|
link = Wgit::Url.new(link)
|
@@ -253,7 +266,7 @@ module Wgit
|
|
253
266
|
end
|
254
267
|
end
|
255
268
|
|
256
|
-
base_url = @base ? get_base.call : @url.
|
269
|
+
base_url = @base ? get_base.call : @url.to_origin
|
257
270
|
base_url.omit_fragment.omit_query
|
258
271
|
end
|
259
272
|
|
@@ -267,7 +280,7 @@ module Wgit
|
|
267
280
|
def to_h(include_html: false, include_score: true)
|
268
281
|
ignore = include_html ? [] : ['@html']
|
269
282
|
ignore << '@score' unless include_score
|
270
|
-
ignore << '@
|
283
|
+
ignore << '@parser' # Always ignore the Nokogiri object.
|
271
284
|
|
272
285
|
Wgit::Utils.to_h(self, ignore: ignore)
|
273
286
|
end
|
@@ -284,7 +297,7 @@ module Wgit
|
|
284
297
|
|
285
298
|
# Returns a Hash containing this Document's instance variables and
|
286
299
|
# their #length (if they respond to it). Works dynamically so that any
|
287
|
-
# user defined
|
300
|
+
# user defined extractors (and their created instance vars) will appear in
|
288
301
|
# the returned Hash as well. The number of text snippets as well as total
|
289
302
|
# number of textual bytes are always included in the returned Hash.
|
290
303
|
#
|
@@ -324,21 +337,39 @@ module Wgit
|
|
324
337
|
end
|
325
338
|
|
326
339
|
# Uses Nokogiri's xpath method to search the doc's html and return the
|
327
|
-
# results.
|
340
|
+
# results. Use `#at_xpath` for returning the first result only.
|
328
341
|
#
|
329
342
|
# @param xpath [String] The xpath to search the @html with.
|
330
343
|
# @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
|
331
344
|
def xpath(xpath)
|
332
|
-
@
|
345
|
+
@parser.xpath(xpath)
|
333
346
|
end
|
334
347
|
|
335
|
-
# Uses Nokogiri's
|
336
|
-
# results.
|
348
|
+
# Uses Nokogiri's `at_xpath` method to search the doc's html and return the
|
349
|
+
# result. Use `#xpath` for returning several results.
|
350
|
+
#
|
351
|
+
# @param xpath [String] The xpath to search the @html with.
|
352
|
+
# @return [Nokogiri::XML::Element] The result of the xpath search.
|
353
|
+
def at_xpath(xpath)
|
354
|
+
@parser.at_xpath(xpath)
|
355
|
+
end
|
356
|
+
|
357
|
+
# Uses Nokogiri's `css` method to search the doc's html and return the
|
358
|
+
# results. Use `#at_css` for returning the first result only.
|
337
359
|
#
|
338
360
|
# @param selector [String] The CSS selector to search the @html with.
|
339
361
|
# @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
|
340
362
|
def css(selector)
|
341
|
-
@
|
363
|
+
@parser.css(selector)
|
364
|
+
end
|
365
|
+
|
366
|
+
# Uses Nokogiri's `at_css` method to search the doc's html and return the
|
367
|
+
# result. Use `#css` for returning several results.
|
368
|
+
#
|
369
|
+
# @param selector [String] The CSS selector to search the @html with.
|
370
|
+
# @return [Nokogiri::XML::Element] The result of the CSS search.
|
371
|
+
def at_css(selector)
|
372
|
+
@parser.at_css(selector)
|
342
373
|
end
|
343
374
|
|
344
375
|
# Returns all unique internal links from this Document in relative form.
|
@@ -356,13 +387,13 @@ module Wgit
|
|
356
387
|
return [] if @links.empty?
|
357
388
|
|
358
389
|
links = @links
|
359
|
-
.select { |link| link.relative?(host: @url.
|
390
|
+
.select { |link| link.relative?(host: @url.to_origin) }
|
360
391
|
.map(&:omit_base)
|
361
392
|
.map do |link| # Map @url.to_host into / as it's a duplicate.
|
362
393
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
363
394
|
end
|
364
395
|
|
365
|
-
Wgit::Utils.
|
396
|
+
Wgit::Utils.sanitize(links)
|
366
397
|
end
|
367
398
|
|
368
399
|
# Returns all unique internal links from this Document in absolute form by
|
@@ -371,7 +402,7 @@ module Wgit
|
|
371
402
|
#
|
372
403
|
# @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
|
373
404
|
def internal_absolute_links
|
374
|
-
internal_links.map { |link| link.
|
405
|
+
internal_links.map { |link| link.make_absolute(self) }
|
375
406
|
end
|
376
407
|
|
377
408
|
# Returns all unique external links from this Document in absolute form.
|
@@ -382,10 +413,17 @@ module Wgit
|
|
382
413
|
return [] if @links.empty?
|
383
414
|
|
384
415
|
links = @links
|
385
|
-
.
|
416
|
+
.map do |link|
|
417
|
+
if link.scheme_relative?
|
418
|
+
link.prefix_scheme(@url.to_scheme.to_sym)
|
419
|
+
else
|
420
|
+
link
|
421
|
+
end
|
422
|
+
end
|
423
|
+
.reject { |link| link.relative?(host: @url.to_origin) }
|
386
424
|
.map(&:omit_trailing_slash)
|
387
425
|
|
388
|
-
Wgit::Utils.
|
426
|
+
Wgit::Utils.sanitize(links)
|
389
427
|
end
|
390
428
|
|
391
429
|
# Searches the @text for the given query and returns the results.
|
@@ -400,8 +438,8 @@ module Wgit
|
|
400
438
|
# original sentence, which ever is less. The algorithm obviously ensures
|
401
439
|
# that the search query is visible somewhere in the sentence.
|
402
440
|
#
|
403
|
-
# @param query [
|
404
|
-
# @text for.
|
441
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
442
|
+
# document's @text for.
|
405
443
|
# @param case_sensitive [Boolean] Whether character case must match.
|
406
444
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
407
445
|
# for separately.
|
@@ -411,12 +449,16 @@ module Wgit
|
|
411
449
|
def search(
|
412
450
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
413
451
|
)
|
414
|
-
query = query.to_s
|
415
|
-
raise 'A search query must be provided' if query.empty?
|
416
452
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
417
453
|
|
418
|
-
|
419
|
-
|
454
|
+
if query.is_a?(Regexp)
|
455
|
+
regex = query
|
456
|
+
else # respond_to? #to_s == true
|
457
|
+
query = query.to_s
|
458
|
+
query = query.gsub(' ', '|') unless whole_sentence
|
459
|
+
regex = Regexp.new(query, !case_sensitive)
|
460
|
+
end
|
461
|
+
|
420
462
|
results = {}
|
421
463
|
|
422
464
|
@text.each do |sentence|
|
@@ -443,8 +485,8 @@ module Wgit
|
|
443
485
|
# functionality. The original text is returned; no other reference to it
|
444
486
|
# is kept thereafter.
|
445
487
|
#
|
446
|
-
# @param query [
|
447
|
-
# @text for.
|
488
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
489
|
+
# document's @text for.
|
448
490
|
# @param case_sensitive [Boolean] Whether character case must match.
|
449
491
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
450
492
|
# for separately.
|
@@ -463,13 +505,31 @@ module Wgit
|
|
463
505
|
orig_text
|
464
506
|
end
|
465
507
|
|
508
|
+
# Extracts a value/object from this Document's @html using the given xpath
|
509
|
+
# parameter.
|
510
|
+
#
|
511
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
512
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
513
|
+
# Object) : results (Array).
|
514
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
515
|
+
# (String) : result (Nokogiri Object).
|
516
|
+
# @return [String, Object] The value found in the html or the default value
|
517
|
+
# (singleton ? nil : []).
|
518
|
+
def extract(xpath, singleton: true, text_content_only: true)
|
519
|
+
send(
|
520
|
+
:extract_from_html, xpath,
|
521
|
+
singleton: singleton, text_content_only: text_content_only
|
522
|
+
)
|
523
|
+
end
|
524
|
+
|
466
525
|
protected
|
467
526
|
|
468
527
|
# Initializes the nokogiri object using @html, which cannot be nil.
|
469
528
|
# Override this method to custom configure the Nokogiri object returned.
|
470
529
|
# Gets called from Wgit::Document.new upon initialization.
|
471
530
|
#
|
472
|
-
# @yield [config] The given block is passed to Nokogiri::HTML for
|
531
|
+
# @yield [config] The given block is passed to Nokogiri::HTML for
|
532
|
+
# initialisation.
|
473
533
|
# @raise [StandardError] If @html isn't set.
|
474
534
|
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
475
535
|
def init_nokogiri(&block)
|
@@ -481,7 +541,7 @@ module Wgit
|
|
481
541
|
# Extracts a value/object from this Document's @html using the given xpath
|
482
542
|
# parameter.
|
483
543
|
#
|
484
|
-
# @param xpath [String] Used to find the value/object in @html.
|
544
|
+
# @param xpath [String, #call] Used to find the value/object in @html.
|
485
545
|
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
486
546
|
# Object) : results (Array).
|
487
547
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
@@ -497,23 +557,16 @@ module Wgit
|
|
497
557
|
# the block's `value` param unchanged if you simply want to inspect it.
|
498
558
|
# @return [String, Object] The value found in the html or the default value
|
499
559
|
# (singleton ? nil : []).
|
500
|
-
def
|
501
|
-
|
502
|
-
|
503
|
-
results = @doc.xpath(xpath)
|
504
|
-
|
505
|
-
return default if results.nil? || results.empty?
|
560
|
+
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
561
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
562
|
+
result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
|
506
563
|
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
text_content_only ? results.map(&:content) : results
|
511
|
-
end
|
512
|
-
|
513
|
-
singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
|
564
|
+
if text_content_only
|
565
|
+
result = singleton ? result&.content : result.map(&:content)
|
566
|
+
end
|
514
567
|
|
568
|
+
Wgit::Utils.sanitize(result)
|
515
569
|
result = yield(result, self, :document) if block_given?
|
516
|
-
|
517
570
|
result
|
518
571
|
end
|
519
572
|
|
@@ -533,16 +586,14 @@ module Wgit
|
|
533
586
|
# the block's `value` param unchanged if you simply want to inspect it.
|
534
587
|
# @return [String, Object] The value found in the obj or the default value
|
535
588
|
# (singleton ? nil : []).
|
536
|
-
def
|
589
|
+
def extract_from_object(obj, key, singleton: true)
|
537
590
|
assert_respond_to(obj, :fetch)
|
538
591
|
|
539
592
|
default = singleton ? nil : []
|
540
593
|
result = obj.fetch(key.to_s, default)
|
541
594
|
|
542
|
-
|
543
|
-
|
595
|
+
Wgit::Utils.sanitize(result)
|
544
596
|
result = yield(result, obj, :object) if block_given?
|
545
|
-
|
546
597
|
result
|
547
598
|
end
|
548
599
|
|
@@ -556,12 +607,12 @@ module Wgit
|
|
556
607
|
url = Wgit::Url.parse(url)
|
557
608
|
url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
|
558
609
|
|
559
|
-
@url
|
560
|
-
@html
|
561
|
-
@
|
562
|
-
@score
|
610
|
+
@url = url
|
611
|
+
@html = html || ''
|
612
|
+
@parser = init_nokogiri
|
613
|
+
@score = 0.0
|
563
614
|
|
564
|
-
Wgit::Utils.
|
615
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
565
616
|
|
566
617
|
# Dynamically run the init_*_from_html methods.
|
567
618
|
Document.private_instance_methods(false).each do |method|
|
@@ -577,12 +628,12 @@ module Wgit
|
|
577
628
|
def init_from_object(obj, encode: true)
|
578
629
|
assert_respond_to(obj, :fetch)
|
579
630
|
|
580
|
-
@url
|
581
|
-
@html
|
582
|
-
@
|
583
|
-
@score
|
631
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
632
|
+
@html = obj.fetch('html', '')
|
633
|
+
@parser = init_nokogiri
|
634
|
+
@score = obj.fetch('score', 0.0)
|
584
635
|
|
585
|
-
Wgit::Utils.
|
636
|
+
Wgit::Utils.sanitize(@html, encode: encode)
|
586
637
|
|
587
638
|
# Dynamically run the init_*_from_object methods.
|
588
639
|
Document.private_instance_methods(false).each do |method|
|
@@ -593,11 +644,11 @@ module Wgit
|
|
593
644
|
end
|
594
645
|
end
|
595
646
|
|
596
|
-
# Initialises an instance variable and defines
|
647
|
+
# Initialises an instance variable and defines an accessor method for it.
|
597
648
|
#
|
598
649
|
# @param var [Symbol] The name of the variable to be initialized.
|
599
650
|
# @param value [Object] The newly initialized variable's value.
|
600
|
-
# @return [Symbol] The name of the
|
651
|
+
# @return [Symbol] The name of the defined getter method.
|
601
652
|
def init_var(var, value)
|
602
653
|
# instance_var_name starts with @, var_name doesn't.
|
603
654
|
var = var.to_s
|
@@ -605,10 +656,9 @@ module Wgit
|
|
605
656
|
instance_var_name = "@#{var_name}".to_sym
|
606
657
|
|
607
658
|
instance_variable_set(instance_var_name, value)
|
659
|
+
Wgit::Document.attr_accessor(var_name)
|
608
660
|
|
609
|
-
|
610
|
-
instance_variable_get(instance_var_name)
|
611
|
-
end
|
661
|
+
var_name
|
612
662
|
end
|
613
663
|
|
614
664
|
alias content html
|
@@ -1,19 +1,19 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
### Default Document
|
3
|
+
### Default Document Extractors ###
|
4
4
|
|
5
5
|
# Base.
|
6
|
-
Wgit::Document.
|
6
|
+
Wgit::Document.define_extractor(
|
7
7
|
:base,
|
8
8
|
'//base/@href',
|
9
9
|
singleton: true,
|
10
10
|
text_content_only: true
|
11
11
|
) do |base|
|
12
|
-
Wgit::Url.
|
12
|
+
Wgit::Url.parse?(base) if base
|
13
13
|
end
|
14
14
|
|
15
15
|
# Title.
|
16
|
-
Wgit::Document.
|
16
|
+
Wgit::Document.define_extractor(
|
17
17
|
:title,
|
18
18
|
'//title',
|
19
19
|
singleton: true,
|
@@ -21,7 +21,7 @@ Wgit::Document.define_extension(
|
|
21
21
|
)
|
22
22
|
|
23
23
|
# Description.
|
24
|
-
Wgit::Document.
|
24
|
+
Wgit::Document.define_extractor(
|
25
25
|
:description,
|
26
26
|
'//meta[@name="description"]/@content',
|
27
27
|
singleton: true,
|
@@ -29,7 +29,7 @@ Wgit::Document.define_extension(
|
|
29
29
|
)
|
30
30
|
|
31
31
|
# Author.
|
32
|
-
Wgit::Document.
|
32
|
+
Wgit::Document.define_extractor(
|
33
33
|
:author,
|
34
34
|
'//meta[@name="author"]/@content',
|
35
35
|
singleton: true,
|
@@ -37,7 +37,7 @@ Wgit::Document.define_extension(
|
|
37
37
|
)
|
38
38
|
|
39
39
|
# Keywords.
|
40
|
-
Wgit::Document.
|
40
|
+
Wgit::Document.define_extractor(
|
41
41
|
:keywords,
|
42
42
|
'//meta[@name="keywords"]/@content',
|
43
43
|
singleton: true,
|
@@ -45,25 +45,25 @@ Wgit::Document.define_extension(
|
|
45
45
|
) do |keywords, _source, type|
|
46
46
|
if keywords && (type == :document)
|
47
47
|
keywords = keywords.split(',')
|
48
|
-
Wgit::Utils.
|
48
|
+
Wgit::Utils.sanitize(keywords)
|
49
49
|
end
|
50
50
|
keywords
|
51
51
|
end
|
52
52
|
|
53
53
|
# Links.
|
54
|
-
Wgit::Document.
|
54
|
+
Wgit::Document.define_extractor(
|
55
55
|
:links,
|
56
56
|
'//a/@href',
|
57
57
|
singleton: false,
|
58
58
|
text_content_only: true
|
59
59
|
) do |links|
|
60
60
|
links
|
61
|
-
.map { |link| Wgit::Url.
|
61
|
+
.map { |link| Wgit::Url.parse?(link) }
|
62
62
|
.compact # Remove unparsable links.
|
63
63
|
end
|
64
64
|
|
65
65
|
# Text.
|
66
|
-
Wgit::Document.
|
66
|
+
Wgit::Document.define_extractor(
|
67
67
|
:text,
|
68
68
|
proc { Wgit::Document.text_elements_xpath },
|
69
69
|
singleton: false,
|