wgit 0.8.0 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/document.rb CHANGED
@@ -6,19 +6,19 @@ require 'json'
6
6
  require 'set'
7
7
 
8
8
  module Wgit
9
- # Class primarily modeling a HTML web document, although other MIME types
9
+ # Class modeling/serialising a HTML web document, although other MIME types
10
10
  # will work e.g. images etc. Also doubles as a search result when
11
11
  # loading Documents from the database via `Wgit::Database#search`.
12
12
  #
13
13
  # The initialize method dynamically initializes instance variables from the
14
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
15
- # Document class can be easily extended allowing you to pull out the bits of
16
- # a webpage that are important to you. See `Wgit::Document.define_extension`.
15
+ # Document class can be easily extended allowing you to extract the bits of
16
+ # a webpage that are important to you. See `Wgit::Document.define_extractor`.
17
17
  class Document
18
18
  include Assertable
19
19
 
20
- # Regex for the allowed var names when defining an extension.
21
- REGEX_EXTENSION_NAME = /[a-z0-9_]+/.freeze
20
+ # Regex for the allowed var names when defining an extractor.
21
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
22
22
 
23
23
  # Set of text elements used to build Document#text.
24
24
  @text_elements = Set.new(%i[
@@ -29,8 +29,8 @@ module Wgit
29
29
  summary sup td textarea th time u ul var wbr
30
30
  ])
31
31
 
32
- # Set of Symbols representing the defined Document extensions.
33
- @extensions = Set.new
32
+ # Set of Symbols representing the defined Document extractors.
33
+ @extractors = Set.new
34
34
 
35
35
  class << self
36
36
  # Set of HTML elements that make up the visible text on a page. These
@@ -38,9 +38,9 @@ module Wgit
38
38
  # README.md for how to add to this Set dynamically.
39
39
  attr_reader :text_elements
40
40
 
41
- # Set of Symbols representing the defined Document extensions. Is
42
- # read-only. Use Wgit::Document.define_extension for a new extension.
43
- attr_reader :extensions
41
+ # Set of Symbols representing the defined Document extractors. Is
42
+ # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
+ attr_reader :extractors
44
44
  end
45
45
 
46
46
  # The URL of the webpage, an instance of Wgit::Url.
@@ -50,7 +50,7 @@ module Wgit
50
50
  attr_reader :html
51
51
 
52
52
  # The Nokogiri::HTML document object initialized from @html.
53
- attr_reader :doc
53
+ attr_reader :parser
54
54
 
55
55
  # The score is only used following a `Database#search` and records matches.
56
56
  attr_reader :score
@@ -62,7 +62,7 @@ module Wgit
62
62
  #
63
63
  # During initialisation, the Document will call any private
64
64
  # `init_*_from_html` and `init_*_from_object` methods it can find. See the
65
- # README.md and Wgit::Document.define_extension method for more details.
65
+ # Wgit::Document.define_extractor method for more details.
66
66
  #
67
67
  # @param url_or_obj [String, Wgit::Url, #fetch] Either a String
68
68
  # representing a URL or a Hash-like object responding to :fetch. e.g. a
@@ -101,13 +101,16 @@ module Wgit
101
101
  xpath
102
102
  end
103
103
 
104
- # Defines an extension, which is a way to serialise HTML elements into
105
- # instance variables upon Document initialization. See the default
106
- # extensions defined in 'document_extensions.rb' as examples.
104
+ # Defines a content extractor, which extracts HTML elements/content
105
+ # into instance variables upon Document initialization. See the default
106
+ # extractors defined in 'document_extractors.rb' as examples. Defining an
107
+ # extractor means that every subsequently crawled/initialized document
108
+ # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
+ # content extraction.
107
110
  #
108
- # Note that defined extensions work for both Documents initialized from
111
+ # Note that defined extractors work for both Documents initialized from
109
112
  # HTML (via Wgit::Crawler methods) and from database objects.
110
- # An extension once defined, initializes a private instance variable with
113
+ # An extractor once defined, initializes a private instance variable with
111
114
  # the xpath or database object result(s).
112
115
  #
113
116
  # When initialising from HTML, a singleton value of true will only
@@ -118,15 +121,17 @@ module Wgit
118
121
  # object), then a default will be used. The default value is:
119
122
  # `singleton ? nil : []`.
120
123
  #
121
- # @param var [Symbol] The name of the variable to be initialised.
124
+ # @param var [Symbol] The name of the variable to be initialised, that will
125
+ # contain the extracted content. A getter and setter method is defined
126
+ # for the initialised variable.
122
127
  # @param xpath [String, #call] The xpath used to find the element(s)
123
128
  # of the webpage. Only used when initializing from HTML.
124
129
  #
125
130
  # Pass a callable object (proc etc.) if you want the
126
131
  # xpath value to be derived on Document initialisation (instead of when
127
- # the extension is defined). The call method must return a valid xpath
132
+ # the extractor is defined). The call method must return a valid xpath
128
133
  # String.
129
- # @param opts [Hash] The options to define an extension with. The
134
+ # @param opts [Hash] The options to define an extractor with. The
130
135
  # options are only used when intializing from HTML, not the database.
131
136
  # @option opts [Boolean] :singleton The singleton option determines
132
137
  # whether or not the result(s) should be in an Array. If multiple
@@ -147,46 +152,50 @@ module Wgit
147
152
  # value. Return the block's value param unchanged if you want to inspect.
148
153
  # @raise [StandardError] If the var param isn't valid.
149
154
  # @return [Symbol] The given var Symbol if successful.
150
- def self.define_extension(var, xpath, opts = {}, &block)
155
+ def self.define_extractor(var, xpath, opts = {}, &block)
151
156
  var = var.to_sym
152
157
  defaults = { singleton: true, text_content_only: true }
153
158
  opts = defaults.merge(opts)
154
159
 
155
- raise "var must match #{REGEX_EXTENSION_NAME}" unless \
156
- var =~ REGEX_EXTENSION_NAME
160
+ raise "var must match #{REGEX_EXTRACTOR_NAME}" unless \
161
+ var =~ REGEX_EXTRACTOR_NAME
157
162
 
158
163
  # Define the private init_*_from_html method for HTML.
159
164
  # Gets the HTML's xpath value and creates a var for it.
160
165
  func_name = Document.send(:define_method, "init_#{var}_from_html") do
161
- result = find_in_html(xpath, opts, &block)
166
+ result = extract_from_html(xpath, **opts, &block)
162
167
  init_var(var, result)
163
168
  end
164
169
  Document.send(:private, func_name)
165
170
 
166
171
  # Define the private init_*_from_object method for a Database object.
167
172
  # Gets the Object's 'key' value and creates a var for it.
168
- func_name = Document.send(:define_method, "init_#{var}_from_object") do |obj|
169
- result = find_in_object(obj, var.to_s, singleton: opts[:singleton], &block)
173
+ func_name = Document.send(
174
+ :define_method, "init_#{var}_from_object"
175
+ ) do |obj|
176
+ result = extract_from_object(
177
+ obj, var.to_s, singleton: opts[:singleton], &block
178
+ )
170
179
  init_var(var, result)
171
180
  end
172
181
  Document.send(:private, func_name)
173
182
 
174
- @extensions << var
183
+ @extractors << var
175
184
  var
176
185
  end
177
186
 
178
- # Removes the `init_*` methods created when an extension is defined.
179
- # Therefore, this is the opposing method to `Document.define_extension`.
187
+ # Removes the `init_*` methods created when an extractor is defined.
188
+ # Therefore, this is the opposing method to `Document.define_extractor`.
180
189
  # Returns true if successful or false if the method(s) cannot be found.
181
190
  #
182
- # @param var [Symbol] The extension variable already defined.
183
- # @return [Boolean] True if the extension `var` was found and removed;
191
+ # @param var [Symbol] The extractor variable to remove.
192
+ # @return [Boolean] True if the extractor `var` was found and removed;
184
193
  # otherwise false.
185
- def self.remove_extension(var)
194
+ def self.remove_extractor(var)
186
195
  Document.send(:remove_method, "init_#{var}_from_html")
187
196
  Document.send(:remove_method, "init_#{var}_from_object")
188
197
 
189
- @extensions.delete(var.to_sym)
198
+ @extractors.delete(var.to_sym)
190
199
  true
191
200
  rescue NameError
192
201
  false
@@ -215,9 +224,9 @@ module Wgit
215
224
 
216
225
  # Returns the base URL of this Wgit::Document. The base URL is either the
217
226
  # <base> element's href value or @url (if @base is nil). If @base is
218
- # present and relative, then @url.to_base + @base is returned. This method
219
- # should be used instead of `doc.url.to_base` etc. when manually building
220
- # absolute links from relative links; or use `link.prefix_base(doc)`.
227
+ # present and relative, then @url.to_origin + @base is returned. This method
228
+ # should be used instead of `doc.url.to_origin` etc. when manually building
229
+ # absolute links from relative links; or use `link.make_absolute(doc)`.
221
230
  #
222
231
  # Provide the `link:` parameter to get the correct base URL for that type
223
232
  # of link. For example, a link of `#top` would always return @url because
@@ -236,12 +245,16 @@ module Wgit
236
245
  # @return [Wgit::Url] The base URL of this Document e.g.
237
246
  # 'http://example.com/public'.
238
247
  def base_url(link: nil)
239
- raise "Document @url ('#{@url}') cannot be relative if <base> is nil" \
240
248
  if @url.relative? && @base.nil?
241
- raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't be relative" \
249
+ raise "Document @url ('#{@url}') cannot be relative if <base> is nil"
250
+ end
251
+
242
252
  if @url.relative? && @base&.relative?
253
+ raise "Document @url ('#{@url}') and <base> ('#{@base}') both can't \
254
+ be relative"
255
+ end
243
256
 
244
- get_base = -> { @base.relative? ? @url.to_base.concat(@base) : @base }
257
+ get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
245
258
 
246
259
  if link
247
260
  link = Wgit::Url.new(link)
@@ -253,7 +266,7 @@ module Wgit
253
266
  end
254
267
  end
255
268
 
256
- base_url = @base ? get_base.call : @url.to_base
269
+ base_url = @base ? get_base.call : @url.to_origin
257
270
  base_url.omit_fragment.omit_query
258
271
  end
259
272
 
@@ -267,7 +280,7 @@ module Wgit
267
280
  def to_h(include_html: false, include_score: true)
268
281
  ignore = include_html ? [] : ['@html']
269
282
  ignore << '@score' unless include_score
270
- ignore << '@doc' # Always ignore Nokogiri @doc.
283
+ ignore << '@parser' # Always ignore the Nokogiri object.
271
284
 
272
285
  Wgit::Utils.to_h(self, ignore: ignore)
273
286
  end
@@ -284,7 +297,7 @@ module Wgit
284
297
 
285
298
  # Returns a Hash containing this Document's instance variables and
286
299
  # their #length (if they respond to it). Works dynamically so that any
287
- # user defined extensions (and their created instance vars) will appear in
300
+ # user defined extractors (and their created instance vars) will appear in
288
301
  # the returned Hash as well. The number of text snippets as well as total
289
302
  # number of textual bytes are always included in the returned Hash.
290
303
  #
@@ -324,21 +337,39 @@ module Wgit
324
337
  end
325
338
 
326
339
  # Uses Nokogiri's xpath method to search the doc's html and return the
327
- # results.
340
+ # results. Use `#at_xpath` for returning the first result only.
328
341
  #
329
342
  # @param xpath [String] The xpath to search the @html with.
330
343
  # @return [Nokogiri::XML::NodeSet] The result set of the xpath search.
331
344
  def xpath(xpath)
332
- @doc.xpath(xpath)
345
+ @parser.xpath(xpath)
333
346
  end
334
347
 
335
- # Uses Nokogiri's css method to search the doc's html and return the
336
- # results.
348
+ # Uses Nokogiri's `at_xpath` method to search the doc's html and return the
349
+ # result. Use `#xpath` for returning several results.
350
+ #
351
+ # @param xpath [String] The xpath to search the @html with.
352
+ # @return [Nokogiri::XML::Element] The result of the xpath search.
353
+ def at_xpath(xpath)
354
+ @parser.at_xpath(xpath)
355
+ end
356
+
357
+ # Uses Nokogiri's `css` method to search the doc's html and return the
358
+ # results. Use `#at_css` for returning the first result only.
337
359
  #
338
360
  # @param selector [String] The CSS selector to search the @html with.
339
361
  # @return [Nokogiri::XML::NodeSet] The result set of the CSS search.
340
362
  def css(selector)
341
- @doc.css(selector)
363
+ @parser.css(selector)
364
+ end
365
+
366
+ # Uses Nokogiri's `at_css` method to search the doc's html and return the
367
+ # result. Use `#css` for returning several results.
368
+ #
369
+ # @param selector [String] The CSS selector to search the @html with.
370
+ # @return [Nokogiri::XML::Element] The result of the CSS search.
371
+ def at_css(selector)
372
+ @parser.at_css(selector)
342
373
  end
343
374
 
344
375
  # Returns all unique internal links from this Document in relative form.
@@ -356,13 +387,13 @@ module Wgit
356
387
  return [] if @links.empty?
357
388
 
358
389
  links = @links
359
- .select { |link| link.relative?(host: @url.to_base) }
390
+ .select { |link| link.relative?(host: @url.to_origin) }
360
391
  .map(&:omit_base)
361
392
  .map do |link| # Map @url.to_host into / as it's a duplicate.
362
393
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
363
394
  end
364
395
 
365
- Wgit::Utils.process_arr(links)
396
+ Wgit::Utils.sanitize(links)
366
397
  end
367
398
 
368
399
  # Returns all unique internal links from this Document in absolute form by
@@ -371,7 +402,7 @@ module Wgit
371
402
  #
372
403
  # @return [Array<Wgit::Url>] Self's unique internal Url's in absolute form.
373
404
  def internal_absolute_links
374
- internal_links.map { |link| link.prefix_base(self) }
405
+ internal_links.map { |link| link.make_absolute(self) }
375
406
  end
376
407
 
377
408
  # Returns all unique external links from this Document in absolute form.
@@ -382,10 +413,17 @@ module Wgit
382
413
  return [] if @links.empty?
383
414
 
384
415
  links = @links
385
- .reject { |link| link.relative?(host: @url.to_base) }
416
+ .map do |link|
417
+ if link.scheme_relative?
418
+ link.prefix_scheme(@url.to_scheme.to_sym)
419
+ else
420
+ link
421
+ end
422
+ end
423
+ .reject { |link| link.relative?(host: @url.to_origin) }
386
424
  .map(&:omit_trailing_slash)
387
425
 
388
- Wgit::Utils.process_arr(links)
426
+ Wgit::Utils.sanitize(links)
389
427
  end
390
428
 
391
429
  # Searches the @text for the given query and returns the results.
@@ -400,8 +438,8 @@ module Wgit
400
438
  # original sentence, which ever is less. The algorithm obviously ensures
401
439
  # that the search query is visible somewhere in the sentence.
402
440
  #
403
- # @param query [String, #to_s] The value to search the document's
404
- # @text for.
441
+ # @param query [Regexp, #to_s] The regex or text value to search the
442
+ # document's @text for.
405
443
  # @param case_sensitive [Boolean] Whether character case must match.
406
444
  # @param whole_sentence [Boolean] Whether multiple words should be searched
407
445
  # for separately.
@@ -411,12 +449,16 @@ module Wgit
411
449
  def search(
412
450
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
413
451
  )
414
- query = query.to_s
415
- raise 'A search query must be provided' if query.empty?
416
452
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
417
453
 
418
- query = query.gsub(' ', '|') unless whole_sentence
419
- regex = Regexp.new(query, !case_sensitive)
454
+ if query.is_a?(Regexp)
455
+ regex = query
456
+ else # respond_to? #to_s == true
457
+ query = query.to_s
458
+ query = query.gsub(' ', '|') unless whole_sentence
459
+ regex = Regexp.new(query, !case_sensitive)
460
+ end
461
+
420
462
  results = {}
421
463
 
422
464
  @text.each do |sentence|
@@ -443,8 +485,8 @@ module Wgit
443
485
  # functionality. The original text is returned; no other reference to it
444
486
  # is kept thereafter.
445
487
  #
446
- # @param query [String, #to_s] The value to search the document's
447
- # @text for.
488
+ # @param query [Regexp, #to_s] The regex or text value to search the
489
+ # document's @text for.
448
490
  # @param case_sensitive [Boolean] Whether character case must match.
449
491
  # @param whole_sentence [Boolean] Whether multiple words should be searched
450
492
  # for separately.
@@ -463,13 +505,31 @@ module Wgit
463
505
  orig_text
464
506
  end
465
507
 
508
+ # Extracts a value/object from this Document's @html using the given xpath
509
+ # parameter.
510
+ #
511
+ # @param xpath [String, #call] Used to find the value/object in @html.
512
+ # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
+ # Object) : results (Array).
514
+ # @param text_content_only [Boolean] text_content_only ? result.content
515
+ # (String) : result (Nokogiri Object).
516
+ # @return [String, Object] The value found in the html or the default value
517
+ # (singleton ? nil : []).
518
+ def extract(xpath, singleton: true, text_content_only: true)
519
+ send(
520
+ :extract_from_html, xpath,
521
+ singleton: singleton, text_content_only: text_content_only
522
+ )
523
+ end
524
+
466
525
  protected
467
526
 
468
527
  # Initializes the nokogiri object using @html, which cannot be nil.
469
528
  # Override this method to custom configure the Nokogiri object returned.
470
529
  # Gets called from Wgit::Document.new upon initialization.
471
530
  #
472
- # @yield [config] The given block is passed to Nokogiri::HTML for initialisation.
531
+ # @yield [config] The given block is passed to Nokogiri::HTML for
532
+ # initialisation.
473
533
  # @raise [StandardError] If @html isn't set.
474
534
  # @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
475
535
  def init_nokogiri(&block)
@@ -481,7 +541,7 @@ module Wgit
481
541
  # Extracts a value/object from this Document's @html using the given xpath
482
542
  # parameter.
483
543
  #
484
- # @param xpath [String] Used to find the value/object in @html.
544
+ # @param xpath [String, #call] Used to find the value/object in @html.
485
545
  # @param singleton [Boolean] singleton ? results.first (single Nokogiri
486
546
  # Object) : results (Array).
487
547
  # @param text_content_only [Boolean] text_content_only ? result.content
@@ -497,23 +557,16 @@ module Wgit
497
557
  # the block's `value` param unchanged if you simply want to inspect it.
498
558
  # @return [String, Object] The value found in the html or the default value
499
559
  # (singleton ? nil : []).
500
- def find_in_html(xpath, singleton: true, text_content_only: true)
501
- default = singleton ? nil : []
502
- xpath = xpath.call if xpath.respond_to?(:call)
503
- results = @doc.xpath(xpath)
504
-
505
- return default if results.nil? || results.empty?
560
+ def extract_from_html(xpath, singleton: true, text_content_only: true)
561
+ xpath = xpath.call if xpath.respond_to?(:call)
562
+ result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
506
563
 
507
- result = if singleton
508
- text_content_only ? results.first.content : results.first
509
- else
510
- text_content_only ? results.map(&:content) : results
511
- end
512
-
513
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
564
+ if text_content_only
565
+ result = singleton ? result&.content : result.map(&:content)
566
+ end
514
567
 
568
+ Wgit::Utils.sanitize(result)
515
569
  result = yield(result, self, :document) if block_given?
516
-
517
570
  result
518
571
  end
519
572
 
@@ -533,16 +586,14 @@ module Wgit
533
586
  # the block's `value` param unchanged if you simply want to inspect it.
534
587
  # @return [String, Object] The value found in the obj or the default value
535
588
  # (singleton ? nil : []).
536
- def find_in_object(obj, key, singleton: true)
589
+ def extract_from_object(obj, key, singleton: true)
537
590
  assert_respond_to(obj, :fetch)
538
591
 
539
592
  default = singleton ? nil : []
540
593
  result = obj.fetch(key.to_s, default)
541
594
 
542
- singleton ? Wgit::Utils.process_str(result) : Wgit::Utils.process_arr(result)
543
-
595
+ Wgit::Utils.sanitize(result)
544
596
  result = yield(result, obj, :object) if block_given?
545
-
546
597
  result
547
598
  end
548
599
 
@@ -556,12 +607,12 @@ module Wgit
556
607
  url = Wgit::Url.parse(url)
557
608
  url.crawled = true unless url.crawled? # Avoid overriding date_crawled.
558
609
 
559
- @url = url
560
- @html = html || ''
561
- @doc = init_nokogiri
562
- @score = 0.0
610
+ @url = url
611
+ @html = html || ''
612
+ @parser = init_nokogiri
613
+ @score = 0.0
563
614
 
564
- Wgit::Utils.process_str(@html, encode: encode)
615
+ Wgit::Utils.sanitize(@html, encode: encode)
565
616
 
566
617
  # Dynamically run the init_*_from_html methods.
567
618
  Document.private_instance_methods(false).each do |method|
@@ -577,12 +628,12 @@ module Wgit
577
628
  def init_from_object(obj, encode: true)
578
629
  assert_respond_to(obj, :fetch)
579
630
 
580
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
581
- @html = obj.fetch('html', '')
582
- @doc = init_nokogiri
583
- @score = obj.fetch('score', 0.0)
631
+ @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
632
+ @html = obj.fetch('html', '')
633
+ @parser = init_nokogiri
634
+ @score = obj.fetch('score', 0.0)
584
635
 
585
- Wgit::Utils.process_str(@html, encode: encode)
636
+ Wgit::Utils.sanitize(@html, encode: encode)
586
637
 
587
638
  # Dynamically run the init_*_from_object methods.
588
639
  Document.private_instance_methods(false).each do |method|
@@ -593,11 +644,11 @@ module Wgit
593
644
  end
594
645
  end
595
646
 
596
- # Initialises an instance variable and defines a getter method for it.
647
+ # Initialises an instance variable and defines an accessor method for it.
597
648
  #
598
649
  # @param var [Symbol] The name of the variable to be initialized.
599
650
  # @param value [Object] The newly initialized variable's value.
600
- # @return [Symbol] The name of the newly created getter method.
651
+ # @return [Symbol] The name of the defined getter method.
601
652
  def init_var(var, value)
602
653
  # instance_var_name starts with @, var_name doesn't.
603
654
  var = var.to_s
@@ -605,10 +656,9 @@ module Wgit
605
656
  instance_var_name = "@#{var_name}".to_sym
606
657
 
607
658
  instance_variable_set(instance_var_name, value)
659
+ Wgit::Document.attr_accessor(var_name)
608
660
 
609
- Document.send(:define_method, var_name) do
610
- instance_variable_get(instance_var_name)
611
- end
661
+ var_name
612
662
  end
613
663
 
614
664
  alias content html
@@ -1,19 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- ### Default Document Extensions ###
3
+ ### Default Document Extractors ###
4
4
 
5
5
  # Base.
6
- Wgit::Document.define_extension(
6
+ Wgit::Document.define_extractor(
7
7
  :base,
8
8
  '//base/@href',
9
9
  singleton: true,
10
10
  text_content_only: true
11
11
  ) do |base|
12
- Wgit::Url.parse_or_nil(base) if base
12
+ Wgit::Url.parse?(base) if base
13
13
  end
14
14
 
15
15
  # Title.
16
- Wgit::Document.define_extension(
16
+ Wgit::Document.define_extractor(
17
17
  :title,
18
18
  '//title',
19
19
  singleton: true,
@@ -21,7 +21,7 @@ Wgit::Document.define_extension(
21
21
  )
22
22
 
23
23
  # Description.
24
- Wgit::Document.define_extension(
24
+ Wgit::Document.define_extractor(
25
25
  :description,
26
26
  '//meta[@name="description"]/@content',
27
27
  singleton: true,
@@ -29,7 +29,7 @@ Wgit::Document.define_extension(
29
29
  )
30
30
 
31
31
  # Author.
32
- Wgit::Document.define_extension(
32
+ Wgit::Document.define_extractor(
33
33
  :author,
34
34
  '//meta[@name="author"]/@content',
35
35
  singleton: true,
@@ -37,7 +37,7 @@ Wgit::Document.define_extension(
37
37
  )
38
38
 
39
39
  # Keywords.
40
- Wgit::Document.define_extension(
40
+ Wgit::Document.define_extractor(
41
41
  :keywords,
42
42
  '//meta[@name="keywords"]/@content',
43
43
  singleton: true,
@@ -45,25 +45,25 @@ Wgit::Document.define_extension(
45
45
  ) do |keywords, _source, type|
46
46
  if keywords && (type == :document)
47
47
  keywords = keywords.split(',')
48
- Wgit::Utils.process_arr(keywords)
48
+ Wgit::Utils.sanitize(keywords)
49
49
  end
50
50
  keywords
51
51
  end
52
52
 
53
53
  # Links.
54
- Wgit::Document.define_extension(
54
+ Wgit::Document.define_extractor(
55
55
  :links,
56
56
  '//a/@href',
57
57
  singleton: false,
58
58
  text_content_only: true
59
59
  ) do |links|
60
60
  links
61
- .map { |link| Wgit::Url.parse_or_nil(link) }
61
+ .map { |link| Wgit::Url.parse?(link) }
62
62
  .compact # Remove unparsable links.
63
63
  end
64
64
 
65
65
  # Text.
66
- Wgit::Document.define_extension(
66
+ Wgit::Document.define_extractor(
67
67
  :text,
68
68
  proc { Wgit::Document.text_elements_xpath },
69
69
  singleton: false,