wgit 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,147 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../assertable"
4
+ require_relative "../url"
5
+ require_relative "../document"
6
+ require_relative "../model"
7
+
8
+ module Wgit::Database
9
+ # The parent DatabaseAdapter class that should be inherited from when
10
+ # creating an underlying Database adapter implementation class e.g.
11
+ # Wgit::Database::MongoDB.
12
+ #
13
+ # Listed in this class are the methods that an implementer class must
14
+ # implement to work with Wgit. Failure to do so will result in a
15
+ # NotImplementedError being raised.
16
+ #
17
+ # While not required, implementing the method `#search_fields=(fields)` in an
18
+ # adapter class will allow `Wgit::Model.set_search_fields` to call
19
+ # it. This allows the search fields to be set in one method call, from within
20
+ # the Wgit::Model class. See this method's docs for more info.
21
+ #
22
+ # Also listed in this class are common helper methods available to all
23
+ # Database implementer subclasses.
24
+ class DatabaseAdapter
25
+ include Wgit::Assertable
26
+
27
+ # The NotImplementedError message that gets raised if an implementor class
28
+ # doesn't implement a method required by Wgit.
29
+ NOT_IMPL_ERR = "The DatabaseAdapter class you're using hasn't \
30
+ implemented this method"
31
+
32
+ ###################### START OF INTERFACE METHODS ######################
33
+
34
+ # Initializes a DatabaseAdapter instance.
35
+ #
36
+ # The implementor class should establish a DB connection here using the
37
+ # given connection_string, falling back to `ENV['WGIT_CONNECTION_STRING']`.
38
+ # Don't forget to call `super`.
39
+ #
40
+ # @param connection_string [String] The connection string needed to connect
41
+ # to the database.
42
+ # @raise [StandardError] If a connection string isn't provided, either as a
43
+ # parameter or via the environment.
44
+ def initialize(connection_string = nil); end
45
+
46
+ # Returns the current size of the database.
47
+ #
48
+ # @return [Integer] The current size of the DB.
49
+ def size
50
+ raise NotImplementedError, NOT_IMPL_ERR
51
+ end
52
+
53
+ # Searches the database's Documents for the given query. The
54
+ # `Wgit::Model.search_fields` should be searched for matches
55
+ # against the given query. Documents should be sorted starting with the
56
+ # most relevant. Each returned Document should have it's `score` field set
57
+ # for relevance.
58
+ #
59
+ # @param query [String] The text query to search with.
60
+ # @param case_sensitive [Boolean] Whether character case must match.
61
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
62
+ # for separately.
63
+ # @param limit [Integer] The max number of results to return.
64
+ # @param skip [Integer] The number of results to skip.
65
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
66
+ # DB.
67
+ # @return [Array<Wgit::Document>] The search results obtained from the DB.
68
+ def search(
69
+ query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0
70
+ )
71
+ raise NotImplementedError, NOT_IMPL_ERR
72
+ end
73
+
74
+ # Deletes everything in the urls and documents collections.
75
+ #
76
+ # @return [Integer] The number of deleted records.
77
+ def empty
78
+ raise NotImplementedError, NOT_IMPL_ERR
79
+ end
80
+
81
+ # Returns Url records that haven't yet been crawled.
82
+ #
83
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
84
+ # @param skip [Integer] Skip n amount of Url's.
85
+ # @yield [url] Given each Url object (Wgit::Url) returned from the DB.
86
+ # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
87
+ def uncrawled_urls(limit: 0, skip: 0)
88
+ raise NotImplementedError, NOT_IMPL_ERR
89
+ end
90
+
91
+ # Inserts or updates the object in the database.
92
+ #
93
+ # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
94
+ # @return [Boolean] True if inserted, false if updated.
95
+ def upsert(obj)
96
+ raise NotImplementedError, NOT_IMPL_ERR
97
+ end
98
+
99
+ # Bulk upserts the objects in the database collection.
100
+ # You cannot mix collection objs types, all must be Urls or Documents.
101
+ #
102
+ # @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
103
+ # inserted/updated.
104
+ # @return [Integer] The total number of newly inserted objects.
105
+ def bulk_upsert(objs)
106
+ raise NotImplementedError, NOT_IMPL_ERR
107
+ end
108
+
109
+ ###################### END OF INTERFACE METHODS ######################
110
+
111
+ private
112
+
113
+ # Returns the correct Wgit::Database:Model for the given obj type.
114
+ #
115
+ # @param obj [Wgit::Url, Wgit::Document] The obj to obtain a model for.
116
+ # @return [Hash] The obj model.
117
+ def build_model(obj)
118
+ assert_type(obj, [Wgit::Url, Wgit::Document])
119
+
120
+ if obj.is_a?(Wgit::Url)
121
+ Wgit::Model.url(obj)
122
+ else
123
+ Wgit::Model.document(obj)
124
+ end
125
+ end
126
+
127
+ # Map each DB hash object into a Wgit::Document. Each Document is yielded
128
+ # if a block is given before returning the mapped Array of Documents.
129
+ def map_documents(doc_hashes)
130
+ doc_hashes.map do |doc|
131
+ doc = Wgit::Document.new(doc)
132
+ yield(doc) if block_given?
133
+ doc
134
+ end
135
+ end
136
+
137
+ # Map each DB hash object into a Wgit::Url. Each Url is yielded
138
+ # if a block is given before returning the mapped Array of Urls.
139
+ def map_urls(url_hashes)
140
+ url_hashes.map do |url|
141
+ url = Wgit::Url.new(url)
142
+ yield(url) if block_given?
143
+ url
144
+ end
145
+ end
146
+ end
147
+ end
data/lib/wgit/document.rb CHANGED
@@ -7,7 +7,8 @@ require 'json'
7
7
  module Wgit
8
8
  # Class modeling/serialising a HTML web document, although other MIME types
9
9
  # will work e.g. images etc. Also doubles as a search result when
10
- # loading Documents from the database via `Wgit::Database#search`.
10
+ # loading Documents from the database via
11
+ # `Wgit::Database::DatabaseAdapter#search`.
11
12
  #
12
13
  # The initialize method dynamically initializes instance variables from the
13
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
@@ -19,33 +20,18 @@ module Wgit
19
20
  # Regex for the allowed var names when defining an extractor.
20
21
  REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
21
22
 
22
- # Set of text elements used to build the xpath for Document#text.
23
- @text_elements = Set.new(%i[
24
- a abbr address article aside b bdi bdo blockquote button caption cite
25
- code data dd del details dfn div dl dt em figcaption figure footer h1 h2
26
- h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
27
- option output p pre q rb rt ruby s samp section small span strong sub
28
- summary sup td textarea th time u ul var wbr
29
- ])
30
-
31
- # Instance vars to be ignored by Document#to_h and in turn Model.document.
23
+ # Instance vars to be ignored by Document#to_h and in turn
24
+ # Wgit::Model.document.
32
25
  @to_h_ignore_vars = [
33
- '@parser', # Always ignore the Nokogiri object.
34
- '@meta_robots', # Used by #no_index?, ignore.
35
- '@meta_wgit' # Used by #no_index?, ignore.
26
+ '@parser' # Always ignore the Nokogiri object.
36
27
  ]
37
28
 
38
29
  # Set of Symbols representing the defined Document extractors.
39
30
  @extractors = Set.new
40
31
 
41
32
  class << self
42
- # Set of HTML elements that make up the visible text on a page. These
43
- # elements are used to initialize the Wgit::Document#text. See the
44
- # README.md for how to add to this Set dynamically.
45
- attr_reader :text_elements
46
-
47
- # Array of instance vars to ignore when Document#to_h and in turn
48
- # Model.document methods are called. Append your own defined extractor
33
+ # Array of instance vars to ignore when Document#to_h and (in turn)
34
+ # Wgit::Model.document methods are called. Append your own defined extractor
49
35
  # vars to omit them from the model (database object) when indexing.
50
36
  # Each var should be a String starting with an '@' char e.g. "@data" etc.
51
37
  attr_reader :to_h_ignore_vars
@@ -64,7 +50,7 @@ module Wgit
64
50
  # The Nokogiri::HTML document object initialized from @html.
65
51
  attr_reader :parser
66
52
 
67
- # The score is only used following a `Database#search` and records matches.
53
+ # The score is set/used following a `Database#search` and records matches.
68
54
  attr_reader :score
69
55
 
70
56
  # Initialize takes either two strings (representing the URL and HTML) or an
@@ -96,17 +82,6 @@ module Wgit
96
82
 
97
83
  ### Document Class Methods ###
98
84
 
99
- # Uses Document.text_elements to build an xpath String, used to obtain
100
- # all of the combined visual text on a webpage.
101
- #
102
- # @return [String] An xpath String to obtain a webpage's text elements.
103
- def self.text_elements_xpath
104
- @text_elements.each_with_index.reduce('') do |xpath, (el, i)|
105
- xpath += ' | ' unless i.zero?
106
- xpath + format('//%s/text()', el)
107
- end
108
- end
109
-
110
85
  # Defines a content extractor, which extracts HTML elements/content
111
86
  # into instance variables upon Document initialization. See the default
112
87
  # extractors defined in 'document_extractors.rb' as examples. Defining an
@@ -130,8 +105,9 @@ module Wgit
130
105
  # @param var [Symbol] The name of the variable to be initialised, that will
131
106
  # contain the extracted content. A getter and setter method is defined
132
107
  # for the initialised variable.
133
- # @param xpath [String, #call] The xpath used to find the element(s)
134
- # of the webpage. Only used when initializing from HTML.
108
+ # @param xpath [String, #call, nil] The xpath used to find the element(s)
109
+ # of the webpage. Only used when initializing from HTML. Passing nil will
110
+ # skip the HTML extraction, which sometimes isn't required.
135
111
  #
136
112
  # Pass a callable object (proc etc.) if you want the
137
113
  # xpath value to be derived on Document initialisation (instead of when
@@ -447,12 +423,14 @@ be relative"
447
423
  Wgit::Utils.sanitize(links)
448
424
  end
449
425
 
450
- # Searches the @text for the given query and returns the results.
426
+ # Searches the Document's instance vars for the given query and returns
427
+ # the results. The `Wgit::Model.search_fields` denote the vars to be
428
+ # searched, unless overridden using the search_fields: param.
451
429
  #
452
- # The number of search hits for each sentenence are recorded internally
430
+ # The number of matches for each search field is recorded internally
453
431
  # and used to rank/sort the search results before being returned. Where
454
- # the Wgit::Database#search method search all documents for the most hits,
455
- # this method searches each document's @text for the most hits.
432
+ # the Wgit::Database::DatabaseAdapter#search method searches all documents
433
+ # for matches, this method searches each individual Document for matches.
456
434
  #
457
435
  # Each search result comprises of a sentence of a given length. The length
458
436
  # will be based on the sentence_limit parameter or the full length of the
@@ -460,51 +438,86 @@ be relative"
460
438
  # that the search query is visible somewhere in the sentence.
461
439
  #
462
440
  # @param query [Regexp, #to_s] The regex or text value to search the
463
- # document's @text for.
441
+ # document's instance vars (Wgit::Model.search_fields) for.
464
442
  # @param case_sensitive [Boolean] Whether character case must match.
465
443
  # @param whole_sentence [Boolean] Whether multiple words should be searched
466
444
  # for separately.
467
445
  # @param sentence_limit [Integer] The max length of each search result
468
446
  # sentence.
469
- # @return [Array<String>] A subset of @text, matching the query.
447
+ # @param search_fields [Hash<Symbol, Integer>] The Document instance vars
448
+ # to search and the weight for a match (used to determine relevence).
449
+ # This should only be set for custom one-off Document searches. For
450
+ # permanent changing of search fields, see Wgit::Model.set_search_fields.
451
+ # @yield [results_hash] Given the results_hash containing each search
452
+ # result (String) and its score (num_matches * weight).
453
+ # @return [Array<String>] A subset of this document's instance vars,
454
+ # matching the query for the search_fields: param.
470
455
  def search(
471
- query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
456
+ query, case_sensitive: false, whole_sentence: true,
457
+ sentence_limit: 80, search_fields: Wgit::Model.search_fields
472
458
  )
473
459
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
460
+ assert_type(search_fields, Hash)
474
461
 
475
- if query.is_a?(Regexp)
476
- regex = query
477
- else # query.respond_to? :to_s == true
478
- query = query.to_s
479
- query = query.gsub(' ', '|') unless whole_sentence
480
- regex = Regexp.new(query, !case_sensitive)
481
- end
482
-
462
+ regex = Wgit::Utils.build_search_regex(
463
+ query, case_sensitive:, whole_sentence:)
483
464
  results = {}
484
465
 
485
- @text.each do |sentence|
486
- sentence = sentence.strip
487
- next if results[sentence]
466
+ search_fields.each do |field, weight|
467
+ doc_field = instance_variable_get("@#{field}".to_sym)
468
+ next unless doc_field
488
469
 
489
- hits = sentence.scan(regex).count
490
- next unless hits.positive?
470
+ Wgit::Utils.each(doc_field) do |text|
471
+ assert_type(text, String)
491
472
 
492
- index = sentence.index(regex) # Index of first match.
493
- Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
473
+ text = text.strip
474
+ matches = text.scan(regex).count
475
+ next unless matches.positive?
494
476
 
495
- results[sentence] = hits
477
+ index = text.index(regex) # Index of first match.
478
+ Wgit::Utils.format_sentence_length(text, index, sentence_limit)
479
+
480
+ # For duplicate matching text, total the text score.
481
+ text_score = matches * weight
482
+ existing_score = results[text]
483
+ text_score += existing_score if existing_score
484
+
485
+ results[text] = text_score
486
+ end
496
487
  end
497
488
 
498
489
  return [] if results.empty?
499
490
 
500
- results = Hash[results.sort_by { |_k, v| v }]
501
- results.keys.reverse
491
+ yield results if block_given?
492
+
493
+ # Return only the matching text sentences, sorted by relevance.
494
+ Hash[results.sort_by { |_, score| -score }].keys
495
+ end
496
+
497
+ # Performs a text only search of the Document, instead of searching all
498
+ # search fields defined in Wgit::Model.search_fields.
499
+ #
500
+ # @param query [Regexp, #to_s] The regex or text value to search the
501
+ # document's text for.
502
+ # @param case_sensitive [Boolean] Whether character case must match.
503
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
504
+ # for separately.
505
+ # @param sentence_limit [Integer] The max length of each search result
506
+ # sentence.
507
+ # @return [Array<String>] A subset of this document's text fields that
508
+ # match the query.
509
+ def search_text(
510
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
511
+ )
512
+ search(
513
+ query, case_sensitive:, whole_sentence:,
514
+ sentence_limit:, search_fields: { text: 1 })
502
515
  end
503
516
 
504
- # Performs a text search (see Document#search for details) but assigns the
505
- # results to the @text instance variable. This can be used for sub search
506
- # functionality. The original text is returned; no other reference to it
507
- # is kept thereafter.
517
+ # Performs a text only search (see Document#search_text for details) but
518
+ # assigns the results to the @text instance variable. This can be used
519
+ # for sub search functionality. The original text is returned; no other
520
+ # reference to it is kept thereafter.
508
521
  #
509
522
  # @param query [Regexp, #to_s] The regex or text value to search the
510
523
  # document's @text for.
@@ -514,11 +527,11 @@ be relative"
514
527
  # @param sentence_limit [Integer] The max length of each search result
515
528
  # sentence.
516
529
  # @return [String] This Document's original @text value.
517
- def search!(
530
+ def search_text!(
518
531
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
519
532
  )
520
533
  orig_text = @text
521
- @text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
534
+ @text = search_text(query, case_sensitive:, whole_sentence:, sentence_limit:)
522
535
 
523
536
  orig_text
524
537
  end
@@ -544,14 +557,71 @@ be relative"
544
557
  send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
545
558
  end
546
559
 
547
- # Works with the default extractors to extract and check the HTML meta tags
548
- # instructing Wgit not to index this document (save it to a Database). If
549
- # the default extractors are removed, this method will always return false.
560
+ # Attempts to extract and check the HTML meta tags instructing Wgit not to
561
+ # index this document (save it to a Database).
550
562
  #
551
563
  # @return [Boolean] True if this document shouldn't be saved to a Database,
552
564
  # false otherwise.
553
565
  def no_index?
554
- [@meta_robots, @meta_wgit].include?('noindex')
566
+ meta_robots = extract_from_html(
567
+ '//meta[@name="robots"]/@content',
568
+ singleton: true,
569
+ text_content_only: true
570
+ )
571
+ meta_wgit = extract_from_html(
572
+ '//meta[@name="wgit"]/@content',
573
+ singleton: true,
574
+ text_content_only: true
575
+ )
576
+
577
+ [meta_robots, meta_wgit].include?('noindex')
578
+ end
579
+
580
+ # Firstly finds the target element whose text contains el_text.
581
+ # Then finds the preceeding fragment element nearest to the target
582
+ # element and returns it's href value (starting with #). The search is
583
+ # performed against the @html so Documents loaded from a DB will need to
584
+ # contain the 'html' field in the Wgit::Model. See the
585
+ # `Wgit::Model#include_doc_html` documentation for more info.
586
+ #
587
+ # @param el_text [String] The element text of the target element.
588
+ # @param el_type [String] The element type, defaulting to any type.
589
+ # @yield [results] Given the results of the xpath query. Return the target
590
+ # you want or nil to use the default (first) target in results.
591
+ # @return [String, nil] nil if no nearest fragment or the nearest
592
+ # fragment's href e.g. '#about'.
593
+ # @raise [StandardError] Raises if no matching target element containg
594
+ # el_text can be found or if @html is empty.
595
+ def nearest_fragment(el_text, el_type = "*")
596
+ raise "The @html is empty" if @html.empty?
597
+
598
+ xpath_query = "//#{el_type}[text()[contains(.,\"#{el_text}\")]]"
599
+ results = xpath(xpath_query)
600
+ return nil if results.empty?
601
+
602
+ target = results.first
603
+ if block_given?
604
+ result = yield(results)
605
+ target = result if result
606
+ end
607
+
608
+ target_index = html_index(target)
609
+ raise 'Failed to find target index' unless target_index
610
+
611
+ fragment_h = fragment_indices(fragments)
612
+
613
+ # Return the target href if the target is itself a fragment.
614
+ return fragment_h[target_index] if fragment_h.keys.include?(target_index)
615
+
616
+ # Find the target's nearest preceeding fragment href.
617
+ closest_index = 0
618
+ fragment_h.each do |fragment_index, href|
619
+ if fragment_index.between?(closest_index, target_index)
620
+ closest_index = fragment_index
621
+ end
622
+ end
623
+
624
+ fragment_h[closest_index]
555
625
  end
556
626
 
557
627
  protected
@@ -573,7 +643,8 @@ be relative"
573
643
  # Extracts a value/object from this Document's @html using the given xpath
574
644
  # parameter.
575
645
  #
576
- # @param xpath [String, #call] Used to find the value/object in @html.
646
+ # @param xpath [String, #call, nil] Used to find the value/object in @html.
647
+ # Passing nil will skip the HTML extraction which isn't always needed.
577
648
  # @param singleton [Boolean] singleton ? results.first (single Object) :
578
649
  # results (Enumerable).
579
650
  # @param text_content_only [Boolean] text_content_only ? result.content
@@ -588,8 +659,12 @@ be relative"
588
659
  # @return [String, Object] The value found in the html or the default value
589
660
  # (singleton ? nil : []).
590
661
  def extract_from_html(xpath, singleton: true, text_content_only: true)
591
- xpath = xpath.call if xpath.respond_to?(:call)
592
- result = singleton ? at_xpath(xpath) : xpath(xpath)
662
+ result = nil
663
+
664
+ if xpath
665
+ xpath = xpath.call if xpath.respond_to?(:call)
666
+ result = singleton ? at_xpath(xpath) : xpath(xpath)
667
+ end
593
668
 
594
669
  if result && text_content_only
595
670
  result = singleton ? result.content : result.map(&:content)
@@ -647,7 +722,8 @@ be relative"
647
722
  # Dynamically run the init_*_from_html methods.
648
723
  Document.private_instance_methods(false).each do |method|
649
724
  if method.to_s.start_with?('init_') &&
650
- method.to_s.end_with?('_from_html') && method != __method__
725
+ method.to_s.end_with?('_from_html') &&
726
+ method != __method__
651
727
  send(method)
652
728
  end
653
729
  end
@@ -658,12 +734,14 @@ be relative"
658
734
  def init_from_object(obj, encode: true)
659
735
  assert_respond_to(obj, :fetch)
660
736
 
661
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
737
+ url = obj.fetch('url') # Should always be present.
738
+ raise "Missing 'url' field in doc object" unless url
739
+
740
+ @url = Wgit::Url.new(url)
662
741
  @html = obj.fetch('html', '')
663
742
  @parser = init_nokogiri
664
743
  @score = obj.fetch('score', 0.0)
665
-
666
- @html = Wgit::Utils.sanitize(@html, encode:)
744
+ @html = Wgit::Utils.sanitize(@html, encode:)
667
745
 
668
746
  # Dynamically run the init_*_from_object methods.
669
747
  Document.private_instance_methods(false).each do |method|
@@ -691,6 +769,38 @@ be relative"
691
769
  var_name
692
770
  end
693
771
 
772
+ # Returns all <a> fragment elements from within the HTML body e.g. #about.
773
+ def fragments
774
+ anchors = xpath("/html/body//a")
775
+
776
+ anchors.select do |anchor|
777
+ href = anchor.attributes['href']&.value
778
+ href&.start_with?('#')
779
+ end
780
+ end
781
+
782
+ # Returns a Hash{Int=>String} of <a> fragment positions and their href
783
+ # values. Only fragment anchors are returned e.g. <a> elements with a
784
+ # href starting with '#'.
785
+ def fragment_indices(fragments)
786
+ fragments.reduce({}) do |hash, fragment|
787
+ index = html_index(fragment)
788
+ next hash unless index
789
+
790
+ href = fragment.attributes['href']&.value
791
+ hash[index] = href
792
+
793
+ hash
794
+ end
795
+ end
796
+
797
+ # Takes a Nokogiri element or HTML substring and returns it's index in
798
+ # the html. Returns the index/position Int or nil if not found. The search
799
+ # is case insensitive because Nokogiri lower cases camelCase attributes.
800
+ def html_index(el_or_str)
801
+ @html.downcase.index(el_or_str.to_s.strip.downcase)
802
+ end
803
+
694
804
  alias_method :content, :html
695
805
  alias_method :statistics, :stats
696
806
  alias_method :internal_urls, :internal_links
@@ -2,24 +2,10 @@
2
2
 
3
3
  ### Default Document Extractors ###
4
4
 
5
- # No index.
6
- Wgit::Document.define_extractor(
7
- :meta_robots,
8
- '//meta[@name="robots"]/@content',
9
- singleton: true,
10
- text_content_only: true
11
- )
12
- Wgit::Document.define_extractor(
13
- :meta_wgit,
14
- '//meta[@name="wgit"]/@content',
15
- singleton: true,
16
- text_content_only: true
17
- )
18
-
19
5
  # Base.
20
6
  Wgit::Document.define_extractor(
21
7
  :base,
22
- '//base/@href',
8
+ "//base/@href",
23
9
  singleton: true,
24
10
  text_content_only: true
25
11
  ) do |base|
@@ -29,7 +15,7 @@ end
29
15
  # Title.
30
16
  Wgit::Document.define_extractor(
31
17
  :title,
32
- '//title',
18
+ "//title",
33
19
  singleton: true,
34
20
  text_content_only: true
35
21
  )
@@ -57,17 +43,18 @@ Wgit::Document.define_extractor(
57
43
  singleton: true,
58
44
  text_content_only: true
59
45
  ) do |keywords, _source, type|
60
- if keywords && (type == :document)
61
- keywords = keywords.split(',')
46
+ if keywords && type == :document
47
+ keywords = keywords.split(",")
62
48
  keywords = Wgit::Utils.sanitize(keywords)
63
49
  end
50
+
64
51
  keywords
65
52
  end
66
53
 
67
54
  # Links.
68
55
  Wgit::Document.define_extractor(
69
56
  :links,
70
- '//a/@href',
57
+ "//a/@href",
71
58
  singleton: false,
72
59
  text_content_only: true
73
60
  ) do |links|
@@ -79,7 +66,12 @@ end
79
66
  # Text.
80
67
  Wgit::Document.define_extractor(
81
68
  :text,
82
- proc { Wgit::Document.text_elements_xpath },
83
- singleton: false,
84
- text_content_only: true
85
- )
69
+ nil # doc.parser contains all HTML so omit the xpath search.
70
+ ) do |text, doc, type|
71
+ if type == :document
72
+ html_to_text = Wgit::HTMLToText.new(doc.parser)
73
+ text = html_to_text.extract
74
+ end
75
+
76
+ text
77
+ end