wgit 0.10.8 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,147 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../assertable"
4
+ require_relative "../url"
5
+ require_relative "../document"
6
+ require_relative "../model"
7
+
8
+ module Wgit::Database
9
+ # The parent DatabaseAdapter class that should be inherited from when
10
+ # creating an underlying Database adapter implementation class e.g.
11
+ # Wgit::Database::MongoDB.
12
+ #
13
+ # Listed in this class are the methods that an implementer class must
14
+ # implement to work with Wgit. Failure to do so will result in a
15
+ # NotImplementedError being raised.
16
+ #
17
+ # While not required, implementing the method `#search_fields=(fields)` in an
18
+ # adapter class will allow `Wgit::Model.set_search_fields` to call
19
+ # it. This allows the search fields to be set in one method call, from within
20
+ # the Wgit::Model class. See this method's docs for more info.
21
+ #
22
+ # Also listed in this class are common helper methods available to all
23
+ # Database implementer subclasses.
24
+ class DatabaseAdapter
25
+ include Wgit::Assertable
26
+
27
+ # The NotImplementedError message that gets raised if an implementor class
28
+ # doesn't implement a method required by Wgit.
29
+ NOT_IMPL_ERR = "The DatabaseAdapter class you're using hasn't \
30
+ implemented this method"
31
+
32
+ ###################### START OF INTERFACE METHODS ######################
33
+
34
+ # Initializes a DatabaseAdapter instance.
35
+ #
36
+ # The implementor class should establish a DB connection here using the
37
+ # given connection_string, falling back to `ENV['WGIT_CONNECTION_STRING']`.
38
+ # Don't forget to call `super`.
39
+ #
40
+ # @param connection_string [String] The connection string needed to connect
41
+ # to the database.
42
+ # @raise [StandardError] If a connection string isn't provided, either as a
43
+ # parameter or via the environment.
44
+ def initialize(connection_string = nil); end
45
+
46
+ # Returns the current size of the database.
47
+ #
48
+ # @return [Integer] The current size of the DB.
49
+ def size
50
+ raise NotImplementedError, NOT_IMPL_ERR
51
+ end
52
+
53
+ # Searches the database's Documents for the given query. The
54
+ # `Wgit::Model.search_fields` should be searched for matches
55
+ # against the given query. Documents should be sorted starting with the
56
+ # most relevant. Each returned Document should have it's `score` field set
57
+ # for relevance.
58
+ #
59
+ # @param query [String] The text query to search with.
60
+ # @param case_sensitive [Boolean] Whether character case must match.
61
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
62
+ # for separately.
63
+ # @param limit [Integer] The max number of results to return.
64
+ # @param skip [Integer] The number of results to skip.
65
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
66
+ # DB.
67
+ # @return [Array<Wgit::Document>] The search results obtained from the DB.
68
+ def search(
69
+ query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0
70
+ )
71
+ raise NotImplementedError, NOT_IMPL_ERR
72
+ end
73
+
74
+ # Deletes everything in the urls and documents collections.
75
+ #
76
+ # @return [Integer] The number of deleted records.
77
+ def empty
78
+ raise NotImplementedError, NOT_IMPL_ERR
79
+ end
80
+
81
+ # Returns Url records that haven't yet been crawled.
82
+ #
83
+ # @param limit [Integer] The max number of Url's to return. 0 returns all.
84
+ # @param skip [Integer] Skip n amount of Url's.
85
+ # @yield [url] Given each Url object (Wgit::Url) returned from the DB.
86
+ # @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
87
+ def uncrawled_urls(limit: 0, skip: 0)
88
+ raise NotImplementedError, NOT_IMPL_ERR
89
+ end
90
+
91
+ # Inserts or updates the object in the database.
92
+ #
93
+ # @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
94
+ # @return [Boolean] True if inserted, false if updated.
95
+ def upsert(obj)
96
+ raise NotImplementedError, NOT_IMPL_ERR
97
+ end
98
+
99
+ # Bulk upserts the objects in the database collection.
100
+ # You cannot mix collection objs types, all must be Urls or Documents.
101
+ #
102
+ # @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
103
+ # inserted/updated.
104
+ # @return [Integer] The total number of newly inserted objects.
105
+ def bulk_upsert(objs)
106
+ raise NotImplementedError, NOT_IMPL_ERR
107
+ end
108
+
109
+ ###################### END OF INTERFACE METHODS ######################
110
+
111
+ private
112
+
113
+ # Returns the correct Wgit::Database:Model for the given obj type.
114
+ #
115
+ # @param obj [Wgit::Url, Wgit::Document] The obj to obtain a model for.
116
+ # @return [Hash] The obj model.
117
+ def build_model(obj)
118
+ assert_type(obj, [Wgit::Url, Wgit::Document])
119
+
120
+ if obj.is_a?(Wgit::Url)
121
+ Wgit::Model.url(obj)
122
+ else
123
+ Wgit::Model.document(obj)
124
+ end
125
+ end
126
+
127
+ # Map each DB hash object into a Wgit::Document. Each Document is yielded
128
+ # if a block is given before returning the mapped Array of Documents.
129
+ def map_documents(doc_hashes)
130
+ doc_hashes.map do |doc|
131
+ doc = Wgit::Document.new(doc)
132
+ yield(doc) if block_given?
133
+ doc
134
+ end
135
+ end
136
+
137
+ # Map each DB hash object into a Wgit::Url. Each Url is yielded
138
+ # if a block is given before returning the mapped Array of Urls.
139
+ def map_urls(url_hashes)
140
+ url_hashes.map do |url|
141
+ url = Wgit::Url.new(url)
142
+ yield(url) if block_given?
143
+ url
144
+ end
145
+ end
146
+ end
147
+ end
data/lib/wgit/document.rb CHANGED
@@ -3,12 +3,12 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
- require 'set'
7
6
 
8
7
  module Wgit
9
8
  # Class modeling/serialising a HTML web document, although other MIME types
10
9
  # will work e.g. images etc. Also doubles as a search result when
11
- # loading Documents from the database via `Wgit::Database#search`.
10
+ # loading Documents from the database via
11
+ # `Wgit::Database::DatabaseAdapter#search`.
12
12
  #
13
13
  # The initialize method dynamically initializes instance variables from the
14
14
  # Document HTML / Database object e.g. text. This bit is dynamic so that the
@@ -18,25 +18,23 @@ module Wgit
18
18
  include Assertable
19
19
 
20
20
  # Regex for the allowed var names when defining an extractor.
21
- REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
21
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
22
22
 
23
- # Set of text elements used to build Document#text.
24
- @text_elements = Set.new(%i[
25
- a abbr address article aside b bdi bdo blockquote button caption cite
26
- code data dd del details dfn div dl dt em figcaption figure footer h1 h2
27
- h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
28
- option output p pre q rb rt ruby s samp section small span strong sub
29
- summary sup td textarea th time u ul var wbr
30
- ])
23
+ # Instance vars to be ignored by Document#to_h and in turn
24
+ # Wgit::Model.document.
25
+ @to_h_ignore_vars = [
26
+ '@parser' # Always ignore the Nokogiri object.
27
+ ]
31
28
 
32
29
  # Set of Symbols representing the defined Document extractors.
33
30
  @extractors = Set.new
34
31
 
35
32
  class << self
36
- # Set of HTML elements that make up the visible text on a page. These
37
- # elements are used to initialize the Wgit::Document#text. See the
38
- # README.md for how to add to this Set dynamically.
39
- attr_reader :text_elements
33
+ # Array of instance vars to ignore when Document#to_h and (in turn)
34
+ # Wgit::Model.document methods are called. Append your own defined extractor
35
+ # vars to omit them from the model (database object) when indexing.
36
+ # Each var should be a String starting with an '@' char e.g. "@data" etc.
37
+ attr_reader :to_h_ignore_vars
40
38
 
41
39
  # Set of Symbols representing the defined Document extractors. Is
42
40
  # read-only. Use Wgit::Document.define_extractor for a new extractor.
@@ -52,7 +50,7 @@ module Wgit
52
50
  # The Nokogiri::HTML document object initialized from @html.
53
51
  attr_reader :parser
54
52
 
55
- # The score is only used following a `Database#search` and records matches.
53
+ # The score is set/used following a `Database#search` and records matches.
56
54
  attr_reader :score
57
55
 
58
56
  # Initialize takes either two strings (representing the URL and HTML) or an
@@ -76,25 +74,14 @@ module Wgit
76
74
  # false if the Document content is an image etc.
77
75
  def initialize(url_or_obj, html = '', encode: true)
78
76
  if url_or_obj.is_a?(String)
79
- init_from_strings(url_or_obj, html, encode: encode)
77
+ init_from_strings(url_or_obj, html, encode:)
80
78
  else
81
- init_from_object(url_or_obj, encode: encode)
79
+ init_from_object(url_or_obj, encode:)
82
80
  end
83
81
  end
84
82
 
85
83
  ### Document Class Methods ###
86
84
 
87
- # Uses Document.text_elements to build an xpath String, used to obtain
88
- # all of the combined visual text on a webpage.
89
- #
90
- # @return [String] An xpath String to obtain a webpage's text elements.
91
- def self.text_elements_xpath
92
- Wgit::Document.text_elements.each_with_index.reduce('') do |xpath, (el, i)|
93
- xpath += ' | ' unless i.zero?
94
- xpath += format('//%s/text()', el)
95
- end
96
- end
97
-
98
85
  # Defines a content extractor, which extracts HTML elements/content
99
86
  # into instance variables upon Document initialization. See the default
100
87
  # extractors defined in 'document_extractors.rb' as examples. Defining an
@@ -118,8 +105,9 @@ module Wgit
118
105
  # @param var [Symbol] The name of the variable to be initialised, that will
119
106
  # contain the extracted content. A getter and setter method is defined
120
107
  # for the initialised variable.
121
- # @param xpath [String, #call] The xpath used to find the element(s)
122
- # of the webpage. Only used when initializing from HTML.
108
+ # @param xpath [String, #call, nil] The xpath used to find the element(s)
109
+ # of the webpage. Only used when initializing from HTML. Passing nil will
110
+ # skip the HTML extraction, which sometimes isn't required.
123
111
  #
124
112
  # Pass a callable object (proc etc.) if you want the
125
113
  # xpath value to be derived on Document initialisation (instead of when
@@ -210,7 +198,7 @@ module Wgit
210
198
  #
211
199
  # @return [String] A short textual representation of this Document.
212
200
  def inspect
213
- "#<Wgit::Document url=\"#{@url}\" html=#{size} bytes>"
201
+ "#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
214
202
  end
215
203
 
216
204
  # Determines if both the url and html match. Use
@@ -241,10 +229,10 @@ module Wgit
241
229
  # Provide the `link:` parameter to get the correct base URL for that type
242
230
  # of link. For example, a link of `#top` would always return @url because
243
231
  # it applies to that page, not a different one. Query strings work in the
244
- # same way. Use this parameter if manually concatting Url's e.g.
232
+ # same way. Use this parameter if manually joining Url's e.g.
245
233
  #
246
234
  # relative_link = Wgit::Url.new('?q=hello')
247
- # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
235
+ # absolute_link = doc.base_url(link: relative_link).join(relative_link)
248
236
  #
249
237
  # This is similar to how Wgit::Document#internal_absolute_links works.
250
238
  #
@@ -264,7 +252,7 @@ module Wgit
264
252
  be relative"
265
253
  end
266
254
 
267
- get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
255
+ get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
268
256
 
269
257
  if link
270
258
  link = Wgit::Url.new(link)
@@ -288,11 +276,11 @@ be relative"
288
276
  # returned Hash.
289
277
  # @return [Hash] Containing self's instance vars.
290
278
  def to_h(include_html: false, include_score: true)
291
- ignore = include_html ? [] : ['@html']
279
+ ignore = Wgit::Document.to_h_ignore_vars.dup
280
+ ignore << '@html' unless include_html
292
281
  ignore << '@score' unless include_score
293
- ignore << '@parser' # Always ignore the Nokogiri object.
294
282
 
295
- Wgit::Utils.to_h(self, ignore: ignore)
283
+ Wgit::Utils.to_h(self, ignore:)
296
284
  end
297
285
 
298
286
  # Converts this Document's #to_h return value to a JSON String.
@@ -301,7 +289,7 @@ be relative"
301
289
  # returned JSON String.
302
290
  # @return [String] This Document represented as a JSON String.
303
291
  def to_json(include_html: false)
304
- h = to_h(include_html: include_html)
292
+ h = to_h(include_html:)
305
293
  JSON.generate(h)
306
294
  end
307
295
 
@@ -323,7 +311,7 @@ be relative"
323
311
  else
324
312
  next unless instance_variable_get(var).respond_to?(:length)
325
313
 
326
- hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
314
+ hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
327
315
  end
328
316
  end
329
317
 
@@ -431,17 +419,18 @@ be relative"
431
419
  end
432
420
  end
433
421
  .reject { |link| link.relative?(host: @url.to_origin) }
434
- .map(&:omit_trailing_slash)
435
422
 
436
423
  Wgit::Utils.sanitize(links)
437
424
  end
438
425
 
439
- # Searches the @text for the given query and returns the results.
426
+ # Searches the Document's instance vars for the given query and returns
427
+ # the results. The `Wgit::Model.search_fields` denote the vars to be
428
+ # searched, unless overridden using the search_fields: param.
440
429
  #
441
- # The number of search hits for each sentenence are recorded internally
430
+ # The number of matches for each search field is recorded internally
442
431
  # and used to rank/sort the search results before being returned. Where
443
- # the Wgit::Database#search method search all documents for the most hits,
444
- # this method searches each document's @text for the most hits.
432
+ # the Wgit::Database::DatabaseAdapter#search method searches all documents
433
+ # for matches, this method searches each individual Document for matches.
445
434
  #
446
435
  # Each search result comprises of a sentence of a given length. The length
447
436
  # will be based on the sentence_limit parameter or the full length of the
@@ -449,51 +438,86 @@ be relative"
449
438
  # that the search query is visible somewhere in the sentence.
450
439
  #
451
440
  # @param query [Regexp, #to_s] The regex or text value to search the
452
- # document's @text for.
441
+ # document's instance vars (Wgit::Model.search_fields) for.
453
442
  # @param case_sensitive [Boolean] Whether character case must match.
454
443
  # @param whole_sentence [Boolean] Whether multiple words should be searched
455
444
  # for separately.
456
445
  # @param sentence_limit [Integer] The max length of each search result
457
446
  # sentence.
458
- # @return [Array<String>] A subset of @text, matching the query.
447
+ # @param search_fields [Hash<Symbol, Integer>] The Document instance vars
448
+ # to search and the weight for a match (used to determine relevence).
449
+ # This should only be set for custom one-off Document searches. For
450
+ # permanent changing of search fields, see Wgit::Model.set_search_fields.
451
+ # @yield [results_hash] Given the results_hash containing each search
452
+ # result (String) and its score (num_matches * weight).
453
+ # @return [Array<String>] A subset of this document's instance vars,
454
+ # matching the query for the search_fields: param.
459
455
  def search(
460
- query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
456
+ query, case_sensitive: false, whole_sentence: true,
457
+ sentence_limit: 80, search_fields: Wgit::Model.search_fields
461
458
  )
462
459
  raise 'The sentence_limit value must be even' if sentence_limit.odd?
460
+ assert_type(search_fields, Hash)
463
461
 
464
- if query.is_a?(Regexp)
465
- regex = query
466
- else # query.respond_to? :to_s == true
467
- query = query.to_s
468
- query = query.gsub(' ', '|') unless whole_sentence
469
- regex = Regexp.new(query, !case_sensitive)
470
- end
471
-
462
+ regex = Wgit::Utils.build_search_regex(
463
+ query, case_sensitive:, whole_sentence:)
472
464
  results = {}
473
465
 
474
- @text.each do |sentence|
475
- sentence = sentence.strip
476
- next if results[sentence]
466
+ search_fields.each do |field, weight|
467
+ doc_field = instance_variable_get("@#{field}".to_sym)
468
+ next unless doc_field
469
+
470
+ Wgit::Utils.each(doc_field) do |text|
471
+ assert_type(text, String)
477
472
 
478
- hits = sentence.scan(regex).count
479
- next unless hits.positive?
473
+ text = text.strip
474
+ matches = text.scan(regex).count
475
+ next unless matches.positive?
480
476
 
481
- index = sentence.index(regex) # Index of first match.
482
- Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
477
+ index = text.index(regex) # Index of first match.
478
+ Wgit::Utils.format_sentence_length(text, index, sentence_limit)
483
479
 
484
- results[sentence] = hits
480
+ # For duplicate matching text, total the text score.
481
+ text_score = matches * weight
482
+ existing_score = results[text]
483
+ text_score += existing_score if existing_score
484
+
485
+ results[text] = text_score
486
+ end
485
487
  end
486
488
 
487
489
  return [] if results.empty?
488
490
 
489
- results = Hash[results.sort_by { |_k, v| v }]
490
- results.keys.reverse
491
+ yield results if block_given?
492
+
493
+ # Return only the matching text sentences, sorted by relevance.
494
+ Hash[results.sort_by { |_, score| -score }].keys
495
+ end
496
+
497
+ # Performs a text only search of the Document, instead of searching all
498
+ # search fields defined in Wgit::Model.search_fields.
499
+ #
500
+ # @param query [Regexp, #to_s] The regex or text value to search the
501
+ # document's text for.
502
+ # @param case_sensitive [Boolean] Whether character case must match.
503
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
504
+ # for separately.
505
+ # @param sentence_limit [Integer] The max length of each search result
506
+ # sentence.
507
+ # @return [Array<String>] A subset of this document's text fields that
508
+ # match the query.
509
+ def search_text(
510
+ query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
511
+ )
512
+ search(
513
+ query, case_sensitive:, whole_sentence:,
514
+ sentence_limit:, search_fields: { text: 1 })
491
515
  end
492
516
 
493
- # Performs a text search (see Document#search for details) but assigns the
494
- # results to the @text instance variable. This can be used for sub search
495
- # functionality. The original text is returned; no other reference to it
496
- # is kept thereafter.
517
+ # Performs a text only search (see Document#search_text for details) but
518
+ # assigns the results to the @text instance variable. This can be used
519
+ # for sub search functionality. The original text is returned; no other
520
+ # reference to it is kept thereafter.
497
521
  #
498
522
  # @param query [Regexp, #to_s] The regex or text value to search the
499
523
  # document's @text for.
@@ -503,14 +527,11 @@ be relative"
503
527
  # @param sentence_limit [Integer] The max length of each search result
504
528
  # sentence.
505
529
  # @return [String] This Document's original @text value.
506
- def search!(
530
+ def search_text!(
507
531
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
508
532
  )
509
533
  orig_text = @text
510
- @text = search(
511
- query, case_sensitive: case_sensitive,
512
- whole_sentence: whole_sentence, sentence_limit: sentence_limit
513
- )
534
+ @text = search_text(query, case_sensitive:, whole_sentence:, sentence_limit:)
514
535
 
515
536
  orig_text
516
537
  end
@@ -533,11 +554,74 @@ be relative"
533
554
  # @return [String, Object] The value found in the html or the default value
534
555
  # (singleton ? nil : []).
535
556
  def extract(xpath, singleton: true, text_content_only: true, &block)
536
- send(
537
- :extract_from_html, xpath,
538
- singleton: singleton, text_content_only: text_content_only,
539
- &block
557
+ send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
558
+ end
559
+
560
+ # Attempts to extract and check the HTML meta tags instructing Wgit not to
561
+ # index this document (save it to a Database).
562
+ #
563
+ # @return [Boolean] True if this document shouldn't be saved to a Database,
564
+ # false otherwise.
565
+ def no_index?
566
+ meta_robots = extract_from_html(
567
+ '//meta[@name="robots"]/@content',
568
+ singleton: true,
569
+ text_content_only: true
570
+ )
571
+ meta_wgit = extract_from_html(
572
+ '//meta[@name="wgit"]/@content',
573
+ singleton: true,
574
+ text_content_only: true
540
575
  )
576
+
577
+ [meta_robots, meta_wgit].include?('noindex')
578
+ end
579
+
580
+ # Firstly finds the target element whose text contains el_text.
581
+ # Then finds the preceeding fragment element nearest to the target
582
+ # element and returns it's href value (starting with #). The search is
583
+ # performed against the @html so Documents loaded from a DB will need to
584
+ # contain the 'html' field in the Wgit::Model. See the
585
+ # `Wgit::Model#include_doc_html` documentation for more info.
586
+ #
587
+ # @param el_text [String] The element text of the target element.
588
+ # @param el_type [String] The element type, defaulting to any type.
589
+ # @yield [results] Given the results of the xpath query. Return the target
590
+ # you want or nil to use the default (first) target in results.
591
+ # @return [String, nil] nil if no nearest fragment or the nearest
592
+ # fragment's href e.g. '#about'.
593
+ # @raise [StandardError] Raises if no matching target element containg
594
+ # el_text can be found or if @html is empty.
595
+ def nearest_fragment(el_text, el_type = "*")
596
+ raise "The @html is empty" if @html.empty?
597
+
598
+ xpath_query = "//#{el_type}[text()[contains(.,\"#{el_text}\")]]"
599
+ results = xpath(xpath_query)
600
+ return nil if results.empty?
601
+
602
+ target = results.first
603
+ if block_given?
604
+ result = yield(results)
605
+ target = result if result
606
+ end
607
+
608
+ target_index = html_index(target)
609
+ raise 'Failed to find target index' unless target_index
610
+
611
+ fragment_h = fragment_indices(fragments)
612
+
613
+ # Return the target href if the target is itself a fragment.
614
+ return fragment_h[target_index] if fragment_h.keys.include?(target_index)
615
+
616
+ # Find the target's nearest preceeding fragment href.
617
+ closest_index = 0
618
+ fragment_h.each do |fragment_index, href|
619
+ if fragment_index.between?(closest_index, target_index)
620
+ closest_index = fragment_index
621
+ end
622
+ end
623
+
624
+ fragment_h[closest_index]
541
625
  end
542
626
 
543
627
  protected
@@ -559,7 +643,8 @@ be relative"
559
643
  # Extracts a value/object from this Document's @html using the given xpath
560
644
  # parameter.
561
645
  #
562
- # @param xpath [String, #call] Used to find the value/object in @html.
646
+ # @param xpath [String, #call, nil] Used to find the value/object in @html.
647
+ # Passing nil will skip the HTML extraction which isn't always needed.
563
648
  # @param singleton [Boolean] singleton ? results.first (single Object) :
564
649
  # results (Enumerable).
565
650
  # @param text_content_only [Boolean] text_content_only ? result.content
@@ -574,14 +659,18 @@ be relative"
574
659
  # @return [String, Object] The value found in the html or the default value
575
660
  # (singleton ? nil : []).
576
661
  def extract_from_html(xpath, singleton: true, text_content_only: true)
577
- xpath = xpath.call if xpath.respond_to?(:call)
578
- result = singleton ? at_xpath(xpath) : xpath(xpath)
662
+ result = nil
663
+
664
+ if xpath
665
+ xpath = xpath.call if xpath.respond_to?(:call)
666
+ result = singleton ? at_xpath(xpath) : xpath(xpath)
667
+ end
579
668
 
580
669
  if result && text_content_only
581
670
  result = singleton ? result.content : result.map(&:content)
582
671
  end
583
672
 
584
- Wgit::Utils.sanitize(result)
673
+ result = Wgit::Utils.sanitize(result)
585
674
  result = yield(result, self, :document) if block_given?
586
675
  result
587
676
  end
@@ -608,7 +697,7 @@ be relative"
608
697
  default = singleton ? nil : []
609
698
  result = obj.fetch(key.to_s, default)
610
699
 
611
- Wgit::Utils.sanitize(result)
700
+ result = Wgit::Utils.sanitize(result)
612
701
  result = yield(result, obj, :object) if block_given?
613
702
  result
614
703
  end
@@ -628,13 +717,14 @@ be relative"
628
717
  @parser = init_nokogiri
629
718
  @score = 0.0
630
719
 
631
- Wgit::Utils.sanitize(@html, encode: encode)
720
+ @html = Wgit::Utils.sanitize(@html, encode:)
632
721
 
633
722
  # Dynamically run the init_*_from_html methods.
634
723
  Document.private_instance_methods(false).each do |method|
635
724
  if method.to_s.start_with?('init_') &&
636
- method.to_s.end_with?('_from_html')
637
- send(method) unless method == __method__
725
+ method.to_s.end_with?('_from_html') &&
726
+ method != __method__
727
+ send(method)
638
728
  end
639
729
  end
640
730
  end
@@ -644,18 +734,20 @@ be relative"
644
734
  def init_from_object(obj, encode: true)
645
735
  assert_respond_to(obj, :fetch)
646
736
 
647
- @url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
737
+ url = obj.fetch('url') # Should always be present.
738
+ raise "Missing 'url' field in doc object" unless url
739
+
740
+ @url = Wgit::Url.new(url)
648
741
  @html = obj.fetch('html', '')
649
742
  @parser = init_nokogiri
650
743
  @score = obj.fetch('score', 0.0)
651
-
652
- Wgit::Utils.sanitize(@html, encode: encode)
744
+ @html = Wgit::Utils.sanitize(@html, encode:)
653
745
 
654
746
  # Dynamically run the init_*_from_object methods.
655
747
  Document.private_instance_methods(false).each do |method|
656
748
  if method.to_s.start_with?('init_') &&
657
- method.to_s.end_with?('_from_object')
658
- send(method, obj) unless method == __method__
749
+ method.to_s.end_with?('_from_object') && method != __method__
750
+ send(method, obj)
659
751
  end
660
752
  end
661
753
  end
@@ -668,7 +760,7 @@ be relative"
668
760
  def init_var(var, value)
669
761
  # instance_var_name starts with @, var_name doesn't.
670
762
  var = var.to_s
671
- var_name = (var.start_with?('@') ? var[1..-1] : var).to_sym
763
+ var_name = (var.start_with?('@') ? var[1..] : var).to_sym
672
764
  instance_var_name = "@#{var_name}".to_sym
673
765
 
674
766
  instance_variable_set(instance_var_name, value)
@@ -677,10 +769,42 @@ be relative"
677
769
  var_name
678
770
  end
679
771
 
680
- alias content html
681
- alias statistics stats
682
- alias internal_urls internal_links
683
- alias internal_absolute_urls internal_absolute_links
684
- alias external_urls external_links
772
+ # Returns all <a> fragment elements from within the HTML body e.g. #about.
773
+ def fragments
774
+ anchors = xpath("/html/body//a")
775
+
776
+ anchors.select do |anchor|
777
+ href = anchor.attributes['href']&.value
778
+ href&.start_with?('#')
779
+ end
780
+ end
781
+
782
+ # Returns a Hash{Int=>String} of <a> fragment positions and their href
783
+ # values. Only fragment anchors are returned e.g. <a> elements with a
784
+ # href starting with '#'.
785
+ def fragment_indices(fragments)
786
+ fragments.reduce({}) do |hash, fragment|
787
+ index = html_index(fragment)
788
+ next hash unless index
789
+
790
+ href = fragment.attributes['href']&.value
791
+ hash[index] = href
792
+
793
+ hash
794
+ end
795
+ end
796
+
797
+ # Takes a Nokogiri element or HTML substring and returns it's index in
798
+ # the html. Returns the index/position Int or nil if not found. The search
799
+ # is case insensitive because Nokogiri lower cases camelCase attributes.
800
+ def html_index(el_or_str)
801
+ @html.downcase.index(el_or_str.to_s.strip.downcase)
802
+ end
803
+
804
+ alias_method :content, :html
805
+ alias_method :statistics, :stats
806
+ alias_method :internal_urls, :internal_links
807
+ alias_method :internal_absolute_urls, :internal_absolute_links
808
+ alias_method :external_urls, :external_links
685
809
  end
686
810
  end