wgit 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -0
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +27 -24
- data/bin/wgit +72 -18
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +91 -20
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -663
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +187 -77
- data/lib/wgit/document_extractors.rb +15 -23
- data/lib/wgit/dsl.rb +64 -67
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +29 -10
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +5 -8
- data/lib/wgit/robots_parser.rb +8 -8
- data/lib/wgit/url.rb +38 -38
- data/lib/wgit/utils.rb +124 -14
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -14
- metadata +74 -30
- data/lib/wgit/database/model.rb +0 -60
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../assertable"
|
|
4
|
+
require_relative "../url"
|
|
5
|
+
require_relative "../document"
|
|
6
|
+
require_relative "../model"
|
|
7
|
+
|
|
8
|
+
module Wgit::Database
|
|
9
|
+
# The parent DatabaseAdapter class that should be inherited from when
|
|
10
|
+
# creating an underlying Database adapter implementation class e.g.
|
|
11
|
+
# Wgit::Database::MongoDB.
|
|
12
|
+
#
|
|
13
|
+
# Listed in this class are the methods that an implementer class must
|
|
14
|
+
# implement to work with Wgit. Failure to do so will result in a
|
|
15
|
+
# NotImplementedError being raised.
|
|
16
|
+
#
|
|
17
|
+
# While not required, implementing the method `#search_fields=(fields)` in an
|
|
18
|
+
# adapter class will allow `Wgit::Model.set_search_fields` to call
|
|
19
|
+
# it. This allows the search fields to be set in one method call, from within
|
|
20
|
+
# the Wgit::Model class. See this method's docs for more info.
|
|
21
|
+
#
|
|
22
|
+
# Also listed in this class are common helper methods available to all
|
|
23
|
+
# Database implementer subclasses.
|
|
24
|
+
class DatabaseAdapter
|
|
25
|
+
include Wgit::Assertable
|
|
26
|
+
|
|
27
|
+
# The NotImplementedError message that gets raised if an implementor class
|
|
28
|
+
# doesn't implement a method required by Wgit.
|
|
29
|
+
NOT_IMPL_ERR = "The DatabaseAdapter class you're using hasn't \
|
|
30
|
+
implemented this method"
|
|
31
|
+
|
|
32
|
+
###################### START OF INTERFACE METHODS ######################
|
|
33
|
+
|
|
34
|
+
# Initializes a DatabaseAdapter instance.
|
|
35
|
+
#
|
|
36
|
+
# The implementor class should establish a DB connection here using the
|
|
37
|
+
# given connection_string, falling back to `ENV['WGIT_CONNECTION_STRING']`.
|
|
38
|
+
# Don't forget to call `super`.
|
|
39
|
+
#
|
|
40
|
+
# @param connection_string [String] The connection string needed to connect
|
|
41
|
+
# to the database.
|
|
42
|
+
# @raise [StandardError] If a connection string isn't provided, either as a
|
|
43
|
+
# parameter or via the environment.
|
|
44
|
+
def initialize(connection_string = nil); end
|
|
45
|
+
|
|
46
|
+
# Returns the current size of the database.
|
|
47
|
+
#
|
|
48
|
+
# @return [Integer] The current size of the DB.
|
|
49
|
+
def size
|
|
50
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Searches the database's Documents for the given query. The
|
|
54
|
+
# `Wgit::Model.search_fields` should be searched for matches
|
|
55
|
+
# against the given query. Documents should be sorted starting with the
|
|
56
|
+
# most relevant. Each returned Document should have it's `score` field set
|
|
57
|
+
# for relevance.
|
|
58
|
+
#
|
|
59
|
+
# @param query [String] The text query to search with.
|
|
60
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
|
61
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
|
62
|
+
# for separately.
|
|
63
|
+
# @param limit [Integer] The max number of results to return.
|
|
64
|
+
# @param skip [Integer] The number of results to skip.
|
|
65
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
|
66
|
+
# DB.
|
|
67
|
+
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
|
68
|
+
def search(
|
|
69
|
+
query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0
|
|
70
|
+
)
|
|
71
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Deletes everything in the urls and documents collections.
|
|
75
|
+
#
|
|
76
|
+
# @return [Integer] The number of deleted records.
|
|
77
|
+
def empty
|
|
78
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Returns Url records that haven't yet been crawled.
|
|
82
|
+
#
|
|
83
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
|
84
|
+
# @param skip [Integer] Skip n amount of Url's.
|
|
85
|
+
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
|
86
|
+
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
|
87
|
+
def uncrawled_urls(limit: 0, skip: 0)
|
|
88
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Inserts or updates the object in the database.
|
|
92
|
+
#
|
|
93
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
|
|
94
|
+
# @return [Boolean] True if inserted, false if updated.
|
|
95
|
+
def upsert(obj)
|
|
96
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Bulk upserts the objects in the database collection.
|
|
100
|
+
# You cannot mix collection objs types, all must be Urls or Documents.
|
|
101
|
+
#
|
|
102
|
+
# @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
|
|
103
|
+
# inserted/updated.
|
|
104
|
+
# @return [Integer] The total number of newly inserted objects.
|
|
105
|
+
def bulk_upsert(objs)
|
|
106
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
###################### END OF INTERFACE METHODS ######################
|
|
110
|
+
|
|
111
|
+
private
|
|
112
|
+
|
|
113
|
+
# Returns the correct Wgit::Database:Model for the given obj type.
|
|
114
|
+
#
|
|
115
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj to obtain a model for.
|
|
116
|
+
# @return [Hash] The obj model.
|
|
117
|
+
def build_model(obj)
|
|
118
|
+
assert_type(obj, [Wgit::Url, Wgit::Document])
|
|
119
|
+
|
|
120
|
+
if obj.is_a?(Wgit::Url)
|
|
121
|
+
Wgit::Model.url(obj)
|
|
122
|
+
else
|
|
123
|
+
Wgit::Model.document(obj)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Map each DB hash object into a Wgit::Document. Each Document is yielded
|
|
128
|
+
# if a block is given before returning the mapped Array of Documents.
|
|
129
|
+
def map_documents(doc_hashes)
|
|
130
|
+
doc_hashes.map do |doc|
|
|
131
|
+
doc = Wgit::Document.new(doc)
|
|
132
|
+
yield(doc) if block_given?
|
|
133
|
+
doc
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Map each DB hash object into a Wgit::Url. Each Url is yielded
|
|
138
|
+
# if a block is given before returning the mapped Array of Urls.
|
|
139
|
+
def map_urls(url_hashes)
|
|
140
|
+
url_hashes.map do |url|
|
|
141
|
+
url = Wgit::Url.new(url)
|
|
142
|
+
yield(url) if block_given?
|
|
143
|
+
url
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
data/lib/wgit/document.rb
CHANGED
|
@@ -7,7 +7,8 @@ require 'json'
|
|
|
7
7
|
module Wgit
|
|
8
8
|
# Class modeling/serialising a HTML web document, although other MIME types
|
|
9
9
|
# will work e.g. images etc. Also doubles as a search result when
|
|
10
|
-
# loading Documents from the database via
|
|
10
|
+
# loading Documents from the database via
|
|
11
|
+
# `Wgit::Database::DatabaseAdapter#search`.
|
|
11
12
|
#
|
|
12
13
|
# The initialize method dynamically initializes instance variables from the
|
|
13
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
|
@@ -19,33 +20,18 @@ module Wgit
|
|
|
19
20
|
# Regex for the allowed var names when defining an extractor.
|
|
20
21
|
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
|
|
21
22
|
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
a abbr address article aside b bdi bdo blockquote button caption cite
|
|
25
|
-
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
|
26
|
-
h3 h4 h5 h6 header hr i input ins kbd label legend li main mark meter ol
|
|
27
|
-
option output p pre q rb rt ruby s samp section small span strong sub
|
|
28
|
-
summary sup td textarea th time u ul var wbr
|
|
29
|
-
])
|
|
30
|
-
|
|
31
|
-
# Instance vars to be ignored by Document#to_h and in turn Model.document.
|
|
23
|
+
# Instance vars to be ignored by Document#to_h and in turn
|
|
24
|
+
# Wgit::Model.document.
|
|
32
25
|
@to_h_ignore_vars = [
|
|
33
|
-
'@parser'
|
|
34
|
-
'@meta_robots', # Used by #no_index?, ignore.
|
|
35
|
-
'@meta_wgit' # Used by #no_index?, ignore.
|
|
26
|
+
'@parser' # Always ignore the Nokogiri object.
|
|
36
27
|
]
|
|
37
28
|
|
|
38
29
|
# Set of Symbols representing the defined Document extractors.
|
|
39
30
|
@extractors = Set.new
|
|
40
31
|
|
|
41
32
|
class << self
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
# README.md for how to add to this Set dynamically.
|
|
45
|
-
attr_reader :text_elements
|
|
46
|
-
|
|
47
|
-
# Array of instance vars to ignore when Document#to_h and in turn
|
|
48
|
-
# Model.document methods are called. Append your own defined extractor
|
|
33
|
+
# Array of instance vars to ignore when Document#to_h and (in turn)
|
|
34
|
+
# Wgit::Model.document methods are called. Append your own defined extractor
|
|
49
35
|
# vars to omit them from the model (database object) when indexing.
|
|
50
36
|
# Each var should be a String starting with an '@' char e.g. "@data" etc.
|
|
51
37
|
attr_reader :to_h_ignore_vars
|
|
@@ -64,7 +50,7 @@ module Wgit
|
|
|
64
50
|
# The Nokogiri::HTML document object initialized from @html.
|
|
65
51
|
attr_reader :parser
|
|
66
52
|
|
|
67
|
-
# The score is
|
|
53
|
+
# The score is set/used following a `Database#search` and records matches.
|
|
68
54
|
attr_reader :score
|
|
69
55
|
|
|
70
56
|
# Initialize takes either two strings (representing the URL and HTML) or an
|
|
@@ -96,17 +82,6 @@ module Wgit
|
|
|
96
82
|
|
|
97
83
|
### Document Class Methods ###
|
|
98
84
|
|
|
99
|
-
# Uses Document.text_elements to build an xpath String, used to obtain
|
|
100
|
-
# all of the combined visual text on a webpage.
|
|
101
|
-
#
|
|
102
|
-
# @return [String] An xpath String to obtain a webpage's text elements.
|
|
103
|
-
def self.text_elements_xpath
|
|
104
|
-
@text_elements.each_with_index.reduce('') do |xpath, (el, i)|
|
|
105
|
-
xpath += ' | ' unless i.zero?
|
|
106
|
-
xpath + format('//%s/text()', el)
|
|
107
|
-
end
|
|
108
|
-
end
|
|
109
|
-
|
|
110
85
|
# Defines a content extractor, which extracts HTML elements/content
|
|
111
86
|
# into instance variables upon Document initialization. See the default
|
|
112
87
|
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
|
@@ -130,8 +105,9 @@ module Wgit
|
|
|
130
105
|
# @param var [Symbol] The name of the variable to be initialised, that will
|
|
131
106
|
# contain the extracted content. A getter and setter method is defined
|
|
132
107
|
# for the initialised variable.
|
|
133
|
-
# @param xpath [String, #call] The xpath used to find the element(s)
|
|
134
|
-
# of the webpage. Only used when initializing from HTML.
|
|
108
|
+
# @param xpath [String, #call, nil] The xpath used to find the element(s)
|
|
109
|
+
# of the webpage. Only used when initializing from HTML. Passing nil will
|
|
110
|
+
# skip the HTML extraction, which sometimes isn't required.
|
|
135
111
|
#
|
|
136
112
|
# Pass a callable object (proc etc.) if you want the
|
|
137
113
|
# xpath value to be derived on Document initialisation (instead of when
|
|
@@ -447,12 +423,14 @@ be relative"
|
|
|
447
423
|
Wgit::Utils.sanitize(links)
|
|
448
424
|
end
|
|
449
425
|
|
|
450
|
-
# Searches the
|
|
426
|
+
# Searches the Document's instance vars for the given query and returns
|
|
427
|
+
# the results. The `Wgit::Model.search_fields` denote the vars to be
|
|
428
|
+
# searched, unless overridden using the search_fields: param.
|
|
451
429
|
#
|
|
452
|
-
# The number of
|
|
430
|
+
# The number of matches for each search field is recorded internally
|
|
453
431
|
# and used to rank/sort the search results before being returned. Where
|
|
454
|
-
# the Wgit::Database#search method
|
|
455
|
-
# this method searches each
|
|
432
|
+
# the Wgit::Database::DatabaseAdapter#search method searches all documents
|
|
433
|
+
# for matches, this method searches each individual Document for matches.
|
|
456
434
|
#
|
|
457
435
|
# Each search result comprises of a sentence of a given length. The length
|
|
458
436
|
# will be based on the sentence_limit parameter or the full length of the
|
|
@@ -460,51 +438,86 @@ be relative"
|
|
|
460
438
|
# that the search query is visible somewhere in the sentence.
|
|
461
439
|
#
|
|
462
440
|
# @param query [Regexp, #to_s] The regex or text value to search the
|
|
463
|
-
# document's
|
|
441
|
+
# document's instance vars (Wgit::Model.search_fields) for.
|
|
464
442
|
# @param case_sensitive [Boolean] Whether character case must match.
|
|
465
443
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
|
466
444
|
# for separately.
|
|
467
445
|
# @param sentence_limit [Integer] The max length of each search result
|
|
468
446
|
# sentence.
|
|
469
|
-
# @
|
|
447
|
+
# @param search_fields [Hash<Symbol, Integer>] The Document instance vars
|
|
448
|
+
# to search and the weight for a match (used to determine relevence).
|
|
449
|
+
# This should only be set for custom one-off Document searches. For
|
|
450
|
+
# permanent changing of search fields, see Wgit::Model.set_search_fields.
|
|
451
|
+
# @yield [results_hash] Given the results_hash containing each search
|
|
452
|
+
# result (String) and its score (num_matches * weight).
|
|
453
|
+
# @return [Array<String>] A subset of this document's instance vars,
|
|
454
|
+
# matching the query for the search_fields: param.
|
|
470
455
|
def search(
|
|
471
|
-
query, case_sensitive: false, whole_sentence: true,
|
|
456
|
+
query, case_sensitive: false, whole_sentence: true,
|
|
457
|
+
sentence_limit: 80, search_fields: Wgit::Model.search_fields
|
|
472
458
|
)
|
|
473
459
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
|
460
|
+
assert_type(search_fields, Hash)
|
|
474
461
|
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
else # query.respond_to? :to_s == true
|
|
478
|
-
query = query.to_s
|
|
479
|
-
query = query.gsub(' ', '|') unless whole_sentence
|
|
480
|
-
regex = Regexp.new(query, !case_sensitive)
|
|
481
|
-
end
|
|
482
|
-
|
|
462
|
+
regex = Wgit::Utils.build_search_regex(
|
|
463
|
+
query, case_sensitive:, whole_sentence:)
|
|
483
464
|
results = {}
|
|
484
465
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
next
|
|
466
|
+
search_fields.each do |field, weight|
|
|
467
|
+
doc_field = instance_variable_get("@#{field}".to_sym)
|
|
468
|
+
next unless doc_field
|
|
488
469
|
|
|
489
|
-
|
|
490
|
-
|
|
470
|
+
Wgit::Utils.each(doc_field) do |text|
|
|
471
|
+
assert_type(text, String)
|
|
491
472
|
|
|
492
|
-
|
|
493
|
-
|
|
473
|
+
text = text.strip
|
|
474
|
+
matches = text.scan(regex).count
|
|
475
|
+
next unless matches.positive?
|
|
494
476
|
|
|
495
|
-
|
|
477
|
+
index = text.index(regex) # Index of first match.
|
|
478
|
+
Wgit::Utils.format_sentence_length(text, index, sentence_limit)
|
|
479
|
+
|
|
480
|
+
# For duplicate matching text, total the text score.
|
|
481
|
+
text_score = matches * weight
|
|
482
|
+
existing_score = results[text]
|
|
483
|
+
text_score += existing_score if existing_score
|
|
484
|
+
|
|
485
|
+
results[text] = text_score
|
|
486
|
+
end
|
|
496
487
|
end
|
|
497
488
|
|
|
498
489
|
return [] if results.empty?
|
|
499
490
|
|
|
500
|
-
|
|
501
|
-
|
|
491
|
+
yield results if block_given?
|
|
492
|
+
|
|
493
|
+
# Return only the matching text sentences, sorted by relevance.
|
|
494
|
+
Hash[results.sort_by { |_, score| -score }].keys
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
# Performs a text only search of the Document, instead of searching all
|
|
498
|
+
# search fields defined in Wgit::Model.search_fields.
|
|
499
|
+
#
|
|
500
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
|
501
|
+
# document's text for.
|
|
502
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
|
503
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
|
504
|
+
# for separately.
|
|
505
|
+
# @param sentence_limit [Integer] The max length of each search result
|
|
506
|
+
# sentence.
|
|
507
|
+
# @return [Array<String>] A subset of this document's text fields that
|
|
508
|
+
# match the query.
|
|
509
|
+
def search_text(
|
|
510
|
+
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
|
511
|
+
)
|
|
512
|
+
search(
|
|
513
|
+
query, case_sensitive:, whole_sentence:,
|
|
514
|
+
sentence_limit:, search_fields: { text: 1 })
|
|
502
515
|
end
|
|
503
516
|
|
|
504
|
-
# Performs a text search (see Document#
|
|
505
|
-
# results to the @text instance variable. This can be used
|
|
506
|
-
# functionality. The original text is returned; no other
|
|
507
|
-
# is kept thereafter.
|
|
517
|
+
# Performs a text only search (see Document#search_text for details) but
|
|
518
|
+
# assigns the results to the @text instance variable. This can be used
|
|
519
|
+
# for sub search functionality. The original text is returned; no other
|
|
520
|
+
# reference to it is kept thereafter.
|
|
508
521
|
#
|
|
509
522
|
# @param query [Regexp, #to_s] The regex or text value to search the
|
|
510
523
|
# document's @text for.
|
|
@@ -514,11 +527,11 @@ be relative"
|
|
|
514
527
|
# @param sentence_limit [Integer] The max length of each search result
|
|
515
528
|
# sentence.
|
|
516
529
|
# @return [String] This Document's original @text value.
|
|
517
|
-
def
|
|
530
|
+
def search_text!(
|
|
518
531
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
|
519
532
|
)
|
|
520
533
|
orig_text = @text
|
|
521
|
-
@text =
|
|
534
|
+
@text = search_text(query, case_sensitive:, whole_sentence:, sentence_limit:)
|
|
522
535
|
|
|
523
536
|
orig_text
|
|
524
537
|
end
|
|
@@ -544,14 +557,71 @@ be relative"
|
|
|
544
557
|
send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
|
|
545
558
|
end
|
|
546
559
|
|
|
547
|
-
#
|
|
548
|
-
#
|
|
549
|
-
# the default extractors are removed, this method will always return false.
|
|
560
|
+
# Attempts to extract and check the HTML meta tags instructing Wgit not to
|
|
561
|
+
# index this document (save it to a Database).
|
|
550
562
|
#
|
|
551
563
|
# @return [Boolean] True if this document shouldn't be saved to a Database,
|
|
552
564
|
# false otherwise.
|
|
553
565
|
def no_index?
|
|
554
|
-
|
|
566
|
+
meta_robots = extract_from_html(
|
|
567
|
+
'//meta[@name="robots"]/@content',
|
|
568
|
+
singleton: true,
|
|
569
|
+
text_content_only: true
|
|
570
|
+
)
|
|
571
|
+
meta_wgit = extract_from_html(
|
|
572
|
+
'//meta[@name="wgit"]/@content',
|
|
573
|
+
singleton: true,
|
|
574
|
+
text_content_only: true
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
[meta_robots, meta_wgit].include?('noindex')
|
|
578
|
+
end
|
|
579
|
+
|
|
580
|
+
# Firstly finds the target element whose text contains el_text.
|
|
581
|
+
# Then finds the preceeding fragment element nearest to the target
|
|
582
|
+
# element and returns it's href value (starting with #). The search is
|
|
583
|
+
# performed against the @html so Documents loaded from a DB will need to
|
|
584
|
+
# contain the 'html' field in the Wgit::Model. See the
|
|
585
|
+
# `Wgit::Model#include_doc_html` documentation for more info.
|
|
586
|
+
#
|
|
587
|
+
# @param el_text [String] The element text of the target element.
|
|
588
|
+
# @param el_type [String] The element type, defaulting to any type.
|
|
589
|
+
# @yield [results] Given the results of the xpath query. Return the target
|
|
590
|
+
# you want or nil to use the default (first) target in results.
|
|
591
|
+
# @return [String, nil] nil if no nearest fragment or the nearest
|
|
592
|
+
# fragment's href e.g. '#about'.
|
|
593
|
+
# @raise [StandardError] Raises if no matching target element containg
|
|
594
|
+
# el_text can be found or if @html is empty.
|
|
595
|
+
def nearest_fragment(el_text, el_type = "*")
|
|
596
|
+
raise "The @html is empty" if @html.empty?
|
|
597
|
+
|
|
598
|
+
xpath_query = "//#{el_type}[text()[contains(.,\"#{el_text}\")]]"
|
|
599
|
+
results = xpath(xpath_query)
|
|
600
|
+
return nil if results.empty?
|
|
601
|
+
|
|
602
|
+
target = results.first
|
|
603
|
+
if block_given?
|
|
604
|
+
result = yield(results)
|
|
605
|
+
target = result if result
|
|
606
|
+
end
|
|
607
|
+
|
|
608
|
+
target_index = html_index(target)
|
|
609
|
+
raise 'Failed to find target index' unless target_index
|
|
610
|
+
|
|
611
|
+
fragment_h = fragment_indices(fragments)
|
|
612
|
+
|
|
613
|
+
# Return the target href if the target is itself a fragment.
|
|
614
|
+
return fragment_h[target_index] if fragment_h.keys.include?(target_index)
|
|
615
|
+
|
|
616
|
+
# Find the target's nearest preceeding fragment href.
|
|
617
|
+
closest_index = 0
|
|
618
|
+
fragment_h.each do |fragment_index, href|
|
|
619
|
+
if fragment_index.between?(closest_index, target_index)
|
|
620
|
+
closest_index = fragment_index
|
|
621
|
+
end
|
|
622
|
+
end
|
|
623
|
+
|
|
624
|
+
fragment_h[closest_index]
|
|
555
625
|
end
|
|
556
626
|
|
|
557
627
|
protected
|
|
@@ -573,7 +643,8 @@ be relative"
|
|
|
573
643
|
# Extracts a value/object from this Document's @html using the given xpath
|
|
574
644
|
# parameter.
|
|
575
645
|
#
|
|
576
|
-
# @param xpath [String, #call] Used to find the value/object in @html.
|
|
646
|
+
# @param xpath [String, #call, nil] Used to find the value/object in @html.
|
|
647
|
+
# Passing nil will skip the HTML extraction which isn't always needed.
|
|
577
648
|
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
|
578
649
|
# results (Enumerable).
|
|
579
650
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
|
@@ -588,8 +659,12 @@ be relative"
|
|
|
588
659
|
# @return [String, Object] The value found in the html or the default value
|
|
589
660
|
# (singleton ? nil : []).
|
|
590
661
|
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
|
591
|
-
|
|
592
|
-
|
|
662
|
+
result = nil
|
|
663
|
+
|
|
664
|
+
if xpath
|
|
665
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
|
666
|
+
result = singleton ? at_xpath(xpath) : xpath(xpath)
|
|
667
|
+
end
|
|
593
668
|
|
|
594
669
|
if result && text_content_only
|
|
595
670
|
result = singleton ? result.content : result.map(&:content)
|
|
@@ -647,7 +722,8 @@ be relative"
|
|
|
647
722
|
# Dynamically run the init_*_from_html methods.
|
|
648
723
|
Document.private_instance_methods(false).each do |method|
|
|
649
724
|
if method.to_s.start_with?('init_') &&
|
|
650
|
-
method.to_s.end_with?('_from_html') &&
|
|
725
|
+
method.to_s.end_with?('_from_html') &&
|
|
726
|
+
method != __method__
|
|
651
727
|
send(method)
|
|
652
728
|
end
|
|
653
729
|
end
|
|
@@ -658,12 +734,14 @@ be relative"
|
|
|
658
734
|
def init_from_object(obj, encode: true)
|
|
659
735
|
assert_respond_to(obj, :fetch)
|
|
660
736
|
|
|
661
|
-
|
|
737
|
+
url = obj.fetch('url') # Should always be present.
|
|
738
|
+
raise "Missing 'url' field in doc object" unless url
|
|
739
|
+
|
|
740
|
+
@url = Wgit::Url.new(url)
|
|
662
741
|
@html = obj.fetch('html', '')
|
|
663
742
|
@parser = init_nokogiri
|
|
664
743
|
@score = obj.fetch('score', 0.0)
|
|
665
|
-
|
|
666
|
-
@html = Wgit::Utils.sanitize(@html, encode:)
|
|
744
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
|
667
745
|
|
|
668
746
|
# Dynamically run the init_*_from_object methods.
|
|
669
747
|
Document.private_instance_methods(false).each do |method|
|
|
@@ -691,6 +769,38 @@ be relative"
|
|
|
691
769
|
var_name
|
|
692
770
|
end
|
|
693
771
|
|
|
772
|
+
# Returns all <a> fragment elements from within the HTML body e.g. #about.
|
|
773
|
+
def fragments
|
|
774
|
+
anchors = xpath("/html/body//a")
|
|
775
|
+
|
|
776
|
+
anchors.select do |anchor|
|
|
777
|
+
href = anchor.attributes['href']&.value
|
|
778
|
+
href&.start_with?('#')
|
|
779
|
+
end
|
|
780
|
+
end
|
|
781
|
+
|
|
782
|
+
# Returns a Hash{Int=>String} of <a> fragment positions and their href
|
|
783
|
+
# values. Only fragment anchors are returned e.g. <a> elements with a
|
|
784
|
+
# href starting with '#'.
|
|
785
|
+
def fragment_indices(fragments)
|
|
786
|
+
fragments.reduce({}) do |hash, fragment|
|
|
787
|
+
index = html_index(fragment)
|
|
788
|
+
next hash unless index
|
|
789
|
+
|
|
790
|
+
href = fragment.attributes['href']&.value
|
|
791
|
+
hash[index] = href
|
|
792
|
+
|
|
793
|
+
hash
|
|
794
|
+
end
|
|
795
|
+
end
|
|
796
|
+
|
|
797
|
+
# Takes a Nokogiri element or HTML substring and returns it's index in
|
|
798
|
+
# the html. Returns the index/position Int or nil if not found. The search
|
|
799
|
+
# is case insensitive because Nokogiri lower cases camelCase attributes.
|
|
800
|
+
def html_index(el_or_str)
|
|
801
|
+
@html.downcase.index(el_or_str.to_s.strip.downcase)
|
|
802
|
+
end
|
|
803
|
+
|
|
694
804
|
alias_method :content, :html
|
|
695
805
|
alias_method :statistics, :stats
|
|
696
806
|
alias_method :internal_urls, :internal_links
|
|
@@ -2,24 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
### Default Document Extractors ###
|
|
4
4
|
|
|
5
|
-
# No index.
|
|
6
|
-
Wgit::Document.define_extractor(
|
|
7
|
-
:meta_robots,
|
|
8
|
-
'//meta[@name="robots"]/@content',
|
|
9
|
-
singleton: true,
|
|
10
|
-
text_content_only: true
|
|
11
|
-
)
|
|
12
|
-
Wgit::Document.define_extractor(
|
|
13
|
-
:meta_wgit,
|
|
14
|
-
'//meta[@name="wgit"]/@content',
|
|
15
|
-
singleton: true,
|
|
16
|
-
text_content_only: true
|
|
17
|
-
)
|
|
18
|
-
|
|
19
5
|
# Base.
|
|
20
6
|
Wgit::Document.define_extractor(
|
|
21
7
|
:base,
|
|
22
|
-
|
|
8
|
+
"//base/@href",
|
|
23
9
|
singleton: true,
|
|
24
10
|
text_content_only: true
|
|
25
11
|
) do |base|
|
|
@@ -29,7 +15,7 @@ end
|
|
|
29
15
|
# Title.
|
|
30
16
|
Wgit::Document.define_extractor(
|
|
31
17
|
:title,
|
|
32
|
-
|
|
18
|
+
"//title",
|
|
33
19
|
singleton: true,
|
|
34
20
|
text_content_only: true
|
|
35
21
|
)
|
|
@@ -57,17 +43,18 @@ Wgit::Document.define_extractor(
|
|
|
57
43
|
singleton: true,
|
|
58
44
|
text_content_only: true
|
|
59
45
|
) do |keywords, _source, type|
|
|
60
|
-
if keywords &&
|
|
61
|
-
keywords = keywords.split(
|
|
46
|
+
if keywords && type == :document
|
|
47
|
+
keywords = keywords.split(",")
|
|
62
48
|
keywords = Wgit::Utils.sanitize(keywords)
|
|
63
49
|
end
|
|
50
|
+
|
|
64
51
|
keywords
|
|
65
52
|
end
|
|
66
53
|
|
|
67
54
|
# Links.
|
|
68
55
|
Wgit::Document.define_extractor(
|
|
69
56
|
:links,
|
|
70
|
-
|
|
57
|
+
"//a/@href",
|
|
71
58
|
singleton: false,
|
|
72
59
|
text_content_only: true
|
|
73
60
|
) do |links|
|
|
@@ -79,7 +66,12 @@ end
|
|
|
79
66
|
# Text.
|
|
80
67
|
Wgit::Document.define_extractor(
|
|
81
68
|
:text,
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
)
|
|
69
|
+
nil # doc.parser contains all HTML so omit the xpath search.
|
|
70
|
+
) do |text, doc, type|
|
|
71
|
+
if type == :document
|
|
72
|
+
html_to_text = Wgit::HTMLToText.new(doc.parser)
|
|
73
|
+
text = html_to_text.extract
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
text
|
|
77
|
+
end
|