wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
@@ -0,0 +1,147 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "../assertable"
|
4
|
+
require_relative "../url"
|
5
|
+
require_relative "../document"
|
6
|
+
require_relative "../model"
|
7
|
+
|
8
|
+
module Wgit::Database
|
9
|
+
# The parent DatabaseAdapter class that should be inherited from when
|
10
|
+
# creating an underlying Database adapter implementation class e.g.
|
11
|
+
# Wgit::Database::MongoDB.
|
12
|
+
#
|
13
|
+
# Listed in this class are the methods that an implementer class must
|
14
|
+
# implement to work with Wgit. Failure to do so will result in a
|
15
|
+
# NotImplementedError being raised.
|
16
|
+
#
|
17
|
+
# While not required, implementing the method `#search_fields=(fields)` in an
|
18
|
+
# adapter class will allow `Wgit::Model.set_search_fields` to call
|
19
|
+
# it. This allows the search fields to be set in one method call, from within
|
20
|
+
# the Wgit::Model class. See this method's docs for more info.
|
21
|
+
#
|
22
|
+
# Also listed in this class are common helper methods available to all
|
23
|
+
# Database implementer subclasses.
|
24
|
+
class DatabaseAdapter
|
25
|
+
include Wgit::Assertable
|
26
|
+
|
27
|
+
# The NotImplementedError message that gets raised if an implementor class
|
28
|
+
# doesn't implement a method required by Wgit.
|
29
|
+
NOT_IMPL_ERR = "The DatabaseAdapter class you're using hasn't \
|
30
|
+
implemented this method"
|
31
|
+
|
32
|
+
###################### START OF INTERFACE METHODS ######################
|
33
|
+
|
34
|
+
# Initializes a DatabaseAdapter instance.
|
35
|
+
#
|
36
|
+
# The implementor class should establish a DB connection here using the
|
37
|
+
# given connection_string, falling back to `ENV['WGIT_CONNECTION_STRING']`.
|
38
|
+
# Don't forget to call `super`.
|
39
|
+
#
|
40
|
+
# @param connection_string [String] The connection string needed to connect
|
41
|
+
# to the database.
|
42
|
+
# @raise [StandardError] If a connection string isn't provided, either as a
|
43
|
+
# parameter or via the environment.
|
44
|
+
def initialize(connection_string = nil); end
|
45
|
+
|
46
|
+
# Returns the current size of the database.
|
47
|
+
#
|
48
|
+
# @return [Integer] The current size of the DB.
|
49
|
+
def size
|
50
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
51
|
+
end
|
52
|
+
|
53
|
+
# Searches the database's Documents for the given query. The
|
54
|
+
# `Wgit::Model.search_fields` should be searched for matches
|
55
|
+
# against the given query. Documents should be sorted starting with the
|
56
|
+
# most relevant. Each returned Document should have it's `score` field set
|
57
|
+
# for relevance.
|
58
|
+
#
|
59
|
+
# @param query [String] The text query to search with.
|
60
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
61
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
62
|
+
# for separately.
|
63
|
+
# @param limit [Integer] The max number of results to return.
|
64
|
+
# @param skip [Integer] The number of results to skip.
|
65
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
66
|
+
# DB.
|
67
|
+
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
68
|
+
def search(
|
69
|
+
query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0
|
70
|
+
)
|
71
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
72
|
+
end
|
73
|
+
|
74
|
+
# Deletes everything in the urls and documents collections.
|
75
|
+
#
|
76
|
+
# @return [Integer] The number of deleted records.
|
77
|
+
def empty
|
78
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
79
|
+
end
|
80
|
+
|
81
|
+
# Returns Url records that haven't yet been crawled.
|
82
|
+
#
|
83
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
84
|
+
# @param skip [Integer] Skip n amount of Url's.
|
85
|
+
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
86
|
+
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
87
|
+
def uncrawled_urls(limit: 0, skip: 0)
|
88
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
89
|
+
end
|
90
|
+
|
91
|
+
# Inserts or updates the object in the database.
|
92
|
+
#
|
93
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
|
94
|
+
# @return [Boolean] True if inserted, false if updated.
|
95
|
+
def upsert(obj)
|
96
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
97
|
+
end
|
98
|
+
|
99
|
+
# Bulk upserts the objects in the database collection.
|
100
|
+
# You cannot mix collection objs types, all must be Urls or Documents.
|
101
|
+
#
|
102
|
+
# @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
|
103
|
+
# inserted/updated.
|
104
|
+
# @return [Integer] The total number of newly inserted objects.
|
105
|
+
def bulk_upsert(objs)
|
106
|
+
raise NotImplementedError, NOT_IMPL_ERR
|
107
|
+
end
|
108
|
+
|
109
|
+
###################### END OF INTERFACE METHODS ######################
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
# Returns the correct Wgit::Database:Model for the given obj type.
|
114
|
+
#
|
115
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj to obtain a model for.
|
116
|
+
# @return [Hash] The obj model.
|
117
|
+
def build_model(obj)
|
118
|
+
assert_type(obj, [Wgit::Url, Wgit::Document])
|
119
|
+
|
120
|
+
if obj.is_a?(Wgit::Url)
|
121
|
+
Wgit::Model.url(obj)
|
122
|
+
else
|
123
|
+
Wgit::Model.document(obj)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Map each DB hash object into a Wgit::Document. Each Document is yielded
|
128
|
+
# if a block is given before returning the mapped Array of Documents.
|
129
|
+
def map_documents(doc_hashes)
|
130
|
+
doc_hashes.map do |doc|
|
131
|
+
doc = Wgit::Document.new(doc)
|
132
|
+
yield(doc) if block_given?
|
133
|
+
doc
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Map each DB hash object into a Wgit::Url. Each Url is yielded
|
138
|
+
# if a block is given before returning the mapped Array of Urls.
|
139
|
+
def map_urls(url_hashes)
|
140
|
+
url_hashes.map do |url|
|
141
|
+
url = Wgit::Url.new(url)
|
142
|
+
yield(url) if block_given?
|
143
|
+
url
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
data/lib/wgit/document.rb
CHANGED
@@ -3,12 +3,12 @@ require_relative 'utils'
|
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'json'
|
6
|
-
require 'set'
|
7
6
|
|
8
7
|
module Wgit
|
9
8
|
# Class modeling/serialising a HTML web document, although other MIME types
|
10
9
|
# will work e.g. images etc. Also doubles as a search result when
|
11
|
-
# loading Documents from the database via
|
10
|
+
# loading Documents from the database via
|
11
|
+
# `Wgit::Database::DatabaseAdapter#search`.
|
12
12
|
#
|
13
13
|
# The initialize method dynamically initializes instance variables from the
|
14
14
|
# Document HTML / Database object e.g. text. This bit is dynamic so that the
|
@@ -18,25 +18,23 @@ module Wgit
|
|
18
18
|
include Assertable
|
19
19
|
|
20
20
|
# Regex for the allowed var names when defining an extractor.
|
21
|
-
REGEX_EXTRACTOR_NAME = /[a-z0-9_]
|
21
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
|
22
22
|
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
option output p pre q rb rt ruby s samp section small span strong sub
|
29
|
-
summary sup td textarea th time u ul var wbr
|
30
|
-
])
|
23
|
+
# Instance vars to be ignored by Document#to_h and in turn
|
24
|
+
# Wgit::Model.document.
|
25
|
+
@to_h_ignore_vars = [
|
26
|
+
'@parser' # Always ignore the Nokogiri object.
|
27
|
+
]
|
31
28
|
|
32
29
|
# Set of Symbols representing the defined Document extractors.
|
33
30
|
@extractors = Set.new
|
34
31
|
|
35
32
|
class << self
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
|
33
|
+
# Array of instance vars to ignore when Document#to_h and (in turn)
|
34
|
+
# Wgit::Model.document methods are called. Append your own defined extractor
|
35
|
+
# vars to omit them from the model (database object) when indexing.
|
36
|
+
# Each var should be a String starting with an '@' char e.g. "@data" etc.
|
37
|
+
attr_reader :to_h_ignore_vars
|
40
38
|
|
41
39
|
# Set of Symbols representing the defined Document extractors. Is
|
42
40
|
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
@@ -52,7 +50,7 @@ module Wgit
|
|
52
50
|
# The Nokogiri::HTML document object initialized from @html.
|
53
51
|
attr_reader :parser
|
54
52
|
|
55
|
-
# The score is
|
53
|
+
# The score is set/used following a `Database#search` and records matches.
|
56
54
|
attr_reader :score
|
57
55
|
|
58
56
|
# Initialize takes either two strings (representing the URL and HTML) or an
|
@@ -76,25 +74,14 @@ module Wgit
|
|
76
74
|
# false if the Document content is an image etc.
|
77
75
|
def initialize(url_or_obj, html = '', encode: true)
|
78
76
|
if url_or_obj.is_a?(String)
|
79
|
-
init_from_strings(url_or_obj, html, encode:
|
77
|
+
init_from_strings(url_or_obj, html, encode:)
|
80
78
|
else
|
81
|
-
init_from_object(url_or_obj, encode:
|
79
|
+
init_from_object(url_or_obj, encode:)
|
82
80
|
end
|
83
81
|
end
|
84
82
|
|
85
83
|
### Document Class Methods ###
|
86
84
|
|
87
|
-
# Uses Document.text_elements to build an xpath String, used to obtain
|
88
|
-
# all of the combined visual text on a webpage.
|
89
|
-
#
|
90
|
-
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
|
-
def self.text_elements_xpath
|
92
|
-
Wgit::Document.text_elements.each_with_index.reduce('') do |xpath, (el, i)|
|
93
|
-
xpath += ' | ' unless i.zero?
|
94
|
-
xpath += format('//%s/text()', el)
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
85
|
# Defines a content extractor, which extracts HTML elements/content
|
99
86
|
# into instance variables upon Document initialization. See the default
|
100
87
|
# extractors defined in 'document_extractors.rb' as examples. Defining an
|
@@ -118,8 +105,9 @@ module Wgit
|
|
118
105
|
# @param var [Symbol] The name of the variable to be initialised, that will
|
119
106
|
# contain the extracted content. A getter and setter method is defined
|
120
107
|
# for the initialised variable.
|
121
|
-
# @param xpath [String, #call] The xpath used to find the element(s)
|
122
|
-
# of the webpage. Only used when initializing from HTML.
|
108
|
+
# @param xpath [String, #call, nil] The xpath used to find the element(s)
|
109
|
+
# of the webpage. Only used when initializing from HTML. Passing nil will
|
110
|
+
# skip the HTML extraction, which sometimes isn't required.
|
123
111
|
#
|
124
112
|
# Pass a callable object (proc etc.) if you want the
|
125
113
|
# xpath value to be derived on Document initialisation (instead of when
|
@@ -210,7 +198,7 @@ module Wgit
|
|
210
198
|
#
|
211
199
|
# @return [String] A short textual representation of this Document.
|
212
200
|
def inspect
|
213
|
-
"#<Wgit::Document url=\"#{@url}\"
|
201
|
+
"#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
|
214
202
|
end
|
215
203
|
|
216
204
|
# Determines if both the url and html match. Use
|
@@ -241,10 +229,10 @@ module Wgit
|
|
241
229
|
# Provide the `link:` parameter to get the correct base URL for that type
|
242
230
|
# of link. For example, a link of `#top` would always return @url because
|
243
231
|
# it applies to that page, not a different one. Query strings work in the
|
244
|
-
# same way. Use this parameter if manually
|
232
|
+
# same way. Use this parameter if manually joining Url's e.g.
|
245
233
|
#
|
246
234
|
# relative_link = Wgit::Url.new('?q=hello')
|
247
|
-
# absolute_link = doc.base_url(link: relative_link).
|
235
|
+
# absolute_link = doc.base_url(link: relative_link).join(relative_link)
|
248
236
|
#
|
249
237
|
# This is similar to how Wgit::Document#internal_absolute_links works.
|
250
238
|
#
|
@@ -264,7 +252,7 @@ module Wgit
|
|
264
252
|
be relative"
|
265
253
|
end
|
266
254
|
|
267
|
-
get_base = -> { @base.relative? ? @url.to_origin.
|
255
|
+
get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
|
268
256
|
|
269
257
|
if link
|
270
258
|
link = Wgit::Url.new(link)
|
@@ -288,11 +276,11 @@ be relative"
|
|
288
276
|
# returned Hash.
|
289
277
|
# @return [Hash] Containing self's instance vars.
|
290
278
|
def to_h(include_html: false, include_score: true)
|
291
|
-
ignore =
|
279
|
+
ignore = Wgit::Document.to_h_ignore_vars.dup
|
280
|
+
ignore << '@html' unless include_html
|
292
281
|
ignore << '@score' unless include_score
|
293
|
-
ignore << '@parser' # Always ignore the Nokogiri object.
|
294
282
|
|
295
|
-
Wgit::Utils.to_h(self, ignore:
|
283
|
+
Wgit::Utils.to_h(self, ignore:)
|
296
284
|
end
|
297
285
|
|
298
286
|
# Converts this Document's #to_h return value to a JSON String.
|
@@ -301,7 +289,7 @@ be relative"
|
|
301
289
|
# returned JSON String.
|
302
290
|
# @return [String] This Document represented as a JSON String.
|
303
291
|
def to_json(include_html: false)
|
304
|
-
h = to_h(include_html:
|
292
|
+
h = to_h(include_html:)
|
305
293
|
JSON.generate(h)
|
306
294
|
end
|
307
295
|
|
@@ -323,7 +311,7 @@ be relative"
|
|
323
311
|
else
|
324
312
|
next unless instance_variable_get(var).respond_to?(:length)
|
325
313
|
|
326
|
-
hash[var[1
|
314
|
+
hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
|
327
315
|
end
|
328
316
|
end
|
329
317
|
|
@@ -431,17 +419,18 @@ be relative"
|
|
431
419
|
end
|
432
420
|
end
|
433
421
|
.reject { |link| link.relative?(host: @url.to_origin) }
|
434
|
-
.map(&:omit_trailing_slash)
|
435
422
|
|
436
423
|
Wgit::Utils.sanitize(links)
|
437
424
|
end
|
438
425
|
|
439
|
-
# Searches the
|
426
|
+
# Searches the Document's instance vars for the given query and returns
|
427
|
+
# the results. The `Wgit::Model.search_fields` denote the vars to be
|
428
|
+
# searched, unless overridden using the search_fields: param.
|
440
429
|
#
|
441
|
-
# The number of
|
430
|
+
# The number of matches for each search field is recorded internally
|
442
431
|
# and used to rank/sort the search results before being returned. Where
|
443
|
-
# the Wgit::Database#search method
|
444
|
-
# this method searches each
|
432
|
+
# the Wgit::Database::DatabaseAdapter#search method searches all documents
|
433
|
+
# for matches, this method searches each individual Document for matches.
|
445
434
|
#
|
446
435
|
# Each search result comprises of a sentence of a given length. The length
|
447
436
|
# will be based on the sentence_limit parameter or the full length of the
|
@@ -449,51 +438,86 @@ be relative"
|
|
449
438
|
# that the search query is visible somewhere in the sentence.
|
450
439
|
#
|
451
440
|
# @param query [Regexp, #to_s] The regex or text value to search the
|
452
|
-
# document's
|
441
|
+
# document's instance vars (Wgit::Model.search_fields) for.
|
453
442
|
# @param case_sensitive [Boolean] Whether character case must match.
|
454
443
|
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
455
444
|
# for separately.
|
456
445
|
# @param sentence_limit [Integer] The max length of each search result
|
457
446
|
# sentence.
|
458
|
-
# @
|
447
|
+
# @param search_fields [Hash<Symbol, Integer>] The Document instance vars
|
448
|
+
# to search and the weight for a match (used to determine relevence).
|
449
|
+
# This should only be set for custom one-off Document searches. For
|
450
|
+
# permanent changing of search fields, see Wgit::Model.set_search_fields.
|
451
|
+
# @yield [results_hash] Given the results_hash containing each search
|
452
|
+
# result (String) and its score (num_matches * weight).
|
453
|
+
# @return [Array<String>] A subset of this document's instance vars,
|
454
|
+
# matching the query for the search_fields: param.
|
459
455
|
def search(
|
460
|
-
query, case_sensitive: false, whole_sentence: true,
|
456
|
+
query, case_sensitive: false, whole_sentence: true,
|
457
|
+
sentence_limit: 80, search_fields: Wgit::Model.search_fields
|
461
458
|
)
|
462
459
|
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
460
|
+
assert_type(search_fields, Hash)
|
463
461
|
|
464
|
-
|
465
|
-
|
466
|
-
else # query.respond_to? :to_s == true
|
467
|
-
query = query.to_s
|
468
|
-
query = query.gsub(' ', '|') unless whole_sentence
|
469
|
-
regex = Regexp.new(query, !case_sensitive)
|
470
|
-
end
|
471
|
-
|
462
|
+
regex = Wgit::Utils.build_search_regex(
|
463
|
+
query, case_sensitive:, whole_sentence:)
|
472
464
|
results = {}
|
473
465
|
|
474
|
-
|
475
|
-
|
476
|
-
next
|
466
|
+
search_fields.each do |field, weight|
|
467
|
+
doc_field = instance_variable_get("@#{field}".to_sym)
|
468
|
+
next unless doc_field
|
469
|
+
|
470
|
+
Wgit::Utils.each(doc_field) do |text|
|
471
|
+
assert_type(text, String)
|
477
472
|
|
478
|
-
|
479
|
-
|
473
|
+
text = text.strip
|
474
|
+
matches = text.scan(regex).count
|
475
|
+
next unless matches.positive?
|
480
476
|
|
481
|
-
|
482
|
-
|
477
|
+
index = text.index(regex) # Index of first match.
|
478
|
+
Wgit::Utils.format_sentence_length(text, index, sentence_limit)
|
483
479
|
|
484
|
-
|
480
|
+
# For duplicate matching text, total the text score.
|
481
|
+
text_score = matches * weight
|
482
|
+
existing_score = results[text]
|
483
|
+
text_score += existing_score if existing_score
|
484
|
+
|
485
|
+
results[text] = text_score
|
486
|
+
end
|
485
487
|
end
|
486
488
|
|
487
489
|
return [] if results.empty?
|
488
490
|
|
489
|
-
|
490
|
-
|
491
|
+
yield results if block_given?
|
492
|
+
|
493
|
+
# Return only the matching text sentences, sorted by relevance.
|
494
|
+
Hash[results.sort_by { |_, score| -score }].keys
|
495
|
+
end
|
496
|
+
|
497
|
+
# Performs a text only search of the Document, instead of searching all
|
498
|
+
# search fields defined in Wgit::Model.search_fields.
|
499
|
+
#
|
500
|
+
# @param query [Regexp, #to_s] The regex or text value to search the
|
501
|
+
# document's text for.
|
502
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
503
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
504
|
+
# for separately.
|
505
|
+
# @param sentence_limit [Integer] The max length of each search result
|
506
|
+
# sentence.
|
507
|
+
# @return [Array<String>] A subset of this document's text fields that
|
508
|
+
# match the query.
|
509
|
+
def search_text(
|
510
|
+
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
511
|
+
)
|
512
|
+
search(
|
513
|
+
query, case_sensitive:, whole_sentence:,
|
514
|
+
sentence_limit:, search_fields: { text: 1 })
|
491
515
|
end
|
492
516
|
|
493
|
-
# Performs a text search (see Document#
|
494
|
-
# results to the @text instance variable. This can be used
|
495
|
-
# functionality. The original text is returned; no other
|
496
|
-
# is kept thereafter.
|
517
|
+
# Performs a text only search (see Document#search_text for details) but
|
518
|
+
# assigns the results to the @text instance variable. This can be used
|
519
|
+
# for sub search functionality. The original text is returned; no other
|
520
|
+
# reference to it is kept thereafter.
|
497
521
|
#
|
498
522
|
# @param query [Regexp, #to_s] The regex or text value to search the
|
499
523
|
# document's @text for.
|
@@ -503,14 +527,11 @@ be relative"
|
|
503
527
|
# @param sentence_limit [Integer] The max length of each search result
|
504
528
|
# sentence.
|
505
529
|
# @return [String] This Document's original @text value.
|
506
|
-
def
|
530
|
+
def search_text!(
|
507
531
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
508
532
|
)
|
509
533
|
orig_text = @text
|
510
|
-
@text =
|
511
|
-
query, case_sensitive: case_sensitive,
|
512
|
-
whole_sentence: whole_sentence, sentence_limit: sentence_limit
|
513
|
-
)
|
534
|
+
@text = search_text(query, case_sensitive:, whole_sentence:, sentence_limit:)
|
514
535
|
|
515
536
|
orig_text
|
516
537
|
end
|
@@ -533,11 +554,74 @@ be relative"
|
|
533
554
|
# @return [String, Object] The value found in the html or the default value
|
534
555
|
# (singleton ? nil : []).
|
535
556
|
def extract(xpath, singleton: true, text_content_only: true, &block)
|
536
|
-
send(
|
537
|
-
|
538
|
-
|
539
|
-
|
557
|
+
send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
|
558
|
+
end
|
559
|
+
|
560
|
+
# Attempts to extract and check the HTML meta tags instructing Wgit not to
|
561
|
+
# index this document (save it to a Database).
|
562
|
+
#
|
563
|
+
# @return [Boolean] True if this document shouldn't be saved to a Database,
|
564
|
+
# false otherwise.
|
565
|
+
def no_index?
|
566
|
+
meta_robots = extract_from_html(
|
567
|
+
'//meta[@name="robots"]/@content',
|
568
|
+
singleton: true,
|
569
|
+
text_content_only: true
|
570
|
+
)
|
571
|
+
meta_wgit = extract_from_html(
|
572
|
+
'//meta[@name="wgit"]/@content',
|
573
|
+
singleton: true,
|
574
|
+
text_content_only: true
|
540
575
|
)
|
576
|
+
|
577
|
+
[meta_robots, meta_wgit].include?('noindex')
|
578
|
+
end
|
579
|
+
|
580
|
+
# Firstly finds the target element whose text contains el_text.
|
581
|
+
# Then finds the preceeding fragment element nearest to the target
|
582
|
+
# element and returns it's href value (starting with #). The search is
|
583
|
+
# performed against the @html so Documents loaded from a DB will need to
|
584
|
+
# contain the 'html' field in the Wgit::Model. See the
|
585
|
+
# `Wgit::Model#include_doc_html` documentation for more info.
|
586
|
+
#
|
587
|
+
# @param el_text [String] The element text of the target element.
|
588
|
+
# @param el_type [String] The element type, defaulting to any type.
|
589
|
+
# @yield [results] Given the results of the xpath query. Return the target
|
590
|
+
# you want or nil to use the default (first) target in results.
|
591
|
+
# @return [String, nil] nil if no nearest fragment or the nearest
|
592
|
+
# fragment's href e.g. '#about'.
|
593
|
+
# @raise [StandardError] Raises if no matching target element containg
|
594
|
+
# el_text can be found or if @html is empty.
|
595
|
+
def nearest_fragment(el_text, el_type = "*")
|
596
|
+
raise "The @html is empty" if @html.empty?
|
597
|
+
|
598
|
+
xpath_query = "//#{el_type}[text()[contains(.,\"#{el_text}\")]]"
|
599
|
+
results = xpath(xpath_query)
|
600
|
+
return nil if results.empty?
|
601
|
+
|
602
|
+
target = results.first
|
603
|
+
if block_given?
|
604
|
+
result = yield(results)
|
605
|
+
target = result if result
|
606
|
+
end
|
607
|
+
|
608
|
+
target_index = html_index(target)
|
609
|
+
raise 'Failed to find target index' unless target_index
|
610
|
+
|
611
|
+
fragment_h = fragment_indices(fragments)
|
612
|
+
|
613
|
+
# Return the target href if the target is itself a fragment.
|
614
|
+
return fragment_h[target_index] if fragment_h.keys.include?(target_index)
|
615
|
+
|
616
|
+
# Find the target's nearest preceeding fragment href.
|
617
|
+
closest_index = 0
|
618
|
+
fragment_h.each do |fragment_index, href|
|
619
|
+
if fragment_index.between?(closest_index, target_index)
|
620
|
+
closest_index = fragment_index
|
621
|
+
end
|
622
|
+
end
|
623
|
+
|
624
|
+
fragment_h[closest_index]
|
541
625
|
end
|
542
626
|
|
543
627
|
protected
|
@@ -559,7 +643,8 @@ be relative"
|
|
559
643
|
# Extracts a value/object from this Document's @html using the given xpath
|
560
644
|
# parameter.
|
561
645
|
#
|
562
|
-
# @param xpath [String, #call] Used to find the value/object in @html.
|
646
|
+
# @param xpath [String, #call, nil] Used to find the value/object in @html.
|
647
|
+
# Passing nil will skip the HTML extraction which isn't always needed.
|
563
648
|
# @param singleton [Boolean] singleton ? results.first (single Object) :
|
564
649
|
# results (Enumerable).
|
565
650
|
# @param text_content_only [Boolean] text_content_only ? result.content
|
@@ -574,14 +659,18 @@ be relative"
|
|
574
659
|
# @return [String, Object] The value found in the html or the default value
|
575
660
|
# (singleton ? nil : []).
|
576
661
|
def extract_from_html(xpath, singleton: true, text_content_only: true)
|
577
|
-
|
578
|
-
|
662
|
+
result = nil
|
663
|
+
|
664
|
+
if xpath
|
665
|
+
xpath = xpath.call if xpath.respond_to?(:call)
|
666
|
+
result = singleton ? at_xpath(xpath) : xpath(xpath)
|
667
|
+
end
|
579
668
|
|
580
669
|
if result && text_content_only
|
581
670
|
result = singleton ? result.content : result.map(&:content)
|
582
671
|
end
|
583
672
|
|
584
|
-
Wgit::Utils.sanitize(result)
|
673
|
+
result = Wgit::Utils.sanitize(result)
|
585
674
|
result = yield(result, self, :document) if block_given?
|
586
675
|
result
|
587
676
|
end
|
@@ -608,7 +697,7 @@ be relative"
|
|
608
697
|
default = singleton ? nil : []
|
609
698
|
result = obj.fetch(key.to_s, default)
|
610
699
|
|
611
|
-
Wgit::Utils.sanitize(result)
|
700
|
+
result = Wgit::Utils.sanitize(result)
|
612
701
|
result = yield(result, obj, :object) if block_given?
|
613
702
|
result
|
614
703
|
end
|
@@ -628,13 +717,14 @@ be relative"
|
|
628
717
|
@parser = init_nokogiri
|
629
718
|
@score = 0.0
|
630
719
|
|
631
|
-
Wgit::Utils.sanitize(@html, encode:
|
720
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
632
721
|
|
633
722
|
# Dynamically run the init_*_from_html methods.
|
634
723
|
Document.private_instance_methods(false).each do |method|
|
635
724
|
if method.to_s.start_with?('init_') &&
|
636
|
-
method.to_s.end_with?('_from_html')
|
637
|
-
|
725
|
+
method.to_s.end_with?('_from_html') &&
|
726
|
+
method != __method__
|
727
|
+
send(method)
|
638
728
|
end
|
639
729
|
end
|
640
730
|
end
|
@@ -644,18 +734,20 @@ be relative"
|
|
644
734
|
def init_from_object(obj, encode: true)
|
645
735
|
assert_respond_to(obj, :fetch)
|
646
736
|
|
647
|
-
|
737
|
+
url = obj.fetch('url') # Should always be present.
|
738
|
+
raise "Missing 'url' field in doc object" unless url
|
739
|
+
|
740
|
+
@url = Wgit::Url.new(url)
|
648
741
|
@html = obj.fetch('html', '')
|
649
742
|
@parser = init_nokogiri
|
650
743
|
@score = obj.fetch('score', 0.0)
|
651
|
-
|
652
|
-
Wgit::Utils.sanitize(@html, encode: encode)
|
744
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
653
745
|
|
654
746
|
# Dynamically run the init_*_from_object methods.
|
655
747
|
Document.private_instance_methods(false).each do |method|
|
656
748
|
if method.to_s.start_with?('init_') &&
|
657
|
-
method.to_s.end_with?('_from_object')
|
658
|
-
send(method, obj)
|
749
|
+
method.to_s.end_with?('_from_object') && method != __method__
|
750
|
+
send(method, obj)
|
659
751
|
end
|
660
752
|
end
|
661
753
|
end
|
@@ -668,7 +760,7 @@ be relative"
|
|
668
760
|
def init_var(var, value)
|
669
761
|
# instance_var_name starts with @, var_name doesn't.
|
670
762
|
var = var.to_s
|
671
|
-
var_name = (var.start_with?('@') ? var[1
|
763
|
+
var_name = (var.start_with?('@') ? var[1..] : var).to_sym
|
672
764
|
instance_var_name = "@#{var_name}".to_sym
|
673
765
|
|
674
766
|
instance_variable_set(instance_var_name, value)
|
@@ -677,10 +769,42 @@ be relative"
|
|
677
769
|
var_name
|
678
770
|
end
|
679
771
|
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
772
|
+
# Returns all <a> fragment elements from within the HTML body e.g. #about.
|
773
|
+
def fragments
|
774
|
+
anchors = xpath("/html/body//a")
|
775
|
+
|
776
|
+
anchors.select do |anchor|
|
777
|
+
href = anchor.attributes['href']&.value
|
778
|
+
href&.start_with?('#')
|
779
|
+
end
|
780
|
+
end
|
781
|
+
|
782
|
+
# Returns a Hash{Int=>String} of <a> fragment positions and their href
|
783
|
+
# values. Only fragment anchors are returned e.g. <a> elements with a
|
784
|
+
# href starting with '#'.
|
785
|
+
def fragment_indices(fragments)
|
786
|
+
fragments.reduce({}) do |hash, fragment|
|
787
|
+
index = html_index(fragment)
|
788
|
+
next hash unless index
|
789
|
+
|
790
|
+
href = fragment.attributes['href']&.value
|
791
|
+
hash[index] = href
|
792
|
+
|
793
|
+
hash
|
794
|
+
end
|
795
|
+
end
|
796
|
+
|
797
|
+
# Takes a Nokogiri element or HTML substring and returns it's index in
|
798
|
+
# the html. Returns the index/position Int or nil if not found. The search
|
799
|
+
# is case insensitive because Nokogiri lower cases camelCase attributes.
|
800
|
+
def html_index(el_or_str)
|
801
|
+
@html.downcase.index(el_or_str.to_s.strip.downcase)
|
802
|
+
end
|
803
|
+
|
804
|
+
alias_method :content, :html
|
805
|
+
alias_method :statistics, :stats
|
806
|
+
alias_method :internal_urls, :internal_links
|
807
|
+
alias_method :internal_absolute_urls, :internal_absolute_links
|
808
|
+
alias_method :external_urls, :external_links
|
685
809
|
end
|
686
810
|
end
|