wgit 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +61 -0
- data/LICENSE.txt +21 -0
- data/README.md +16 -7
- data/TODO.txt +34 -0
- data/lib/wgit.rb +3 -1
- data/lib/wgit/assertable.rb +35 -29
- data/lib/wgit/core_ext.rb +5 -3
- data/lib/wgit/crawler.rb +96 -58
- data/lib/wgit/database/connection_details.rb +4 -2
- data/lib/wgit/database/database.rb +84 -46
- data/lib/wgit/database/model.rb +12 -10
- data/lib/wgit/document.rb +100 -72
- data/lib/wgit/document_extensions.rb +11 -9
- data/lib/wgit/indexer.rb +34 -24
- data/lib/wgit/logger.rb +4 -2
- data/lib/wgit/url.rb +94 -59
- data/lib/wgit/utils.rb +13 -11
- data/lib/wgit/version.rb +3 -1
- metadata +41 -38
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative '../assertable'
|
2
4
|
|
3
5
|
module Wgit
|
@@ -9,7 +11,7 @@ module Wgit
|
|
9
11
|
CONNECTION_DETAILS = {}
|
10
12
|
|
11
13
|
# The keys required for a successful database connection.
|
12
|
-
CONNECTION_KEYS_REQUIRED = ['DB_CONNECTION_STRING']
|
14
|
+
CONNECTION_KEYS_REQUIRED = ['DB_CONNECTION_STRING'].freeze
|
13
15
|
|
14
16
|
# Set the database's connection details from the given hash. It is your
|
15
17
|
# responsibility to ensure the correct hash vars are present and set.
|
@@ -34,6 +36,6 @@ module Wgit
|
|
34
36
|
# @raise [KeyError] If any of the required connection details are missing.
|
35
37
|
# @return [Hash] Containing the database connection details from the ENV.
|
36
38
|
def self.set_connection_details_from_env
|
37
|
-
|
39
|
+
set_connection_details(ENV)
|
38
40
|
end
|
39
41
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative '../document'
|
2
4
|
require_relative '../url'
|
3
5
|
require_relative '../utils'
|
@@ -7,7 +9,6 @@ require 'logger'
|
|
7
9
|
require 'mongo'
|
8
10
|
|
9
11
|
module Wgit
|
10
|
-
|
11
12
|
# Class modeling a DB connection and CRUD operations for the Url and
|
12
13
|
# Document collections.
|
13
14
|
class Database
|
@@ -73,7 +74,7 @@ module Wgit
|
|
73
74
|
# @yield [url] Given each Url returned from the DB.
|
74
75
|
# @return [Array<Wgit::Url>] The Urls obtained from the DB.
|
75
76
|
def urls(crawled = nil, limit = 0, skip = 0)
|
76
|
-
crawled.nil? ?
|
77
|
+
query = crawled.nil? ? {} : { crawled: crawled }
|
77
78
|
|
78
79
|
sort = { date_added: 1 }
|
79
80
|
results = retrieve(:urls, query, sort, {}, limit, skip)
|
@@ -129,12 +130,12 @@ module Wgit
|
|
129
130
|
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
130
131
|
def search(query, whole_sentence = false, limit = 10, skip = 0)
|
131
132
|
query.strip!
|
132
|
-
query.replace("
|
133
|
+
query.replace('"' + query + '"') if whole_sentence
|
133
134
|
|
134
135
|
# The sort_proj sorts based on the most search hits.
|
135
136
|
# We use the sort_proj hash as both a sort and a projection below.
|
136
137
|
# :$caseSensitive => case_sensitive, 3.2+ only.
|
137
|
-
sort_proj = { score: { :$meta =>
|
138
|
+
sort_proj = { score: { :$meta => 'textScore' } }
|
138
139
|
query = { :$text => { :$search => query } }
|
139
140
|
|
140
141
|
results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)
|
@@ -188,8 +189,8 @@ module Wgit
|
|
188
189
|
# @param url [Wgit::Url] The Url to search the DB for.
|
189
190
|
# @return [Boolean] True if url exists, otherwise false.
|
190
191
|
def url?(url)
|
191
|
-
h = {
|
192
|
-
|
192
|
+
h = { 'url' => url }
|
193
|
+
@@client[:urls].find(h).any?
|
193
194
|
end
|
194
195
|
|
195
196
|
# Returns whether or not a record with the given doc.url (which is unique)
|
@@ -199,8 +200,8 @@ module Wgit
|
|
199
200
|
# @return [Boolean] True if doc exists, otherwise false.
|
200
201
|
def doc?(doc)
|
201
202
|
url = doc.respond_to?(:url) ? doc.url : doc
|
202
|
-
h = {
|
203
|
-
|
203
|
+
h = { 'url' => url }
|
204
|
+
@@client[:documents].find(h).any?
|
204
205
|
end
|
205
206
|
|
206
207
|
### Update Data ###
|
@@ -220,19 +221,25 @@ module Wgit
|
|
220
221
|
end
|
221
222
|
end
|
222
223
|
|
223
|
-
|
224
|
+
protected
|
224
225
|
|
225
226
|
# Return if the write to the DB succeeded or not.
|
227
|
+
#
|
228
|
+
# @param result [Mongo::Object] The operation result.
|
229
|
+
# @param count [Integer] The number of records written to.
|
230
|
+
# @param multi [Boolean] True if more than one record is being written to.
|
231
|
+
# @raise [RuntimeError] If result.class isn't supported.
|
232
|
+
# @return [Boolean] True if the write was successful.
|
226
233
|
def write_succeeded?(result, count = 1, multi = false)
|
227
234
|
case result.class.to_s
|
228
235
|
# Single create result.
|
229
|
-
when
|
236
|
+
when 'Mongo::Operation::Insert::Result'
|
230
237
|
result.documents.first[:err].nil?
|
231
238
|
# Multiple create result.
|
232
|
-
when
|
239
|
+
when 'Mongo::BulkWrite::Result'
|
233
240
|
result.inserted_count == count
|
234
241
|
# Single and multiple update result.
|
235
|
-
when
|
242
|
+
when 'Mongo::Operation::Update::Result'
|
236
243
|
if multi
|
237
244
|
result.n == count
|
238
245
|
else
|
@@ -240,50 +247,65 @@ module Wgit
|
|
240
247
|
end
|
241
248
|
# Class no longer used, have you upgraded the 'mongo' gem?
|
242
249
|
else
|
243
|
-
raise "Result class not currently supported: #{result.class
|
250
|
+
raise "Result class not currently supported: #{result.class}"
|
244
251
|
end
|
245
252
|
end
|
246
253
|
|
247
254
|
# Insert one or more Url objects into the DB.
|
255
|
+
#
|
256
|
+
# @param url_or_urls [Wgit::Url, Array<Wgit::Url>] The Url or Url's to
|
257
|
+
# insert.
|
258
|
+
# @raise [RuntimeError] If url_or_urls isn't of the correct type.
|
259
|
+
# @return [Integer] The number of inserted Url's.
|
248
260
|
def insert_urls(url_or_urls)
|
249
|
-
|
250
|
-
assert_type(url_or_urls, Url)
|
251
|
-
url_or_urls = Wgit::Model.url(url_or_urls)
|
252
|
-
else
|
261
|
+
if url_or_urls.respond_to?(:map)
|
253
262
|
assert_arr_types(url_or_urls, Url)
|
254
263
|
url_or_urls = url_or_urls.map do |url|
|
255
264
|
Wgit::Model.url(url)
|
256
265
|
end
|
266
|
+
else
|
267
|
+
assert_type(url_or_urls, Url)
|
268
|
+
url_or_urls = Wgit::Model.url(url_or_urls)
|
257
269
|
end
|
258
270
|
create(:urls, url_or_urls)
|
259
271
|
end
|
260
272
|
|
261
273
|
# Insert one or more Document objects into the DB.
|
274
|
+
#
|
275
|
+
# @param doc_or_docs [Wgit::Document, Array<Wgit::Document>] The Document
|
276
|
+
# or Document's to insert.
|
277
|
+
# @raise [RuntimeError] If doc_or_docs isn't of the correct type.
|
278
|
+
# @return [Integer] The number of inserted Document's.
|
262
279
|
def insert_docs(doc_or_docs)
|
263
|
-
|
264
|
-
assert_type(doc_or_docs, [Document, Hash])
|
265
|
-
unless doc_or_docs.is_a?(Hash)
|
266
|
-
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
267
|
-
end
|
268
|
-
else
|
280
|
+
if doc_or_docs.respond_to?(:map)
|
269
281
|
assert_arr_types(doc_or_docs, [Document, Hash])
|
270
282
|
doc_or_docs = doc_or_docs.map do |doc|
|
271
283
|
Wgit::Model.document(doc) unless doc.is_a?(Hash)
|
272
284
|
end
|
285
|
+
else
|
286
|
+
assert_type(doc_or_docs, [Document, Hash])
|
287
|
+
unless doc_or_docs.is_a?(Hash)
|
288
|
+
doc_or_docs = Wgit::Model.document(doc_or_docs)
|
289
|
+
end
|
273
290
|
end
|
274
291
|
create(:documents, doc_or_docs)
|
275
292
|
end
|
276
293
|
|
277
294
|
# Create/insert one or more Url or Document records into the DB.
|
295
|
+
#
|
296
|
+
# @param collection [Symbol] Either :urls or :documents.
|
297
|
+
# @param data [Hash, Array<Wgit::Url, Wgit::Document>] The data to insert.
|
298
|
+
# @raise [RuntimeError] If the data type is incorrect or if the write
|
299
|
+
# fails.
|
300
|
+
# @return [Integer] The number of inserted Objects.
|
278
301
|
def create(collection, data)
|
279
302
|
assert_type(data, [Hash, Array])
|
280
303
|
# Single doc.
|
281
304
|
if data.is_a?(Hash)
|
282
305
|
data.merge!(Wgit::Model.common_insert_data)
|
283
306
|
result = @@client[collection.to_sym].insert_one(data)
|
284
|
-
unless write_succeeded?(result)
|
285
|
-
|
286
|
-
end
|
307
|
+
raise 'DB write (insert) failed' unless write_succeeded?(result)
|
308
|
+
|
287
309
|
result.n
|
288
310
|
# Multiple docs.
|
289
311
|
elsif data.is_a?(Array)
|
@@ -292,9 +314,8 @@ module Wgit
|
|
292
314
|
data_hash.merge(Wgit::Model.common_insert_data)
|
293
315
|
end
|
294
316
|
result = @@client[collection.to_sym].insert_many(data)
|
295
|
-
unless write_succeeded?(result, data.length)
|
296
|
-
|
297
|
-
end
|
317
|
+
raise 'DB write(s) failed' unless write_succeeded?(result, data.length)
|
318
|
+
|
298
319
|
result.inserted_count
|
299
320
|
else
|
300
321
|
raise "data must be a Hash or an Array of Hash's"
|
@@ -302,52 +323,69 @@ module Wgit
|
|
302
323
|
end
|
303
324
|
|
304
325
|
# Retrieve Url or Document records from the DB.
|
326
|
+
#
|
327
|
+
# @param collection [Symbol] Either :urls or :documents.
|
328
|
+
# @param query [Hash] The query used during the retrieval.
|
329
|
+
# @param sort [Hash] The sort to use.
|
330
|
+
# @param projection [Hash] The projection to use.
|
331
|
+
# @param limit [Integer] The limit to use.
|
332
|
+
# @param skip [Integer] The skip to use.
|
333
|
+
# @return [Mongo::Object] The Mongo client find result.
|
305
334
|
def retrieve(collection, query,
|
306
335
|
sort = {}, projection = {},
|
307
336
|
limit = 0, skip = 0)
|
308
337
|
assert_type(query, Hash)
|
309
338
|
@@client[collection.to_sym].find(query).projection(projection)
|
310
|
-
|
339
|
+
.skip(skip).limit(limit).sort(sort)
|
311
340
|
end
|
312
341
|
|
313
342
|
# Update a Url object in the DB.
|
343
|
+
#
|
344
|
+
# @param url [Wgit::Url] The Url to update.
|
345
|
+
# @return [Integer] The number of updated records.
|
314
346
|
def update_url(url)
|
315
347
|
assert_type(url, Url)
|
316
348
|
selection = { url: url }
|
317
349
|
url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
|
318
|
-
update = {
|
350
|
+
update = { '$set' => url_hash }
|
319
351
|
_update(true, :urls, selection, update)
|
320
352
|
end
|
321
353
|
|
322
354
|
# Update a Document object in the DB.
|
355
|
+
#
|
356
|
+
# @param doc [Wgit::Document] The Document to update.
|
357
|
+
# @return [Integer] The number of updated records.
|
323
358
|
def update_doc(doc)
|
324
359
|
assert_type(doc, Document)
|
325
360
|
selection = { url: doc.url }
|
326
361
|
doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
|
327
|
-
update = {
|
362
|
+
update = { '$set' => doc_hash }
|
328
363
|
_update(true, :documents, selection, update)
|
329
364
|
end
|
330
365
|
|
366
|
+
private
|
367
|
+
|
331
368
|
# Update one or more Url or Document records in the DB.
|
332
369
|
# NOTE: The Model.common_update_data should be merged in the calling
|
333
|
-
# method as the update param can be bespoke due to its nature.
|
370
|
+
# method as the update param can be bespoke, due to its nature.
|
334
371
|
def _update(single, collection, selection, update)
|
335
372
|
assert_arr_types([selection, update], Hash)
|
336
|
-
if single
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
raise
|
373
|
+
result = if single
|
374
|
+
@@client[collection.to_sym].update_one(selection, update)
|
375
|
+
else
|
376
|
+
@@client[collection.to_sym].update_many(selection, update)
|
377
|
+
end
|
378
|
+
raise 'DB write (update) failed' unless write_succeeded?(result)
|
379
|
+
|
342
380
|
result.n
|
343
381
|
end
|
344
382
|
|
345
|
-
alias
|
346
|
-
alias
|
347
|
-
alias
|
348
|
-
alias
|
349
|
-
alias
|
350
|
-
alias
|
351
|
-
alias
|
383
|
+
alias count size
|
384
|
+
alias length size
|
385
|
+
alias num_documents num_docs
|
386
|
+
alias document? doc?
|
387
|
+
alias insert_url insert_urls
|
388
|
+
alias insert_doc insert_docs
|
389
|
+
alias num_objects num_records
|
352
390
|
end
|
353
391
|
end
|
data/lib/wgit/database/model.rb
CHANGED
@@ -1,46 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative '../utils'
|
2
4
|
|
3
5
|
module Wgit
|
4
|
-
|
5
6
|
# Module containing the database (DB) data model structure.
|
6
7
|
module Model
|
7
|
-
|
8
8
|
# The data model for a Wgit::Url.
|
9
9
|
#
|
10
10
|
# @param url [Wgit::Url] The URL DB record.
|
11
11
|
# @return [Hash] The URL model ready for DB insertion.
|
12
12
|
def self.url(url)
|
13
|
-
raise
|
13
|
+
raise 'url must respond_to? to_h' unless url.respond_to?(:to_h)
|
14
|
+
|
14
15
|
model = url.to_h
|
15
16
|
Wgit::Utils.remove_non_bson_types(model)
|
16
17
|
end
|
17
|
-
|
18
|
+
|
18
19
|
# The data model for a Wgit::Document.
|
19
20
|
#
|
20
21
|
# @param doc [Wgit::Document] The Document DB record.
|
21
22
|
# @return [Hash] The Document model ready for DB insertion.
|
22
23
|
def self.document(doc)
|
23
|
-
raise
|
24
|
+
raise 'doc must respond_to? to_h' unless doc.respond_to?(:to_h)
|
25
|
+
|
24
26
|
model = doc.to_h(false)
|
25
27
|
Wgit::Utils.remove_non_bson_types(model)
|
26
28
|
end
|
27
|
-
|
29
|
+
|
28
30
|
# Default fields when inserting a record into the DB.
|
29
31
|
#
|
30
32
|
# @return [Hash] Containing common insertion fields for all models.
|
31
33
|
def self.common_insert_data
|
32
34
|
{
|
33
|
-
date_added:
|
34
|
-
date_modified:
|
35
|
+
date_added: Wgit::Utils.time_stamp,
|
36
|
+
date_modified: Wgit::Utils.time_stamp
|
35
37
|
}
|
36
38
|
end
|
37
|
-
|
39
|
+
|
38
40
|
# Default fields when updating a record in the DB.
|
39
41
|
#
|
40
42
|
# @return [Hash] Containing common update fields for all models.
|
41
43
|
def self.common_update_data
|
42
44
|
{
|
43
|
-
date_modified: Wgit::Utils.time_stamp
|
45
|
+
date_modified: Wgit::Utils.time_stamp
|
44
46
|
}
|
45
47
|
end
|
46
48
|
end
|
data/lib/wgit/document.rb
CHANGED
@@ -5,7 +5,6 @@ require 'nokogiri'
|
|
5
5
|
require 'json'
|
6
6
|
|
7
7
|
module Wgit
|
8
|
-
|
9
8
|
# Class modeling a HTML web document. Also doubles as a search result when
|
10
9
|
# loading Documents from the database.
|
11
10
|
#
|
@@ -19,9 +18,9 @@ module Wgit
|
|
19
18
|
# The HTML elements that make up the visible text on a page.
|
20
19
|
# These elements are used to initialize the @text of the Document.
|
21
20
|
# See the README.md for how to add to this Array dynamically.
|
22
|
-
@text_elements = [
|
23
|
-
|
24
|
-
|
21
|
+
@text_elements = %i[
|
22
|
+
dd div dl dt figcaption figure hr li
|
23
|
+
main ol p pre span ul h1 h2 h3 h4 h5
|
25
24
|
]
|
26
25
|
|
27
26
|
class << self
|
@@ -58,14 +57,14 @@ module Wgit
|
|
58
57
|
# keys.
|
59
58
|
# @param html [String] The crawled web page's HTML. This param is only
|
60
59
|
# required if url_or_obj is a String representing the web page's URL.
|
61
|
-
def initialize(url_or_obj, html =
|
60
|
+
def initialize(url_or_obj, html = '')
|
62
61
|
# Init from URL String and HTML String.
|
63
62
|
if url_or_obj.is_a?(String)
|
64
63
|
url = url_or_obj
|
65
64
|
assert_type(url, Wgit::Url)
|
66
65
|
|
67
66
|
@url = url
|
68
|
-
@html = html
|
67
|
+
@html = html || ''
|
69
68
|
@doc = init_nokogiri
|
70
69
|
@score = 0.0
|
71
70
|
|
@@ -73,9 +72,9 @@ module Wgit
|
|
73
72
|
|
74
73
|
# Dynamically run the init_*_from_html methods.
|
75
74
|
Document.private_instance_methods(false).each do |method|
|
76
|
-
if method.to_s.start_with?(
|
77
|
-
|
78
|
-
|
75
|
+
if method.to_s.start_with?('init_') &&
|
76
|
+
method.to_s.end_with?('_from_html')
|
77
|
+
send(method)
|
79
78
|
end
|
80
79
|
end
|
81
80
|
# Init from a Hash like object containing Strings as keys e.g. Mongo
|
@@ -84,18 +83,18 @@ module Wgit
|
|
84
83
|
obj = url_or_obj
|
85
84
|
assert_respond_to(obj, :fetch)
|
86
85
|
|
87
|
-
@url = Wgit::Url.new(obj.fetch(
|
88
|
-
@html = obj.fetch(
|
86
|
+
@url = Wgit::Url.new(obj.fetch('url')) # Should always be present.
|
87
|
+
@html = obj.fetch('html', '')
|
89
88
|
@doc = init_nokogiri
|
90
|
-
@score = obj.fetch(
|
89
|
+
@score = obj.fetch('score', 0.0)
|
91
90
|
|
92
91
|
process_url_and_html
|
93
92
|
|
94
93
|
# Dynamically run the init_*_from_object methods.
|
95
94
|
Document.private_instance_methods(false).each do |method|
|
96
|
-
if method.to_s.start_with?(
|
97
|
-
|
98
|
-
|
95
|
+
if method.to_s.start_with?('init_') &&
|
96
|
+
method.to_s.end_with?('_from_object')
|
97
|
+
send(method, obj)
|
99
98
|
end
|
100
99
|
end
|
101
100
|
end
|
@@ -108,7 +107,8 @@ module Wgit
|
|
108
107
|
# @return [Boolean] True if @url and @html are equal, false if not.
|
109
108
|
def ==(other_doc)
|
110
109
|
return false unless other_doc.is_a? Wgit::Document
|
111
|
-
|
110
|
+
|
111
|
+
(@url == other_doc.url) && (@html == other_doc.html)
|
112
112
|
end
|
113
113
|
|
114
114
|
# Is a shortcut for calling Document#html[range].
|
@@ -148,7 +148,7 @@ module Wgit
|
|
148
148
|
assert_type(link, Wgit::Url)
|
149
149
|
raise "link must be relative: #{link}" unless link.is_relative?
|
150
150
|
|
151
|
-
if link.is_anchor?
|
151
|
+
if link.is_anchor? || link.is_query_string?
|
152
152
|
base_url = @base ? get_base.call : @url
|
153
153
|
return base_url.without_anchor.without_query_string
|
154
154
|
end
|
@@ -166,8 +166,8 @@ module Wgit
|
|
166
166
|
# returned Hash.
|
167
167
|
# @return [Hash] Containing self's instance vars.
|
168
168
|
def to_h(include_html = false)
|
169
|
-
ignore = include_html ? [] : [
|
170
|
-
ignore <<
|
169
|
+
ignore = include_html ? [] : ['@html']
|
170
|
+
ignore << '@doc' # Always ignore "@doc"
|
171
171
|
Wgit::Utils.to_h(self, ignore)
|
172
172
|
end
|
173
173
|
|
@@ -200,8 +200,9 @@ module Wgit
|
|
200
200
|
# Else take the var's #length method return value.
|
201
201
|
else
|
202
202
|
next unless instance_variable_get(var).respond_to?(:length)
|
203
|
+
|
203
204
|
hash[var[1..-1].to_sym] =
|
204
|
-
|
205
|
+
instance_variable_get(var).send(:length)
|
205
206
|
end
|
206
207
|
end
|
207
208
|
hash
|
@@ -219,6 +220,7 @@ module Wgit
|
|
219
220
|
# @return [Boolean] True if @html is nil/empty, false otherwise.
|
220
221
|
def empty?
|
221
222
|
return true if @html.nil?
|
223
|
+
|
222
224
|
@html.empty?
|
223
225
|
end
|
224
226
|
|
@@ -252,12 +254,12 @@ module Wgit
|
|
252
254
|
def internal_links
|
253
255
|
return [] if @links.empty?
|
254
256
|
|
255
|
-
links = @links
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
257
|
+
links = @links
|
258
|
+
.select { |link| link.is_relative?(host: @url.to_base) }
|
259
|
+
.map(&:without_base)
|
260
|
+
.map do |link| # We map @url.to_host into / because it's a duplicate.
|
261
|
+
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
262
|
+
end
|
261
263
|
|
262
264
|
Wgit::Utils.process_arr(links)
|
263
265
|
end
|
@@ -271,6 +273,7 @@ module Wgit
|
|
271
273
|
def internal_full_links
|
272
274
|
links = internal_links
|
273
275
|
return [] if links.empty?
|
276
|
+
|
274
277
|
links.map { |link| base_url(link: link).concat(link) }
|
275
278
|
end
|
276
279
|
|
@@ -281,9 +284,9 @@ module Wgit
|
|
281
284
|
def external_links
|
282
285
|
return [] if @links.empty?
|
283
286
|
|
284
|
-
links = @links
|
285
|
-
|
286
|
-
|
287
|
+
links = @links
|
288
|
+
.reject { |link| link.relative_link?(host: @url.to_base) }
|
289
|
+
.map(&:without_trailing_slash)
|
287
290
|
|
288
291
|
Wgit::Utils.process_arr(links)
|
289
292
|
end
|
@@ -304,24 +307,25 @@ module Wgit
|
|
304
307
|
# sentence.
|
305
308
|
# @return [Array<String>] Representing the search results.
|
306
309
|
def search(query, sentence_limit = 80)
|
307
|
-
raise
|
308
|
-
raise
|
310
|
+
raise 'A search query must be provided' if query.empty?
|
311
|
+
raise 'The sentence_limit value must be even' if sentence_limit.odd?
|
309
312
|
|
310
313
|
results = {}
|
311
314
|
regex = Regexp.new(query, Regexp::IGNORECASE)
|
312
315
|
|
313
316
|
@text.each do |sentence|
|
314
317
|
hits = sentence.scan(regex).count
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
318
|
+
next unless hits > 0
|
319
|
+
|
320
|
+
sentence.strip!
|
321
|
+
index = sentence.index(regex)
|
322
|
+
Wgit::Utils.format_sentence_length(sentence, index, sentence_limit)
|
323
|
+
results[sentence] = hits
|
321
324
|
end
|
322
325
|
|
323
326
|
return [] if results.empty?
|
324
|
-
|
327
|
+
|
328
|
+
results = Hash[results.sort_by { |_k, v| v }]
|
325
329
|
results.keys.reverse
|
326
330
|
end
|
327
331
|
|
@@ -347,12 +351,13 @@ module Wgit
|
|
347
351
|
#
|
348
352
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
349
353
|
def self.text_elements_xpath
|
350
|
-
xpath =
|
354
|
+
xpath = ''
|
351
355
|
return xpath if Wgit::Document.text_elements.empty?
|
352
|
-
|
356
|
+
|
357
|
+
el_xpath = '//%s/text()'
|
353
358
|
Wgit::Document.text_elements.each_with_index do |el, i|
|
354
|
-
xpath +=
|
355
|
-
xpath += el_xpath
|
359
|
+
xpath += ' | ' unless i == 0
|
360
|
+
xpath += format(el_xpath, el)
|
356
361
|
end
|
357
362
|
xpath
|
358
363
|
end
|
@@ -429,35 +434,42 @@ module Wgit
|
|
429
434
|
false
|
430
435
|
end
|
431
436
|
|
432
|
-
|
437
|
+
protected
|
433
438
|
|
434
|
-
# Initializes the nokogiri object using @html, which
|
439
|
+
# Initializes the nokogiri object using @html, which cannot be nil.
|
440
|
+
# Override this method to custom configure the Nokogiri object returned.
|
441
|
+
# Gets called from Wgit::Document.new.
|
442
|
+
#
|
443
|
+
# @return [Nokogiri::HTML] The initialised Nokogiri HTML object.
|
435
444
|
def init_nokogiri
|
436
|
-
raise
|
445
|
+
raise '@html must be set' unless @html
|
446
|
+
|
437
447
|
Nokogiri::HTML(@html) do |config|
|
438
448
|
# TODO: Remove #'s below when crawling in production.
|
439
|
-
#config.options = Nokogiri::XML::ParseOptions::STRICT |
|
449
|
+
# config.options = Nokogiri::XML::ParseOptions::STRICT |
|
440
450
|
# Nokogiri::XML::ParseOptions::NONET
|
441
451
|
end
|
442
452
|
end
|
443
453
|
|
444
|
-
#
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
#
|
451
|
-
#
|
452
|
-
#
|
453
|
-
#
|
454
|
-
#
|
455
|
-
#
|
454
|
+
# Returns a value/object from this Document's @html using the given xpath
|
455
|
+
# parameter.
|
456
|
+
#
|
457
|
+
# @param xpath [String] Used to find the value/object in @html.
|
458
|
+
# @param singleton [Boolean] singleton ? results.first (single Nokogiri
|
459
|
+
# Object) : results (Array).
|
460
|
+
# @param text_content_only [Boolean] text_content_only ? result.content
|
461
|
+
# (String) : result (Nokogiri Object).
|
462
|
+
# @yield [String/Object, Symbol] Given the value before it's set as an
|
463
|
+
# instance variable so that you can inspect/alter the value if desired.
|
464
|
+
# Return nil from the block if you don't want to override the value. Also
|
465
|
+
# given the source which is always :html.
|
466
|
+
# @return [String, Object] The value found in the html or the default value
|
467
|
+
# (singleton ? nil : []).
|
456
468
|
def find_in_html(xpath, singleton: true, text_content_only: true)
|
457
469
|
xpath = xpath.call if xpath.respond_to?(:call)
|
458
470
|
results = @doc.xpath(xpath)
|
459
471
|
|
460
|
-
if results
|
472
|
+
if results && !results.empty?
|
461
473
|
result = if singleton
|
462
474
|
text_content_only ? results.first.content : results.first
|
463
475
|
else
|
@@ -477,10 +489,17 @@ module Wgit
|
|
477
489
|
result
|
478
490
|
end
|
479
491
|
|
480
|
-
#
|
481
|
-
#
|
482
|
-
#
|
483
|
-
#
|
492
|
+
# Returns a value from the obj using the given key via obj#fetch.
|
493
|
+
#
|
494
|
+
# @param obj [Object#fetch] The object containing the key/value.
|
495
|
+
# @param key [String] Used to find the value in the obj.
|
496
|
+
# @param singleton [Boolean] True if a single value, false otherwise.
|
497
|
+
# @yield [String/Object, Symbol] Given the value before it's set as an
|
498
|
+
# instance variable so that you can inspect/alter the value if desired.
|
499
|
+
# Return nil from the block if you don't want to override the value. Also
|
500
|
+
# given the source which is always :object.
|
501
|
+
# @return [String, Object] The value found in the obj or the default value
|
502
|
+
# (singleton ? nil : []).
|
484
503
|
def find_in_object(obj, key, singleton: true)
|
485
504
|
assert_respond_to(obj, :fetch)
|
486
505
|
|
@@ -496,14 +515,17 @@ module Wgit
|
|
496
515
|
result
|
497
516
|
end
|
498
517
|
|
518
|
+
private
|
519
|
+
|
499
520
|
# Initialises an instance variable and defines a getter method for it.
|
521
|
+
#
|
500
522
|
# @param var [Symbol] The name of the variable to be initialized.
|
501
523
|
# @param value [Object] The newly initialized variable's value.
|
502
524
|
# @return [Symbol] The name of the newly created getter method.
|
503
525
|
def init_var(var, value)
|
504
526
|
# instance_var_name starts with @, var_name doesn't.
|
505
527
|
var = var.to_s
|
506
|
-
var_name = (var.start_with?(
|
528
|
+
var_name = (var.start_with?('@') ? var[1..-1] : var).to_sym
|
507
529
|
instance_var_name = "@#{var_name}".to_sym
|
508
530
|
|
509
531
|
instance_variable_set(instance_var_name, value)
|
@@ -513,13 +535,19 @@ module Wgit
|
|
513
535
|
end
|
514
536
|
end
|
515
537
|
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
alias
|
523
|
-
alias
|
538
|
+
# Ensure the @url and @html Strings are correctly encoded etc.
|
539
|
+
def process_url_and_html
|
540
|
+
@url = Wgit::Utils.process_str(@url)
|
541
|
+
@html = Wgit::Utils.process_str(@html)
|
542
|
+
end
|
543
|
+
|
544
|
+
alias relative_links internal_links
|
545
|
+
alias relative_urls internal_links
|
546
|
+
alias relative_full_links internal_full_links
|
547
|
+
alias relative_full_urls internal_full_links
|
548
|
+
alias internal_absolute_links internal_full_links
|
549
|
+
alias relative_absolute_links internal_full_links
|
550
|
+
alias relative_absolute_urls internal_full_links
|
551
|
+
alias external_urls external_links
|
524
552
|
end
|
525
553
|
end
|