wgit 0.10.7 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +44 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +22 -2
- data/bin/wgit +3 -1
- data/lib/wgit/assertable.rb +2 -2
- data/lib/wgit/crawler.rb +56 -34
- data/lib/wgit/database/database.rb +64 -52
- data/lib/wgit/document.rb +67 -39
- data/lib/wgit/document_extractors.rb +15 -1
- data/lib/wgit/dsl.rb +16 -20
- data/lib/wgit/indexer.rb +157 -63
- data/lib/wgit/logger.rb +1 -1
- data/lib/wgit/response.rb +21 -6
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +118 -51
- data/lib/wgit/utils.rb +81 -28
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +1 -0
- metadata +33 -38
data/lib/wgit/document.rb
CHANGED
@@ -3,7 +3,6 @@ require_relative 'utils'
|
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'json'
|
6
|
-
require 'set'
|
7
6
|
|
8
7
|
module Wgit
|
9
8
|
# Class modeling/serialising a HTML web document, although other MIME types
|
@@ -18,9 +17,9 @@ module Wgit
|
|
18
17
|
include Assertable
|
19
18
|
|
20
19
|
# Regex for the allowed var names when defining an extractor.
|
21
|
-
REGEX_EXTRACTOR_NAME = /[a-z0-9_]
|
20
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
|
22
21
|
|
23
|
-
# Set of text elements used to build Document#text.
|
22
|
+
# Set of text elements used to build the xpath for Document#text.
|
24
23
|
@text_elements = Set.new(%i[
|
25
24
|
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
25
|
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
@@ -29,6 +28,13 @@ module Wgit
|
|
29
28
|
summary sup td textarea th time u ul var wbr
|
30
29
|
])
|
31
30
|
|
31
|
+
# Instance vars to be ignored by Document#to_h and in turn Model.document.
|
32
|
+
@to_h_ignore_vars = [
|
33
|
+
'@parser', # Always ignore the Nokogiri object.
|
34
|
+
'@meta_robots', # Used by #no_index?, ignore.
|
35
|
+
'@meta_wgit' # Used by #no_index?, ignore.
|
36
|
+
]
|
37
|
+
|
32
38
|
# Set of Symbols representing the defined Document extractors.
|
33
39
|
@extractors = Set.new
|
34
40
|
|
@@ -38,6 +44,12 @@ module Wgit
|
|
38
44
|
# README.md for how to add to this Set dynamically.
|
39
45
|
attr_reader :text_elements
|
40
46
|
|
47
|
+
# Array of instance vars to ignore when Document#to_h and in turn
|
48
|
+
# Model.document methods are called. Append your own defined extractor
|
49
|
+
# vars to omit them from the model (database object) when indexing.
|
50
|
+
# Each var should be a String starting with an '@' char e.g. "@data" etc.
|
51
|
+
attr_reader :to_h_ignore_vars
|
52
|
+
|
41
53
|
# Set of Symbols representing the defined Document extractors. Is
|
42
54
|
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
55
|
attr_reader :extractors
|
@@ -76,9 +88,9 @@ module Wgit
|
|
76
88
|
# false if the Document content is an image etc.
|
77
89
|
def initialize(url_or_obj, html = '', encode: true)
|
78
90
|
if url_or_obj.is_a?(String)
|
79
|
-
init_from_strings(url_or_obj, html, encode:
|
91
|
+
init_from_strings(url_or_obj, html, encode:)
|
80
92
|
else
|
81
|
-
init_from_object(url_or_obj, encode:
|
93
|
+
init_from_object(url_or_obj, encode:)
|
82
94
|
end
|
83
95
|
end
|
84
96
|
|
@@ -89,9 +101,9 @@ module Wgit
|
|
89
101
|
#
|
90
102
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
103
|
def self.text_elements_xpath
|
92
|
-
|
104
|
+
@text_elements.each_with_index.reduce('') do |xpath, (el, i)|
|
93
105
|
xpath += ' | ' unless i.zero?
|
94
|
-
xpath
|
106
|
+
xpath + format('//%s/text()', el)
|
95
107
|
end
|
96
108
|
end
|
97
109
|
|
@@ -192,13 +204,27 @@ module Wgit
|
|
192
204
|
Document.send(:remove_method, "init_#{var}_from_object")
|
193
205
|
|
194
206
|
@extractors.delete(var.to_sym)
|
207
|
+
|
195
208
|
true
|
196
209
|
rescue NameError
|
197
210
|
false
|
198
211
|
end
|
199
212
|
|
213
|
+
# Removes all default and defined extractors by calling
|
214
|
+
# `Document.remove_extractor` underneath. See its documentation.
|
215
|
+
def self.remove_extractors
|
216
|
+
@extractors.each { |var| remove_extractor(var) }
|
217
|
+
end
|
218
|
+
|
200
219
|
### Document Instance Methods ###
|
201
220
|
|
221
|
+
# Overrides String#inspect to shorten the printed output of a Document.
|
222
|
+
#
|
223
|
+
# @return [String] A short textual representation of this Document.
|
224
|
+
def inspect
|
225
|
+
"#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
|
226
|
+
end
|
227
|
+
|
202
228
|
# Determines if both the url and html match. Use
|
203
229
|
# doc.object_id == other.object_id for exact object comparison.
|
204
230
|
#
|
@@ -227,10 +253,10 @@ module Wgit
|
|
227
253
|
# Provide the `link:` parameter to get the correct base URL for that type
|
228
254
|
# of link. For example, a link of `#top` would always return @url because
|
229
255
|
# it applies to that page, not a different one. Query strings work in the
|
230
|
-
# same way. Use this parameter if manually
|
256
|
+
# same way. Use this parameter if manually joining Url's e.g.
|
231
257
|
#
|
232
258
|
# relative_link = Wgit::Url.new('?q=hello')
|
233
|
-
# absolute_link = doc.base_url(link: relative_link).
|
259
|
+
# absolute_link = doc.base_url(link: relative_link).join(relative_link)
|
234
260
|
#
|
235
261
|
# This is similar to how Wgit::Document#internal_absolute_links works.
|
236
262
|
#
|
@@ -250,7 +276,7 @@ module Wgit
|
|
250
276
|
be relative"
|
251
277
|
end
|
252
278
|
|
253
|
-
get_base = -> { @base.relative? ? @url.to_origin.
|
279
|
+
get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
|
254
280
|
|
255
281
|
if link
|
256
282
|
link = Wgit::Url.new(link)
|
@@ -274,11 +300,11 @@ be relative"
|
|
274
300
|
# returned Hash.
|
275
301
|
# @return [Hash] Containing self's instance vars.
|
276
302
|
def to_h(include_html: false, include_score: true)
|
277
|
-
ignore =
|
303
|
+
ignore = Wgit::Document.to_h_ignore_vars.dup
|
304
|
+
ignore << '@html' unless include_html
|
278
305
|
ignore << '@score' unless include_score
|
279
|
-
ignore << '@parser' # Always ignore the Nokogiri object.
|
280
306
|
|
281
|
-
Wgit::Utils.to_h(self, ignore:
|
307
|
+
Wgit::Utils.to_h(self, ignore:)
|
282
308
|
end
|
283
309
|
|
284
310
|
# Converts this Document's #to_h return value to a JSON String.
|
@@ -287,7 +313,7 @@ be relative"
|
|
287
313
|
# returned JSON String.
|
288
314
|
# @return [String] This Document represented as a JSON String.
|
289
315
|
def to_json(include_html: false)
|
290
|
-
h = to_h(include_html:
|
316
|
+
h = to_h(include_html:)
|
291
317
|
JSON.generate(h)
|
292
318
|
end
|
293
319
|
|
@@ -309,7 +335,7 @@ be relative"
|
|
309
335
|
else
|
310
336
|
next unless instance_variable_get(var).respond_to?(:length)
|
311
337
|
|
312
|
-
hash[var[1
|
338
|
+
hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
|
313
339
|
end
|
314
340
|
end
|
315
341
|
|
@@ -417,7 +443,6 @@ be relative"
|
|
417
443
|
end
|
418
444
|
end
|
419
445
|
.reject { |link| link.relative?(host: @url.to_origin) }
|
420
|
-
.map(&:omit_trailing_slash)
|
421
446
|
|
422
447
|
Wgit::Utils.sanitize(links)
|
423
448
|
end
|
@@ -493,10 +518,7 @@ be relative"
|
|
493
518
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
494
519
|
)
|
495
520
|
orig_text = @text
|
496
|
-
@text = search(
|
497
|
-
query, case_sensitive: case_sensitive,
|
498
|
-
whole_sentence: whole_sentence, sentence_limit: sentence_limit
|
499
|
-
)
|
521
|
+
@text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
|
500
522
|
|
501
523
|
orig_text
|
502
524
|
end
|
@@ -519,11 +541,17 @@ be relative"
|
|
519
541
|
# @return [String, Object] The value found in the html or the default value
|
520
542
|
# (singleton ? nil : []).
|
521
543
|
def extract(xpath, singleton: true, text_content_only: true, &block)
|
522
|
-
send(
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
544
|
+
send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
|
545
|
+
end
|
546
|
+
|
547
|
+
# Works with the default extractors to extract and check the HTML meta tags
|
548
|
+
# instructing Wgit not to index this document (save it to a Database). If
|
549
|
+
# the default extractors are removed, this method will always return false.
|
550
|
+
#
|
551
|
+
# @return [Boolean] True if this document shouldn't be saved to a Database,
|
552
|
+
# false otherwise.
|
553
|
+
def no_index?
|
554
|
+
[@meta_robots, @meta_wgit].include?('noindex')
|
527
555
|
end
|
528
556
|
|
529
557
|
protected
|
@@ -567,7 +595,7 @@ be relative"
|
|
567
595
|
result = singleton ? result.content : result.map(&:content)
|
568
596
|
end
|
569
597
|
|
570
|
-
Wgit::Utils.sanitize(result)
|
598
|
+
result = Wgit::Utils.sanitize(result)
|
571
599
|
result = yield(result, self, :document) if block_given?
|
572
600
|
result
|
573
601
|
end
|
@@ -594,7 +622,7 @@ be relative"
|
|
594
622
|
default = singleton ? nil : []
|
595
623
|
result = obj.fetch(key.to_s, default)
|
596
624
|
|
597
|
-
Wgit::Utils.sanitize(result)
|
625
|
+
result = Wgit::Utils.sanitize(result)
|
598
626
|
result = yield(result, obj, :object) if block_given?
|
599
627
|
result
|
600
628
|
end
|
@@ -614,13 +642,13 @@ be relative"
|
|
614
642
|
@parser = init_nokogiri
|
615
643
|
@score = 0.0
|
616
644
|
|
617
|
-
Wgit::Utils.sanitize(@html, encode:
|
645
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
618
646
|
|
619
647
|
# Dynamically run the init_*_from_html methods.
|
620
648
|
Document.private_instance_methods(false).each do |method|
|
621
649
|
if method.to_s.start_with?('init_') &&
|
622
|
-
method.to_s.end_with?('_from_html')
|
623
|
-
send(method)
|
650
|
+
method.to_s.end_with?('_from_html') && method != __method__
|
651
|
+
send(method)
|
624
652
|
end
|
625
653
|
end
|
626
654
|
end
|
@@ -635,13 +663,13 @@ be relative"
|
|
635
663
|
@parser = init_nokogiri
|
636
664
|
@score = obj.fetch('score', 0.0)
|
637
665
|
|
638
|
-
Wgit::Utils.sanitize(@html, encode:
|
666
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
639
667
|
|
640
668
|
# Dynamically run the init_*_from_object methods.
|
641
669
|
Document.private_instance_methods(false).each do |method|
|
642
670
|
if method.to_s.start_with?('init_') &&
|
643
|
-
method.to_s.end_with?('_from_object')
|
644
|
-
send(method, obj)
|
671
|
+
method.to_s.end_with?('_from_object') && method != __method__
|
672
|
+
send(method, obj)
|
645
673
|
end
|
646
674
|
end
|
647
675
|
end
|
@@ -654,7 +682,7 @@ be relative"
|
|
654
682
|
def init_var(var, value)
|
655
683
|
# instance_var_name starts with @, var_name doesn't.
|
656
684
|
var = var.to_s
|
657
|
-
var_name = (var.start_with?('@') ? var[1
|
685
|
+
var_name = (var.start_with?('@') ? var[1..] : var).to_sym
|
658
686
|
instance_var_name = "@#{var_name}".to_sym
|
659
687
|
|
660
688
|
instance_variable_set(instance_var_name, value)
|
@@ -663,10 +691,10 @@ be relative"
|
|
663
691
|
var_name
|
664
692
|
end
|
665
693
|
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
694
|
+
alias_method :content, :html
|
695
|
+
alias_method :statistics, :stats
|
696
|
+
alias_method :internal_urls, :internal_links
|
697
|
+
alias_method :internal_absolute_urls, :internal_absolute_links
|
698
|
+
alias_method :external_urls, :external_links
|
671
699
|
end
|
672
700
|
end
|
@@ -2,6 +2,20 @@
|
|
2
2
|
|
3
3
|
### Default Document Extractors ###
|
4
4
|
|
5
|
+
# No index.
|
6
|
+
Wgit::Document.define_extractor(
|
7
|
+
:meta_robots,
|
8
|
+
'//meta[@name="robots"]/@content',
|
9
|
+
singleton: true,
|
10
|
+
text_content_only: true
|
11
|
+
)
|
12
|
+
Wgit::Document.define_extractor(
|
13
|
+
:meta_wgit,
|
14
|
+
'//meta[@name="wgit"]/@content',
|
15
|
+
singleton: true,
|
16
|
+
text_content_only: true
|
17
|
+
)
|
18
|
+
|
5
19
|
# Base.
|
6
20
|
Wgit::Document.define_extractor(
|
7
21
|
:base,
|
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
|
|
45
59
|
) do |keywords, _source, type|
|
46
60
|
if keywords && (type == :document)
|
47
61
|
keywords = keywords.split(',')
|
48
|
-
Wgit::Utils.sanitize(keywords)
|
62
|
+
keywords = Wgit::Utils.sanitize(keywords)
|
49
63
|
end
|
50
64
|
keywords
|
51
65
|
end
|
data/lib/wgit/dsl.rb
CHANGED
@@ -101,7 +101,7 @@ the 'start' function".freeze
|
|
101
101
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
102
102
|
|
103
103
|
urls.map! { |url| Wgit::Url.parse(url) }
|
104
|
-
crawler.crawl_urls(*urls, follow_redirects
|
104
|
+
crawler.crawl_urls(*urls, follow_redirects:, &block)
|
105
105
|
end
|
106
106
|
|
107
107
|
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
@@ -135,9 +135,7 @@ the 'start' function".freeze
|
|
135
135
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
136
136
|
|
137
137
|
xpath = follow || :default
|
138
|
-
opts = {
|
139
|
-
follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
|
140
|
-
}
|
138
|
+
opts = { follow: xpath, allow_paths:, disallow_paths: }
|
141
139
|
|
142
140
|
urls.reduce([]) do |externals, url|
|
143
141
|
externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
@@ -189,7 +187,7 @@ the 'start' function".freeze
|
|
189
187
|
db = Wgit::Database.new(connection_string)
|
190
188
|
indexer = Wgit::Indexer.new(db, crawler)
|
191
189
|
|
192
|
-
indexer.index_www(max_sites
|
190
|
+
indexer.index_www(max_sites:, max_data:)
|
193
191
|
end
|
194
192
|
|
195
193
|
# Indexes a single website using `Wgit::Indexer#index_site` underneath.
|
@@ -226,8 +224,7 @@ the 'start' function".freeze
|
|
226
224
|
indexer = Wgit::Indexer.new(db, crawler)
|
227
225
|
xpath = follow || :default
|
228
226
|
crawl_opts = {
|
229
|
-
insert_externals
|
230
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths
|
227
|
+
insert_externals:, follow: xpath, allow_paths:, disallow_paths:
|
231
228
|
}
|
232
229
|
|
233
230
|
urls.reduce(0) do |total, url|
|
@@ -261,9 +258,11 @@ the 'start' function".freeze
|
|
261
258
|
indexer = Wgit::Indexer.new(db, crawler)
|
262
259
|
|
263
260
|
urls.map! { |url| Wgit::Url.parse(url) }
|
264
|
-
indexer.index_urls(*urls, insert_externals
|
261
|
+
indexer.index_urls(*urls, insert_externals:, &block)
|
265
262
|
end
|
266
263
|
|
264
|
+
### DATABASE METHODS ###
|
265
|
+
|
267
266
|
# Performs a search of the database's indexed documents and pretty prints
|
268
267
|
# the results in a search engine-esque format. See `Wgit::Database#search!`
|
269
268
|
# and `Wgit::Document#search!` for details of how the search works.
|
@@ -285,7 +284,7 @@ the 'start' function".freeze
|
|
285
284
|
# database containing only its matching `#text`.
|
286
285
|
# @return [Array<Wgit::Document>] The search results with matching text.
|
287
286
|
def search(
|
288
|
-
query, connection_string: @dsl_conn_str, stream:
|
287
|
+
query, connection_string: @dsl_conn_str, stream: $stdout,
|
289
288
|
case_sensitive: false, whole_sentence: true,
|
290
289
|
limit: 10, skip: 0, sentence_limit: 80, &block
|
291
290
|
)
|
@@ -294,15 +293,12 @@ the 'start' function".freeze
|
|
294
293
|
|
295
294
|
results = db.search!(
|
296
295
|
query,
|
297
|
-
case_sensitive
|
298
|
-
|
299
|
-
|
300
|
-
skip: skip,
|
301
|
-
sentence_limit: sentence_limit,
|
302
|
-
&block
|
296
|
+
case_sensitive:, whole_sentence:,
|
297
|
+
limit:, skip:,
|
298
|
+
sentence_limit:, &block
|
303
299
|
)
|
304
300
|
|
305
|
-
Wgit::Utils.
|
301
|
+
Wgit::Utils.pprint_search_results(results, stream:)
|
306
302
|
|
307
303
|
results
|
308
304
|
end
|
@@ -317,9 +313,9 @@ the 'start' function".freeze
|
|
317
313
|
db.clear_db
|
318
314
|
end
|
319
315
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
316
|
+
alias_method :crawl_url, :crawl
|
317
|
+
alias_method :crawl_r, :crawl_site
|
318
|
+
alias_method :index_r, :index_site
|
319
|
+
alias_method :start_urls, :start
|
324
320
|
end
|
325
321
|
end
|