wgit 0.10.8 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +3 -1
- data/bin/wgit +3 -1
- data/lib/wgit/assertable.rb +2 -2
- data/lib/wgit/crawler.rb +56 -34
- data/lib/wgit/database/database.rb +64 -52
- data/lib/wgit/document.rb +54 -40
- data/lib/wgit/document_extractors.rb +15 -1
- data/lib/wgit/dsl.rb +16 -20
- data/lib/wgit/indexer.rb +157 -63
- data/lib/wgit/logger.rb +1 -1
- data/lib/wgit/response.rb +21 -6
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +115 -55
- data/lib/wgit/utils.rb +81 -28
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +1 -0
- metadata +33 -38
data/lib/wgit/document.rb
CHANGED
@@ -3,7 +3,6 @@ require_relative 'utils'
|
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'json'
|
6
|
-
require 'set'
|
7
6
|
|
8
7
|
module Wgit
|
9
8
|
# Class modeling/serialising a HTML web document, although other MIME types
|
@@ -18,9 +17,9 @@ module Wgit
|
|
18
17
|
include Assertable
|
19
18
|
|
20
19
|
# Regex for the allowed var names when defining an extractor.
|
21
|
-
REGEX_EXTRACTOR_NAME = /[a-z0-9_]
|
20
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
|
22
21
|
|
23
|
-
# Set of text elements used to build Document#text.
|
22
|
+
# Set of text elements used to build the xpath for Document#text.
|
24
23
|
@text_elements = Set.new(%i[
|
25
24
|
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
25
|
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
@@ -29,6 +28,13 @@ module Wgit
|
|
29
28
|
summary sup td textarea th time u ul var wbr
|
30
29
|
])
|
31
30
|
|
31
|
+
# Instance vars to be ignored by Document#to_h and in turn Model.document.
|
32
|
+
@to_h_ignore_vars = [
|
33
|
+
'@parser', # Always ignore the Nokogiri object.
|
34
|
+
'@meta_robots', # Used by #no_index?, ignore.
|
35
|
+
'@meta_wgit' # Used by #no_index?, ignore.
|
36
|
+
]
|
37
|
+
|
32
38
|
# Set of Symbols representing the defined Document extractors.
|
33
39
|
@extractors = Set.new
|
34
40
|
|
@@ -38,6 +44,12 @@ module Wgit
|
|
38
44
|
# README.md for how to add to this Set dynamically.
|
39
45
|
attr_reader :text_elements
|
40
46
|
|
47
|
+
# Array of instance vars to ignore when Document#to_h and in turn
|
48
|
+
# Model.document methods are called. Append your own defined extractor
|
49
|
+
# vars to omit them from the model (database object) when indexing.
|
50
|
+
# Each var should be a String starting with an '@' char e.g. "@data" etc.
|
51
|
+
attr_reader :to_h_ignore_vars
|
52
|
+
|
41
53
|
# Set of Symbols representing the defined Document extractors. Is
|
42
54
|
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
55
|
attr_reader :extractors
|
@@ -76,9 +88,9 @@ module Wgit
|
|
76
88
|
# false if the Document content is an image etc.
|
77
89
|
def initialize(url_or_obj, html = '', encode: true)
|
78
90
|
if url_or_obj.is_a?(String)
|
79
|
-
init_from_strings(url_or_obj, html, encode:
|
91
|
+
init_from_strings(url_or_obj, html, encode:)
|
80
92
|
else
|
81
|
-
init_from_object(url_or_obj, encode:
|
93
|
+
init_from_object(url_or_obj, encode:)
|
82
94
|
end
|
83
95
|
end
|
84
96
|
|
@@ -89,9 +101,9 @@ module Wgit
|
|
89
101
|
#
|
90
102
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
103
|
def self.text_elements_xpath
|
92
|
-
|
104
|
+
@text_elements.each_with_index.reduce('') do |xpath, (el, i)|
|
93
105
|
xpath += ' | ' unless i.zero?
|
94
|
-
xpath
|
106
|
+
xpath + format('//%s/text()', el)
|
95
107
|
end
|
96
108
|
end
|
97
109
|
|
@@ -210,7 +222,7 @@ module Wgit
|
|
210
222
|
#
|
211
223
|
# @return [String] A short textual representation of this Document.
|
212
224
|
def inspect
|
213
|
-
"#<Wgit::Document url=\"#{@url}\"
|
225
|
+
"#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
|
214
226
|
end
|
215
227
|
|
216
228
|
# Determines if both the url and html match. Use
|
@@ -241,10 +253,10 @@ module Wgit
|
|
241
253
|
# Provide the `link:` parameter to get the correct base URL for that type
|
242
254
|
# of link. For example, a link of `#top` would always return @url because
|
243
255
|
# it applies to that page, not a different one. Query strings work in the
|
244
|
-
# same way. Use this parameter if manually
|
256
|
+
# same way. Use this parameter if manually joining Url's e.g.
|
245
257
|
#
|
246
258
|
# relative_link = Wgit::Url.new('?q=hello')
|
247
|
-
# absolute_link = doc.base_url(link: relative_link).
|
259
|
+
# absolute_link = doc.base_url(link: relative_link).join(relative_link)
|
248
260
|
#
|
249
261
|
# This is similar to how Wgit::Document#internal_absolute_links works.
|
250
262
|
#
|
@@ -264,7 +276,7 @@ module Wgit
|
|
264
276
|
be relative"
|
265
277
|
end
|
266
278
|
|
267
|
-
get_base = -> { @base.relative? ? @url.to_origin.
|
279
|
+
get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
|
268
280
|
|
269
281
|
if link
|
270
282
|
link = Wgit::Url.new(link)
|
@@ -288,11 +300,11 @@ be relative"
|
|
288
300
|
# returned Hash.
|
289
301
|
# @return [Hash] Containing self's instance vars.
|
290
302
|
def to_h(include_html: false, include_score: true)
|
291
|
-
ignore =
|
303
|
+
ignore = Wgit::Document.to_h_ignore_vars.dup
|
304
|
+
ignore << '@html' unless include_html
|
292
305
|
ignore << '@score' unless include_score
|
293
|
-
ignore << '@parser' # Always ignore the Nokogiri object.
|
294
306
|
|
295
|
-
Wgit::Utils.to_h(self, ignore:
|
307
|
+
Wgit::Utils.to_h(self, ignore:)
|
296
308
|
end
|
297
309
|
|
298
310
|
# Converts this Document's #to_h return value to a JSON String.
|
@@ -301,7 +313,7 @@ be relative"
|
|
301
313
|
# returned JSON String.
|
302
314
|
# @return [String] This Document represented as a JSON String.
|
303
315
|
def to_json(include_html: false)
|
304
|
-
h = to_h(include_html:
|
316
|
+
h = to_h(include_html:)
|
305
317
|
JSON.generate(h)
|
306
318
|
end
|
307
319
|
|
@@ -323,7 +335,7 @@ be relative"
|
|
323
335
|
else
|
324
336
|
next unless instance_variable_get(var).respond_to?(:length)
|
325
337
|
|
326
|
-
hash[var[1
|
338
|
+
hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
|
327
339
|
end
|
328
340
|
end
|
329
341
|
|
@@ -431,7 +443,6 @@ be relative"
|
|
431
443
|
end
|
432
444
|
end
|
433
445
|
.reject { |link| link.relative?(host: @url.to_origin) }
|
434
|
-
.map(&:omit_trailing_slash)
|
435
446
|
|
436
447
|
Wgit::Utils.sanitize(links)
|
437
448
|
end
|
@@ -507,10 +518,7 @@ be relative"
|
|
507
518
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
508
519
|
)
|
509
520
|
orig_text = @text
|
510
|
-
@text = search(
|
511
|
-
query, case_sensitive: case_sensitive,
|
512
|
-
whole_sentence: whole_sentence, sentence_limit: sentence_limit
|
513
|
-
)
|
521
|
+
@text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
|
514
522
|
|
515
523
|
orig_text
|
516
524
|
end
|
@@ -533,11 +541,17 @@ be relative"
|
|
533
541
|
# @return [String, Object] The value found in the html or the default value
|
534
542
|
# (singleton ? nil : []).
|
535
543
|
def extract(xpath, singleton: true, text_content_only: true, &block)
|
536
|
-
send(
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
544
|
+
send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
|
545
|
+
end
|
546
|
+
|
547
|
+
# Works with the default extractors to extract and check the HTML meta tags
|
548
|
+
# instructing Wgit not to index this document (save it to a Database). If
|
549
|
+
# the default extractors are removed, this method will always return false.
|
550
|
+
#
|
551
|
+
# @return [Boolean] True if this document shouldn't be saved to a Database,
|
552
|
+
# false otherwise.
|
553
|
+
def no_index?
|
554
|
+
[@meta_robots, @meta_wgit].include?('noindex')
|
541
555
|
end
|
542
556
|
|
543
557
|
protected
|
@@ -581,7 +595,7 @@ be relative"
|
|
581
595
|
result = singleton ? result.content : result.map(&:content)
|
582
596
|
end
|
583
597
|
|
584
|
-
Wgit::Utils.sanitize(result)
|
598
|
+
result = Wgit::Utils.sanitize(result)
|
585
599
|
result = yield(result, self, :document) if block_given?
|
586
600
|
result
|
587
601
|
end
|
@@ -608,7 +622,7 @@ be relative"
|
|
608
622
|
default = singleton ? nil : []
|
609
623
|
result = obj.fetch(key.to_s, default)
|
610
624
|
|
611
|
-
Wgit::Utils.sanitize(result)
|
625
|
+
result = Wgit::Utils.sanitize(result)
|
612
626
|
result = yield(result, obj, :object) if block_given?
|
613
627
|
result
|
614
628
|
end
|
@@ -628,13 +642,13 @@ be relative"
|
|
628
642
|
@parser = init_nokogiri
|
629
643
|
@score = 0.0
|
630
644
|
|
631
|
-
Wgit::Utils.sanitize(@html, encode:
|
645
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
632
646
|
|
633
647
|
# Dynamically run the init_*_from_html methods.
|
634
648
|
Document.private_instance_methods(false).each do |method|
|
635
649
|
if method.to_s.start_with?('init_') &&
|
636
|
-
method.to_s.end_with?('_from_html')
|
637
|
-
send(method)
|
650
|
+
method.to_s.end_with?('_from_html') && method != __method__
|
651
|
+
send(method)
|
638
652
|
end
|
639
653
|
end
|
640
654
|
end
|
@@ -649,13 +663,13 @@ be relative"
|
|
649
663
|
@parser = init_nokogiri
|
650
664
|
@score = obj.fetch('score', 0.0)
|
651
665
|
|
652
|
-
Wgit::Utils.sanitize(@html, encode:
|
666
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
653
667
|
|
654
668
|
# Dynamically run the init_*_from_object methods.
|
655
669
|
Document.private_instance_methods(false).each do |method|
|
656
670
|
if method.to_s.start_with?('init_') &&
|
657
|
-
method.to_s.end_with?('_from_object')
|
658
|
-
send(method, obj)
|
671
|
+
method.to_s.end_with?('_from_object') && method != __method__
|
672
|
+
send(method, obj)
|
659
673
|
end
|
660
674
|
end
|
661
675
|
end
|
@@ -668,7 +682,7 @@ be relative"
|
|
668
682
|
def init_var(var, value)
|
669
683
|
# instance_var_name starts with @, var_name doesn't.
|
670
684
|
var = var.to_s
|
671
|
-
var_name = (var.start_with?('@') ? var[1
|
685
|
+
var_name = (var.start_with?('@') ? var[1..] : var).to_sym
|
672
686
|
instance_var_name = "@#{var_name}".to_sym
|
673
687
|
|
674
688
|
instance_variable_set(instance_var_name, value)
|
@@ -677,10 +691,10 @@ be relative"
|
|
677
691
|
var_name
|
678
692
|
end
|
679
693
|
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
694
|
+
alias_method :content, :html
|
695
|
+
alias_method :statistics, :stats
|
696
|
+
alias_method :internal_urls, :internal_links
|
697
|
+
alias_method :internal_absolute_urls, :internal_absolute_links
|
698
|
+
alias_method :external_urls, :external_links
|
685
699
|
end
|
686
700
|
end
|
@@ -2,6 +2,20 @@
|
|
2
2
|
|
3
3
|
### Default Document Extractors ###
|
4
4
|
|
5
|
+
# No index.
|
6
|
+
Wgit::Document.define_extractor(
|
7
|
+
:meta_robots,
|
8
|
+
'//meta[@name="robots"]/@content',
|
9
|
+
singleton: true,
|
10
|
+
text_content_only: true
|
11
|
+
)
|
12
|
+
Wgit::Document.define_extractor(
|
13
|
+
:meta_wgit,
|
14
|
+
'//meta[@name="wgit"]/@content',
|
15
|
+
singleton: true,
|
16
|
+
text_content_only: true
|
17
|
+
)
|
18
|
+
|
5
19
|
# Base.
|
6
20
|
Wgit::Document.define_extractor(
|
7
21
|
:base,
|
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
|
|
45
59
|
) do |keywords, _source, type|
|
46
60
|
if keywords && (type == :document)
|
47
61
|
keywords = keywords.split(',')
|
48
|
-
Wgit::Utils.sanitize(keywords)
|
62
|
+
keywords = Wgit::Utils.sanitize(keywords)
|
49
63
|
end
|
50
64
|
keywords
|
51
65
|
end
|
data/lib/wgit/dsl.rb
CHANGED
@@ -101,7 +101,7 @@ the 'start' function".freeze
|
|
101
101
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
102
102
|
|
103
103
|
urls.map! { |url| Wgit::Url.parse(url) }
|
104
|
-
crawler.crawl_urls(*urls, follow_redirects
|
104
|
+
crawler.crawl_urls(*urls, follow_redirects:, &block)
|
105
105
|
end
|
106
106
|
|
107
107
|
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
@@ -135,9 +135,7 @@ the 'start' function".freeze
|
|
135
135
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
136
136
|
|
137
137
|
xpath = follow || :default
|
138
|
-
opts = {
|
139
|
-
follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
|
140
|
-
}
|
138
|
+
opts = { follow: xpath, allow_paths:, disallow_paths: }
|
141
139
|
|
142
140
|
urls.reduce([]) do |externals, url|
|
143
141
|
externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
@@ -189,7 +187,7 @@ the 'start' function".freeze
|
|
189
187
|
db = Wgit::Database.new(connection_string)
|
190
188
|
indexer = Wgit::Indexer.new(db, crawler)
|
191
189
|
|
192
|
-
indexer.index_www(max_sites
|
190
|
+
indexer.index_www(max_sites:, max_data:)
|
193
191
|
end
|
194
192
|
|
195
193
|
# Indexes a single website using `Wgit::Indexer#index_site` underneath.
|
@@ -226,8 +224,7 @@ the 'start' function".freeze
|
|
226
224
|
indexer = Wgit::Indexer.new(db, crawler)
|
227
225
|
xpath = follow || :default
|
228
226
|
crawl_opts = {
|
229
|
-
insert_externals
|
230
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths
|
227
|
+
insert_externals:, follow: xpath, allow_paths:, disallow_paths:
|
231
228
|
}
|
232
229
|
|
233
230
|
urls.reduce(0) do |total, url|
|
@@ -261,9 +258,11 @@ the 'start' function".freeze
|
|
261
258
|
indexer = Wgit::Indexer.new(db, crawler)
|
262
259
|
|
263
260
|
urls.map! { |url| Wgit::Url.parse(url) }
|
264
|
-
indexer.index_urls(*urls, insert_externals
|
261
|
+
indexer.index_urls(*urls, insert_externals:, &block)
|
265
262
|
end
|
266
263
|
|
264
|
+
### DATABASE METHODS ###
|
265
|
+
|
267
266
|
# Performs a search of the database's indexed documents and pretty prints
|
268
267
|
# the results in a search engine-esque format. See `Wgit::Database#search!`
|
269
268
|
# and `Wgit::Document#search!` for details of how the search works.
|
@@ -285,7 +284,7 @@ the 'start' function".freeze
|
|
285
284
|
# database containing only its matching `#text`.
|
286
285
|
# @return [Array<Wgit::Document>] The search results with matching text.
|
287
286
|
def search(
|
288
|
-
query, connection_string: @dsl_conn_str, stream:
|
287
|
+
query, connection_string: @dsl_conn_str, stream: $stdout,
|
289
288
|
case_sensitive: false, whole_sentence: true,
|
290
289
|
limit: 10, skip: 0, sentence_limit: 80, &block
|
291
290
|
)
|
@@ -294,15 +293,12 @@ the 'start' function".freeze
|
|
294
293
|
|
295
294
|
results = db.search!(
|
296
295
|
query,
|
297
|
-
case_sensitive
|
298
|
-
|
299
|
-
|
300
|
-
skip: skip,
|
301
|
-
sentence_limit: sentence_limit,
|
302
|
-
&block
|
296
|
+
case_sensitive:, whole_sentence:,
|
297
|
+
limit:, skip:,
|
298
|
+
sentence_limit:, &block
|
303
299
|
)
|
304
300
|
|
305
|
-
Wgit::Utils.
|
301
|
+
Wgit::Utils.pprint_search_results(results, stream:)
|
306
302
|
|
307
303
|
results
|
308
304
|
end
|
@@ -317,9 +313,9 @@ the 'start' function".freeze
|
|
317
313
|
db.clear_db
|
318
314
|
end
|
319
315
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
316
|
+
alias_method :crawl_url, :crawl
|
317
|
+
alias_method :crawl_r, :crawl_site
|
318
|
+
alias_method :index_r, :index_site
|
319
|
+
alias_method :start_urls, :start
|
324
320
|
end
|
325
321
|
end
|