wgit 0.10.8 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +33 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +3 -1
- data/bin/wgit +3 -1
- data/lib/wgit/assertable.rb +2 -2
- data/lib/wgit/crawler.rb +56 -34
- data/lib/wgit/database/database.rb +64 -52
- data/lib/wgit/document.rb +54 -40
- data/lib/wgit/document_extractors.rb +15 -1
- data/lib/wgit/dsl.rb +16 -20
- data/lib/wgit/indexer.rb +157 -63
- data/lib/wgit/logger.rb +1 -1
- data/lib/wgit/response.rb +21 -6
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +115 -55
- data/lib/wgit/utils.rb +81 -28
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +1 -0
- metadata +33 -38
data/lib/wgit/document.rb
CHANGED
@@ -3,7 +3,6 @@ require_relative 'utils'
|
|
3
3
|
require_relative 'assertable'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'json'
|
6
|
-
require 'set'
|
7
6
|
|
8
7
|
module Wgit
|
9
8
|
# Class modeling/serialising a HTML web document, although other MIME types
|
@@ -18,9 +17,9 @@ module Wgit
|
|
18
17
|
include Assertable
|
19
18
|
|
20
19
|
# Regex for the allowed var names when defining an extractor.
|
21
|
-
REGEX_EXTRACTOR_NAME = /[a-z0-9_]
|
20
|
+
REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
|
22
21
|
|
23
|
-
# Set of text elements used to build Document#text.
|
22
|
+
# Set of text elements used to build the xpath for Document#text.
|
24
23
|
@text_elements = Set.new(%i[
|
25
24
|
a abbr address article aside b bdi bdo blockquote button caption cite
|
26
25
|
code data dd del details dfn div dl dt em figcaption figure footer h1 h2
|
@@ -29,6 +28,13 @@ module Wgit
|
|
29
28
|
summary sup td textarea th time u ul var wbr
|
30
29
|
])
|
31
30
|
|
31
|
+
# Instance vars to be ignored by Document#to_h and in turn Model.document.
|
32
|
+
@to_h_ignore_vars = [
|
33
|
+
'@parser', # Always ignore the Nokogiri object.
|
34
|
+
'@meta_robots', # Used by #no_index?, ignore.
|
35
|
+
'@meta_wgit' # Used by #no_index?, ignore.
|
36
|
+
]
|
37
|
+
|
32
38
|
# Set of Symbols representing the defined Document extractors.
|
33
39
|
@extractors = Set.new
|
34
40
|
|
@@ -38,6 +44,12 @@ module Wgit
|
|
38
44
|
# README.md for how to add to this Set dynamically.
|
39
45
|
attr_reader :text_elements
|
40
46
|
|
47
|
+
# Array of instance vars to ignore when Document#to_h and in turn
|
48
|
+
# Model.document methods are called. Append your own defined extractor
|
49
|
+
# vars to omit them from the model (database object) when indexing.
|
50
|
+
# Each var should be a String starting with an '@' char e.g. "@data" etc.
|
51
|
+
attr_reader :to_h_ignore_vars
|
52
|
+
|
41
53
|
# Set of Symbols representing the defined Document extractors. Is
|
42
54
|
# read-only. Use Wgit::Document.define_extractor for a new extractor.
|
43
55
|
attr_reader :extractors
|
@@ -76,9 +88,9 @@ module Wgit
|
|
76
88
|
# false if the Document content is an image etc.
|
77
89
|
def initialize(url_or_obj, html = '', encode: true)
|
78
90
|
if url_or_obj.is_a?(String)
|
79
|
-
init_from_strings(url_or_obj, html, encode:
|
91
|
+
init_from_strings(url_or_obj, html, encode:)
|
80
92
|
else
|
81
|
-
init_from_object(url_or_obj, encode:
|
93
|
+
init_from_object(url_or_obj, encode:)
|
82
94
|
end
|
83
95
|
end
|
84
96
|
|
@@ -89,9 +101,9 @@ module Wgit
|
|
89
101
|
#
|
90
102
|
# @return [String] An xpath String to obtain a webpage's text elements.
|
91
103
|
def self.text_elements_xpath
|
92
|
-
|
104
|
+
@text_elements.each_with_index.reduce('') do |xpath, (el, i)|
|
93
105
|
xpath += ' | ' unless i.zero?
|
94
|
-
xpath
|
106
|
+
xpath + format('//%s/text()', el)
|
95
107
|
end
|
96
108
|
end
|
97
109
|
|
@@ -210,7 +222,7 @@ module Wgit
|
|
210
222
|
#
|
211
223
|
# @return [String] A short textual representation of this Document.
|
212
224
|
def inspect
|
213
|
-
"#<Wgit::Document url=\"#{@url}\"
|
225
|
+
"#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
|
214
226
|
end
|
215
227
|
|
216
228
|
# Determines if both the url and html match. Use
|
@@ -241,10 +253,10 @@ module Wgit
|
|
241
253
|
# Provide the `link:` parameter to get the correct base URL for that type
|
242
254
|
# of link. For example, a link of `#top` would always return @url because
|
243
255
|
# it applies to that page, not a different one. Query strings work in the
|
244
|
-
# same way. Use this parameter if manually
|
256
|
+
# same way. Use this parameter if manually joining Url's e.g.
|
245
257
|
#
|
246
258
|
# relative_link = Wgit::Url.new('?q=hello')
|
247
|
-
# absolute_link = doc.base_url(link: relative_link).
|
259
|
+
# absolute_link = doc.base_url(link: relative_link).join(relative_link)
|
248
260
|
#
|
249
261
|
# This is similar to how Wgit::Document#internal_absolute_links works.
|
250
262
|
#
|
@@ -264,7 +276,7 @@ module Wgit
|
|
264
276
|
be relative"
|
265
277
|
end
|
266
278
|
|
267
|
-
get_base = -> { @base.relative? ? @url.to_origin.
|
279
|
+
get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
|
268
280
|
|
269
281
|
if link
|
270
282
|
link = Wgit::Url.new(link)
|
@@ -288,11 +300,11 @@ be relative"
|
|
288
300
|
# returned Hash.
|
289
301
|
# @return [Hash] Containing self's instance vars.
|
290
302
|
def to_h(include_html: false, include_score: true)
|
291
|
-
ignore =
|
303
|
+
ignore = Wgit::Document.to_h_ignore_vars.dup
|
304
|
+
ignore << '@html' unless include_html
|
292
305
|
ignore << '@score' unless include_score
|
293
|
-
ignore << '@parser' # Always ignore the Nokogiri object.
|
294
306
|
|
295
|
-
Wgit::Utils.to_h(self, ignore:
|
307
|
+
Wgit::Utils.to_h(self, ignore:)
|
296
308
|
end
|
297
309
|
|
298
310
|
# Converts this Document's #to_h return value to a JSON String.
|
@@ -301,7 +313,7 @@ be relative"
|
|
301
313
|
# returned JSON String.
|
302
314
|
# @return [String] This Document represented as a JSON String.
|
303
315
|
def to_json(include_html: false)
|
304
|
-
h = to_h(include_html:
|
316
|
+
h = to_h(include_html:)
|
305
317
|
JSON.generate(h)
|
306
318
|
end
|
307
319
|
|
@@ -323,7 +335,7 @@ be relative"
|
|
323
335
|
else
|
324
336
|
next unless instance_variable_get(var).respond_to?(:length)
|
325
337
|
|
326
|
-
hash[var[1
|
338
|
+
hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
|
327
339
|
end
|
328
340
|
end
|
329
341
|
|
@@ -431,7 +443,6 @@ be relative"
|
|
431
443
|
end
|
432
444
|
end
|
433
445
|
.reject { |link| link.relative?(host: @url.to_origin) }
|
434
|
-
.map(&:omit_trailing_slash)
|
435
446
|
|
436
447
|
Wgit::Utils.sanitize(links)
|
437
448
|
end
|
@@ -507,10 +518,7 @@ be relative"
|
|
507
518
|
query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
|
508
519
|
)
|
509
520
|
orig_text = @text
|
510
|
-
@text = search(
|
511
|
-
query, case_sensitive: case_sensitive,
|
512
|
-
whole_sentence: whole_sentence, sentence_limit: sentence_limit
|
513
|
-
)
|
521
|
+
@text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
|
514
522
|
|
515
523
|
orig_text
|
516
524
|
end
|
@@ -533,11 +541,17 @@ be relative"
|
|
533
541
|
# @return [String, Object] The value found in the html or the default value
|
534
542
|
# (singleton ? nil : []).
|
535
543
|
def extract(xpath, singleton: true, text_content_only: true, &block)
|
536
|
-
send(
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
544
|
+
send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
|
545
|
+
end
|
546
|
+
|
547
|
+
# Works with the default extractors to extract and check the HTML meta tags
|
548
|
+
# instructing Wgit not to index this document (save it to a Database). If
|
549
|
+
# the default extractors are removed, this method will always return false.
|
550
|
+
#
|
551
|
+
# @return [Boolean] True if this document shouldn't be saved to a Database,
|
552
|
+
# false otherwise.
|
553
|
+
def no_index?
|
554
|
+
[@meta_robots, @meta_wgit].include?('noindex')
|
541
555
|
end
|
542
556
|
|
543
557
|
protected
|
@@ -581,7 +595,7 @@ be relative"
|
|
581
595
|
result = singleton ? result.content : result.map(&:content)
|
582
596
|
end
|
583
597
|
|
584
|
-
Wgit::Utils.sanitize(result)
|
598
|
+
result = Wgit::Utils.sanitize(result)
|
585
599
|
result = yield(result, self, :document) if block_given?
|
586
600
|
result
|
587
601
|
end
|
@@ -608,7 +622,7 @@ be relative"
|
|
608
622
|
default = singleton ? nil : []
|
609
623
|
result = obj.fetch(key.to_s, default)
|
610
624
|
|
611
|
-
Wgit::Utils.sanitize(result)
|
625
|
+
result = Wgit::Utils.sanitize(result)
|
612
626
|
result = yield(result, obj, :object) if block_given?
|
613
627
|
result
|
614
628
|
end
|
@@ -628,13 +642,13 @@ be relative"
|
|
628
642
|
@parser = init_nokogiri
|
629
643
|
@score = 0.0
|
630
644
|
|
631
|
-
Wgit::Utils.sanitize(@html, encode:
|
645
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
632
646
|
|
633
647
|
# Dynamically run the init_*_from_html methods.
|
634
648
|
Document.private_instance_methods(false).each do |method|
|
635
649
|
if method.to_s.start_with?('init_') &&
|
636
|
-
method.to_s.end_with?('_from_html')
|
637
|
-
send(method)
|
650
|
+
method.to_s.end_with?('_from_html') && method != __method__
|
651
|
+
send(method)
|
638
652
|
end
|
639
653
|
end
|
640
654
|
end
|
@@ -649,13 +663,13 @@ be relative"
|
|
649
663
|
@parser = init_nokogiri
|
650
664
|
@score = obj.fetch('score', 0.0)
|
651
665
|
|
652
|
-
Wgit::Utils.sanitize(@html, encode:
|
666
|
+
@html = Wgit::Utils.sanitize(@html, encode:)
|
653
667
|
|
654
668
|
# Dynamically run the init_*_from_object methods.
|
655
669
|
Document.private_instance_methods(false).each do |method|
|
656
670
|
if method.to_s.start_with?('init_') &&
|
657
|
-
method.to_s.end_with?('_from_object')
|
658
|
-
send(method, obj)
|
671
|
+
method.to_s.end_with?('_from_object') && method != __method__
|
672
|
+
send(method, obj)
|
659
673
|
end
|
660
674
|
end
|
661
675
|
end
|
@@ -668,7 +682,7 @@ be relative"
|
|
668
682
|
def init_var(var, value)
|
669
683
|
# instance_var_name starts with @, var_name doesn't.
|
670
684
|
var = var.to_s
|
671
|
-
var_name = (var.start_with?('@') ? var[1
|
685
|
+
var_name = (var.start_with?('@') ? var[1..] : var).to_sym
|
672
686
|
instance_var_name = "@#{var_name}".to_sym
|
673
687
|
|
674
688
|
instance_variable_set(instance_var_name, value)
|
@@ -677,10 +691,10 @@ be relative"
|
|
677
691
|
var_name
|
678
692
|
end
|
679
693
|
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
694
|
+
alias_method :content, :html
|
695
|
+
alias_method :statistics, :stats
|
696
|
+
alias_method :internal_urls, :internal_links
|
697
|
+
alias_method :internal_absolute_urls, :internal_absolute_links
|
698
|
+
alias_method :external_urls, :external_links
|
685
699
|
end
|
686
700
|
end
|
@@ -2,6 +2,20 @@
|
|
2
2
|
|
3
3
|
### Default Document Extractors ###
|
4
4
|
|
5
|
+
# No index.
|
6
|
+
Wgit::Document.define_extractor(
|
7
|
+
:meta_robots,
|
8
|
+
'//meta[@name="robots"]/@content',
|
9
|
+
singleton: true,
|
10
|
+
text_content_only: true
|
11
|
+
)
|
12
|
+
Wgit::Document.define_extractor(
|
13
|
+
:meta_wgit,
|
14
|
+
'//meta[@name="wgit"]/@content',
|
15
|
+
singleton: true,
|
16
|
+
text_content_only: true
|
17
|
+
)
|
18
|
+
|
5
19
|
# Base.
|
6
20
|
Wgit::Document.define_extractor(
|
7
21
|
:base,
|
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
|
|
45
59
|
) do |keywords, _source, type|
|
46
60
|
if keywords && (type == :document)
|
47
61
|
keywords = keywords.split(',')
|
48
|
-
Wgit::Utils.sanitize(keywords)
|
62
|
+
keywords = Wgit::Utils.sanitize(keywords)
|
49
63
|
end
|
50
64
|
keywords
|
51
65
|
end
|
data/lib/wgit/dsl.rb
CHANGED
@@ -101,7 +101,7 @@ the 'start' function".freeze
|
|
101
101
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
102
102
|
|
103
103
|
urls.map! { |url| Wgit::Url.parse(url) }
|
104
|
-
crawler.crawl_urls(*urls, follow_redirects
|
104
|
+
crawler.crawl_urls(*urls, follow_redirects:, &block)
|
105
105
|
end
|
106
106
|
|
107
107
|
# Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
|
@@ -135,9 +135,7 @@ the 'start' function".freeze
|
|
135
135
|
raise DSL_ERROR__NO_START_URL if urls.empty?
|
136
136
|
|
137
137
|
xpath = follow || :default
|
138
|
-
opts = {
|
139
|
-
follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
|
140
|
-
}
|
138
|
+
opts = { follow: xpath, allow_paths:, disallow_paths: }
|
141
139
|
|
142
140
|
urls.reduce([]) do |externals, url|
|
143
141
|
externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
|
@@ -189,7 +187,7 @@ the 'start' function".freeze
|
|
189
187
|
db = Wgit::Database.new(connection_string)
|
190
188
|
indexer = Wgit::Indexer.new(db, crawler)
|
191
189
|
|
192
|
-
indexer.index_www(max_sites
|
190
|
+
indexer.index_www(max_sites:, max_data:)
|
193
191
|
end
|
194
192
|
|
195
193
|
# Indexes a single website using `Wgit::Indexer#index_site` underneath.
|
@@ -226,8 +224,7 @@ the 'start' function".freeze
|
|
226
224
|
indexer = Wgit::Indexer.new(db, crawler)
|
227
225
|
xpath = follow || :default
|
228
226
|
crawl_opts = {
|
229
|
-
insert_externals
|
230
|
-
allow_paths: allow_paths, disallow_paths: disallow_paths
|
227
|
+
insert_externals:, follow: xpath, allow_paths:, disallow_paths:
|
231
228
|
}
|
232
229
|
|
233
230
|
urls.reduce(0) do |total, url|
|
@@ -261,9 +258,11 @@ the 'start' function".freeze
|
|
261
258
|
indexer = Wgit::Indexer.new(db, crawler)
|
262
259
|
|
263
260
|
urls.map! { |url| Wgit::Url.parse(url) }
|
264
|
-
indexer.index_urls(*urls, insert_externals
|
261
|
+
indexer.index_urls(*urls, insert_externals:, &block)
|
265
262
|
end
|
266
263
|
|
264
|
+
### DATABASE METHODS ###
|
265
|
+
|
267
266
|
# Performs a search of the database's indexed documents and pretty prints
|
268
267
|
# the results in a search engine-esque format. See `Wgit::Database#search!`
|
269
268
|
# and `Wgit::Document#search!` for details of how the search works.
|
@@ -285,7 +284,7 @@ the 'start' function".freeze
|
|
285
284
|
# database containing only its matching `#text`.
|
286
285
|
# @return [Array<Wgit::Document>] The search results with matching text.
|
287
286
|
def search(
|
288
|
-
query, connection_string: @dsl_conn_str, stream:
|
287
|
+
query, connection_string: @dsl_conn_str, stream: $stdout,
|
289
288
|
case_sensitive: false, whole_sentence: true,
|
290
289
|
limit: 10, skip: 0, sentence_limit: 80, &block
|
291
290
|
)
|
@@ -294,15 +293,12 @@ the 'start' function".freeze
|
|
294
293
|
|
295
294
|
results = db.search!(
|
296
295
|
query,
|
297
|
-
case_sensitive
|
298
|
-
|
299
|
-
|
300
|
-
skip: skip,
|
301
|
-
sentence_limit: sentence_limit,
|
302
|
-
&block
|
296
|
+
case_sensitive:, whole_sentence:,
|
297
|
+
limit:, skip:,
|
298
|
+
sentence_limit:, &block
|
303
299
|
)
|
304
300
|
|
305
|
-
Wgit::Utils.
|
301
|
+
Wgit::Utils.pprint_search_results(results, stream:)
|
306
302
|
|
307
303
|
results
|
308
304
|
end
|
@@ -317,9 +313,9 @@ the 'start' function".freeze
|
|
317
313
|
db.clear_db
|
318
314
|
end
|
319
315
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
316
|
+
alias_method :crawl_url, :crawl
|
317
|
+
alias_method :crawl_r, :crawl_site
|
318
|
+
alias_method :index_r, :index_site
|
319
|
+
alias_method :start_urls, :start
|
324
320
|
end
|
325
321
|
end
|