wgit 0.10.7 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/document.rb CHANGED
@@ -3,7 +3,6 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
- require 'set'
7
6
 
8
7
  module Wgit
9
8
  # Class modeling/serialising a HTML web document, although other MIME types
@@ -18,9 +17,9 @@ module Wgit
18
17
  include Assertable
19
18
 
20
19
  # Regex for the allowed var names when defining an extractor.
21
- REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
20
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
22
21
 
23
- # Set of text elements used to build Document#text.
22
+ # Set of text elements used to build the xpath for Document#text.
24
23
  @text_elements = Set.new(%i[
25
24
  a abbr address article aside b bdi bdo blockquote button caption cite
26
25
  code data dd del details dfn div dl dt em figcaption figure footer h1 h2
@@ -29,6 +28,13 @@ module Wgit
29
28
  summary sup td textarea th time u ul var wbr
30
29
  ])
31
30
 
31
+ # Instance vars to be ignored by Document#to_h and in turn Model.document.
32
+ @to_h_ignore_vars = [
33
+ '@parser', # Always ignore the Nokogiri object.
34
+ '@meta_robots', # Used by #no_index?, ignore.
35
+ '@meta_wgit' # Used by #no_index?, ignore.
36
+ ]
37
+
32
38
  # Set of Symbols representing the defined Document extractors.
33
39
  @extractors = Set.new
34
40
 
@@ -38,6 +44,12 @@ module Wgit
38
44
  # README.md for how to add to this Set dynamically.
39
45
  attr_reader :text_elements
40
46
 
47
+ # Array of instance vars to ignore when Document#to_h and in turn
48
+ # Model.document methods are called. Append your own defined extractor
49
+ # vars to omit them from the model (database object) when indexing.
50
+ # Each var should be a String starting with an '@' char e.g. "@data" etc.
51
+ attr_reader :to_h_ignore_vars
52
+
41
53
  # Set of Symbols representing the defined Document extractors. Is
42
54
  # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
55
  attr_reader :extractors
@@ -76,9 +88,9 @@ module Wgit
76
88
  # false if the Document content is an image etc.
77
89
  def initialize(url_or_obj, html = '', encode: true)
78
90
  if url_or_obj.is_a?(String)
79
- init_from_strings(url_or_obj, html, encode: encode)
91
+ init_from_strings(url_or_obj, html, encode:)
80
92
  else
81
- init_from_object(url_or_obj, encode: encode)
93
+ init_from_object(url_or_obj, encode:)
82
94
  end
83
95
  end
84
96
 
@@ -89,9 +101,9 @@ module Wgit
89
101
  #
90
102
  # @return [String] An xpath String to obtain a webpage's text elements.
91
103
  def self.text_elements_xpath
92
- Wgit::Document.text_elements.each_with_index.reduce('') do |xpath, (el, i)|
104
+ @text_elements.each_with_index.reduce('') do |xpath, (el, i)|
93
105
  xpath += ' | ' unless i.zero?
94
- xpath += format('//%s/text()', el)
106
+ xpath + format('//%s/text()', el)
95
107
  end
96
108
  end
97
109
 
@@ -192,13 +204,27 @@ module Wgit
192
204
  Document.send(:remove_method, "init_#{var}_from_object")
193
205
 
194
206
  @extractors.delete(var.to_sym)
207
+
195
208
  true
196
209
  rescue NameError
197
210
  false
198
211
  end
199
212
 
213
+ # Removes all default and defined extractors by calling
214
+ # `Document.remove_extractor` underneath. See its documentation.
215
+ def self.remove_extractors
216
+ @extractors.each { |var| remove_extractor(var) }
217
+ end
218
+
200
219
  ### Document Instance Methods ###
201
220
 
221
+ # Overrides String#inspect to shorten the printed output of a Document.
222
+ #
223
+ # @return [String] A short textual representation of this Document.
224
+ def inspect
225
+ "#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
226
+ end
227
+
202
228
  # Determines if both the url and html match. Use
203
229
  # doc.object_id == other.object_id for exact object comparison.
204
230
  #
@@ -227,10 +253,10 @@ module Wgit
227
253
  # Provide the `link:` parameter to get the correct base URL for that type
228
254
  # of link. For example, a link of `#top` would always return @url because
229
255
  # it applies to that page, not a different one. Query strings work in the
230
- # same way. Use this parameter if manually concatting Url's e.g.
256
+ # same way. Use this parameter if manually joining Url's e.g.
231
257
  #
232
258
  # relative_link = Wgit::Url.new('?q=hello')
233
- # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
259
+ # absolute_link = doc.base_url(link: relative_link).join(relative_link)
234
260
  #
235
261
  # This is similar to how Wgit::Document#internal_absolute_links works.
236
262
  #
@@ -250,7 +276,7 @@ module Wgit
250
276
  be relative"
251
277
  end
252
278
 
253
- get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
279
+ get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
254
280
 
255
281
  if link
256
282
  link = Wgit::Url.new(link)
@@ -274,11 +300,11 @@ be relative"
274
300
  # returned Hash.
275
301
  # @return [Hash] Containing self's instance vars.
276
302
  def to_h(include_html: false, include_score: true)
277
- ignore = include_html ? [] : ['@html']
303
+ ignore = Wgit::Document.to_h_ignore_vars.dup
304
+ ignore << '@html' unless include_html
278
305
  ignore << '@score' unless include_score
279
- ignore << '@parser' # Always ignore the Nokogiri object.
280
306
 
281
- Wgit::Utils.to_h(self, ignore: ignore)
307
+ Wgit::Utils.to_h(self, ignore:)
282
308
  end
283
309
 
284
310
  # Converts this Document's #to_h return value to a JSON String.
@@ -287,7 +313,7 @@ be relative"
287
313
  # returned JSON String.
288
314
  # @return [String] This Document represented as a JSON String.
289
315
  def to_json(include_html: false)
290
- h = to_h(include_html: include_html)
316
+ h = to_h(include_html:)
291
317
  JSON.generate(h)
292
318
  end
293
319
 
@@ -309,7 +335,7 @@ be relative"
309
335
  else
310
336
  next unless instance_variable_get(var).respond_to?(:length)
311
337
 
312
- hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
338
+ hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
313
339
  end
314
340
  end
315
341
 
@@ -417,7 +443,6 @@ be relative"
417
443
  end
418
444
  end
419
445
  .reject { |link| link.relative?(host: @url.to_origin) }
420
- .map(&:omit_trailing_slash)
421
446
 
422
447
  Wgit::Utils.sanitize(links)
423
448
  end
@@ -493,10 +518,7 @@ be relative"
493
518
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
494
519
  )
495
520
  orig_text = @text
496
- @text = search(
497
- query, case_sensitive: case_sensitive,
498
- whole_sentence: whole_sentence, sentence_limit: sentence_limit
499
- )
521
+ @text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
500
522
 
501
523
  orig_text
502
524
  end
@@ -519,11 +541,17 @@ be relative"
519
541
  # @return [String, Object] The value found in the html or the default value
520
542
  # (singleton ? nil : []).
521
543
  def extract(xpath, singleton: true, text_content_only: true, &block)
522
- send(
523
- :extract_from_html, xpath,
524
- singleton: singleton, text_content_only: text_content_only,
525
- &block
526
- )
544
+ send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
545
+ end
546
+
547
+ # Works with the default extractors to extract and check the HTML meta tags
548
+ # instructing Wgit not to index this document (save it to a Database). If
549
+ # the default extractors are removed, this method will always return false.
550
+ #
551
+ # @return [Boolean] True if this document shouldn't be saved to a Database,
552
+ # false otherwise.
553
+ def no_index?
554
+ [@meta_robots, @meta_wgit].include?('noindex')
527
555
  end
528
556
 
529
557
  protected
@@ -567,7 +595,7 @@ be relative"
567
595
  result = singleton ? result.content : result.map(&:content)
568
596
  end
569
597
 
570
- Wgit::Utils.sanitize(result)
598
+ result = Wgit::Utils.sanitize(result)
571
599
  result = yield(result, self, :document) if block_given?
572
600
  result
573
601
  end
@@ -594,7 +622,7 @@ be relative"
594
622
  default = singleton ? nil : []
595
623
  result = obj.fetch(key.to_s, default)
596
624
 
597
- Wgit::Utils.sanitize(result)
625
+ result = Wgit::Utils.sanitize(result)
598
626
  result = yield(result, obj, :object) if block_given?
599
627
  result
600
628
  end
@@ -614,13 +642,13 @@ be relative"
614
642
  @parser = init_nokogiri
615
643
  @score = 0.0
616
644
 
617
- Wgit::Utils.sanitize(@html, encode: encode)
645
+ @html = Wgit::Utils.sanitize(@html, encode:)
618
646
 
619
647
  # Dynamically run the init_*_from_html methods.
620
648
  Document.private_instance_methods(false).each do |method|
621
649
  if method.to_s.start_with?('init_') &&
622
- method.to_s.end_with?('_from_html')
623
- send(method) unless method == __method__
650
+ method.to_s.end_with?('_from_html') && method != __method__
651
+ send(method)
624
652
  end
625
653
  end
626
654
  end
@@ -635,13 +663,13 @@ be relative"
635
663
  @parser = init_nokogiri
636
664
  @score = obj.fetch('score', 0.0)
637
665
 
638
- Wgit::Utils.sanitize(@html, encode: encode)
666
+ @html = Wgit::Utils.sanitize(@html, encode:)
639
667
 
640
668
  # Dynamically run the init_*_from_object methods.
641
669
  Document.private_instance_methods(false).each do |method|
642
670
  if method.to_s.start_with?('init_') &&
643
- method.to_s.end_with?('_from_object')
644
- send(method, obj) unless method == __method__
671
+ method.to_s.end_with?('_from_object') && method != __method__
672
+ send(method, obj)
645
673
  end
646
674
  end
647
675
  end
@@ -654,7 +682,7 @@ be relative"
654
682
  def init_var(var, value)
655
683
  # instance_var_name starts with @, var_name doesn't.
656
684
  var = var.to_s
657
- var_name = (var.start_with?('@') ? var[1..-1] : var).to_sym
685
+ var_name = (var.start_with?('@') ? var[1..] : var).to_sym
658
686
  instance_var_name = "@#{var_name}".to_sym
659
687
 
660
688
  instance_variable_set(instance_var_name, value)
@@ -663,10 +691,10 @@ be relative"
663
691
  var_name
664
692
  end
665
693
 
666
- alias content html
667
- alias statistics stats
668
- alias internal_urls internal_links
669
- alias internal_absolute_urls internal_absolute_links
670
- alias external_urls external_links
694
+ alias_method :content, :html
695
+ alias_method :statistics, :stats
696
+ alias_method :internal_urls, :internal_links
697
+ alias_method :internal_absolute_urls, :internal_absolute_links
698
+ alias_method :external_urls, :external_links
671
699
  end
672
700
  end
@@ -2,6 +2,20 @@
2
2
 
3
3
  ### Default Document Extractors ###
4
4
 
5
+ # No index.
6
+ Wgit::Document.define_extractor(
7
+ :meta_robots,
8
+ '//meta[@name="robots"]/@content',
9
+ singleton: true,
10
+ text_content_only: true
11
+ )
12
+ Wgit::Document.define_extractor(
13
+ :meta_wgit,
14
+ '//meta[@name="wgit"]/@content',
15
+ singleton: true,
16
+ text_content_only: true
17
+ )
18
+
5
19
  # Base.
6
20
  Wgit::Document.define_extractor(
7
21
  :base,
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
45
59
  ) do |keywords, _source, type|
46
60
  if keywords && (type == :document)
47
61
  keywords = keywords.split(',')
48
- Wgit::Utils.sanitize(keywords)
62
+ keywords = Wgit::Utils.sanitize(keywords)
49
63
  end
50
64
  keywords
51
65
  end
data/lib/wgit/dsl.rb CHANGED
@@ -101,7 +101,7 @@ the 'start' function".freeze
101
101
  raise DSL_ERROR__NO_START_URL if urls.empty?
102
102
 
103
103
  urls.map! { |url| Wgit::Url.parse(url) }
104
- crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
104
+ crawler.crawl_urls(*urls, follow_redirects:, &block)
105
105
  end
106
106
 
107
107
  # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -135,9 +135,7 @@ the 'start' function".freeze
135
135
  raise DSL_ERROR__NO_START_URL if urls.empty?
136
136
 
137
137
  xpath = follow || :default
138
- opts = {
139
- follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
- }
138
+ opts = { follow: xpath, allow_paths:, disallow_paths: }
141
139
 
142
140
  urls.reduce([]) do |externals, url|
143
141
  externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
@@ -189,7 +187,7 @@ the 'start' function".freeze
189
187
  db = Wgit::Database.new(connection_string)
190
188
  indexer = Wgit::Indexer.new(db, crawler)
191
189
 
192
- indexer.index_www(max_sites: max_sites, max_data: max_data)
190
+ indexer.index_www(max_sites:, max_data:)
193
191
  end
194
192
 
195
193
  # Indexes a single website using `Wgit::Indexer#index_site` underneath.
@@ -226,8 +224,7 @@ the 'start' function".freeze
226
224
  indexer = Wgit::Indexer.new(db, crawler)
227
225
  xpath = follow || :default
228
226
  crawl_opts = {
229
- insert_externals: insert_externals, follow: xpath,
230
- allow_paths: allow_paths, disallow_paths: disallow_paths
227
+ insert_externals:, follow: xpath, allow_paths:, disallow_paths:
231
228
  }
232
229
 
233
230
  urls.reduce(0) do |total, url|
@@ -261,9 +258,11 @@ the 'start' function".freeze
261
258
  indexer = Wgit::Indexer.new(db, crawler)
262
259
 
263
260
  urls.map! { |url| Wgit::Url.parse(url) }
264
- indexer.index_urls(*urls, insert_externals: insert_externals, &block)
261
+ indexer.index_urls(*urls, insert_externals:, &block)
265
262
  end
266
263
 
264
+ ### DATABASE METHODS ###
265
+
267
266
  # Performs a search of the database's indexed documents and pretty prints
268
267
  # the results in a search engine-esque format. See `Wgit::Database#search!`
269
268
  # and `Wgit::Document#search!` for details of how the search works.
@@ -285,7 +284,7 @@ the 'start' function".freeze
285
284
  # database containing only its matching `#text`.
286
285
  # @return [Array<Wgit::Document>] The search results with matching text.
287
286
  def search(
288
- query, connection_string: @dsl_conn_str, stream: STDOUT,
287
+ query, connection_string: @dsl_conn_str, stream: $stdout,
289
288
  case_sensitive: false, whole_sentence: true,
290
289
  limit: 10, skip: 0, sentence_limit: 80, &block
291
290
  )
@@ -294,15 +293,12 @@ the 'start' function".freeze
294
293
 
295
294
  results = db.search!(
296
295
  query,
297
- case_sensitive: case_sensitive,
298
- whole_sentence: whole_sentence,
299
- limit: limit,
300
- skip: skip,
301
- sentence_limit: sentence_limit,
302
- &block
296
+ case_sensitive:, whole_sentence:,
297
+ limit:, skip:,
298
+ sentence_limit:, &block
303
299
  )
304
300
 
305
- Wgit::Utils.printf_search_results(results, stream: stream)
301
+ Wgit::Utils.pprint_search_results(results, stream:)
306
302
 
307
303
  results
308
304
  end
@@ -317,9 +313,9 @@ the 'start' function".freeze
317
313
  db.clear_db
318
314
  end
319
315
 
320
- alias crawl_url crawl
321
- alias crawl_r crawl_site
322
- alias index_r index_site
323
- alias start_urls start
316
+ alias_method :crawl_url, :crawl
317
+ alias_method :crawl_r, :crawl_site
318
+ alias_method :index_r, :index_site
319
+ alias_method :start_urls, :start
324
320
  end
325
321
  end