wgit 0.10.8 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wgit/document.rb CHANGED
@@ -3,7 +3,6 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
- require 'set'
7
6
 
8
7
  module Wgit
9
8
  # Class modeling/serialising a HTML web document, although other MIME types
@@ -18,9 +17,9 @@ module Wgit
18
17
  include Assertable
19
18
 
20
19
  # Regex for the allowed var names when defining an extractor.
21
- REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
20
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
22
21
 
23
- # Set of text elements used to build Document#text.
22
+ # Set of text elements used to build the xpath for Document#text.
24
23
  @text_elements = Set.new(%i[
25
24
  a abbr address article aside b bdi bdo blockquote button caption cite
26
25
  code data dd del details dfn div dl dt em figcaption figure footer h1 h2
@@ -29,6 +28,13 @@ module Wgit
29
28
  summary sup td textarea th time u ul var wbr
30
29
  ])
31
30
 
31
+ # Instance vars to be ignored by Document#to_h and in turn Model.document.
32
+ @to_h_ignore_vars = [
33
+ '@parser', # Always ignore the Nokogiri object.
34
+ '@meta_robots', # Used by #no_index?, ignore.
35
+ '@meta_wgit' # Used by #no_index?, ignore.
36
+ ]
37
+
32
38
  # Set of Symbols representing the defined Document extractors.
33
39
  @extractors = Set.new
34
40
 
@@ -38,6 +44,12 @@ module Wgit
38
44
  # README.md for how to add to this Set dynamically.
39
45
  attr_reader :text_elements
40
46
 
47
+ # Array of instance vars to ignore when Document#to_h and in turn
48
+ # Model.document methods are called. Append your own defined extractor
49
+ # vars to omit them from the model (database object) when indexing.
50
+ # Each var should be a String starting with an '@' char e.g. "@data" etc.
51
+ attr_reader :to_h_ignore_vars
52
+
41
53
  # Set of Symbols representing the defined Document extractors. Is
42
54
  # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
55
  attr_reader :extractors
@@ -76,9 +88,9 @@ module Wgit
76
88
  # false if the Document content is an image etc.
77
89
  def initialize(url_or_obj, html = '', encode: true)
78
90
  if url_or_obj.is_a?(String)
79
- init_from_strings(url_or_obj, html, encode: encode)
91
+ init_from_strings(url_or_obj, html, encode:)
80
92
  else
81
- init_from_object(url_or_obj, encode: encode)
93
+ init_from_object(url_or_obj, encode:)
82
94
  end
83
95
  end
84
96
 
@@ -89,9 +101,9 @@ module Wgit
89
101
  #
90
102
  # @return [String] An xpath String to obtain a webpage's text elements.
91
103
  def self.text_elements_xpath
92
- Wgit::Document.text_elements.each_with_index.reduce('') do |xpath, (el, i)|
104
+ @text_elements.each_with_index.reduce('') do |xpath, (el, i)|
93
105
  xpath += ' | ' unless i.zero?
94
- xpath += format('//%s/text()', el)
106
+ xpath + format('//%s/text()', el)
95
107
  end
96
108
  end
97
109
 
@@ -210,7 +222,7 @@ module Wgit
210
222
  #
211
223
  # @return [String] A short textual representation of this Document.
212
224
  def inspect
213
- "#<Wgit::Document url=\"#{@url}\" html=#{size} bytes>"
225
+ "#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
214
226
  end
215
227
 
216
228
  # Determines if both the url and html match. Use
@@ -241,10 +253,10 @@ module Wgit
241
253
  # Provide the `link:` parameter to get the correct base URL for that type
242
254
  # of link. For example, a link of `#top` would always return @url because
243
255
  # it applies to that page, not a different one. Query strings work in the
244
- # same way. Use this parameter if manually concatting Url's e.g.
256
+ # same way. Use this parameter if manually joining Url's e.g.
245
257
  #
246
258
  # relative_link = Wgit::Url.new('?q=hello')
247
- # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
259
+ # absolute_link = doc.base_url(link: relative_link).join(relative_link)
248
260
  #
249
261
  # This is similar to how Wgit::Document#internal_absolute_links works.
250
262
  #
@@ -264,7 +276,7 @@ module Wgit
264
276
  be relative"
265
277
  end
266
278
 
267
- get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
279
+ get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
268
280
 
269
281
  if link
270
282
  link = Wgit::Url.new(link)
@@ -288,11 +300,11 @@ be relative"
288
300
  # returned Hash.
289
301
  # @return [Hash] Containing self's instance vars.
290
302
  def to_h(include_html: false, include_score: true)
291
- ignore = include_html ? [] : ['@html']
303
+ ignore = Wgit::Document.to_h_ignore_vars.dup
304
+ ignore << '@html' unless include_html
292
305
  ignore << '@score' unless include_score
293
- ignore << '@parser' # Always ignore the Nokogiri object.
294
306
 
295
- Wgit::Utils.to_h(self, ignore: ignore)
307
+ Wgit::Utils.to_h(self, ignore:)
296
308
  end
297
309
 
298
310
  # Converts this Document's #to_h return value to a JSON String.
@@ -301,7 +313,7 @@ be relative"
301
313
  # returned JSON String.
302
314
  # @return [String] This Document represented as a JSON String.
303
315
  def to_json(include_html: false)
304
- h = to_h(include_html: include_html)
316
+ h = to_h(include_html:)
305
317
  JSON.generate(h)
306
318
  end
307
319
 
@@ -323,7 +335,7 @@ be relative"
323
335
  else
324
336
  next unless instance_variable_get(var).respond_to?(:length)
325
337
 
326
- hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
338
+ hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
327
339
  end
328
340
  end
329
341
 
@@ -431,7 +443,6 @@ be relative"
431
443
  end
432
444
  end
433
445
  .reject { |link| link.relative?(host: @url.to_origin) }
434
- .map(&:omit_trailing_slash)
435
446
 
436
447
  Wgit::Utils.sanitize(links)
437
448
  end
@@ -507,10 +518,7 @@ be relative"
507
518
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
508
519
  )
509
520
  orig_text = @text
510
- @text = search(
511
- query, case_sensitive: case_sensitive,
512
- whole_sentence: whole_sentence, sentence_limit: sentence_limit
513
- )
521
+ @text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
514
522
 
515
523
  orig_text
516
524
  end
@@ -533,11 +541,17 @@ be relative"
533
541
  # @return [String, Object] The value found in the html or the default value
534
542
  # (singleton ? nil : []).
535
543
  def extract(xpath, singleton: true, text_content_only: true, &block)
536
- send(
537
- :extract_from_html, xpath,
538
- singleton: singleton, text_content_only: text_content_only,
539
- &block
540
- )
544
+ send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
545
+ end
546
+
547
+ # Works with the default extractors to extract and check the HTML meta tags
548
+ # instructing Wgit not to index this document (save it to a Database). If
549
+ # the default extractors are removed, this method will always return false.
550
+ #
551
+ # @return [Boolean] True if this document shouldn't be saved to a Database,
552
+ # false otherwise.
553
+ def no_index?
554
+ [@meta_robots, @meta_wgit].include?('noindex')
541
555
  end
542
556
 
543
557
  protected
@@ -581,7 +595,7 @@ be relative"
581
595
  result = singleton ? result.content : result.map(&:content)
582
596
  end
583
597
 
584
- Wgit::Utils.sanitize(result)
598
+ result = Wgit::Utils.sanitize(result)
585
599
  result = yield(result, self, :document) if block_given?
586
600
  result
587
601
  end
@@ -608,7 +622,7 @@ be relative"
608
622
  default = singleton ? nil : []
609
623
  result = obj.fetch(key.to_s, default)
610
624
 
611
- Wgit::Utils.sanitize(result)
625
+ result = Wgit::Utils.sanitize(result)
612
626
  result = yield(result, obj, :object) if block_given?
613
627
  result
614
628
  end
@@ -628,13 +642,13 @@ be relative"
628
642
  @parser = init_nokogiri
629
643
  @score = 0.0
630
644
 
631
- Wgit::Utils.sanitize(@html, encode: encode)
645
+ @html = Wgit::Utils.sanitize(@html, encode:)
632
646
 
633
647
  # Dynamically run the init_*_from_html methods.
634
648
  Document.private_instance_methods(false).each do |method|
635
649
  if method.to_s.start_with?('init_') &&
636
- method.to_s.end_with?('_from_html')
637
- send(method) unless method == __method__
650
+ method.to_s.end_with?('_from_html') && method != __method__
651
+ send(method)
638
652
  end
639
653
  end
640
654
  end
@@ -649,13 +663,13 @@ be relative"
649
663
  @parser = init_nokogiri
650
664
  @score = obj.fetch('score', 0.0)
651
665
 
652
- Wgit::Utils.sanitize(@html, encode: encode)
666
+ @html = Wgit::Utils.sanitize(@html, encode:)
653
667
 
654
668
  # Dynamically run the init_*_from_object methods.
655
669
  Document.private_instance_methods(false).each do |method|
656
670
  if method.to_s.start_with?('init_') &&
657
- method.to_s.end_with?('_from_object')
658
- send(method, obj) unless method == __method__
671
+ method.to_s.end_with?('_from_object') && method != __method__
672
+ send(method, obj)
659
673
  end
660
674
  end
661
675
  end
@@ -668,7 +682,7 @@ be relative"
668
682
  def init_var(var, value)
669
683
  # instance_var_name starts with @, var_name doesn't.
670
684
  var = var.to_s
671
- var_name = (var.start_with?('@') ? var[1..-1] : var).to_sym
685
+ var_name = (var.start_with?('@') ? var[1..] : var).to_sym
672
686
  instance_var_name = "@#{var_name}".to_sym
673
687
 
674
688
  instance_variable_set(instance_var_name, value)
@@ -677,10 +691,10 @@ be relative"
677
691
  var_name
678
692
  end
679
693
 
680
- alias content html
681
- alias statistics stats
682
- alias internal_urls internal_links
683
- alias internal_absolute_urls internal_absolute_links
684
- alias external_urls external_links
694
+ alias_method :content, :html
695
+ alias_method :statistics, :stats
696
+ alias_method :internal_urls, :internal_links
697
+ alias_method :internal_absolute_urls, :internal_absolute_links
698
+ alias_method :external_urls, :external_links
685
699
  end
686
700
  end
@@ -2,6 +2,20 @@
2
2
 
3
3
  ### Default Document Extractors ###
4
4
 
5
+ # No index.
6
+ Wgit::Document.define_extractor(
7
+ :meta_robots,
8
+ '//meta[@name="robots"]/@content',
9
+ singleton: true,
10
+ text_content_only: true
11
+ )
12
+ Wgit::Document.define_extractor(
13
+ :meta_wgit,
14
+ '//meta[@name="wgit"]/@content',
15
+ singleton: true,
16
+ text_content_only: true
17
+ )
18
+
5
19
  # Base.
6
20
  Wgit::Document.define_extractor(
7
21
  :base,
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
45
59
  ) do |keywords, _source, type|
46
60
  if keywords && (type == :document)
47
61
  keywords = keywords.split(',')
48
- Wgit::Utils.sanitize(keywords)
62
+ keywords = Wgit::Utils.sanitize(keywords)
49
63
  end
50
64
  keywords
51
65
  end
data/lib/wgit/dsl.rb CHANGED
@@ -101,7 +101,7 @@ the 'start' function".freeze
101
101
  raise DSL_ERROR__NO_START_URL if urls.empty?
102
102
 
103
103
  urls.map! { |url| Wgit::Url.parse(url) }
104
- crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
104
+ crawler.crawl_urls(*urls, follow_redirects:, &block)
105
105
  end
106
106
 
107
107
  # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -135,9 +135,7 @@ the 'start' function".freeze
135
135
  raise DSL_ERROR__NO_START_URL if urls.empty?
136
136
 
137
137
  xpath = follow || :default
138
- opts = {
139
- follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
- }
138
+ opts = { follow: xpath, allow_paths:, disallow_paths: }
141
139
 
142
140
  urls.reduce([]) do |externals, url|
143
141
  externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
@@ -189,7 +187,7 @@ the 'start' function".freeze
189
187
  db = Wgit::Database.new(connection_string)
190
188
  indexer = Wgit::Indexer.new(db, crawler)
191
189
 
192
- indexer.index_www(max_sites: max_sites, max_data: max_data)
190
+ indexer.index_www(max_sites:, max_data:)
193
191
  end
194
192
 
195
193
  # Indexes a single website using `Wgit::Indexer#index_site` underneath.
@@ -226,8 +224,7 @@ the 'start' function".freeze
226
224
  indexer = Wgit::Indexer.new(db, crawler)
227
225
  xpath = follow || :default
228
226
  crawl_opts = {
229
- insert_externals: insert_externals, follow: xpath,
230
- allow_paths: allow_paths, disallow_paths: disallow_paths
227
+ insert_externals:, follow: xpath, allow_paths:, disallow_paths:
231
228
  }
232
229
 
233
230
  urls.reduce(0) do |total, url|
@@ -261,9 +258,11 @@ the 'start' function".freeze
261
258
  indexer = Wgit::Indexer.new(db, crawler)
262
259
 
263
260
  urls.map! { |url| Wgit::Url.parse(url) }
264
- indexer.index_urls(*urls, insert_externals: insert_externals, &block)
261
+ indexer.index_urls(*urls, insert_externals:, &block)
265
262
  end
266
263
 
264
+ ### DATABASE METHODS ###
265
+
267
266
  # Performs a search of the database's indexed documents and pretty prints
268
267
  # the results in a search engine-esque format. See `Wgit::Database#search!`
269
268
  # and `Wgit::Document#search!` for details of how the search works.
@@ -285,7 +284,7 @@ the 'start' function".freeze
285
284
  # database containing only its matching `#text`.
286
285
  # @return [Array<Wgit::Document>] The search results with matching text.
287
286
  def search(
288
- query, connection_string: @dsl_conn_str, stream: STDOUT,
287
+ query, connection_string: @dsl_conn_str, stream: $stdout,
289
288
  case_sensitive: false, whole_sentence: true,
290
289
  limit: 10, skip: 0, sentence_limit: 80, &block
291
290
  )
@@ -294,15 +293,12 @@ the 'start' function".freeze
294
293
 
295
294
  results = db.search!(
296
295
  query,
297
- case_sensitive: case_sensitive,
298
- whole_sentence: whole_sentence,
299
- limit: limit,
300
- skip: skip,
301
- sentence_limit: sentence_limit,
302
- &block
296
+ case_sensitive:, whole_sentence:,
297
+ limit:, skip:,
298
+ sentence_limit:, &block
303
299
  )
304
300
 
305
- Wgit::Utils.printf_search_results(results, stream: stream)
301
+ Wgit::Utils.pprint_search_results(results, stream:)
306
302
 
307
303
  results
308
304
  end
@@ -317,9 +313,9 @@ the 'start' function".freeze
317
313
  db.clear_db
318
314
  end
319
315
 
320
- alias crawl_url crawl
321
- alias crawl_r crawl_site
322
- alias index_r index_site
323
- alias start_urls start
316
+ alias_method :crawl_url, :crawl
317
+ alias_method :crawl_r, :crawl_site
318
+ alias_method :index_r, :index_site
319
+ alias_method :start_urls, :start
324
320
  end
325
321
  end