wgit 0.10.8 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/document.rb CHANGED
@@ -3,7 +3,6 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
- require 'set'
7
6
 
8
7
  module Wgit
9
8
  # Class modeling/serialising a HTML web document, although other MIME types
@@ -18,9 +17,9 @@ module Wgit
18
17
  include Assertable
19
18
 
20
19
  # Regex for the allowed var names when defining an extractor.
21
- REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
20
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
22
21
 
23
- # Set of text elements used to build Document#text.
22
+ # Set of text elements used to build the xpath for Document#text.
24
23
  @text_elements = Set.new(%i[
25
24
  a abbr address article aside b bdi bdo blockquote button caption cite
26
25
  code data dd del details dfn div dl dt em figcaption figure footer h1 h2
@@ -29,6 +28,13 @@ module Wgit
29
28
  summary sup td textarea th time u ul var wbr
30
29
  ])
31
30
 
31
+ # Instance vars to be ignored by Document#to_h and in turn Model.document.
32
+ @to_h_ignore_vars = [
33
+ '@parser', # Always ignore the Nokogiri object.
34
+ '@meta_robots', # Used by #no_index?, ignore.
35
+ '@meta_wgit' # Used by #no_index?, ignore.
36
+ ]
37
+
32
38
  # Set of Symbols representing the defined Document extractors.
33
39
  @extractors = Set.new
34
40
 
@@ -38,6 +44,12 @@ module Wgit
38
44
  # README.md for how to add to this Set dynamically.
39
45
  attr_reader :text_elements
40
46
 
47
+ # Array of instance vars to ignore when Document#to_h and in turn
48
+ # Model.document methods are called. Append your own defined extractor
49
+ # vars to omit them from the model (database object) when indexing.
50
+ # Each var should be a String starting with an '@' char e.g. "@data" etc.
51
+ attr_reader :to_h_ignore_vars
52
+
41
53
  # Set of Symbols representing the defined Document extractors. Is
42
54
  # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
55
  attr_reader :extractors
@@ -76,9 +88,9 @@ module Wgit
76
88
  # false if the Document content is an image etc.
77
89
  def initialize(url_or_obj, html = '', encode: true)
78
90
  if url_or_obj.is_a?(String)
79
- init_from_strings(url_or_obj, html, encode: encode)
91
+ init_from_strings(url_or_obj, html, encode:)
80
92
  else
81
- init_from_object(url_or_obj, encode: encode)
93
+ init_from_object(url_or_obj, encode:)
82
94
  end
83
95
  end
84
96
 
@@ -89,9 +101,9 @@ module Wgit
89
101
  #
90
102
  # @return [String] An xpath String to obtain a webpage's text elements.
91
103
  def self.text_elements_xpath
92
- Wgit::Document.text_elements.each_with_index.reduce('') do |xpath, (el, i)|
104
+ @text_elements.each_with_index.reduce('') do |xpath, (el, i)|
93
105
  xpath += ' | ' unless i.zero?
94
- xpath += format('//%s/text()', el)
106
+ xpath + format('//%s/text()', el)
95
107
  end
96
108
  end
97
109
 
@@ -210,7 +222,7 @@ module Wgit
210
222
  #
211
223
  # @return [String] A short textual representation of this Document.
212
224
  def inspect
213
- "#<Wgit::Document url=\"#{@url}\" html=#{size} bytes>"
225
+ "#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
214
226
  end
215
227
 
216
228
  # Determines if both the url and html match. Use
@@ -241,10 +253,10 @@ module Wgit
241
253
  # Provide the `link:` parameter to get the correct base URL for that type
242
254
  # of link. For example, a link of `#top` would always return @url because
243
255
  # it applies to that page, not a different one. Query strings work in the
244
- # same way. Use this parameter if manually concatting Url's e.g.
256
+ # same way. Use this parameter if manually joining Url's e.g.
245
257
  #
246
258
  # relative_link = Wgit::Url.new('?q=hello')
247
- # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
259
+ # absolute_link = doc.base_url(link: relative_link).join(relative_link)
248
260
  #
249
261
  # This is similar to how Wgit::Document#internal_absolute_links works.
250
262
  #
@@ -264,7 +276,7 @@ module Wgit
264
276
  be relative"
265
277
  end
266
278
 
267
- get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
279
+ get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
268
280
 
269
281
  if link
270
282
  link = Wgit::Url.new(link)
@@ -288,11 +300,11 @@ be relative"
288
300
  # returned Hash.
289
301
  # @return [Hash] Containing self's instance vars.
290
302
  def to_h(include_html: false, include_score: true)
291
- ignore = include_html ? [] : ['@html']
303
+ ignore = Wgit::Document.to_h_ignore_vars.dup
304
+ ignore << '@html' unless include_html
292
305
  ignore << '@score' unless include_score
293
- ignore << '@parser' # Always ignore the Nokogiri object.
294
306
 
295
- Wgit::Utils.to_h(self, ignore: ignore)
307
+ Wgit::Utils.to_h(self, ignore:)
296
308
  end
297
309
 
298
310
  # Converts this Document's #to_h return value to a JSON String.
@@ -301,7 +313,7 @@ be relative"
301
313
  # returned JSON String.
302
314
  # @return [String] This Document represented as a JSON String.
303
315
  def to_json(include_html: false)
304
- h = to_h(include_html: include_html)
316
+ h = to_h(include_html:)
305
317
  JSON.generate(h)
306
318
  end
307
319
 
@@ -323,7 +335,7 @@ be relative"
323
335
  else
324
336
  next unless instance_variable_get(var).respond_to?(:length)
325
337
 
326
- hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
338
+ hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
327
339
  end
328
340
  end
329
341
 
@@ -431,7 +443,6 @@ be relative"
431
443
  end
432
444
  end
433
445
  .reject { |link| link.relative?(host: @url.to_origin) }
434
- .map(&:omit_trailing_slash)
435
446
 
436
447
  Wgit::Utils.sanitize(links)
437
448
  end
@@ -507,10 +518,7 @@ be relative"
507
518
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
508
519
  )
509
520
  orig_text = @text
510
- @text = search(
511
- query, case_sensitive: case_sensitive,
512
- whole_sentence: whole_sentence, sentence_limit: sentence_limit
513
- )
521
+ @text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
514
522
 
515
523
  orig_text
516
524
  end
@@ -533,11 +541,17 @@ be relative"
533
541
  # @return [String, Object] The value found in the html or the default value
534
542
  # (singleton ? nil : []).
535
543
  def extract(xpath, singleton: true, text_content_only: true, &block)
536
- send(
537
- :extract_from_html, xpath,
538
- singleton: singleton, text_content_only: text_content_only,
539
- &block
540
- )
544
+ send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
545
+ end
546
+
547
+ # Works with the default extractors to extract and check the HTML meta tags
548
+ # instructing Wgit not to index this document (save it to a Database). If
549
+ # the default extractors are removed, this method will always return false.
550
+ #
551
+ # @return [Boolean] True if this document shouldn't be saved to a Database,
552
+ # false otherwise.
553
+ def no_index?
554
+ [@meta_robots, @meta_wgit].include?('noindex')
541
555
  end
542
556
 
543
557
  protected
@@ -581,7 +595,7 @@ be relative"
581
595
  result = singleton ? result.content : result.map(&:content)
582
596
  end
583
597
 
584
- Wgit::Utils.sanitize(result)
598
+ result = Wgit::Utils.sanitize(result)
585
599
  result = yield(result, self, :document) if block_given?
586
600
  result
587
601
  end
@@ -608,7 +622,7 @@ be relative"
608
622
  default = singleton ? nil : []
609
623
  result = obj.fetch(key.to_s, default)
610
624
 
611
- Wgit::Utils.sanitize(result)
625
+ result = Wgit::Utils.sanitize(result)
612
626
  result = yield(result, obj, :object) if block_given?
613
627
  result
614
628
  end
@@ -628,13 +642,13 @@ be relative"
628
642
  @parser = init_nokogiri
629
643
  @score = 0.0
630
644
 
631
- Wgit::Utils.sanitize(@html, encode: encode)
645
+ @html = Wgit::Utils.sanitize(@html, encode:)
632
646
 
633
647
  # Dynamically run the init_*_from_html methods.
634
648
  Document.private_instance_methods(false).each do |method|
635
649
  if method.to_s.start_with?('init_') &&
636
- method.to_s.end_with?('_from_html')
637
- send(method) unless method == __method__
650
+ method.to_s.end_with?('_from_html') && method != __method__
651
+ send(method)
638
652
  end
639
653
  end
640
654
  end
@@ -649,13 +663,13 @@ be relative"
649
663
  @parser = init_nokogiri
650
664
  @score = obj.fetch('score', 0.0)
651
665
 
652
- Wgit::Utils.sanitize(@html, encode: encode)
666
+ @html = Wgit::Utils.sanitize(@html, encode:)
653
667
 
654
668
  # Dynamically run the init_*_from_object methods.
655
669
  Document.private_instance_methods(false).each do |method|
656
670
  if method.to_s.start_with?('init_') &&
657
- method.to_s.end_with?('_from_object')
658
- send(method, obj) unless method == __method__
671
+ method.to_s.end_with?('_from_object') && method != __method__
672
+ send(method, obj)
659
673
  end
660
674
  end
661
675
  end
@@ -668,7 +682,7 @@ be relative"
668
682
  def init_var(var, value)
669
683
  # instance_var_name starts with @, var_name doesn't.
670
684
  var = var.to_s
671
- var_name = (var.start_with?('@') ? var[1..-1] : var).to_sym
685
+ var_name = (var.start_with?('@') ? var[1..] : var).to_sym
672
686
  instance_var_name = "@#{var_name}".to_sym
673
687
 
674
688
  instance_variable_set(instance_var_name, value)
@@ -677,10 +691,10 @@ be relative"
677
691
  var_name
678
692
  end
679
693
 
680
- alias content html
681
- alias statistics stats
682
- alias internal_urls internal_links
683
- alias internal_absolute_urls internal_absolute_links
684
- alias external_urls external_links
694
+ alias_method :content, :html
695
+ alias_method :statistics, :stats
696
+ alias_method :internal_urls, :internal_links
697
+ alias_method :internal_absolute_urls, :internal_absolute_links
698
+ alias_method :external_urls, :external_links
685
699
  end
686
700
  end
@@ -2,6 +2,20 @@
2
2
 
3
3
  ### Default Document Extractors ###
4
4
 
5
+ # No index.
6
+ Wgit::Document.define_extractor(
7
+ :meta_robots,
8
+ '//meta[@name="robots"]/@content',
9
+ singleton: true,
10
+ text_content_only: true
11
+ )
12
+ Wgit::Document.define_extractor(
13
+ :meta_wgit,
14
+ '//meta[@name="wgit"]/@content',
15
+ singleton: true,
16
+ text_content_only: true
17
+ )
18
+
5
19
  # Base.
6
20
  Wgit::Document.define_extractor(
7
21
  :base,
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
45
59
  ) do |keywords, _source, type|
46
60
  if keywords && (type == :document)
47
61
  keywords = keywords.split(',')
48
- Wgit::Utils.sanitize(keywords)
62
+ keywords = Wgit::Utils.sanitize(keywords)
49
63
  end
50
64
  keywords
51
65
  end
data/lib/wgit/dsl.rb CHANGED
@@ -101,7 +101,7 @@ the 'start' function".freeze
101
101
  raise DSL_ERROR__NO_START_URL if urls.empty?
102
102
 
103
103
  urls.map! { |url| Wgit::Url.parse(url) }
104
- crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
104
+ crawler.crawl_urls(*urls, follow_redirects:, &block)
105
105
  end
106
106
 
107
107
  # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -135,9 +135,7 @@ the 'start' function".freeze
135
135
  raise DSL_ERROR__NO_START_URL if urls.empty?
136
136
 
137
137
  xpath = follow || :default
138
- opts = {
139
- follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
- }
138
+ opts = { follow: xpath, allow_paths:, disallow_paths: }
141
139
 
142
140
  urls.reduce([]) do |externals, url|
143
141
  externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
@@ -189,7 +187,7 @@ the 'start' function".freeze
189
187
  db = Wgit::Database.new(connection_string)
190
188
  indexer = Wgit::Indexer.new(db, crawler)
191
189
 
192
- indexer.index_www(max_sites: max_sites, max_data: max_data)
190
+ indexer.index_www(max_sites:, max_data:)
193
191
  end
194
192
 
195
193
  # Indexes a single website using `Wgit::Indexer#index_site` underneath.
@@ -226,8 +224,7 @@ the 'start' function".freeze
226
224
  indexer = Wgit::Indexer.new(db, crawler)
227
225
  xpath = follow || :default
228
226
  crawl_opts = {
229
- insert_externals: insert_externals, follow: xpath,
230
- allow_paths: allow_paths, disallow_paths: disallow_paths
227
+ insert_externals:, follow: xpath, allow_paths:, disallow_paths:
231
228
  }
232
229
 
233
230
  urls.reduce(0) do |total, url|
@@ -261,9 +258,11 @@ the 'start' function".freeze
261
258
  indexer = Wgit::Indexer.new(db, crawler)
262
259
 
263
260
  urls.map! { |url| Wgit::Url.parse(url) }
264
- indexer.index_urls(*urls, insert_externals: insert_externals, &block)
261
+ indexer.index_urls(*urls, insert_externals:, &block)
265
262
  end
266
263
 
264
+ ### DATABASE METHODS ###
265
+
267
266
  # Performs a search of the database's indexed documents and pretty prints
268
267
  # the results in a search engine-esque format. See `Wgit::Database#search!`
269
268
  # and `Wgit::Document#search!` for details of how the search works.
@@ -285,7 +284,7 @@ the 'start' function".freeze
285
284
  # database containing only its matching `#text`.
286
285
  # @return [Array<Wgit::Document>] The search results with matching text.
287
286
  def search(
288
- query, connection_string: @dsl_conn_str, stream: STDOUT,
287
+ query, connection_string: @dsl_conn_str, stream: $stdout,
289
288
  case_sensitive: false, whole_sentence: true,
290
289
  limit: 10, skip: 0, sentence_limit: 80, &block
291
290
  )
@@ -294,15 +293,12 @@ the 'start' function".freeze
294
293
 
295
294
  results = db.search!(
296
295
  query,
297
- case_sensitive: case_sensitive,
298
- whole_sentence: whole_sentence,
299
- limit: limit,
300
- skip: skip,
301
- sentence_limit: sentence_limit,
302
- &block
296
+ case_sensitive:, whole_sentence:,
297
+ limit:, skip:,
298
+ sentence_limit:, &block
303
299
  )
304
300
 
305
- Wgit::Utils.printf_search_results(results, stream: stream)
301
+ Wgit::Utils.pprint_search_results(results, stream:)
306
302
 
307
303
  results
308
304
  end
@@ -317,9 +313,9 @@ the 'start' function".freeze
317
313
  db.clear_db
318
314
  end
319
315
 
320
- alias crawl_url crawl
321
- alias crawl_r crawl_site
322
- alias index_r index_site
323
- alias start_urls start
316
+ alias_method :crawl_url, :crawl
317
+ alias_method :crawl_r, :crawl_site
318
+ alias_method :index_r, :index_site
319
+ alias_method :start_urls, :start
324
320
  end
325
321
  end