wgit 0.10.7 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wgit/document.rb CHANGED
@@ -3,7 +3,6 @@ require_relative 'utils'
3
3
  require_relative 'assertable'
4
4
  require 'nokogiri'
5
5
  require 'json'
6
- require 'set'
7
6
 
8
7
  module Wgit
9
8
  # Class modeling/serialising a HTML web document, although other MIME types
@@ -18,9 +17,9 @@ module Wgit
18
17
  include Assertable
19
18
 
20
19
  # Regex for the allowed var names when defining an extractor.
21
- REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/.freeze
20
+ REGEX_EXTRACTOR_NAME = /[a-z0-9_]+/
22
21
 
23
- # Set of text elements used to build Document#text.
22
+ # Set of text elements used to build the xpath for Document#text.
24
23
  @text_elements = Set.new(%i[
25
24
  a abbr address article aside b bdi bdo blockquote button caption cite
26
25
  code data dd del details dfn div dl dt em figcaption figure footer h1 h2
@@ -29,6 +28,13 @@ module Wgit
29
28
  summary sup td textarea th time u ul var wbr
30
29
  ])
31
30
 
31
+ # Instance vars to be ignored by Document#to_h and in turn Model.document.
32
+ @to_h_ignore_vars = [
33
+ '@parser', # Always ignore the Nokogiri object.
34
+ '@meta_robots', # Used by #no_index?, ignore.
35
+ '@meta_wgit' # Used by #no_index?, ignore.
36
+ ]
37
+
32
38
  # Set of Symbols representing the defined Document extractors.
33
39
  @extractors = Set.new
34
40
 
@@ -38,6 +44,12 @@ module Wgit
38
44
  # README.md for how to add to this Set dynamically.
39
45
  attr_reader :text_elements
40
46
 
47
+ # Array of instance vars to ignore when Document#to_h and in turn
48
+ # Model.document methods are called. Append your own defined extractor
49
+ # vars to omit them from the model (database object) when indexing.
50
+ # Each var should be a String starting with an '@' char e.g. "@data" etc.
51
+ attr_reader :to_h_ignore_vars
52
+
41
53
  # Set of Symbols representing the defined Document extractors. Is
42
54
  # read-only. Use Wgit::Document.define_extractor for a new extractor.
43
55
  attr_reader :extractors
@@ -76,9 +88,9 @@ module Wgit
76
88
  # false if the Document content is an image etc.
77
89
  def initialize(url_or_obj, html = '', encode: true)
78
90
  if url_or_obj.is_a?(String)
79
- init_from_strings(url_or_obj, html, encode: encode)
91
+ init_from_strings(url_or_obj, html, encode:)
80
92
  else
81
- init_from_object(url_or_obj, encode: encode)
93
+ init_from_object(url_or_obj, encode:)
82
94
  end
83
95
  end
84
96
 
@@ -89,9 +101,9 @@ module Wgit
89
101
  #
90
102
  # @return [String] An xpath String to obtain a webpage's text elements.
91
103
  def self.text_elements_xpath
92
- Wgit::Document.text_elements.each_with_index.reduce('') do |xpath, (el, i)|
104
+ @text_elements.each_with_index.reduce('') do |xpath, (el, i)|
93
105
  xpath += ' | ' unless i.zero?
94
- xpath += format('//%s/text()', el)
106
+ xpath + format('//%s/text()', el)
95
107
  end
96
108
  end
97
109
 
@@ -192,13 +204,27 @@ module Wgit
192
204
  Document.send(:remove_method, "init_#{var}_from_object")
193
205
 
194
206
  @extractors.delete(var.to_sym)
207
+
195
208
  true
196
209
  rescue NameError
197
210
  false
198
211
  end
199
212
 
213
+ # Removes all default and defined extractors by calling
214
+ # `Document.remove_extractor` underneath. See its documentation.
215
+ def self.remove_extractors
216
+ @extractors.each { |var| remove_extractor(var) }
217
+ end
218
+
200
219
  ### Document Instance Methods ###
201
220
 
221
+ # Overrides String#inspect to shorten the printed output of a Document.
222
+ #
223
+ # @return [String] A short textual representation of this Document.
224
+ def inspect
225
+ "#<Wgit::Document url=\"#{@url}\" html_size=#{size}>"
226
+ end
227
+
202
228
  # Determines if both the url and html match. Use
203
229
  # doc.object_id == other.object_id for exact object comparison.
204
230
  #
@@ -227,10 +253,10 @@ module Wgit
227
253
  # Provide the `link:` parameter to get the correct base URL for that type
228
254
  # of link. For example, a link of `#top` would always return @url because
229
255
  # it applies to that page, not a different one. Query strings work in the
230
- # same way. Use this parameter if manually concatting Url's e.g.
256
+ # same way. Use this parameter if manually joining Url's e.g.
231
257
  #
232
258
  # relative_link = Wgit::Url.new('?q=hello')
233
- # absolute_link = doc.base_url(link: relative_link).concat(relative_link)
259
+ # absolute_link = doc.base_url(link: relative_link).join(relative_link)
234
260
  #
235
261
  # This is similar to how Wgit::Document#internal_absolute_links works.
236
262
  #
@@ -250,7 +276,7 @@ module Wgit
250
276
  be relative"
251
277
  end
252
278
 
253
- get_base = -> { @base.relative? ? @url.to_origin.concat(@base) : @base }
279
+ get_base = -> { @base.relative? ? @url.to_origin.join(@base) : @base }
254
280
 
255
281
  if link
256
282
  link = Wgit::Url.new(link)
@@ -274,11 +300,11 @@ be relative"
274
300
  # returned Hash.
275
301
  # @return [Hash] Containing self's instance vars.
276
302
  def to_h(include_html: false, include_score: true)
277
- ignore = include_html ? [] : ['@html']
303
+ ignore = Wgit::Document.to_h_ignore_vars.dup
304
+ ignore << '@html' unless include_html
278
305
  ignore << '@score' unless include_score
279
- ignore << '@parser' # Always ignore the Nokogiri object.
280
306
 
281
- Wgit::Utils.to_h(self, ignore: ignore)
307
+ Wgit::Utils.to_h(self, ignore:)
282
308
  end
283
309
 
284
310
  # Converts this Document's #to_h return value to a JSON String.
@@ -287,7 +313,7 @@ be relative"
287
313
  # returned JSON String.
288
314
  # @return [String] This Document represented as a JSON String.
289
315
  def to_json(include_html: false)
290
- h = to_h(include_html: include_html)
316
+ h = to_h(include_html:)
291
317
  JSON.generate(h)
292
318
  end
293
319
 
@@ -309,7 +335,7 @@ be relative"
309
335
  else
310
336
  next unless instance_variable_get(var).respond_to?(:length)
311
337
 
312
- hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length)
338
+ hash[var[1..].to_sym] = instance_variable_get(var).send(:length)
313
339
  end
314
340
  end
315
341
 
@@ -417,7 +443,6 @@ be relative"
417
443
  end
418
444
  end
419
445
  .reject { |link| link.relative?(host: @url.to_origin) }
420
- .map(&:omit_trailing_slash)
421
446
 
422
447
  Wgit::Utils.sanitize(links)
423
448
  end
@@ -493,10 +518,7 @@ be relative"
493
518
  query, case_sensitive: false, whole_sentence: true, sentence_limit: 80
494
519
  )
495
520
  orig_text = @text
496
- @text = search(
497
- query, case_sensitive: case_sensitive,
498
- whole_sentence: whole_sentence, sentence_limit: sentence_limit
499
- )
521
+ @text = search(query, case_sensitive:, whole_sentence:, sentence_limit:)
500
522
 
501
523
  orig_text
502
524
  end
@@ -519,11 +541,17 @@ be relative"
519
541
  # @return [String, Object] The value found in the html or the default value
520
542
  # (singleton ? nil : []).
521
543
  def extract(xpath, singleton: true, text_content_only: true, &block)
522
- send(
523
- :extract_from_html, xpath,
524
- singleton: singleton, text_content_only: text_content_only,
525
- &block
526
- )
544
+ send(:extract_from_html, xpath, singleton:, text_content_only:, &block)
545
+ end
546
+
547
+ # Works with the default extractors to extract and check the HTML meta tags
548
+ # instructing Wgit not to index this document (save it to a Database). If
549
+ # the default extractors are removed, this method will always return false.
550
+ #
551
+ # @return [Boolean] True if this document shouldn't be saved to a Database,
552
+ # false otherwise.
553
+ def no_index?
554
+ [@meta_robots, @meta_wgit].include?('noindex')
527
555
  end
528
556
 
529
557
  protected
@@ -567,7 +595,7 @@ be relative"
567
595
  result = singleton ? result.content : result.map(&:content)
568
596
  end
569
597
 
570
- Wgit::Utils.sanitize(result)
598
+ result = Wgit::Utils.sanitize(result)
571
599
  result = yield(result, self, :document) if block_given?
572
600
  result
573
601
  end
@@ -594,7 +622,7 @@ be relative"
594
622
  default = singleton ? nil : []
595
623
  result = obj.fetch(key.to_s, default)
596
624
 
597
- Wgit::Utils.sanitize(result)
625
+ result = Wgit::Utils.sanitize(result)
598
626
  result = yield(result, obj, :object) if block_given?
599
627
  result
600
628
  end
@@ -614,13 +642,13 @@ be relative"
614
642
  @parser = init_nokogiri
615
643
  @score = 0.0
616
644
 
617
- Wgit::Utils.sanitize(@html, encode: encode)
645
+ @html = Wgit::Utils.sanitize(@html, encode:)
618
646
 
619
647
  # Dynamically run the init_*_from_html methods.
620
648
  Document.private_instance_methods(false).each do |method|
621
649
  if method.to_s.start_with?('init_') &&
622
- method.to_s.end_with?('_from_html')
623
- send(method) unless method == __method__
650
+ method.to_s.end_with?('_from_html') && method != __method__
651
+ send(method)
624
652
  end
625
653
  end
626
654
  end
@@ -635,13 +663,13 @@ be relative"
635
663
  @parser = init_nokogiri
636
664
  @score = obj.fetch('score', 0.0)
637
665
 
638
- Wgit::Utils.sanitize(@html, encode: encode)
666
+ @html = Wgit::Utils.sanitize(@html, encode:)
639
667
 
640
668
  # Dynamically run the init_*_from_object methods.
641
669
  Document.private_instance_methods(false).each do |method|
642
670
  if method.to_s.start_with?('init_') &&
643
- method.to_s.end_with?('_from_object')
644
- send(method, obj) unless method == __method__
671
+ method.to_s.end_with?('_from_object') && method != __method__
672
+ send(method, obj)
645
673
  end
646
674
  end
647
675
  end
@@ -654,7 +682,7 @@ be relative"
654
682
  def init_var(var, value)
655
683
  # instance_var_name starts with @, var_name doesn't.
656
684
  var = var.to_s
657
- var_name = (var.start_with?('@') ? var[1..-1] : var).to_sym
685
+ var_name = (var.start_with?('@') ? var[1..] : var).to_sym
658
686
  instance_var_name = "@#{var_name}".to_sym
659
687
 
660
688
  instance_variable_set(instance_var_name, value)
@@ -663,10 +691,10 @@ be relative"
663
691
  var_name
664
692
  end
665
693
 
666
- alias content html
667
- alias statistics stats
668
- alias internal_urls internal_links
669
- alias internal_absolute_urls internal_absolute_links
670
- alias external_urls external_links
694
+ alias_method :content, :html
695
+ alias_method :statistics, :stats
696
+ alias_method :internal_urls, :internal_links
697
+ alias_method :internal_absolute_urls, :internal_absolute_links
698
+ alias_method :external_urls, :external_links
671
699
  end
672
700
  end
@@ -2,6 +2,20 @@
2
2
 
3
3
  ### Default Document Extractors ###
4
4
 
5
+ # No index.
6
+ Wgit::Document.define_extractor(
7
+ :meta_robots,
8
+ '//meta[@name="robots"]/@content',
9
+ singleton: true,
10
+ text_content_only: true
11
+ )
12
+ Wgit::Document.define_extractor(
13
+ :meta_wgit,
14
+ '//meta[@name="wgit"]/@content',
15
+ singleton: true,
16
+ text_content_only: true
17
+ )
18
+
5
19
  # Base.
6
20
  Wgit::Document.define_extractor(
7
21
  :base,
@@ -45,7 +59,7 @@ Wgit::Document.define_extractor(
45
59
  ) do |keywords, _source, type|
46
60
  if keywords && (type == :document)
47
61
  keywords = keywords.split(',')
48
- Wgit::Utils.sanitize(keywords)
62
+ keywords = Wgit::Utils.sanitize(keywords)
49
63
  end
50
64
  keywords
51
65
  end
data/lib/wgit/dsl.rb CHANGED
@@ -101,7 +101,7 @@ the 'start' function".freeze
101
101
  raise DSL_ERROR__NO_START_URL if urls.empty?
102
102
 
103
103
  urls.map! { |url| Wgit::Url.parse(url) }
104
- crawler.crawl_urls(*urls, follow_redirects: follow_redirects, &block)
104
+ crawler.crawl_urls(*urls, follow_redirects:, &block)
105
105
  end
106
106
 
107
107
  # Crawls an entire site using `Wgit::Crawler#crawl_site` underneath. If no
@@ -135,9 +135,7 @@ the 'start' function".freeze
135
135
  raise DSL_ERROR__NO_START_URL if urls.empty?
136
136
 
137
137
  xpath = follow || :default
138
- opts = {
139
- follow: xpath, allow_paths: allow_paths, disallow_paths: disallow_paths
140
- }
138
+ opts = { follow: xpath, allow_paths:, disallow_paths: }
141
139
 
142
140
  urls.reduce([]) do |externals, url|
143
141
  externals + crawler.crawl_site(Wgit::Url.parse(url), **opts, &block)
@@ -189,7 +187,7 @@ the 'start' function".freeze
189
187
  db = Wgit::Database.new(connection_string)
190
188
  indexer = Wgit::Indexer.new(db, crawler)
191
189
 
192
- indexer.index_www(max_sites: max_sites, max_data: max_data)
190
+ indexer.index_www(max_sites:, max_data:)
193
191
  end
194
192
 
195
193
  # Indexes a single website using `Wgit::Indexer#index_site` underneath.
@@ -226,8 +224,7 @@ the 'start' function".freeze
226
224
  indexer = Wgit::Indexer.new(db, crawler)
227
225
  xpath = follow || :default
228
226
  crawl_opts = {
229
- insert_externals: insert_externals, follow: xpath,
230
- allow_paths: allow_paths, disallow_paths: disallow_paths
227
+ insert_externals:, follow: xpath, allow_paths:, disallow_paths:
231
228
  }
232
229
 
233
230
  urls.reduce(0) do |total, url|
@@ -261,9 +258,11 @@ the 'start' function".freeze
261
258
  indexer = Wgit::Indexer.new(db, crawler)
262
259
 
263
260
  urls.map! { |url| Wgit::Url.parse(url) }
264
- indexer.index_urls(*urls, insert_externals: insert_externals, &block)
261
+ indexer.index_urls(*urls, insert_externals:, &block)
265
262
  end
266
263
 
264
+ ### DATABASE METHODS ###
265
+
267
266
  # Performs a search of the database's indexed documents and pretty prints
268
267
  # the results in a search engine-esque format. See `Wgit::Database#search!`
269
268
  # and `Wgit::Document#search!` for details of how the search works.
@@ -285,7 +284,7 @@ the 'start' function".freeze
285
284
  # database containing only its matching `#text`.
286
285
  # @return [Array<Wgit::Document>] The search results with matching text.
287
286
  def search(
288
- query, connection_string: @dsl_conn_str, stream: STDOUT,
287
+ query, connection_string: @dsl_conn_str, stream: $stdout,
289
288
  case_sensitive: false, whole_sentence: true,
290
289
  limit: 10, skip: 0, sentence_limit: 80, &block
291
290
  )
@@ -294,15 +293,12 @@ the 'start' function".freeze
294
293
 
295
294
  results = db.search!(
296
295
  query,
297
- case_sensitive: case_sensitive,
298
- whole_sentence: whole_sentence,
299
- limit: limit,
300
- skip: skip,
301
- sentence_limit: sentence_limit,
302
- &block
296
+ case_sensitive:, whole_sentence:,
297
+ limit:, skip:,
298
+ sentence_limit:, &block
303
299
  )
304
300
 
305
- Wgit::Utils.printf_search_results(results, stream: stream)
301
+ Wgit::Utils.pprint_search_results(results, stream:)
306
302
 
307
303
  results
308
304
  end
@@ -317,9 +313,9 @@ the 'start' function".freeze
317
313
  db.clear_db
318
314
  end
319
315
 
320
- alias crawl_url crawl
321
- alias crawl_r crawl_site
322
- alias index_r index_site
323
- alias start_urls start
316
+ alias_method :crawl_url, :crawl
317
+ alias_method :crawl_r, :crawl_site
318
+ alias_method :index_r, :index_site
319
+ alias_method :start_urls, :start
324
320
  end
325
321
  end