wgit 0.10.3 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 720cf6b84698fbd54c109319f05557ee2e29bdbda59ec23278422dc5ddc77f2f
4
- data.tar.gz: d4304bce849b404b9d2d7faa4d9a3f7969784f649a83152605b51b2e0bd21ac4
3
+ metadata.gz: 4598dcfc047ce3915ba5a871837be5efc54201d61b4967cf53070bec2af4dd52
4
+ data.tar.gz: 604010011024af6f2d4dfcc87e6c4c1d73f8e4811938281119fccb79792818c1
5
5
  SHA512:
6
- metadata.gz: a8743ec17b3caaa9b6c5dd5c9b9b18902561927dfd992003f25db88334cc2b4364a4c6ce2dea34629f801d5d7dbe9761b15e7f2f034e00ba526db36ce828dcaf
7
- data.tar.gz: 00cf954a86c8b0d96f2e694359c1c75e3193e0e6d146ffba19b3857bef4c15ca93d25f1310ebebf815de8da93ede1b97e325dc54aade699219b9ab35f2976e49
6
+ metadata.gz: 44b098e2a97191801787386e9d2060dcdeacc625c3453976679fc276a73b2bf0614713764a55f7074073018e898f2e43dc1a7f4f803339a86158052f59dcabcb
7
+ data.tar.gz: 8645c7095bb14590cf83c21905c9f5ed524e1047254e6526b8fe46a53f3989395472300d27fb65f899951a5f4b80ee9928accd23164b10e1a834975bf045db47
data/CHANGELOG.md CHANGED
@@ -9,6 +9,33 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.10.6
13
+ ### Added
14
+ - `Wgit::DSL` method `#crawl_url` (aliased to `#crawl`).
15
+ ### Changed/Removed
16
+ - Added a `&block` param to `Wgit::Document#extract`, which gets passed to `#extract_from_html`.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
21
+ ## v0.10.5
22
+ ### Added
23
+ - `Database#last_result` getter method to return the most recent raw mongo result.
24
+ ### Changed/Removed
25
+ - ...
26
+ ### Fixed
27
+ - ...
28
+ ---
29
+
30
+ ## v0.10.4
31
+ ### Added
32
+ - `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
33
+ ### Changed/Removed
34
+ - ...
35
+ ### Fixed
36
+ - ...
37
+ ---
38
+
12
39
  ## v0.10.3
13
40
  ### Added
14
41
  - ...
@@ -45,6 +45,9 @@ module Wgit
45
45
  # A custom setter method is also provided for changing the search logic.
46
46
  attr_reader :text_index
47
47
 
48
+ # The raw MongoDB result of the most recent operation.
49
+ attr_reader :last_result
50
+
48
51
  # Initializes a connected database client using the provided
49
52
  # connection_string or ENV['WGIT_CONNECTION_STRING'].
50
53
  #
@@ -185,6 +188,8 @@ module Wgit
185
188
  result = @client[collection].replace_one(query, data_hash, upsert: true)
186
189
 
187
190
  result.matched_count.zero?
191
+ ensure
192
+ @last_result = result
188
193
  end
189
194
 
190
195
  ### Retrieve Data ###
@@ -294,16 +299,12 @@ module Wgit
294
299
  results = retrieve(DOCUMENTS_COLLECTION, query,
295
300
  sort: sort_proj, projection: sort_proj,
296
301
  limit: limit, skip: skip)
297
- return [] if results.count < 1 # respond_to? :empty? == false
298
302
 
299
- # results.respond_to? :map! is false so we use map and overwrite the var.
300
- results = results.map do |mongo_doc|
303
+ results.map do |mongo_doc|
301
304
  doc = Wgit::Document.new(mongo_doc)
302
305
  yield(doc) if block_given?
303
306
  doc
304
307
  end
305
-
306
- results
307
308
  end
308
309
 
309
310
  # Searches the database's Documents for the given query and then searches
@@ -348,6 +349,58 @@ module Wgit
348
349
  results
349
350
  end
350
351
 
352
+ # Searches the database's Documents for the given query and then searches
353
+ # each result in turn using `doc.search`. Instead of an Array of Documents,
354
+ # this method returns a Hash of the docs url => search_results creating a
355
+ # search engine like result set for quick access to text matches.
356
+ #
357
+ # @param query [String] The text query to search with.
358
+ # @param case_sensitive [Boolean] Whether character case must match.
359
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
360
+ # for separately.
361
+ # @param limit [Integer] The max number of results to return.
362
+ # @param skip [Integer] The number of results to skip.
363
+ # @param sentence_limit [Integer] The max length of each search result
364
+ # sentence.
365
+ # @param top_result_only [Boolean] Whether to return all of the documents
366
+ # search results or just the top (most relavent) result.
367
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
368
+ # DB.
369
+ # @return [Hash<String, String | Array<String>>] The search results obtained
370
+ # from the DB having mapped the docs url => search_results. The format of
371
+ # search_results depends on the value of `top_result_only`.
372
+ def search_text(
373
+ query, case_sensitive: false, whole_sentence: true,
374
+ limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
375
+ )
376
+ results = search(
377
+ query,
378
+ case_sensitive: case_sensitive,
379
+ whole_sentence: whole_sentence,
380
+ limit: limit,
381
+ skip: skip
382
+ )
383
+
384
+ results
385
+ .map do |doc|
386
+ yield(doc) if block_given?
387
+
388
+ results = doc.search(
389
+ query,
390
+ case_sensitive: case_sensitive,
391
+ whole_sentence: whole_sentence,
392
+ sentence_limit: sentence_limit
393
+ )
394
+
395
+ # Only return result if its text has a match - compact is called below.
396
+ next nil if results.empty?
397
+
398
+ [doc.url, (top_result_only ? results.first : results)]
399
+ end
400
+ .compact
401
+ .to_h
402
+ end
403
+
351
404
  # Returns statistics about the database.
352
405
  #
353
406
  # @return [BSON::Document#[]#fetch] Similar to a Hash instance.
@@ -454,21 +507,30 @@ module Wgit
454
507
  # 0 or 1 because urls are unique.
455
508
  def delete(obj)
456
509
  collection, query = get_type_info(obj)
457
- @client[collection].delete_one(query).n
510
+ result = @client[collection].delete_one(query)
511
+ result.n
512
+ ensure
513
+ @last_result = result
458
514
  end
459
515
 
460
516
  # Deletes everything in the urls collection.
461
517
  #
462
518
  # @return [Integer] The number of deleted records.
463
519
  def clear_urls
464
- @client[URLS_COLLECTION].delete_many({}).n
520
+ result = @client[URLS_COLLECTION].delete_many({})
521
+ result.n
522
+ ensure
523
+ @last_result = result
465
524
  end
466
525
 
467
526
  # Deletes everything in the documents collection.
468
527
  #
469
528
  # @return [Integer] The number of deleted records.
470
529
  def clear_docs
471
- @client[DOCUMENTS_COLLECTION].delete_many({}).n
530
+ result = @client[DOCUMENTS_COLLECTION].delete_many({})
531
+ result.n
532
+ ensure
533
+ @last_result = result
472
534
  end
473
535
 
474
536
  # Deletes everything in the urls and documents collections. This will nuke
@@ -536,6 +598,8 @@ module Wgit
536
598
  else
537
599
  raise 'data must be a Hash or an Array of Hashes'
538
600
  end
601
+ ensure
602
+ @last_result = result
539
603
  end
540
604
 
541
605
  # Return if the write to the DB succeeded or not.
@@ -572,8 +636,8 @@ module Wgit
572
636
  sort: {}, projection: {},
573
637
  limit: 0, skip: 0)
574
638
  assert_type(query, Hash)
575
- @client[collection.to_sym].find(query).projection(projection)
576
- .skip(skip).limit(limit).sort(sort)
639
+ @last_result = @client[collection.to_sym].find(query).projection(projection)
640
+ .skip(skip).limit(limit).sort(sort)
577
641
  end
578
642
 
579
643
  # Mutate/update one or more Url or Document records in the DB.
@@ -593,6 +657,8 @@ module Wgit
593
657
  raise 'DB write(s) (update) failed' unless write_succeeded?(result)
594
658
 
595
659
  result.n
660
+ ensure
661
+ @last_result = result
596
662
  end
597
663
 
598
664
  alias num_objects num_records
data/lib/wgit/document.rb CHANGED
@@ -89,24 +89,18 @@ module Wgit
89
89
  #
90
90
  # @return [String] An xpath String to obtain a webpage's text elements.
91
91
  def self.text_elements_xpath
92
- xpath = ''
93
- return xpath if Wgit::Document.text_elements.empty?
94
-
95
- el_xpath = '//%s/text()'
96
- Wgit::Document.text_elements.each_with_index do |el, i|
97
- xpath += ' | ' unless i.zero?
98
- xpath += format(el_xpath, el)
92
+ Wgit::Document.text_elements.each_with_index.reduce("") do |xpath, (el, i)|
93
+ xpath += " | " unless i.zero?
94
+ xpath += format("//%s/text()", el)
99
95
  end
100
-
101
- xpath
102
96
  end
103
97
 
104
98
  # Defines a content extractor, which extracts HTML elements/content
105
99
  # into instance variables upon Document initialization. See the default
106
100
  # extractors defined in 'document_extractors.rb' as examples. Defining an
107
101
  # extractor means that every subsequently crawled/initialized document
108
- # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
- # content extraction.
102
+ # will attempt to extract the xpath's content. Use `#extract` for a one off
103
+ # content extraction on any document.
110
104
  #
111
105
  # Note that defined extractors work for both Documents initialized from
112
106
  # HTML (via Wgit::Crawler methods) and from database objects.
@@ -115,9 +109,9 @@ module Wgit
115
109
  #
116
110
  # When initialising from HTML, a singleton value of true will only
117
111
  # ever return the first result found; otherwise all the results are
118
- # returned in an Array. When initialising from a database object, the value
119
- # is taken as is and singleton is only used to define the default empty
120
- # value. If a value cannot be found (in either the HTML or database
112
+ # returned in an Enumerable. When initialising from a database object, the
113
+ # value is taken as is and singleton is only used to define the default
114
+ # empty value. If a value cannot be found (in either the HTML or database
121
115
  # object), then a default will be used. The default value is:
122
116
  # `singleton ? nil : []`.
123
117
  #
@@ -134,12 +128,14 @@ module Wgit
134
128
  # @param opts [Hash] The options to define an extractor with. The
135
129
  # options are only used when intializing from HTML, not the database.
136
130
  # @option opts [Boolean] :singleton The singleton option determines
137
- # whether or not the result(s) should be in an Array. If multiple
131
+ # whether or not the result(s) should be in an Enumerable. If multiple
138
132
  # results are found and singleton is true then the first result will be
139
133
  # used. Defaults to true.
140
134
  # @option opts [Boolean] :text_content_only The text_content_only option
141
- # if true will use the text content of the Nokogiri result object,
142
- # otherwise the Nokogiri object itself is returned. Defaults to true.
135
+ # if true will use the text #content of the Nokogiri result object,
136
+ # otherwise the Nokogiri object itself is returned. The type of Nokogiri
137
+ # object returned depends on the given xpath query. See the Nokogiri
138
+ # documentation for more information. Defaults to true.
143
139
  # @yield The block is executed when a Wgit::Document is initialized,
144
140
  # regardless of the source. Use it (optionally) to process the result
145
141
  # value.
@@ -453,7 +449,7 @@ be relative"
453
449
 
454
450
  if query.is_a?(Regexp)
455
451
  regex = query
456
- else # respond_to? #to_s == true
452
+ else # query.respond_to? :to_s == true
457
453
  query = query.to_s
458
454
  query = query.gsub(' ', '|') unless whole_sentence
459
455
  regex = Regexp.new(query, !case_sensitive)
@@ -509,16 +505,24 @@ be relative"
509
505
  # parameter.
510
506
  #
511
507
  # @param xpath [String, #call] Used to find the value/object in @html.
512
- # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
- # Object) : results (Array).
508
+ # @param singleton [Boolean] singleton ? results.first (single Object) :
509
+ # results (Enumerable).
514
510
  # @param text_content_only [Boolean] text_content_only ? result.content
515
511
  # (String) : result (Nokogiri Object).
512
+ # @yield (Optionally) Pass a block to read/write the result value before
513
+ # it's returned.
514
+ # @yieldparam value [Object] The result value to be returned.
515
+ # @yieldparam source [Wgit::Document, Object] This Document instance.
516
+ # @yieldparam type [Symbol] The `source` type, which is `:document`.
517
+ # @yieldreturn [Object] The return value of the block gets returned. Return
518
+ # the block's `value` param unchanged if you simply want to inspect it.
516
519
  # @return [String, Object] The value found in the html or the default value
517
520
  # (singleton ? nil : []).
518
- def extract(xpath, singleton: true, text_content_only: true)
521
+ def extract(xpath, singleton: true, text_content_only: true, &block)
519
522
  send(
520
523
  :extract_from_html, xpath,
521
- singleton: singleton, text_content_only: text_content_only
524
+ singleton: singleton, text_content_only: text_content_only,
525
+ &block
522
526
  )
523
527
  end
524
528
 
@@ -542,27 +546,25 @@ be relative"
542
546
  # parameter.
543
547
  #
544
548
  # @param xpath [String, #call] Used to find the value/object in @html.
545
- # @param singleton [Boolean] singleton ? results.first (single Nokogiri
546
- # Object) : results (Array).
549
+ # @param singleton [Boolean] singleton ? results.first (single Object) :
550
+ # results (Enumerable).
547
551
  # @param text_content_only [Boolean] text_content_only ? result.content
548
552
  # (String) : result (Nokogiri Object).
549
- # @yield The block is executed when a Wgit::Document is initialized,
550
- # regardless of the source. Use it (optionally) to process the result
551
- # value.
553
+ # @yield (Optionally) Pass a block to read/write the result value before
554
+ # it's returned.
552
555
  # @yieldparam value [Object] The result value to be returned.
553
- # @yieldparam source [Wgit::Document, Object] The source of the `value`.
554
- # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
555
- # `:object`.
556
+ # @yieldparam source [Wgit::Document, Object] This Document instance.
557
+ # @yieldparam type [Symbol] The `source` type, which is `:document`.
556
558
  # @yieldreturn [Object] The return value of the block gets returned. Return
557
559
  # the block's `value` param unchanged if you simply want to inspect it.
558
560
  # @return [String, Object] The value found in the html or the default value
559
561
  # (singleton ? nil : []).
560
562
  def extract_from_html(xpath, singleton: true, text_content_only: true)
561
563
  xpath = xpath.call if xpath.respond_to?(:call)
562
- result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
564
+ result = singleton ? at_xpath(xpath) : xpath(xpath)
563
565
 
564
- if text_content_only
565
- result = singleton ? result&.content : result.map(&:content)
566
+ if result && text_content_only
567
+ result = singleton ? result.content : result.map(&:content)
566
568
  end
567
569
 
568
570
  Wgit::Utils.sanitize(result)
data/lib/wgit/dsl.rb CHANGED
@@ -317,6 +317,7 @@ the 'start' function".freeze
317
317
  db.clear_db
318
318
  end
319
319
 
320
+ alias crawl_url crawl
320
321
  alias crawl_r crawl_site
321
322
  alias index_r index_site
322
323
  alias start_urls start
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.3'
9
+ VERSION = '0.10.6'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.3
4
+ version: 0.10.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-25 00:00:00.000000000 Z
11
+ date: 2022-07-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable