wgit 0.10.3 → 0.10.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 720cf6b84698fbd54c109319f05557ee2e29bdbda59ec23278422dc5ddc77f2f
4
- data.tar.gz: d4304bce849b404b9d2d7faa4d9a3f7969784f649a83152605b51b2e0bd21ac4
3
+ metadata.gz: 4598dcfc047ce3915ba5a871837be5efc54201d61b4967cf53070bec2af4dd52
4
+ data.tar.gz: 604010011024af6f2d4dfcc87e6c4c1d73f8e4811938281119fccb79792818c1
5
5
  SHA512:
6
- metadata.gz: a8743ec17b3caaa9b6c5dd5c9b9b18902561927dfd992003f25db88334cc2b4364a4c6ce2dea34629f801d5d7dbe9761b15e7f2f034e00ba526db36ce828dcaf
7
- data.tar.gz: 00cf954a86c8b0d96f2e694359c1c75e3193e0e6d146ffba19b3857bef4c15ca93d25f1310ebebf815de8da93ede1b97e325dc54aade699219b9ab35f2976e49
6
+ metadata.gz: 44b098e2a97191801787386e9d2060dcdeacc625c3453976679fc276a73b2bf0614713764a55f7074073018e898f2e43dc1a7f4f803339a86158052f59dcabcb
7
+ data.tar.gz: 8645c7095bb14590cf83c21905c9f5ed524e1047254e6526b8fe46a53f3989395472300d27fb65f899951a5f4b80ee9928accd23164b10e1a834975bf045db47
data/CHANGELOG.md CHANGED
@@ -9,6 +9,33 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.10.6
13
+ ### Added
14
+ - `Wgit::DSL` method `#crawl_url` (aliased to `#crawl`).
15
+ ### Changed/Removed
16
+ - Added a `&block` param to `Wgit::Document#extract`, which gets passed to `#extract_from_html`.
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
21
+ ## v0.10.5
22
+ ### Added
23
+ - `Database#last_result` getter method to return the most recent raw mongo result.
24
+ ### Changed/Removed
25
+ - ...
26
+ ### Fixed
27
+ - ...
28
+ ---
29
+
30
+ ## v0.10.4
31
+ ### Added
32
+ - `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
33
+ ### Changed/Removed
34
+ - ...
35
+ ### Fixed
36
+ - ...
37
+ ---
38
+
12
39
  ## v0.10.3
13
40
  ### Added
14
41
  - ...
@@ -45,6 +45,9 @@ module Wgit
45
45
  # A custom setter method is also provided for changing the search logic.
46
46
  attr_reader :text_index
47
47
 
48
+ # The raw MongoDB result of the most recent operation.
49
+ attr_reader :last_result
50
+
48
51
  # Initializes a connected database client using the provided
49
52
  # connection_string or ENV['WGIT_CONNECTION_STRING'].
50
53
  #
@@ -185,6 +188,8 @@ module Wgit
185
188
  result = @client[collection].replace_one(query, data_hash, upsert: true)
186
189
 
187
190
  result.matched_count.zero?
191
+ ensure
192
+ @last_result = result
188
193
  end
189
194
 
190
195
  ### Retrieve Data ###
@@ -294,16 +299,12 @@ module Wgit
294
299
  results = retrieve(DOCUMENTS_COLLECTION, query,
295
300
  sort: sort_proj, projection: sort_proj,
296
301
  limit: limit, skip: skip)
297
- return [] if results.count < 1 # respond_to? :empty? == false
298
302
 
299
- # results.respond_to? :map! is false so we use map and overwrite the var.
300
- results = results.map do |mongo_doc|
303
+ results.map do |mongo_doc|
301
304
  doc = Wgit::Document.new(mongo_doc)
302
305
  yield(doc) if block_given?
303
306
  doc
304
307
  end
305
-
306
- results
307
308
  end
308
309
 
309
310
  # Searches the database's Documents for the given query and then searches
@@ -348,6 +349,58 @@ module Wgit
348
349
  results
349
350
  end
350
351
 
352
+ # Searches the database's Documents for the given query and then searches
353
+ # each result in turn using `doc.search`. Instead of an Array of Documents,
354
+ # this method returns a Hash of the docs url => search_results creating a
355
+ # search engine like result set for quick access to text matches.
356
+ #
357
+ # @param query [String] The text query to search with.
358
+ # @param case_sensitive [Boolean] Whether character case must match.
359
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
360
+ # for separately.
361
+ # @param limit [Integer] The max number of results to return.
362
+ # @param skip [Integer] The number of results to skip.
363
+ # @param sentence_limit [Integer] The max length of each search result
364
+ # sentence.
365
+ # @param top_result_only [Boolean] Whether to return all of the documents
366
+ # search results or just the top (most relavent) result.
367
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
368
+ # DB.
369
+ # @return [Hash<String, String | Array<String>>] The search results obtained
370
+ # from the DB having mapped the docs url => search_results. The format of
371
+ # search_results depends on the value of `top_result_only`.
372
+ def search_text(
373
+ query, case_sensitive: false, whole_sentence: true,
374
+ limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
375
+ )
376
+ results = search(
377
+ query,
378
+ case_sensitive: case_sensitive,
379
+ whole_sentence: whole_sentence,
380
+ limit: limit,
381
+ skip: skip
382
+ )
383
+
384
+ results
385
+ .map do |doc|
386
+ yield(doc) if block_given?
387
+
388
+ results = doc.search(
389
+ query,
390
+ case_sensitive: case_sensitive,
391
+ whole_sentence: whole_sentence,
392
+ sentence_limit: sentence_limit
393
+ )
394
+
395
+ # Only return result if its text has a match - compact is called below.
396
+ next nil if results.empty?
397
+
398
+ [doc.url, (top_result_only ? results.first : results)]
399
+ end
400
+ .compact
401
+ .to_h
402
+ end
403
+
351
404
  # Returns statistics about the database.
352
405
  #
353
406
  # @return [BSON::Document#[]#fetch] Similar to a Hash instance.
@@ -454,21 +507,30 @@ module Wgit
454
507
  # 0 or 1 because urls are unique.
455
508
  def delete(obj)
456
509
  collection, query = get_type_info(obj)
457
- @client[collection].delete_one(query).n
510
+ result = @client[collection].delete_one(query)
511
+ result.n
512
+ ensure
513
+ @last_result = result
458
514
  end
459
515
 
460
516
  # Deletes everything in the urls collection.
461
517
  #
462
518
  # @return [Integer] The number of deleted records.
463
519
  def clear_urls
464
- @client[URLS_COLLECTION].delete_many({}).n
520
+ result = @client[URLS_COLLECTION].delete_many({})
521
+ result.n
522
+ ensure
523
+ @last_result = result
465
524
  end
466
525
 
467
526
  # Deletes everything in the documents collection.
468
527
  #
469
528
  # @return [Integer] The number of deleted records.
470
529
  def clear_docs
471
- @client[DOCUMENTS_COLLECTION].delete_many({}).n
530
+ result = @client[DOCUMENTS_COLLECTION].delete_many({})
531
+ result.n
532
+ ensure
533
+ @last_result = result
472
534
  end
473
535
 
474
536
  # Deletes everything in the urls and documents collections. This will nuke
@@ -536,6 +598,8 @@ module Wgit
536
598
  else
537
599
  raise 'data must be a Hash or an Array of Hashes'
538
600
  end
601
+ ensure
602
+ @last_result = result
539
603
  end
540
604
 
541
605
  # Return if the write to the DB succeeded or not.
@@ -572,8 +636,8 @@ module Wgit
572
636
  sort: {}, projection: {},
573
637
  limit: 0, skip: 0)
574
638
  assert_type(query, Hash)
575
- @client[collection.to_sym].find(query).projection(projection)
576
- .skip(skip).limit(limit).sort(sort)
639
+ @last_result = @client[collection.to_sym].find(query).projection(projection)
640
+ .skip(skip).limit(limit).sort(sort)
577
641
  end
578
642
 
579
643
  # Mutate/update one or more Url or Document records in the DB.
@@ -593,6 +657,8 @@ module Wgit
593
657
  raise 'DB write(s) (update) failed' unless write_succeeded?(result)
594
658
 
595
659
  result.n
660
+ ensure
661
+ @last_result = result
596
662
  end
597
663
 
598
664
  alias num_objects num_records
data/lib/wgit/document.rb CHANGED
@@ -89,24 +89,18 @@ module Wgit
89
89
  #
90
90
  # @return [String] An xpath String to obtain a webpage's text elements.
91
91
  def self.text_elements_xpath
92
- xpath = ''
93
- return xpath if Wgit::Document.text_elements.empty?
94
-
95
- el_xpath = '//%s/text()'
96
- Wgit::Document.text_elements.each_with_index do |el, i|
97
- xpath += ' | ' unless i.zero?
98
- xpath += format(el_xpath, el)
92
+ Wgit::Document.text_elements.each_with_index.reduce("") do |xpath, (el, i)|
93
+ xpath += " | " unless i.zero?
94
+ xpath += format("//%s/text()", el)
99
95
  end
100
-
101
- xpath
102
96
  end
103
97
 
104
98
  # Defines a content extractor, which extracts HTML elements/content
105
99
  # into instance variables upon Document initialization. See the default
106
100
  # extractors defined in 'document_extractors.rb' as examples. Defining an
107
101
  # extractor means that every subsequently crawled/initialized document
108
- # will attempt to extract the xpath's content. Use `#xpath` for a one off
109
- # content extraction.
102
+ # will attempt to extract the xpath's content. Use `#extract` for a one off
103
+ # content extraction on any document.
110
104
  #
111
105
  # Note that defined extractors work for both Documents initialized from
112
106
  # HTML (via Wgit::Crawler methods) and from database objects.
@@ -115,9 +109,9 @@ module Wgit
115
109
  #
116
110
  # When initialising from HTML, a singleton value of true will only
117
111
  # ever return the first result found; otherwise all the results are
118
- # returned in an Array. When initialising from a database object, the value
119
- # is taken as is and singleton is only used to define the default empty
120
- # value. If a value cannot be found (in either the HTML or database
112
+ # returned in an Enumerable. When initialising from a database object, the
113
+ # value is taken as is and singleton is only used to define the default
114
+ # empty value. If a value cannot be found (in either the HTML or database
121
115
  # object), then a default will be used. The default value is:
122
116
  # `singleton ? nil : []`.
123
117
  #
@@ -134,12 +128,14 @@ module Wgit
134
128
  # @param opts [Hash] The options to define an extractor with. The
135
129
  # options are only used when intializing from HTML, not the database.
136
130
  # @option opts [Boolean] :singleton The singleton option determines
137
- # whether or not the result(s) should be in an Array. If multiple
131
+ # whether or not the result(s) should be in an Enumerable. If multiple
138
132
  # results are found and singleton is true then the first result will be
139
133
  # used. Defaults to true.
140
134
  # @option opts [Boolean] :text_content_only The text_content_only option
141
- # if true will use the text content of the Nokogiri result object,
142
- # otherwise the Nokogiri object itself is returned. Defaults to true.
135
+ # if true will use the text #content of the Nokogiri result object,
136
+ # otherwise the Nokogiri object itself is returned. The type of Nokogiri
137
+ # object returned depends on the given xpath query. See the Nokogiri
138
+ # documentation for more information. Defaults to true.
143
139
  # @yield The block is executed when a Wgit::Document is initialized,
144
140
  # regardless of the source. Use it (optionally) to process the result
145
141
  # value.
@@ -453,7 +449,7 @@ be relative"
453
449
 
454
450
  if query.is_a?(Regexp)
455
451
  regex = query
456
- else # respond_to? #to_s == true
452
+ else # query.respond_to? :to_s == true
457
453
  query = query.to_s
458
454
  query = query.gsub(' ', '|') unless whole_sentence
459
455
  regex = Regexp.new(query, !case_sensitive)
@@ -509,16 +505,24 @@ be relative"
509
505
  # parameter.
510
506
  #
511
507
  # @param xpath [String, #call] Used to find the value/object in @html.
512
- # @param singleton [Boolean] singleton ? results.first (single Nokogiri
513
- # Object) : results (Array).
508
+ # @param singleton [Boolean] singleton ? results.first (single Object) :
509
+ # results (Enumerable).
514
510
  # @param text_content_only [Boolean] text_content_only ? result.content
515
511
  # (String) : result (Nokogiri Object).
512
+ # @yield (Optionally) Pass a block to read/write the result value before
513
+ # it's returned.
514
+ # @yieldparam value [Object] The result value to be returned.
515
+ # @yieldparam source [Wgit::Document, Object] This Document instance.
516
+ # @yieldparam type [Symbol] The `source` type, which is `:document`.
517
+ # @yieldreturn [Object] The return value of the block gets returned. Return
518
+ # the block's `value` param unchanged if you simply want to inspect it.
516
519
  # @return [String, Object] The value found in the html or the default value
517
520
  # (singleton ? nil : []).
518
- def extract(xpath, singleton: true, text_content_only: true)
521
+ def extract(xpath, singleton: true, text_content_only: true, &block)
519
522
  send(
520
523
  :extract_from_html, xpath,
521
- singleton: singleton, text_content_only: text_content_only
524
+ singleton: singleton, text_content_only: text_content_only,
525
+ &block
522
526
  )
523
527
  end
524
528
 
@@ -542,27 +546,25 @@ be relative"
542
546
  # parameter.
543
547
  #
544
548
  # @param xpath [String, #call] Used to find the value/object in @html.
545
- # @param singleton [Boolean] singleton ? results.first (single Nokogiri
546
- # Object) : results (Array).
549
+ # @param singleton [Boolean] singleton ? results.first (single Object) :
550
+ # results (Enumerable).
547
551
  # @param text_content_only [Boolean] text_content_only ? result.content
548
552
  # (String) : result (Nokogiri Object).
549
- # @yield The block is executed when a Wgit::Document is initialized,
550
- # regardless of the source. Use it (optionally) to process the result
551
- # value.
553
+ # @yield (Optionally) Pass a block to read/write the result value before
554
+ # it's returned.
552
555
  # @yieldparam value [Object] The result value to be returned.
553
- # @yieldparam source [Wgit::Document, Object] The source of the `value`.
554
- # @yieldparam type [Symbol] The `source` type, either `:document` or (DB)
555
- # `:object`.
556
+ # @yieldparam source [Wgit::Document, Object] This Document instance.
557
+ # @yieldparam type [Symbol] The `source` type, which is `:document`.
556
558
  # @yieldreturn [Object] The return value of the block gets returned. Return
557
559
  # the block's `value` param unchanged if you simply want to inspect it.
558
560
  # @return [String, Object] The value found in the html or the default value
559
561
  # (singleton ? nil : []).
560
562
  def extract_from_html(xpath, singleton: true, text_content_only: true)
561
563
  xpath = xpath.call if xpath.respond_to?(:call)
562
- result = singleton ? @parser.at_xpath(xpath) : @parser.xpath(xpath)
564
+ result = singleton ? at_xpath(xpath) : xpath(xpath)
563
565
 
564
- if text_content_only
565
- result = singleton ? result&.content : result.map(&:content)
566
+ if result && text_content_only
567
+ result = singleton ? result.content : result.map(&:content)
566
568
  end
567
569
 
568
570
  Wgit::Utils.sanitize(result)
data/lib/wgit/dsl.rb CHANGED
@@ -317,6 +317,7 @@ the 'start' function".freeze
317
317
  db.clear_db
318
318
  end
319
319
 
320
+ alias crawl_url crawl
320
321
  alias crawl_r crawl_site
321
322
  alias index_r index_site
322
323
  alias start_urls start
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.3'
9
+ VERSION = '0.10.6'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.3
4
+ version: 0.10.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-25 00:00:00.000000000 Z
11
+ date: 2022-07-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable