wgit 0.10.0 → 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6719bb2015379133ef2c9b417cada1826deab254f6fa1adaa093314f8fece99
4
- data.tar.gz: 5ced648c0dff501bf0191aebfc0188d535f4ee657a072e1dbccd68ebbc6ac881
3
+ metadata.gz: 13a092ae05f0338598d0cc26d85ed828a86d60240eb996bacdbd7511aaae25a2
4
+ data.tar.gz: 3835b8aa3c06d49c1230dd42316c30315e39b0b33f0c433ff8df60eb6be2ecaa
5
5
  SHA512:
6
- metadata.gz: 4a7782b4ccf6fa69fad9bb63d7d421fa548603ad5a35304db554bdcdf6deafe305395aba1ac9f35bcd095bc6cf4049ce70e56645faf1457e2e1313d48d1eb7f8
7
- data.tar.gz: 8b8bb1454a131201e262eda060c6ae8490266a7675910026a0dd6ae0b2b55f2accf140d473edf135078f68cbe1048c4bb86f2dc5a6d4cf08a006f8fc20ac49b5
6
+ metadata.gz: 41a88d424925e3ba670104e69d027a5656c66616ab59bc85c62ec3359afd78230073e5a8f50697cbfc553b8dc9a012cf2ad2f2d857f7840fc668e0e43cdbfb33
7
+ data.tar.gz: 17bd6ddaba4cb53bd5cff4f9b36d5930f4bf496ee588163d190e10274438844f1a5e764c08fc7f82dc6c5c6b2efebaa52dc624be134e38a7d9d3ff9b970d1430
data/CHANGELOG.md CHANGED
@@ -9,6 +9,42 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.10.4
13
+ ### Added
14
+ - `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
15
+ ### Changed/Removed
16
+ - ...
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
21
+ ## v0.10.3
22
+ ### Added
23
+ - ...
24
+ ### Changed/Removed
25
+ - Changed `Database#create_collections` and `#create_unique_indexes` by removing `rescue nil` from their database operations. Now any underlying errors with the database client are not masked.
26
+ ### Fixed
27
+ - ...
28
+ ---
29
+
30
+ ## v0.10.2
31
+ ### Added
32
+ - `Wgit::Base#setup` and `#teardown` methods (lifecycle hooks) that can be overridden by subclasses.
33
+ ### Changed/Removed
34
+ - ...
35
+ ### Fixed
36
+ - ...
37
+ ---
38
+
39
+ ## v0.10.1
40
+ ### Added
41
+ - Support for Ruby 3.
42
+ ### Changed/Removed
43
+ - Removed support for Ruby 2.5 (as it's too old).
44
+ ### Fixed
45
+ - ...
46
+ ---
47
+
12
48
  ## v0.10.0
13
49
  ### Added
14
50
  - `Wgit::Url#scheme_relative?` method.
data/README.md CHANGED
@@ -160,7 +160,7 @@ Only MRI Ruby is tested and supported, but Wgit may work with other Ruby impleme
160
160
 
161
161
  Currently, the required MRI Ruby version is:
162
162
 
163
- `~> 2.5` (a.k.a.) `>= 2.5 && < 3`
163
+ `ruby '>= 2.6', '< 4'`
164
164
 
165
165
  ### Using Bundler
166
166
 
data/lib/wgit/base.rb CHANGED
@@ -4,16 +4,25 @@ module Wgit
4
4
  class Base
5
5
  extend Wgit::DSL
6
6
 
7
+ # Runs once before the crawl/index is run. Override as needed.
8
+ def setup; end
9
+
10
+ # Runs once after the crawl/index is complete. Override as needed.
11
+ def teardown; end
12
+
7
13
  # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
14
  # block to the subclass's `#parse` method.
9
15
  def self.run(&block)
16
+ crawl_method = @method || :crawl
10
17
  obj = new
18
+
11
19
  unless obj.respond_to?(:parse)
12
20
  raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
21
  end
14
22
 
15
- crawl_method = @method || :crawl
23
+ obj.setup
16
24
  send(crawl_method) { |doc| obj.parse(doc, &block) }
25
+ obj.teardown
17
26
 
18
27
  obj
19
28
  end
@@ -91,29 +91,27 @@ module Wgit
91
91
 
92
92
  ### DDL ###
93
93
 
94
- # Creates the urls and documents collections if they don't already exist.
95
- # This method is therefore idempotent.
94
+ # Creates the 'urls' and 'documents' collections.
96
95
  #
97
96
  # @return [nil] Always returns nil.
98
97
  def create_collections
99
- db.client[URLS_COLLECTION].create rescue nil
100
- db.client[DOCUMENTS_COLLECTION].create rescue nil
98
+ @client[URLS_COLLECTION].create
99
+ @client[DOCUMENTS_COLLECTION].create
101
100
 
102
101
  nil
103
102
  end
104
103
 
105
- # Creates the urls and documents unique 'url' indexes if they don't already
106
- # exist. This method is therefore idempotent.
104
+ # Creates the urls and documents unique 'url' indexes.
107
105
  #
108
106
  # @return [nil] Always returns nil.
109
107
  def create_unique_indexes
110
108
  @client[URLS_COLLECTION].indexes.create_one(
111
109
  { url: 1 }, name: UNIQUE_INDEX, unique: true
112
- ) rescue nil
110
+ )
113
111
 
114
112
  @client[DOCUMENTS_COLLECTION].indexes.create_one(
115
113
  { 'url.url' => 1 }, name: UNIQUE_INDEX, unique: true
116
- ) rescue nil
114
+ )
117
115
 
118
116
  nil
119
117
  end
@@ -350,6 +348,58 @@ module Wgit
350
348
  results
351
349
  end
352
350
 
351
+ # Searches the database's Documents for the given query and then searches
352
+ # each result in turn using `doc.search`. Instead of an Array of Documents,
353
+ # this method returns a Hash of the docs url => search_results creating a
354
+ # search engine like result set for quick access to text matches.
355
+ #
356
+ # @param query [String] The text query to search with.
357
+ # @param case_sensitive [Boolean] Whether character case must match.
358
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
359
+ # for separately.
360
+ # @param limit [Integer] The max number of results to return.
361
+ # @param skip [Integer] The number of results to skip.
362
+ # @param sentence_limit [Integer] The max length of each search result
363
+ # sentence.
364
+ # @param top_result_only [Boolean] Whether to return all of the documents
365
+ # search results or just the top (most relavent) result.
366
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
367
+ # DB.
368
+ # @return [Hash<String, String | Array<String>>] The search results obtained
369
+ # from the DB having mapped the docs url => search_results. The format of
370
+ # search_results depends on the value of `top_result_only`.
371
+ def search_text(
372
+ query, case_sensitive: false, whole_sentence: true,
373
+ limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
374
+ )
375
+ results = search(
376
+ query,
377
+ case_sensitive: case_sensitive,
378
+ whole_sentence: whole_sentence,
379
+ limit: limit,
380
+ skip: skip
381
+ )
382
+
383
+ results
384
+ .map do |doc|
385
+ yield(doc) if block_given?
386
+
387
+ results = doc.search(
388
+ query,
389
+ case_sensitive: case_sensitive,
390
+ whole_sentence: whole_sentence,
391
+ sentence_limit: sentence_limit
392
+ )
393
+
394
+ # Only return result if its text has a match - compact is called below.
395
+ next nil if results.empty?
396
+
397
+ [doc.url, (top_result_only ? results.first : results)]
398
+ end
399
+ .compact
400
+ .to_h
401
+ end
402
+
353
403
  # Returns statistics about the database.
354
404
  #
355
405
  # @return [BSON::Document#[]#fetch] Similar to a Hash instance.
data/lib/wgit/document.rb CHANGED
@@ -453,7 +453,7 @@ be relative"
453
453
 
454
454
  if query.is_a?(Regexp)
455
455
  regex = query
456
- else # respond_to? #to_s == true
456
+ else # query.respond_to? :to_s == true
457
457
  query = query.to_s
458
458
  query = query.gsub(' ', '|') unless whole_sentence
459
459
  regex = Regexp.new(query, !case_sensitive)
data/lib/wgit/indexer.rb CHANGED
@@ -80,8 +80,8 @@ database capacity, exiting.")
80
80
  urls_count += write_urls_to_db(ext_links)
81
81
  end
82
82
 
83
- Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
84
- overall for this iteration.")
83
+ Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
84
+ url(s) overall for this iteration.")
85
85
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
86
86
  the next iteration.")
87
87
 
@@ -136,8 +136,8 @@ the next iteration.")
136
136
  Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
137
137
  end
138
138
 
139
- Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
140
- the site: #{url}")
139
+ Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
140
+ for the site: #{url}")
141
141
 
142
142
  total_pages_indexed
143
143
  end
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.0'
9
+ VERSION = '0.10.4'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.10.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-20 00:00:00.000000000 Z
11
+ date: 2021-11-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -249,16 +249,19 @@ require_paths:
249
249
  - lib
250
250
  required_ruby_version: !ruby/object:Gem::Requirement
251
251
  requirements:
252
- - - "~>"
252
+ - - ">="
253
+ - !ruby/object:Gem::Version
254
+ version: '2.6'
255
+ - - "<"
253
256
  - !ruby/object:Gem::Version
254
- version: '2.5'
257
+ version: '4'
255
258
  required_rubygems_version: !ruby/object:Gem::Requirement
256
259
  requirements:
257
260
  - - ">="
258
261
  - !ruby/object:Gem::Version
259
262
  version: '0'
260
263
  requirements: []
261
- rubygems_version: 3.1.2
264
+ rubygems_version: 3.2.22
262
265
  signing_key:
263
266
  specification_version: 4
264
267
  summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically