wgit 0.10.0 → 0.10.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6719bb2015379133ef2c9b417cada1826deab254f6fa1adaa093314f8fece99
4
- data.tar.gz: 5ced648c0dff501bf0191aebfc0188d535f4ee657a072e1dbccd68ebbc6ac881
3
+ metadata.gz: 13a092ae05f0338598d0cc26d85ed828a86d60240eb996bacdbd7511aaae25a2
4
+ data.tar.gz: 3835b8aa3c06d49c1230dd42316c30315e39b0b33f0c433ff8df60eb6be2ecaa
5
5
  SHA512:
6
- metadata.gz: 4a7782b4ccf6fa69fad9bb63d7d421fa548603ad5a35304db554bdcdf6deafe305395aba1ac9f35bcd095bc6cf4049ce70e56645faf1457e2e1313d48d1eb7f8
7
- data.tar.gz: 8b8bb1454a131201e262eda060c6ae8490266a7675910026a0dd6ae0b2b55f2accf140d473edf135078f68cbe1048c4bb86f2dc5a6d4cf08a006f8fc20ac49b5
6
+ metadata.gz: 41a88d424925e3ba670104e69d027a5656c66616ab59bc85c62ec3359afd78230073e5a8f50697cbfc553b8dc9a012cf2ad2f2d857f7840fc668e0e43cdbfb33
7
+ data.tar.gz: 17bd6ddaba4cb53bd5cff4f9b36d5930f4bf496ee588163d190e10274438844f1a5e764c08fc7f82dc6c5c6b2efebaa52dc624be134e38a7d9d3ff9b970d1430
data/CHANGELOG.md CHANGED
@@ -9,6 +9,42 @@
9
9
  - ...
10
10
  ---
11
11
 
12
+ ## v0.10.4
13
+ ### Added
14
+ - `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
15
+ ### Changed/Removed
16
+ - ...
17
+ ### Fixed
18
+ - ...
19
+ ---
20
+
21
+ ## v0.10.3
22
+ ### Added
23
+ - ...
24
+ ### Changed/Removed
25
+ - Changed `Database#create_collections` and `#create_unique_indexes` by removing `rescue nil` from their database operations. Now any underlying errors with the database client are not masked.
26
+ ### Fixed
27
+ - ...
28
+ ---
29
+
30
+ ## v0.10.2
31
+ ### Added
32
+ - `Wgit::Base#setup` and `#teardown` methods (lifecycle hooks) that can be overridden by subclasses.
33
+ ### Changed/Removed
34
+ - ...
35
+ ### Fixed
36
+ - ...
37
+ ---
38
+
39
+ ## v0.10.1
40
+ ### Added
41
+ - Support for Ruby 3.
42
+ ### Changed/Removed
43
+ - Removed support for Ruby 2.5 (as it's too old).
44
+ ### Fixed
45
+ - ...
46
+ ---
47
+
12
48
  ## v0.10.0
13
49
  ### Added
14
50
  - `Wgit::Url#scheme_relative?` method.
data/README.md CHANGED
@@ -160,7 +160,7 @@ Only MRI Ruby is tested and supported, but Wgit may work with other Ruby impleme
160
160
 
161
161
  Currently, the required MRI Ruby version is:
162
162
 
163
- `~> 2.5` (a.k.a.) `>= 2.5 && < 3`
163
+ `ruby '>= 2.6', '< 4'`
164
164
 
165
165
  ### Using Bundler
166
166
 
data/lib/wgit/base.rb CHANGED
@@ -4,16 +4,25 @@ module Wgit
4
4
  class Base
5
5
  extend Wgit::DSL
6
6
 
7
+ # Runs once before the crawl/index is run. Override as needed.
8
+ def setup; end
9
+
10
+ # Runs once after the crawl/index is complete. Override as needed.
11
+ def teardown; end
12
+
7
13
  # Runs the crawl/index passing each crawled `Wgit::Document` and the given
8
14
  # block to the subclass's `#parse` method.
9
15
  def self.run(&block)
16
+ crawl_method = @method || :crawl
10
17
  obj = new
18
+
11
19
  unless obj.respond_to?(:parse)
12
20
  raise "#{obj.class} must respond_to? #parse(doc, &block)"
13
21
  end
14
22
 
15
- crawl_method = @method || :crawl
23
+ obj.setup
16
24
  send(crawl_method) { |doc| obj.parse(doc, &block) }
25
+ obj.teardown
17
26
 
18
27
  obj
19
28
  end
@@ -91,29 +91,27 @@ module Wgit
91
91
 
92
92
  ### DDL ###
93
93
 
94
- # Creates the urls and documents collections if they don't already exist.
95
- # This method is therefore idempotent.
94
+ # Creates the 'urls' and 'documents' collections.
96
95
  #
97
96
  # @return [nil] Always returns nil.
98
97
  def create_collections
99
- db.client[URLS_COLLECTION].create rescue nil
100
- db.client[DOCUMENTS_COLLECTION].create rescue nil
98
+ @client[URLS_COLLECTION].create
99
+ @client[DOCUMENTS_COLLECTION].create
101
100
 
102
101
  nil
103
102
  end
104
103
 
105
- # Creates the urls and documents unique 'url' indexes if they don't already
106
- # exist. This method is therefore idempotent.
104
+ # Creates the urls and documents unique 'url' indexes.
107
105
  #
108
106
  # @return [nil] Always returns nil.
109
107
  def create_unique_indexes
110
108
  @client[URLS_COLLECTION].indexes.create_one(
111
109
  { url: 1 }, name: UNIQUE_INDEX, unique: true
112
- ) rescue nil
110
+ )
113
111
 
114
112
  @client[DOCUMENTS_COLLECTION].indexes.create_one(
115
113
  { 'url.url' => 1 }, name: UNIQUE_INDEX, unique: true
116
- ) rescue nil
114
+ )
117
115
 
118
116
  nil
119
117
  end
@@ -350,6 +348,58 @@ module Wgit
350
348
  results
351
349
  end
352
350
 
351
+ # Searches the database's Documents for the given query and then searches
352
+ # each result in turn using `doc.search`. Instead of an Array of Documents,
353
+ # this method returns a Hash of the docs url => search_results creating a
354
+ # search engine like result set for quick access to text matches.
355
+ #
356
+ # @param query [String] The text query to search with.
357
+ # @param case_sensitive [Boolean] Whether character case must match.
358
+ # @param whole_sentence [Boolean] Whether multiple words should be searched
359
+ # for separately.
360
+ # @param limit [Integer] The max number of results to return.
361
+ # @param skip [Integer] The number of results to skip.
362
+ # @param sentence_limit [Integer] The max length of each search result
363
+ # sentence.
364
+ # @param top_result_only [Boolean] Whether to return all of the documents
365
+ # search results or just the top (most relavent) result.
366
+ # @yield [doc] Given each search result (Wgit::Document) returned from the
367
+ # DB.
368
+ # @return [Hash<String, String | Array<String>>] The search results obtained
369
+ # from the DB having mapped the docs url => search_results. The format of
370
+ # search_results depends on the value of `top_result_only`.
371
+ def search_text(
372
+ query, case_sensitive: false, whole_sentence: true,
373
+ limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
374
+ )
375
+ results = search(
376
+ query,
377
+ case_sensitive: case_sensitive,
378
+ whole_sentence: whole_sentence,
379
+ limit: limit,
380
+ skip: skip
381
+ )
382
+
383
+ results
384
+ .map do |doc|
385
+ yield(doc) if block_given?
386
+
387
+ results = doc.search(
388
+ query,
389
+ case_sensitive: case_sensitive,
390
+ whole_sentence: whole_sentence,
391
+ sentence_limit: sentence_limit
392
+ )
393
+
394
+ # Only return result if its text has a match - compact is called below.
395
+ next nil if results.empty?
396
+
397
+ [doc.url, (top_result_only ? results.first : results)]
398
+ end
399
+ .compact
400
+ .to_h
401
+ end
402
+
353
403
  # Returns statistics about the database.
354
404
  #
355
405
  # @return [BSON::Document#[]#fetch] Similar to a Hash instance.
data/lib/wgit/document.rb CHANGED
@@ -453,7 +453,7 @@ be relative"
453
453
 
454
454
  if query.is_a?(Regexp)
455
455
  regex = query
456
- else # respond_to? #to_s == true
456
+ else # query.respond_to? :to_s == true
457
457
  query = query.to_s
458
458
  query = query.gsub(' ', '|') unless whole_sentence
459
459
  regex = Regexp.new(query, !case_sensitive)
data/lib/wgit/indexer.rb CHANGED
@@ -80,8 +80,8 @@ database capacity, exiting.")
80
80
  urls_count += write_urls_to_db(ext_links)
81
81
  end
82
82
 
83
- Wgit.logger.info("Crawled and indexed docs for #{docs_count} url(s) \
84
- overall for this iteration.")
83
+ Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
84
+ url(s) overall for this iteration.")
85
85
  Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
86
86
  the next iteration.")
87
87
 
@@ -136,8 +136,8 @@ the next iteration.")
136
136
  Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
137
137
  end
138
138
 
139
- Wgit.logger.info("Crawled and indexed #{total_pages_indexed} docs for \
140
- the site: #{url}")
139
+ Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
140
+ for the site: #{url}")
141
141
 
142
142
  total_pages_indexed
143
143
  end
data/lib/wgit/version.rb CHANGED
@@ -6,7 +6,7 @@
6
6
  # @author Michael Telford
7
7
  module Wgit
8
8
  # The current gem version of Wgit.
9
- VERSION = '0.10.0'
9
+ VERSION = '0.10.4'
10
10
 
11
11
  # Returns the current gem version of Wgit as a String.
12
12
  def self.version
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.10.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-20 00:00:00.000000000 Z
11
+ date: 2021-11-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -249,16 +249,19 @@ require_paths:
249
249
  - lib
250
250
  required_ruby_version: !ruby/object:Gem::Requirement
251
251
  requirements:
252
- - - "~>"
252
+ - - ">="
253
+ - !ruby/object:Gem::Version
254
+ version: '2.6'
255
+ - - "<"
253
256
  - !ruby/object:Gem::Version
254
- version: '2.5'
257
+ version: '4'
255
258
  required_rubygems_version: !ruby/object:Gem::Requirement
256
259
  requirements:
257
260
  - - ">="
258
261
  - !ruby/object:Gem::Version
259
262
  version: '0'
260
263
  requirements: []
261
- rubygems_version: 3.1.2
264
+ rubygems_version: 3.2.22
262
265
  signing_key:
263
266
  specification_version: 4
264
267
  summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically