wgit 0.10.0 → 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +1 -1
- data/lib/wgit/base.rb +10 -1
- data/lib/wgit/database/database.rb +58 -8
- data/lib/wgit/document.rb +1 -1
- data/lib/wgit/indexer.rb +4 -4
- data/lib/wgit/version.rb +1 -1
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13a092ae05f0338598d0cc26d85ed828a86d60240eb996bacdbd7511aaae25a2
|
4
|
+
data.tar.gz: 3835b8aa3c06d49c1230dd42316c30315e39b0b33f0c433ff8df60eb6be2ecaa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41a88d424925e3ba670104e69d027a5656c66616ab59bc85c62ec3359afd78230073e5a8f50697cbfc553b8dc9a012cf2ad2f2d857f7840fc668e0e43cdbfb33
|
7
|
+
data.tar.gz: 17bd6ddaba4cb53bd5cff4f9b36d5930f4bf496ee588163d190e10274438844f1a5e764c08fc7f82dc6c5c6b2efebaa52dc624be134e38a7d9d3ff9b970d1430
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,42 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.10.4
|
13
|
+
### Added
|
14
|
+
- `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
|
15
|
+
### Changed/Removed
|
16
|
+
- ...
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.10.3
|
22
|
+
### Added
|
23
|
+
- ...
|
24
|
+
### Changed/Removed
|
25
|
+
- Changed `Database#create_collections` and `#create_unique_indexes` by removing `rescue nil` from their database operations. Now any underlying errors with the database client are not masked.
|
26
|
+
### Fixed
|
27
|
+
- ...
|
28
|
+
---
|
29
|
+
|
30
|
+
## v0.10.2
|
31
|
+
### Added
|
32
|
+
- `Wgit::Base#setup` and `#teardown` methods (lifecycle hooks) that can be overridden by subclasses.
|
33
|
+
### Changed/Removed
|
34
|
+
- ...
|
35
|
+
### Fixed
|
36
|
+
- ...
|
37
|
+
---
|
38
|
+
|
39
|
+
## v0.10.1
|
40
|
+
### Added
|
41
|
+
- Support for Ruby 3.
|
42
|
+
### Changed/Removed
|
43
|
+
- Removed support for Ruby 2.5 (as it's too old).
|
44
|
+
### Fixed
|
45
|
+
- ...
|
46
|
+
---
|
47
|
+
|
12
48
|
## v0.10.0
|
13
49
|
### Added
|
14
50
|
- `Wgit::Url#scheme_relative?` method.
|
data/README.md
CHANGED
data/lib/wgit/base.rb
CHANGED
@@ -4,16 +4,25 @@ module Wgit
|
|
4
4
|
class Base
|
5
5
|
extend Wgit::DSL
|
6
6
|
|
7
|
+
# Runs once before the crawl/index is run. Override as needed.
|
8
|
+
def setup; end
|
9
|
+
|
10
|
+
# Runs once after the crawl/index is complete. Override as needed.
|
11
|
+
def teardown; end
|
12
|
+
|
7
13
|
# Runs the crawl/index passing each crawled `Wgit::Document` and the given
|
8
14
|
# block to the subclass's `#parse` method.
|
9
15
|
def self.run(&block)
|
16
|
+
crawl_method = @method || :crawl
|
10
17
|
obj = new
|
18
|
+
|
11
19
|
unless obj.respond_to?(:parse)
|
12
20
|
raise "#{obj.class} must respond_to? #parse(doc, &block)"
|
13
21
|
end
|
14
22
|
|
15
|
-
|
23
|
+
obj.setup
|
16
24
|
send(crawl_method) { |doc| obj.parse(doc, &block) }
|
25
|
+
obj.teardown
|
17
26
|
|
18
27
|
obj
|
19
28
|
end
|
@@ -91,29 +91,27 @@ module Wgit
|
|
91
91
|
|
92
92
|
### DDL ###
|
93
93
|
|
94
|
-
# Creates the urls and documents collections
|
95
|
-
# This method is therefore idempotent.
|
94
|
+
# Creates the 'urls' and 'documents' collections.
|
96
95
|
#
|
97
96
|
# @return [nil] Always returns nil.
|
98
97
|
def create_collections
|
99
|
-
|
100
|
-
|
98
|
+
@client[URLS_COLLECTION].create
|
99
|
+
@client[DOCUMENTS_COLLECTION].create
|
101
100
|
|
102
101
|
nil
|
103
102
|
end
|
104
103
|
|
105
|
-
# Creates the urls and documents unique 'url' indexes
|
106
|
-
# exist. This method is therefore idempotent.
|
104
|
+
# Creates the urls and documents unique 'url' indexes.
|
107
105
|
#
|
108
106
|
# @return [nil] Always returns nil.
|
109
107
|
def create_unique_indexes
|
110
108
|
@client[URLS_COLLECTION].indexes.create_one(
|
111
109
|
{ url: 1 }, name: UNIQUE_INDEX, unique: true
|
112
|
-
)
|
110
|
+
)
|
113
111
|
|
114
112
|
@client[DOCUMENTS_COLLECTION].indexes.create_one(
|
115
113
|
{ 'url.url' => 1 }, name: UNIQUE_INDEX, unique: true
|
116
|
-
)
|
114
|
+
)
|
117
115
|
|
118
116
|
nil
|
119
117
|
end
|
@@ -350,6 +348,58 @@ module Wgit
|
|
350
348
|
results
|
351
349
|
end
|
352
350
|
|
351
|
+
# Searches the database's Documents for the given query and then searches
|
352
|
+
# each result in turn using `doc.search`. Instead of an Array of Documents,
|
353
|
+
# this method returns a Hash of the docs url => search_results creating a
|
354
|
+
# search engine like result set for quick access to text matches.
|
355
|
+
#
|
356
|
+
# @param query [String] The text query to search with.
|
357
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
358
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
359
|
+
# for separately.
|
360
|
+
# @param limit [Integer] The max number of results to return.
|
361
|
+
# @param skip [Integer] The number of results to skip.
|
362
|
+
# @param sentence_limit [Integer] The max length of each search result
|
363
|
+
# sentence.
|
364
|
+
# @param top_result_only [Boolean] Whether to return all of the documents
|
365
|
+
# search results or just the top (most relavent) result.
|
366
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
367
|
+
# DB.
|
368
|
+
# @return [Hash<String, String | Array<String>>] The search results obtained
|
369
|
+
# from the DB having mapped the docs url => search_results. The format of
|
370
|
+
# search_results depends on the value of `top_result_only`.
|
371
|
+
def search_text(
|
372
|
+
query, case_sensitive: false, whole_sentence: true,
|
373
|
+
limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
|
374
|
+
)
|
375
|
+
results = search(
|
376
|
+
query,
|
377
|
+
case_sensitive: case_sensitive,
|
378
|
+
whole_sentence: whole_sentence,
|
379
|
+
limit: limit,
|
380
|
+
skip: skip
|
381
|
+
)
|
382
|
+
|
383
|
+
results
|
384
|
+
.map do |doc|
|
385
|
+
yield(doc) if block_given?
|
386
|
+
|
387
|
+
results = doc.search(
|
388
|
+
query,
|
389
|
+
case_sensitive: case_sensitive,
|
390
|
+
whole_sentence: whole_sentence,
|
391
|
+
sentence_limit: sentence_limit
|
392
|
+
)
|
393
|
+
|
394
|
+
# Only return result if its text has a match - compact is called below.
|
395
|
+
next nil if results.empty?
|
396
|
+
|
397
|
+
[doc.url, (top_result_only ? results.first : results)]
|
398
|
+
end
|
399
|
+
.compact
|
400
|
+
.to_h
|
401
|
+
end
|
402
|
+
|
353
403
|
# Returns statistics about the database.
|
354
404
|
#
|
355
405
|
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
data/lib/wgit/document.rb
CHANGED
@@ -453,7 +453,7 @@ be relative"
|
|
453
453
|
|
454
454
|
if query.is_a?(Regexp)
|
455
455
|
regex = query
|
456
|
-
else # respond_to?
|
456
|
+
else # query.respond_to? :to_s == true
|
457
457
|
query = query.to_s
|
458
458
|
query = query.gsub(' ', '|') unless whole_sentence
|
459
459
|
regex = Regexp.new(query, !case_sensitive)
|
data/lib/wgit/indexer.rb
CHANGED
@@ -80,8 +80,8 @@ database capacity, exiting.")
|
|
80
80
|
urls_count += write_urls_to_db(ext_links)
|
81
81
|
end
|
82
82
|
|
83
|
-
Wgit.logger.info("Crawled and indexed
|
84
|
-
overall for this iteration.")
|
83
|
+
Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
|
84
|
+
url(s) overall for this iteration.")
|
85
85
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
86
86
|
the next iteration.")
|
87
87
|
|
@@ -136,8 +136,8 @@ the next iteration.")
|
|
136
136
|
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
137
137
|
end
|
138
138
|
|
139
|
-
Wgit.logger.info("Crawled and indexed #{total_pages_indexed}
|
140
|
-
the site: #{url}")
|
139
|
+
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
|
140
|
+
for the site: #{url}")
|
141
141
|
|
142
142
|
total_pages_indexed
|
143
143
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -249,16 +249,19 @@ require_paths:
|
|
249
249
|
- lib
|
250
250
|
required_ruby_version: !ruby/object:Gem::Requirement
|
251
251
|
requirements:
|
252
|
-
- - "
|
252
|
+
- - ">="
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: '2.6'
|
255
|
+
- - "<"
|
253
256
|
- !ruby/object:Gem::Version
|
254
|
-
version: '
|
257
|
+
version: '4'
|
255
258
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
256
259
|
requirements:
|
257
260
|
- - ">="
|
258
261
|
- !ruby/object:Gem::Version
|
259
262
|
version: '0'
|
260
263
|
requirements: []
|
261
|
-
rubygems_version: 3.
|
264
|
+
rubygems_version: 3.2.22
|
262
265
|
signing_key:
|
263
266
|
specification_version: 4
|
264
267
|
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|