wgit 0.10.0 → 0.10.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/README.md +1 -1
- data/lib/wgit/base.rb +10 -1
- data/lib/wgit/database/database.rb +58 -8
- data/lib/wgit/document.rb +1 -1
- data/lib/wgit/indexer.rb +4 -4
- data/lib/wgit/version.rb +1 -1
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13a092ae05f0338598d0cc26d85ed828a86d60240eb996bacdbd7511aaae25a2
|
4
|
+
data.tar.gz: 3835b8aa3c06d49c1230dd42316c30315e39b0b33f0c433ff8df60eb6be2ecaa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41a88d424925e3ba670104e69d027a5656c66616ab59bc85c62ec3359afd78230073e5a8f50697cbfc553b8dc9a012cf2ad2f2d857f7840fc668e0e43cdbfb33
|
7
|
+
data.tar.gz: 17bd6ddaba4cb53bd5cff4f9b36d5930f4bf496ee588163d190e10274438844f1a5e764c08fc7f82dc6c5c6b2efebaa52dc624be134e38a7d9d3ff9b970d1430
|
data/CHANGELOG.md
CHANGED
@@ -9,6 +9,42 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.10.4
|
13
|
+
### Added
|
14
|
+
- `Database#search_text` method which returns a Hash of `url => text_results` instead of `Wgit::Documents` (like `#search`).
|
15
|
+
### Changed/Removed
|
16
|
+
- ...
|
17
|
+
### Fixed
|
18
|
+
- ...
|
19
|
+
---
|
20
|
+
|
21
|
+
## v0.10.3
|
22
|
+
### Added
|
23
|
+
- ...
|
24
|
+
### Changed/Removed
|
25
|
+
- Changed `Database#create_collections` and `#create_unique_indexes` by removing `rescue nil` from their database operations. Now any underlying errors with the database client are not masked.
|
26
|
+
### Fixed
|
27
|
+
- ...
|
28
|
+
---
|
29
|
+
|
30
|
+
## v0.10.2
|
31
|
+
### Added
|
32
|
+
- `Wgit::Base#setup` and `#teardown` methods (lifecycle hooks) that can be overridden by subclasses.
|
33
|
+
### Changed/Removed
|
34
|
+
- ...
|
35
|
+
### Fixed
|
36
|
+
- ...
|
37
|
+
---
|
38
|
+
|
39
|
+
## v0.10.1
|
40
|
+
### Added
|
41
|
+
- Support for Ruby 3.
|
42
|
+
### Changed/Removed
|
43
|
+
- Removed support for Ruby 2.5 (as it's too old).
|
44
|
+
### Fixed
|
45
|
+
- ...
|
46
|
+
---
|
47
|
+
|
12
48
|
## v0.10.0
|
13
49
|
### Added
|
14
50
|
- `Wgit::Url#scheme_relative?` method.
|
data/README.md
CHANGED
data/lib/wgit/base.rb
CHANGED
@@ -4,16 +4,25 @@ module Wgit
|
|
4
4
|
class Base
|
5
5
|
extend Wgit::DSL
|
6
6
|
|
7
|
+
# Runs once before the crawl/index is run. Override as needed.
|
8
|
+
def setup; end
|
9
|
+
|
10
|
+
# Runs once after the crawl/index is complete. Override as needed.
|
11
|
+
def teardown; end
|
12
|
+
|
7
13
|
# Runs the crawl/index passing each crawled `Wgit::Document` and the given
|
8
14
|
# block to the subclass's `#parse` method.
|
9
15
|
def self.run(&block)
|
16
|
+
crawl_method = @method || :crawl
|
10
17
|
obj = new
|
18
|
+
|
11
19
|
unless obj.respond_to?(:parse)
|
12
20
|
raise "#{obj.class} must respond_to? #parse(doc, &block)"
|
13
21
|
end
|
14
22
|
|
15
|
-
|
23
|
+
obj.setup
|
16
24
|
send(crawl_method) { |doc| obj.parse(doc, &block) }
|
25
|
+
obj.teardown
|
17
26
|
|
18
27
|
obj
|
19
28
|
end
|
@@ -91,29 +91,27 @@ module Wgit
|
|
91
91
|
|
92
92
|
### DDL ###
|
93
93
|
|
94
|
-
# Creates the urls and documents collections
|
95
|
-
# This method is therefore idempotent.
|
94
|
+
# Creates the 'urls' and 'documents' collections.
|
96
95
|
#
|
97
96
|
# @return [nil] Always returns nil.
|
98
97
|
def create_collections
|
99
|
-
|
100
|
-
|
98
|
+
@client[URLS_COLLECTION].create
|
99
|
+
@client[DOCUMENTS_COLLECTION].create
|
101
100
|
|
102
101
|
nil
|
103
102
|
end
|
104
103
|
|
105
|
-
# Creates the urls and documents unique 'url' indexes
|
106
|
-
# exist. This method is therefore idempotent.
|
104
|
+
# Creates the urls and documents unique 'url' indexes.
|
107
105
|
#
|
108
106
|
# @return [nil] Always returns nil.
|
109
107
|
def create_unique_indexes
|
110
108
|
@client[URLS_COLLECTION].indexes.create_one(
|
111
109
|
{ url: 1 }, name: UNIQUE_INDEX, unique: true
|
112
|
-
)
|
110
|
+
)
|
113
111
|
|
114
112
|
@client[DOCUMENTS_COLLECTION].indexes.create_one(
|
115
113
|
{ 'url.url' => 1 }, name: UNIQUE_INDEX, unique: true
|
116
|
-
)
|
114
|
+
)
|
117
115
|
|
118
116
|
nil
|
119
117
|
end
|
@@ -350,6 +348,58 @@ module Wgit
|
|
350
348
|
results
|
351
349
|
end
|
352
350
|
|
351
|
+
# Searches the database's Documents for the given query and then searches
|
352
|
+
# each result in turn using `doc.search`. Instead of an Array of Documents,
|
353
|
+
# this method returns a Hash of the docs url => search_results creating a
|
354
|
+
# search engine like result set for quick access to text matches.
|
355
|
+
#
|
356
|
+
# @param query [String] The text query to search with.
|
357
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
358
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
359
|
+
# for separately.
|
360
|
+
# @param limit [Integer] The max number of results to return.
|
361
|
+
# @param skip [Integer] The number of results to skip.
|
362
|
+
# @param sentence_limit [Integer] The max length of each search result
|
363
|
+
# sentence.
|
364
|
+
# @param top_result_only [Boolean] Whether to return all of the documents
|
365
|
+
# search results or just the top (most relavent) result.
|
366
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
367
|
+
# DB.
|
368
|
+
# @return [Hash<String, String | Array<String>>] The search results obtained
|
369
|
+
# from the DB having mapped the docs url => search_results. The format of
|
370
|
+
# search_results depends on the value of `top_result_only`.
|
371
|
+
def search_text(
|
372
|
+
query, case_sensitive: false, whole_sentence: true,
|
373
|
+
limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
|
374
|
+
)
|
375
|
+
results = search(
|
376
|
+
query,
|
377
|
+
case_sensitive: case_sensitive,
|
378
|
+
whole_sentence: whole_sentence,
|
379
|
+
limit: limit,
|
380
|
+
skip: skip
|
381
|
+
)
|
382
|
+
|
383
|
+
results
|
384
|
+
.map do |doc|
|
385
|
+
yield(doc) if block_given?
|
386
|
+
|
387
|
+
results = doc.search(
|
388
|
+
query,
|
389
|
+
case_sensitive: case_sensitive,
|
390
|
+
whole_sentence: whole_sentence,
|
391
|
+
sentence_limit: sentence_limit
|
392
|
+
)
|
393
|
+
|
394
|
+
# Only return result if its text has a match - compact is called below.
|
395
|
+
next nil if results.empty?
|
396
|
+
|
397
|
+
[doc.url, (top_result_only ? results.first : results)]
|
398
|
+
end
|
399
|
+
.compact
|
400
|
+
.to_h
|
401
|
+
end
|
402
|
+
|
353
403
|
# Returns statistics about the database.
|
354
404
|
#
|
355
405
|
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
data/lib/wgit/document.rb
CHANGED
@@ -453,7 +453,7 @@ be relative"
|
|
453
453
|
|
454
454
|
if query.is_a?(Regexp)
|
455
455
|
regex = query
|
456
|
-
else # respond_to?
|
456
|
+
else # query.respond_to? :to_s == true
|
457
457
|
query = query.to_s
|
458
458
|
query = query.gsub(' ', '|') unless whole_sentence
|
459
459
|
regex = Regexp.new(query, !case_sensitive)
|
data/lib/wgit/indexer.rb
CHANGED
@@ -80,8 +80,8 @@ database capacity, exiting.")
|
|
80
80
|
urls_count += write_urls_to_db(ext_links)
|
81
81
|
end
|
82
82
|
|
83
|
-
Wgit.logger.info("Crawled and indexed
|
84
|
-
overall for this iteration.")
|
83
|
+
Wgit.logger.info("Crawled and indexed documents for #{docs_count} \
|
84
|
+
url(s) overall for this iteration.")
|
85
85
|
Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
|
86
86
|
the next iteration.")
|
87
87
|
|
@@ -136,8 +136,8 @@ the next iteration.")
|
|
136
136
|
Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
|
137
137
|
end
|
138
138
|
|
139
|
-
Wgit.logger.info("Crawled and indexed #{total_pages_indexed}
|
140
|
-
the site: #{url}")
|
139
|
+
Wgit.logger.info("Crawled and indexed #{total_pages_indexed} documents \
|
140
|
+
for the site: #{url}")
|
141
141
|
|
142
142
|
total_pages_indexed
|
143
143
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -249,16 +249,19 @@ require_paths:
|
|
249
249
|
- lib
|
250
250
|
required_ruby_version: !ruby/object:Gem::Requirement
|
251
251
|
requirements:
|
252
|
-
- - "
|
252
|
+
- - ">="
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: '2.6'
|
255
|
+
- - "<"
|
253
256
|
- !ruby/object:Gem::Version
|
254
|
-
version: '
|
257
|
+
version: '4'
|
255
258
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
256
259
|
requirements:
|
257
260
|
- - ">="
|
258
261
|
- !ruby/object:Gem::Version
|
259
262
|
version: '0'
|
260
263
|
requirements: []
|
261
|
-
rubygems_version: 3.
|
264
|
+
rubygems_version: 3.2.22
|
262
265
|
signing_key:
|
263
266
|
specification_version: 4
|
264
267
|
summary: Wgit is a HTML web crawler, written in Ruby, that allows you to programmatically
|