wgit 0.10.7 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +44 -1
- data/CONTRIBUTING.md +1 -1
- data/README.md +22 -2
- data/bin/wgit +3 -1
- data/lib/wgit/assertable.rb +2 -2
- data/lib/wgit/crawler.rb +56 -34
- data/lib/wgit/database/database.rb +64 -52
- data/lib/wgit/document.rb +67 -39
- data/lib/wgit/document_extractors.rb +15 -1
- data/lib/wgit/dsl.rb +16 -20
- data/lib/wgit/indexer.rb +157 -63
- data/lib/wgit/logger.rb +1 -1
- data/lib/wgit/response.rb +21 -6
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +118 -51
- data/lib/wgit/utils.rb +81 -28
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +1 -0
- metadata +33 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e80ab519d9f55f759df4a97873a3cb33be37f3270e408ad8f3f1cf96bd762bc
|
4
|
+
data.tar.gz: baf78c4fe1e30d49847dd44f1c4f3a05104db3ee57886cc2da329e3ad17ddd4f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4701b737d24d38b3a9cc27b4524556487c22ce135bfb63b162e75c8f08175b6558b2c96eca7a022bf368aef3a8d37d9d072ebbf75bdd80a29bf7c996795a406c
|
7
|
+
data.tar.gz: 026206a073b0e3465778db5e5a430e2830c64204d6d18ff38e049bfa3668dd62330b031367fb447e7554b0336e41893ea96eca52a844cdd7a5e94845b4e3a3b8
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Wgit Change Log
|
2
2
|
|
3
|
-
## v0.0.0 (TEMPLATE - DO NOT EDIT)
|
3
|
+
## v0.0.0 [- BREAKING CHANGES] (TEMPLATE - DO NOT EDIT)
|
4
4
|
### Added
|
5
5
|
- ...
|
6
6
|
### Changed/Removed
|
@@ -9,6 +9,49 @@
|
|
9
9
|
- ...
|
10
10
|
---
|
11
11
|
|
12
|
+
## v0.11.0 - BREAKING CHANGES
|
13
|
+
This release is a biggie with the main headline being the introduction of robots.txt support (see below). This release introduces several breaking changes so take care when updating your current version of Wgit.
|
14
|
+
### Added
|
15
|
+
- Ability to prevent indexing via `robots.txt` and `noindex` values in HTML `meta` elements and HTTP response header `X-Robots-Tag`. See new class `Wgit::RobotsParser` and the updated `Wgit::Indexer#index_*` methods. Also see the [wiki article](https://github.com/michaeltelford/wgit/wiki/How-To-Prevent-Indexing) on the subject.
|
16
|
+
- `Wgit::RobotsParser` class for parsing `robots.txt` files.
|
17
|
+
- `Wgit::Response#no_index?` and `Wgit::Document#no_index?` methods (see wiki article above).
|
18
|
+
- Added two new default extractors which extract robots meta elements for use in `Wgit::Document#no_index?`.
|
19
|
+
- Added `Wgit::Document.to_h_ignore_vars` Array for user manipulation.
|
20
|
+
- Added `Wgit::Utils.pprint` method to aid debugging.
|
21
|
+
- Added `Wgit::Utils.sanitize_url` method.
|
22
|
+
- Added `Wgit::Indexer#index_www(max_urls_per_iteration:, ...)` param.
|
23
|
+
- Added `Wgit::Url#redirects` and `#redirects=` methods.
|
24
|
+
- Added `Wgit::Url#redirects_journey` used by `Wgit::Indexer` to insert a Url and it's redirects.
|
25
|
+
- Added `Wgit::Database#bulk_upsert` which `Wgit::Indexer` now uses where possible. This reduces the total database calls made during an index operation.
|
26
|
+
### Changed/Removed
|
27
|
+
- Updated `Wgit::Indexer#index_*` methods to honour index prevention methods (see the [wiki article](https://github.com/michaeltelford/wgit/wiki/How-To-Prevent-Indexing)).
|
28
|
+
- Updated `Wgit::Utils.sanitize*` methods so they no longer modify the receiver.
|
29
|
+
- Updated `Wgit::Crawler#crawl_url` to always return the crawled `Wgit::Document`. If relying on `nil` in your code, you should now use `doc.empty?` instead.
|
30
|
+
- Updated `Wgit::Indexer` method logs.
|
31
|
+
- Updated/added custom class `#inspect` methods.
|
32
|
+
- Renamed `Wgit::Utils.printf_search_results` to `pprint_search_results`.
|
33
|
+
- Renamed `Wgit::Url#concat` to `#join`. The `#concat` method is now `String#concat`.
|
34
|
+
- Updated `Wgit::Indexer` methods to now write external Urls to the Database as: `doc.external_urls.map(&:to_origin)` meaning `http://example.com/about` becomes `http://example.com`.
|
35
|
+
- Updated the following methods to no longer omit trailing slashes from Urls: `Wgit::Url` - `#to_path`, `#omit_base`, `#omit_origin` and `Wgit::Document` - `#internal_links`, `#internal_absolute_links`, `#external_links`. For an average website, this results in ~30% less network requests when crawling.
|
36
|
+
- Updated Ruby version to `3.3.0`.
|
37
|
+
- Updated all bundle dependencies to latest versions, see `Gemfile.lock` for exact versions.
|
38
|
+
### Fixed
|
39
|
+
- `Wgit::Crawler#crawl_site` now internally records all redirects for a given Url.
|
40
|
+
- `Wgit::Crawler#crawl_site` infinite loop when using Wgit on a Ruby version > `3.0.2`.
|
41
|
+
- Various other minor fixes/improvements throughout the code base.
|
42
|
+
---
|
43
|
+
|
44
|
+
## v0.10.8
|
45
|
+
### Added
|
46
|
+
- Custom `#inspect` methods to `Wgit::Url` and `Wgit::Document` classes.
|
47
|
+
- `Document.remove_extractors` method, which removes all default and defined extractors.
|
48
|
+
|
49
|
+
### Changed/Removed
|
50
|
+
- ...
|
51
|
+
### Fixed
|
52
|
+
- ...
|
53
|
+
---
|
54
|
+
|
12
55
|
## v0.10.7
|
13
56
|
### Added
|
14
57
|
- ...
|
data/CONTRIBUTING.md
CHANGED
@@ -12,7 +12,7 @@ Before you make a contribution, reach out to michael.telford@live.com about what
|
|
12
12
|
- Write some code
|
13
13
|
- Re-run the tests (which now hopefully pass)
|
14
14
|
- Push your branch to your `origin` remote
|
15
|
-
- Open a GitHub Pull Request (with the target branch
|
15
|
+
- Open a GitHub Pull Request (with the target branch as wgit's (upstream) `master`)
|
16
16
|
- Apply any requested changes
|
17
17
|
- Wait for your PR to be merged
|
18
18
|
|
data/README.md
CHANGED
@@ -62,7 +62,23 @@ end
|
|
62
62
|
puts JSON.generate(quotes)
|
63
63
|
```
|
64
64
|
|
65
|
-
|
65
|
+
Which outputs:
|
66
|
+
|
67
|
+
```text
|
68
|
+
[
|
69
|
+
{
|
70
|
+
"quote": "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”",
|
71
|
+
"author": "Jane Austen"
|
72
|
+
},
|
73
|
+
{
|
74
|
+
"quote": "“A day without sunshine is like, you know, night.”",
|
75
|
+
"author": "Steve Martin"
|
76
|
+
},
|
77
|
+
...
|
78
|
+
]
|
79
|
+
```
|
80
|
+
|
81
|
+
Great! But what if we want to crawl and store the content in a database, so that it can be searched? Wgit makes it easy to index and search HTML using [MongoDB](https://www.mongodb.com/):
|
66
82
|
|
67
83
|
```ruby
|
68
84
|
require 'wgit'
|
@@ -89,6 +105,8 @@ The `search` call (on the last line) will return and output the results:
|
|
89
105
|
Quotes to Scrape
|
90
106
|
“I am free of all prejudice. I hate everyone equally. ”
|
91
107
|
http://quotes.toscrape.com/tag/humor/page/2/
|
108
|
+
|
109
|
+
...
|
92
110
|
```
|
93
111
|
|
94
112
|
Using a MongoDB [client](https://robomongo.org/), we can see that the two web pages have been indexed, along with their extracted *quotes* and *authors*:
|
@@ -146,6 +164,8 @@ indexer = Wgit::Indexer.new
|
|
146
164
|
indexer.index_site(wiki, **opts)
|
147
165
|
```
|
148
166
|
|
167
|
+
- Wgit's built in indexing methods will by default, honour a site's `robots.txt` rules. There's also a handy robots.txt parser that you can use in your own code.
|
168
|
+
|
149
169
|
## Why Not Wgit?
|
150
170
|
|
151
171
|
So why might you not use Wgit, I hear you ask?
|
@@ -219,7 +239,7 @@ And you're good to go!
|
|
219
239
|
|
220
240
|
### Tooling
|
221
241
|
|
222
|
-
Wgit uses the [`toys`](https://github.com/dazuma/toys) gem (instead of Rake) for task invocation. For a full list of available tasks a.k.a. tools, run `toys --tools`. You can search for a tool using `toys -s tool_name`. The most commonly used tools are listed below...
|
242
|
+
Wgit uses the [`toys`](https://github.com/dazuma/toys) gem (instead of Rake) for task invocation. Always run `toys` as `bundle exec toys`. For a full list of available tasks a.k.a. tools, run `toys --tools`. You can search for a tool using `toys -s tool_name`. The most commonly used tools are listed below...
|
223
243
|
|
224
244
|
Run `toys db` to see a list of database related tools, enabling you to run a Mongo DB instance locally using Docker. Run `toys test` to execute the tests.
|
225
245
|
|
data/bin/wgit
CHANGED
@@ -5,6 +5,7 @@ require 'wgit'
|
|
5
5
|
# Eval .wgit.rb file (if it exists somewhere).
|
6
6
|
def eval_wgit(filepath = nil)
|
7
7
|
puts 'Searching for .wgit.rb file in local and home directories...'
|
8
|
+
success = false
|
8
9
|
|
9
10
|
[filepath, Dir.pwd, Dir.home].each do |dir|
|
10
11
|
path = "#{dir}/.wgit.rb"
|
@@ -13,11 +14,12 @@ def eval_wgit(filepath = nil)
|
|
13
14
|
puts "Eval'ing #{path}"
|
14
15
|
puts 'Call `eval_wgit` after changes to re-eval the file'
|
15
16
|
eval(File.read(path))
|
17
|
+
success = true
|
16
18
|
|
17
19
|
break
|
18
20
|
end
|
19
21
|
|
20
|
-
|
22
|
+
success
|
21
23
|
end
|
22
24
|
|
23
25
|
eval_wgit
|
data/lib/wgit/assertable.rb
CHANGED
data/lib/wgit/crawler.rb
CHANGED
@@ -5,7 +5,6 @@ require_relative 'document'
|
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
|
-
require 'set'
|
9
8
|
require 'benchmark'
|
10
9
|
require 'typhoeus'
|
11
10
|
require 'ferrum'
|
@@ -70,6 +69,8 @@ module Wgit
|
|
70
69
|
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
70
|
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
71
|
# installed and in $PATH.
|
72
|
+
# @param parse_javascript_delay [Integer] The delay time given to a page's
|
73
|
+
# JS to update the DOM. After the delay, the HTML is crawled.
|
73
74
|
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
75
|
parse_javascript: false, parse_javascript_delay: 1)
|
75
76
|
@redirect_limit = redirect_limit
|
@@ -86,8 +87,6 @@ module Wgit
|
|
86
87
|
#
|
87
88
|
# Use the allow and disallow paths params to partially and selectively
|
88
89
|
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
89
|
-
# Note that each path must NOT start with a slash; the only exception being
|
90
|
-
# a `/` on its own with no other characters, referring to the index page.
|
91
90
|
#
|
92
91
|
# Only redirects to the same host are followed. For example, the Url
|
93
92
|
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
@@ -118,37 +117,35 @@ module Wgit
|
|
118
117
|
url, follow: :default, allow_paths: nil, disallow_paths: nil, &block
|
119
118
|
)
|
120
119
|
doc = crawl_url(url, &block)
|
121
|
-
return nil if doc.
|
120
|
+
return nil if doc.empty?
|
122
121
|
|
123
|
-
|
124
|
-
|
125
|
-
allow_paths: allow_paths,
|
126
|
-
disallow_paths: disallow_paths
|
127
|
-
}
|
128
|
-
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
122
|
+
total_pages = 1
|
123
|
+
link_opts = { xpath: follow, allow_paths:, disallow_paths: }
|
129
124
|
|
130
|
-
crawled = Set.new(
|
125
|
+
crawled = Set.new(url.redirects_journey)
|
131
126
|
externals = Set.new(doc.external_links)
|
132
127
|
internals = Set.new(next_internal_links(doc, **link_opts))
|
133
128
|
|
134
129
|
return externals.to_a if internals.empty?
|
135
130
|
|
136
131
|
loop do
|
137
|
-
links = internals
|
132
|
+
links = subtract_links(internals, crawled)
|
138
133
|
break if links.empty?
|
139
134
|
|
140
135
|
links.each do |link|
|
141
|
-
orig_link = link.dup
|
142
136
|
doc = crawl_url(link, follow_redirects: :host, &block)
|
143
137
|
|
144
|
-
crawled +=
|
145
|
-
next if doc.
|
138
|
+
crawled += link.redirects_journey
|
139
|
+
next if doc.empty?
|
146
140
|
|
147
|
-
|
148
|
-
|
141
|
+
total_pages += 1
|
142
|
+
internals += next_internal_links(doc, **link_opts)
|
143
|
+
externals += doc.external_links
|
149
144
|
end
|
150
145
|
end
|
151
146
|
|
147
|
+
Wgit.logger.debug("Crawled #{total_pages} documents for the site: #{url}")
|
148
|
+
|
152
149
|
externals.to_a
|
153
150
|
end
|
154
151
|
|
@@ -169,7 +166,7 @@ module Wgit
|
|
169
166
|
def crawl_urls(*urls, follow_redirects: true, &block)
|
170
167
|
raise 'You must provide at least one Url' if urls.empty?
|
171
168
|
|
172
|
-
opts = { follow_redirects:
|
169
|
+
opts = { follow_redirects: }
|
173
170
|
doc = nil
|
174
171
|
|
175
172
|
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
@@ -189,19 +186,19 @@ module Wgit
|
|
189
186
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
190
187
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
188
|
# Use `doc.empty?` to determine if the page is valid.
|
192
|
-
# @return [Wgit::Document
|
193
|
-
#
|
189
|
+
# @return [Wgit::Document] The crawled HTML Document. Check if the crawl
|
190
|
+
# was successful with doc.empty? (true if unsuccessful).
|
194
191
|
def crawl_url(url, follow_redirects: true)
|
195
192
|
# A String url isn't allowed because it's passed by value not reference,
|
196
193
|
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
197
194
|
assert_type(url, Wgit::Url)
|
198
195
|
|
199
|
-
html = fetch(url, follow_redirects:
|
196
|
+
html = fetch(url, follow_redirects:)
|
200
197
|
doc = Wgit::Document.new(url, html, encode: @encode)
|
201
198
|
|
202
199
|
yield(doc) if block_given?
|
203
200
|
|
204
|
-
doc
|
201
|
+
doc
|
205
202
|
end
|
206
203
|
|
207
204
|
protected
|
@@ -226,7 +223,7 @@ module Wgit
|
|
226
223
|
response = Wgit::Response.new
|
227
224
|
raise "Invalid url: #{url}" if url.invalid?
|
228
225
|
|
229
|
-
resolve(url, response, follow_redirects:
|
226
|
+
resolve(url, response, follow_redirects:)
|
230
227
|
get_browser_response(url, response) if @parse_javascript
|
231
228
|
|
232
229
|
response.body_or_nil
|
@@ -238,6 +235,9 @@ module Wgit
|
|
238
235
|
url.crawled = true # Sets date_crawled underneath.
|
239
236
|
url.crawl_duration = response.total_time
|
240
237
|
|
238
|
+
# Don't override previous url.redirects if response is fully resolved.
|
239
|
+
url.redirects = response.redirects unless response.redirects.empty?
|
240
|
+
|
241
241
|
@last_response = response
|
242
242
|
end
|
243
243
|
|
@@ -253,7 +253,7 @@ module Wgit
|
|
253
253
|
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
254
254
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
255
255
|
def resolve(url, response, follow_redirects: true)
|
256
|
-
origin = url.
|
256
|
+
origin = url.to_origin # Record the origin before any redirects.
|
257
257
|
follow_redirects, within = redirect?(follow_redirects)
|
258
258
|
|
259
259
|
loop do
|
@@ -277,7 +277,7 @@ module Wgit
|
|
277
277
|
if response.redirect_count >= @redirect_limit
|
278
278
|
|
279
279
|
# Process the location to be crawled next.
|
280
|
-
location = url.to_origin.
|
280
|
+
location = url.to_origin.join(location) if location.relative?
|
281
281
|
response.redirections[url.to_s] = location.to_s
|
282
282
|
url.replace(location) # Update the url on redirect.
|
283
283
|
end
|
@@ -420,6 +420,27 @@ module Wgit
|
|
420
420
|
|
421
421
|
private
|
422
422
|
|
423
|
+
# Manually does the following: `links = internals - crawled`.
|
424
|
+
# This is needed due to an apparent bug in Set<Url> (when upgrading from
|
425
|
+
# Ruby v3.0.2 to v3.3.0) causing an infinite crawl loop in #crawl_site.
|
426
|
+
# TODO: Check in future Ruby versions and remove this method when fixed.
|
427
|
+
def subtract_links(internals, crawled)
|
428
|
+
links = Set.new
|
429
|
+
|
430
|
+
internals.each do |internal_url|
|
431
|
+
already_crawled = false
|
432
|
+
|
433
|
+
crawled.each do |crawled_url|
|
434
|
+
already_crawled = internal_url == crawled_url
|
435
|
+
break if already_crawled
|
436
|
+
end
|
437
|
+
|
438
|
+
links.add(internal_url) unless already_crawled
|
439
|
+
end
|
440
|
+
|
441
|
+
links
|
442
|
+
end
|
443
|
+
|
423
444
|
# Returns the next links used to continue crawling a site. The xpath value
|
424
445
|
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
446
|
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
@@ -431,7 +452,8 @@ module Wgit
|
|
431
452
|
.compact
|
432
453
|
end
|
433
454
|
|
434
|
-
|
455
|
+
doc_domain = doc.url.to_domain
|
456
|
+
if links.any? { |link| link.to_domain != doc_domain }
|
435
457
|
raise 'The links to follow must be within the site domain'
|
436
458
|
end
|
437
459
|
|
@@ -458,12 +480,12 @@ module Wgit
|
|
458
480
|
|
459
481
|
# Validate and filter by the given URL paths.
|
460
482
|
def process_paths(links, allow_paths, disallow_paths)
|
461
|
-
if allow_paths
|
483
|
+
if allow_paths && !allow_paths.empty?
|
462
484
|
paths = validate_paths(allow_paths)
|
463
485
|
filter_links(links, :select!, paths)
|
464
486
|
end
|
465
487
|
|
466
|
-
if disallow_paths
|
488
|
+
if disallow_paths && !disallow_paths.empty?
|
467
489
|
paths = validate_paths(disallow_paths)
|
468
490
|
filter_links(links, :reject!, paths)
|
469
491
|
end
|
@@ -477,7 +499,7 @@ module Wgit
|
|
477
499
|
raise 'The provided paths must all be Strings' \
|
478
500
|
unless paths.all? { |path| path.is_a?(String) }
|
479
501
|
|
480
|
-
Wgit::Utils.sanitize(paths, encode: false)
|
502
|
+
paths = Wgit::Utils.sanitize(paths, encode: false)
|
481
503
|
raise 'The provided paths cannot be empty' if paths.empty?
|
482
504
|
|
483
505
|
paths.map do |path|
|
@@ -491,7 +513,7 @@ module Wgit
|
|
491
513
|
def filter_links(links, filter_method, paths)
|
492
514
|
links.send(filter_method) do |link|
|
493
515
|
# Turn http://example.com into / meaning index.
|
494
|
-
link = link.to_endpoint.index? ? '/' : link.omit_base
|
516
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base.omit_trailing_slash
|
495
517
|
|
496
518
|
match = false
|
497
519
|
paths.each do |pattern|
|
@@ -532,9 +554,9 @@ module Wgit
|
|
532
554
|
)
|
533
555
|
end
|
534
556
|
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
557
|
+
alias_method :crawl, :crawl_urls
|
558
|
+
alias_method :crawl_pages, :crawl_urls
|
559
|
+
alias_method :crawl_page, :crawl_url
|
560
|
+
alias_method :crawl_r, :crawl_site
|
539
561
|
end
|
540
562
|
end
|
@@ -162,20 +162,20 @@ module Wgit
|
|
162
162
|
# Wgit::Document>] The records to insert/create.
|
163
163
|
# @raise [StandardError] If data isn't valid.
|
164
164
|
def insert(data)
|
165
|
-
data = data.dup # Avoid modifying by reference.
|
166
165
|
collection = nil
|
166
|
+
request_obj = nil
|
167
167
|
|
168
|
-
if data.respond_to?(:map
|
169
|
-
data.map
|
168
|
+
if data.respond_to?(:map)
|
169
|
+
request_obj = data.map do |obj|
|
170
170
|
collection, _, model = get_type_info(obj)
|
171
171
|
model
|
172
172
|
end
|
173
173
|
else
|
174
174
|
collection, _, model = get_type_info(data)
|
175
|
-
|
175
|
+
request_obj = model
|
176
176
|
end
|
177
177
|
|
178
|
-
create(collection,
|
178
|
+
create(collection, request_obj)
|
179
179
|
end
|
180
180
|
|
181
181
|
# Inserts or updates the object in the database.
|
@@ -183,7 +183,7 @@ module Wgit
|
|
183
183
|
# @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
|
184
184
|
# @return [Boolean] True if inserted, false if updated.
|
185
185
|
def upsert(obj)
|
186
|
-
collection, query, model = get_type_info(obj
|
186
|
+
collection, query, model = get_type_info(obj)
|
187
187
|
data_hash = model.merge(Wgit::Model.common_update_data)
|
188
188
|
result = @client[collection].replace_one(query, data_hash, upsert: true)
|
189
189
|
|
@@ -192,6 +192,36 @@ module Wgit
|
|
192
192
|
@last_result = result
|
193
193
|
end
|
194
194
|
|
195
|
+
# Bulk upserts the objects in the database collection.
|
196
|
+
# You cannot mix collection objs types, all must be Urls or Documents.
|
197
|
+
#
|
198
|
+
# @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
|
199
|
+
# inserted/updated.
|
200
|
+
# @return [Integer] The total number of upserted objects.
|
201
|
+
def bulk_upsert(objs)
|
202
|
+
assert_arr_types(objs, [Wgit::Url, Wgit::Document])
|
203
|
+
raise 'objs is empty' if objs.empty?
|
204
|
+
|
205
|
+
collection = nil
|
206
|
+
request_objs = objs.map do |obj|
|
207
|
+
collection, query, model = get_type_info(obj)
|
208
|
+
data_hash = model.merge(Wgit::Model.common_update_data)
|
209
|
+
|
210
|
+
{
|
211
|
+
update_many: {
|
212
|
+
filter: query,
|
213
|
+
update: { '$set' => data_hash },
|
214
|
+
upsert: true
|
215
|
+
}
|
216
|
+
}
|
217
|
+
end
|
218
|
+
|
219
|
+
result = @client[collection].bulk_write(request_objs)
|
220
|
+
result.upserted_count + result.modified_count
|
221
|
+
ensure
|
222
|
+
@last_result = result
|
223
|
+
end
|
224
|
+
|
195
225
|
### Retrieve Data ###
|
196
226
|
|
197
227
|
# Returns all Document records from the DB. Use #search to filter based on
|
@@ -205,14 +235,14 @@ module Wgit
|
|
205
235
|
# @yield [doc] Given each Document object (Wgit::Document) returned from
|
206
236
|
# the DB.
|
207
237
|
# @return [Array<Wgit::Document>] The Documents obtained from the DB.
|
208
|
-
def docs(limit: 0, skip: 0)
|
238
|
+
def docs(limit: 0, skip: 0, &block)
|
209
239
|
results = retrieve(DOCUMENTS_COLLECTION, {},
|
210
|
-
sort: { date_added: 1 }, limit
|
240
|
+
sort: { date_added: 1 }, limit:, skip:)
|
211
241
|
return [] if results.count < 1 # results#empty? doesn't exist.
|
212
242
|
|
213
243
|
# results.respond_to? :map! is false so we use map and overwrite the var.
|
214
244
|
results = results.map { |doc_hash| Wgit::Document.new(doc_hash) }
|
215
|
-
results.each
|
245
|
+
results.each(&block) if block_given?
|
216
246
|
|
217
247
|
results
|
218
248
|
end
|
@@ -227,17 +257,16 @@ module Wgit
|
|
227
257
|
# @param skip [Integer] Skip n amount of Url's.
|
228
258
|
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
229
259
|
# @return [Array<Wgit::Url>] The Urls obtained from the DB.
|
230
|
-
def urls(crawled: nil, limit: 0, skip: 0)
|
231
|
-
query = crawled.nil? ? {} : { crawled:
|
260
|
+
def urls(crawled: nil, limit: 0, skip: 0, &block)
|
261
|
+
query = crawled.nil? ? {} : { crawled: }
|
232
262
|
sort = { date_added: 1 }
|
233
263
|
|
234
|
-
results = retrieve(URLS_COLLECTION, query,
|
235
|
-
sort: sort, limit: limit, skip: skip)
|
264
|
+
results = retrieve(URLS_COLLECTION, query, sort:, limit:, skip:)
|
236
265
|
return [] if results.count < 1 # results#empty? doesn't exist.
|
237
266
|
|
238
267
|
# results.respond_to? :map! is false so we use map and overwrite the var.
|
239
268
|
results = results.map { |url_doc| Wgit::Url.new(url_doc) }
|
240
|
-
results.each
|
269
|
+
results.each(&block) if block_given?
|
241
270
|
|
242
271
|
results
|
243
272
|
end
|
@@ -249,7 +278,7 @@ module Wgit
|
|
249
278
|
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
250
279
|
# @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
|
251
280
|
def crawled_urls(limit: 0, skip: 0, &block)
|
252
|
-
urls(crawled: true, limit
|
281
|
+
urls(crawled: true, limit:, skip:, &block)
|
253
282
|
end
|
254
283
|
|
255
284
|
# Returned Url records that haven't yet been crawled.
|
@@ -259,7 +288,7 @@ module Wgit
|
|
259
288
|
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
260
289
|
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
261
290
|
def uncrawled_urls(limit: 0, skip: 0, &block)
|
262
|
-
urls(crawled: false, limit
|
291
|
+
urls(crawled: false, limit:, skip:, &block)
|
263
292
|
end
|
264
293
|
|
265
294
|
# Searches the database's Documents for the given query.
|
@@ -286,19 +315,21 @@ module Wgit
|
|
286
315
|
query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0
|
287
316
|
)
|
288
317
|
query = query.to_s.strip
|
289
|
-
query.replace(
|
318
|
+
query.replace("\"#{query}\"") if whole_sentence
|
290
319
|
|
291
320
|
# Sort based on the most search hits (aka "textScore").
|
292
321
|
# We use the sort_proj hash as both a sort and a projection below.
|
293
322
|
sort_proj = { score: { :$meta => 'textScore' } }
|
294
|
-
query = {
|
295
|
-
:$
|
296
|
-
|
297
|
-
|
323
|
+
query = {
|
324
|
+
:$text => {
|
325
|
+
:$search => query,
|
326
|
+
:$caseSensitive => case_sensitive
|
327
|
+
}
|
328
|
+
}
|
298
329
|
|
299
330
|
results = retrieve(DOCUMENTS_COLLECTION, query,
|
300
331
|
sort: sort_proj, projection: sort_proj,
|
301
|
-
limit
|
332
|
+
limit:, skip:)
|
302
333
|
|
303
334
|
results.map do |mongo_doc|
|
304
335
|
doc = Wgit::Document.new(mongo_doc)
|
@@ -328,21 +359,10 @@ module Wgit
|
|
328
359
|
query, case_sensitive: false, whole_sentence: true,
|
329
360
|
limit: 10, skip: 0, sentence_limit: 80
|
330
361
|
)
|
331
|
-
results = search(
|
332
|
-
query,
|
333
|
-
case_sensitive: case_sensitive,
|
334
|
-
whole_sentence: whole_sentence,
|
335
|
-
limit: limit,
|
336
|
-
skip: skip
|
337
|
-
)
|
362
|
+
results = search(query, case_sensitive:, whole_sentence:, limit:, skip:)
|
338
363
|
|
339
364
|
results.each do |doc|
|
340
|
-
doc.search!(
|
341
|
-
query,
|
342
|
-
case_sensitive: case_sensitive,
|
343
|
-
whole_sentence: whole_sentence,
|
344
|
-
sentence_limit: sentence_limit
|
345
|
-
)
|
365
|
+
doc.search!(query, case_sensitive:, whole_sentence:, sentence_limit:)
|
346
366
|
yield(doc) if block_given?
|
347
367
|
end
|
348
368
|
|
@@ -373,26 +393,16 @@ module Wgit
|
|
373
393
|
query, case_sensitive: false, whole_sentence: true,
|
374
394
|
limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
|
375
395
|
)
|
376
|
-
results = search(
|
377
|
-
query,
|
378
|
-
case_sensitive: case_sensitive,
|
379
|
-
whole_sentence: whole_sentence,
|
380
|
-
limit: limit,
|
381
|
-
skip: skip
|
382
|
-
)
|
396
|
+
results = search(query, case_sensitive:, whole_sentence:, limit:, skip:)
|
383
397
|
|
384
398
|
results
|
385
399
|
.map do |doc|
|
386
400
|
yield(doc) if block_given?
|
387
401
|
|
402
|
+
# Only return result if its text has a match - compact is called below.
|
388
403
|
results = doc.search(
|
389
|
-
query,
|
390
|
-
case_sensitive: case_sensitive,
|
391
|
-
whole_sentence: whole_sentence,
|
392
|
-
sentence_limit: sentence_limit
|
404
|
+
query, case_sensitive:, whole_sentence:, sentence_limit:
|
393
405
|
)
|
394
|
-
|
395
|
-
# Only return result if its text has a match - compact is called below.
|
396
406
|
next nil if results.empty?
|
397
407
|
|
398
408
|
[doc.url, (top_result_only ? results.first : results)]
|
@@ -443,7 +453,7 @@ module Wgit
|
|
443
453
|
# @return [Boolean] True if url exists, otherwise false.
|
444
454
|
def url?(url)
|
445
455
|
assert_type(url, String) # This includes Wgit::Url's.
|
446
|
-
query = { url:
|
456
|
+
query = { url: }
|
447
457
|
retrieve(URLS_COLLECTION, query, limit: 1).any?
|
448
458
|
end
|
449
459
|
|
@@ -490,7 +500,7 @@ module Wgit
|
|
490
500
|
# @raise [StandardError] If the obj is not valid.
|
491
501
|
# @return [Integer] The number of updated records/objects.
|
492
502
|
def update(obj)
|
493
|
-
collection, query, model = get_type_info(obj
|
503
|
+
collection, query, model = get_type_info(obj)
|
494
504
|
data_hash = model.merge(Wgit::Model.common_update_data)
|
495
505
|
|
496
506
|
mutate(collection, query, { '$set' => data_hash })
|
@@ -554,6 +564,8 @@ module Wgit
|
|
554
564
|
# @return [Array<Symbol, Hash>] The collection type, query to get
|
555
565
|
# the record/obj from the database (if it exists) and the model of obj.
|
556
566
|
def get_type_info(obj)
|
567
|
+
obj = obj.dup
|
568
|
+
|
557
569
|
case obj
|
558
570
|
when Wgit::Url
|
559
571
|
collection = URLS_COLLECTION
|
@@ -661,7 +673,7 @@ module Wgit
|
|
661
673
|
@last_result = result
|
662
674
|
end
|
663
675
|
|
664
|
-
|
665
|
-
|
676
|
+
alias_method :num_objects, :num_records
|
677
|
+
alias_method :clear_db!, :clear_db
|
666
678
|
end
|
667
679
|
end
|