wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
data/lib/wgit/crawler.rb
CHANGED
@@ -5,7 +5,6 @@ require_relative 'document'
|
|
5
5
|
require_relative 'utils'
|
6
6
|
require_relative 'assertable'
|
7
7
|
require_relative 'response'
|
8
|
-
require 'set'
|
9
8
|
require 'benchmark'
|
10
9
|
require 'typhoeus'
|
11
10
|
require 'ferrum'
|
@@ -55,6 +54,11 @@ module Wgit
|
|
55
54
|
# The value should balance between a good UX and enough JS parse time.
|
56
55
|
attr_accessor :parse_javascript_delay
|
57
56
|
|
57
|
+
# The opts Hash passed directly to the ferrum Chrome browser when
|
58
|
+
# `parse_javascript: true`.
|
59
|
+
# See https://github.com/rubycdp/ferrum for details.
|
60
|
+
attr_accessor :ferrum_opts
|
61
|
+
|
58
62
|
# The Wgit::Response of the most recently crawled URL.
|
59
63
|
attr_reader :last_response
|
60
64
|
|
@@ -70,13 +74,31 @@ module Wgit
|
|
70
74
|
# @param parse_javascript [Boolean] Whether or not to parse the Javascript
|
71
75
|
# of the crawled document. Parsing requires Chrome/Chromium to be
|
72
76
|
# installed and in $PATH.
|
77
|
+
# @param parse_javascript_delay [Integer] The delay time given to a page's
|
78
|
+
# JS to update the DOM. After the delay, the HTML is crawled.
|
73
79
|
def initialize(redirect_limit: 5, timeout: 5, encode: true,
|
74
|
-
parse_javascript: false, parse_javascript_delay: 1
|
80
|
+
parse_javascript: false, parse_javascript_delay: 1,
|
81
|
+
ferrum_opts: {})
|
82
|
+
assert_type(redirect_limit, Integer)
|
83
|
+
assert_type(timeout, [Integer, Float])
|
84
|
+
assert_type(encode, [TrueClass, FalseClass])
|
85
|
+
assert_type(parse_javascript, [TrueClass, FalseClass])
|
86
|
+
assert_type(parse_javascript_delay, Integer)
|
87
|
+
assert_type(ferrum_opts, Hash)
|
88
|
+
|
75
89
|
@redirect_limit = redirect_limit
|
76
90
|
@timeout = timeout
|
77
91
|
@encode = encode
|
78
92
|
@parse_javascript = parse_javascript
|
79
93
|
@parse_javascript_delay = parse_javascript_delay
|
94
|
+
@ferrum_opts = default_ferrum_opts.merge(ferrum_opts)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Overrides String#inspect to shorten the printed output of a Crawler.
|
98
|
+
#
|
99
|
+
# @return [String] A short textual representation of this Crawler.
|
100
|
+
def inspect
|
101
|
+
"#<Wgit::Crawler timeout=#{@timeout} redirect_limit=#{@redirect_limit} encode=#{@encode} parse_javascript=#{@parse_javascript} parse_javascript_delay=#{@parse_javascript_delay} ferrum_opts=#{@ferrum_opts}>"
|
80
102
|
end
|
81
103
|
|
82
104
|
# Crawls an entire website's HTML pages by recursively going through
|
@@ -86,8 +108,6 @@ module Wgit
|
|
86
108
|
#
|
87
109
|
# Use the allow and disallow paths params to partially and selectively
|
88
110
|
# crawl a site; the glob syntax is fully supported e.g. `'wiki/\*'` etc.
|
89
|
-
# Note that each path must NOT start with a slash; the only exception being
|
90
|
-
# a `/` on its own with no other characters, referring to the index page.
|
91
111
|
#
|
92
112
|
# Only redirects to the same host are followed. For example, the Url
|
93
113
|
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
@@ -104,6 +124,7 @@ module Wgit
|
|
104
124
|
# the crawl. This changes how a site is crawled. Only links pointing to
|
105
125
|
# the site domain are allowed. The `:default` is any `<a>` href returning
|
106
126
|
# HTML.
|
127
|
+
# @param max_pages [Integer]
|
107
128
|
# @param allow_paths [String, Array<String>] Filters the `follow:` links by
|
108
129
|
# selecting them if their path `File.fnmatch?` one of allow_paths.
|
109
130
|
# @param disallow_paths [String, Array<String>] Filters the `follow` links
|
@@ -115,40 +136,48 @@ module Wgit
|
|
115
136
|
# from all of the site's pages or nil if the given url could not be
|
116
137
|
# crawled successfully.
|
117
138
|
def crawl_site(
|
118
|
-
url, follow: :default,
|
139
|
+
url, follow: :default, max_pages: nil,
|
140
|
+
allow_paths: nil, disallow_paths: nil, &block
|
119
141
|
)
|
120
142
|
doc = crawl_url(url, &block)
|
121
|
-
return nil if doc.
|
143
|
+
return nil if doc.empty?
|
122
144
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
disallow_paths: disallow_paths
|
127
|
-
}
|
128
|
-
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
145
|
+
total_pages = 1
|
146
|
+
limit_reached = max_pages && total_pages >= max_pages
|
147
|
+
link_opts = { xpath: follow, allow_paths:, disallow_paths: }
|
129
148
|
|
130
|
-
crawled = Set.new(
|
149
|
+
crawled = Set.new(url.redirects_journey)
|
131
150
|
externals = Set.new(doc.external_links)
|
132
151
|
internals = Set.new(next_internal_links(doc, **link_opts))
|
133
152
|
|
134
153
|
return externals.to_a if internals.empty?
|
135
154
|
|
136
155
|
loop do
|
137
|
-
|
156
|
+
if limit_reached
|
157
|
+
Wgit.logger.debug("Crawled and reached the max_pages limit of: #{max_pages}")
|
158
|
+
break
|
159
|
+
end
|
160
|
+
|
161
|
+
links = subtract_links(internals, crawled)
|
138
162
|
break if links.empty?
|
139
163
|
|
140
164
|
links.each do |link|
|
141
|
-
|
165
|
+
limit_reached = max_pages && total_pages >= max_pages
|
166
|
+
break if limit_reached
|
167
|
+
|
142
168
|
doc = crawl_url(link, follow_redirects: :host, &block)
|
143
169
|
|
144
|
-
crawled +=
|
145
|
-
next if doc.
|
170
|
+
crawled += link.redirects_journey
|
171
|
+
next if doc.empty?
|
146
172
|
|
147
|
-
|
148
|
-
|
173
|
+
total_pages += 1
|
174
|
+
internals += next_internal_links(doc, **link_opts)
|
175
|
+
externals += doc.external_links
|
149
176
|
end
|
150
177
|
end
|
151
178
|
|
179
|
+
Wgit.logger.debug("Crawled #{total_pages} documents for the site: #{url}")
|
180
|
+
|
152
181
|
externals.to_a
|
153
182
|
end
|
154
183
|
|
@@ -169,7 +198,7 @@ module Wgit
|
|
169
198
|
def crawl_urls(*urls, follow_redirects: true, &block)
|
170
199
|
raise 'You must provide at least one Url' if urls.empty?
|
171
200
|
|
172
|
-
opts = { follow_redirects:
|
201
|
+
opts = { follow_redirects: }
|
173
202
|
doc = nil
|
174
203
|
|
175
204
|
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, **opts, &block) }
|
@@ -189,19 +218,19 @@ module Wgit
|
|
189
218
|
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
190
219
|
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
191
220
|
# Use `doc.empty?` to determine if the page is valid.
|
192
|
-
# @return [Wgit::Document
|
193
|
-
#
|
221
|
+
# @return [Wgit::Document] The crawled HTML Document. Check if the crawl
|
222
|
+
# was successful with doc.empty? (true if unsuccessful).
|
194
223
|
def crawl_url(url, follow_redirects: true)
|
195
224
|
# A String url isn't allowed because it's passed by value not reference,
|
196
225
|
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
197
226
|
assert_type(url, Wgit::Url)
|
198
227
|
|
199
|
-
html = fetch(url, follow_redirects:
|
228
|
+
html = fetch(url, follow_redirects:)
|
200
229
|
doc = Wgit::Document.new(url, html, encode: @encode)
|
201
230
|
|
202
231
|
yield(doc) if block_given?
|
203
232
|
|
204
|
-
doc
|
233
|
+
doc
|
205
234
|
end
|
206
235
|
|
207
236
|
protected
|
@@ -226,7 +255,7 @@ module Wgit
|
|
226
255
|
response = Wgit::Response.new
|
227
256
|
raise "Invalid url: #{url}" if url.invalid?
|
228
257
|
|
229
|
-
resolve(url, response, follow_redirects:
|
258
|
+
resolve(url, response, follow_redirects:)
|
230
259
|
get_browser_response(url, response) if @parse_javascript
|
231
260
|
|
232
261
|
response.body_or_nil
|
@@ -238,6 +267,9 @@ module Wgit
|
|
238
267
|
url.crawled = true # Sets date_crawled underneath.
|
239
268
|
url.crawl_duration = response.total_time
|
240
269
|
|
270
|
+
# Don't override previous url.redirects if response is fully resolved.
|
271
|
+
url.redirects = response.redirects unless response.redirects.empty?
|
272
|
+
|
241
273
|
@last_response = response
|
242
274
|
end
|
243
275
|
|
@@ -253,7 +285,7 @@ module Wgit
|
|
253
285
|
# :origin, :host, :domain or :brand. See Wgit::Url#relative? opts param.
|
254
286
|
# @raise [StandardError] If a redirect isn't allowed etc.
|
255
287
|
def resolve(url, response, follow_redirects: true)
|
256
|
-
origin = url.
|
288
|
+
origin = url.to_origin # Record the origin before any redirects.
|
257
289
|
follow_redirects, within = redirect?(follow_redirects)
|
258
290
|
|
259
291
|
loop do
|
@@ -277,7 +309,7 @@ module Wgit
|
|
277
309
|
if response.redirect_count >= @redirect_limit
|
278
310
|
|
279
311
|
# Process the location to be crawled next.
|
280
|
-
location = url.to_origin.
|
312
|
+
location = url.to_origin.join(location) if location.relative?
|
281
313
|
response.redirections[url.to_s] = location.to_s
|
282
314
|
url.replace(location) # Update the url on redirect.
|
283
315
|
end
|
@@ -370,7 +402,7 @@ module Wgit
|
|
370
402
|
# @param url [String] The url to browse to.
|
371
403
|
# @return [Ferrum::Browser] The browser response object.
|
372
404
|
def browser_get(url)
|
373
|
-
@browser ||= Ferrum::Browser.new(
|
405
|
+
@browser ||= Ferrum::Browser.new(**@ferrum_opts)
|
374
406
|
@browser.goto(url)
|
375
407
|
|
376
408
|
# Wait for the page's JS to finish dynamically manipulating the DOM.
|
@@ -420,6 +452,38 @@ module Wgit
|
|
420
452
|
|
421
453
|
private
|
422
454
|
|
455
|
+
# The default opts which are merged with the user's ferrum_opts: and then
|
456
|
+
# passed directly to the ferrum Chrome browser.
|
457
|
+
def default_ferrum_opts
|
458
|
+
{
|
459
|
+
timeout: @timeout,
|
460
|
+
process_timeout: 10,
|
461
|
+
headless: true
|
462
|
+
}
|
463
|
+
end
|
464
|
+
|
465
|
+
# Manually does the following: `links = internals - crawled`.
|
466
|
+
# This is needed due to an apparent bug in Set<Url> (when upgrading from
|
467
|
+
# Ruby v3.0.2 to v3.3.0) causing an infinite crawl loop in #crawl_site.
|
468
|
+
# Run in a shell to test: bundle exec toys test infinite_crawl_loop
|
469
|
+
# TODO: Check in future Ruby versions and remove this method when fixed.
|
470
|
+
def subtract_links(internals, crawled)
|
471
|
+
links = Set.new
|
472
|
+
|
473
|
+
internals.each do |internal_url|
|
474
|
+
already_crawled = false
|
475
|
+
|
476
|
+
crawled.each do |crawled_url|
|
477
|
+
already_crawled = internal_url == crawled_url
|
478
|
+
break if already_crawled
|
479
|
+
end
|
480
|
+
|
481
|
+
links.add(internal_url) unless already_crawled
|
482
|
+
end
|
483
|
+
|
484
|
+
links
|
485
|
+
end
|
486
|
+
|
423
487
|
# Returns the next links used to continue crawling a site. The xpath value
|
424
488
|
# is used to obtain the links. Any valid URL Strings will be converted into
|
425
489
|
# absolute Wgit::Urls. Invalid URLs will be silently dropped. Any link not
|
@@ -431,7 +495,8 @@ module Wgit
|
|
431
495
|
.compact
|
432
496
|
end
|
433
497
|
|
434
|
-
|
498
|
+
doc_domain = doc.url.to_domain
|
499
|
+
if links.any? { |link| link.to_domain != doc_domain }
|
435
500
|
raise 'The links to follow must be within the site domain'
|
436
501
|
end
|
437
502
|
|
@@ -458,12 +523,12 @@ module Wgit
|
|
458
523
|
|
459
524
|
# Validate and filter by the given URL paths.
|
460
525
|
def process_paths(links, allow_paths, disallow_paths)
|
461
|
-
if allow_paths
|
526
|
+
if allow_paths && !allow_paths.empty?
|
462
527
|
paths = validate_paths(allow_paths)
|
463
528
|
filter_links(links, :select!, paths)
|
464
529
|
end
|
465
530
|
|
466
|
-
if disallow_paths
|
531
|
+
if disallow_paths && !disallow_paths.empty?
|
467
532
|
paths = validate_paths(disallow_paths)
|
468
533
|
filter_links(links, :reject!, paths)
|
469
534
|
end
|
@@ -477,7 +542,7 @@ module Wgit
|
|
477
542
|
raise 'The provided paths must all be Strings' \
|
478
543
|
unless paths.all? { |path| path.is_a?(String) }
|
479
544
|
|
480
|
-
Wgit::Utils.sanitize(paths, encode: false)
|
545
|
+
paths = Wgit::Utils.sanitize(paths, encode: false)
|
481
546
|
raise 'The provided paths cannot be empty' if paths.empty?
|
482
547
|
|
483
548
|
paths.map do |path|
|
@@ -491,7 +556,7 @@ module Wgit
|
|
491
556
|
def filter_links(links, filter_method, paths)
|
492
557
|
links.send(filter_method) do |link|
|
493
558
|
# Turn http://example.com into / meaning index.
|
494
|
-
link = link.to_endpoint.index? ? '/' : link.omit_base
|
559
|
+
link = link.to_endpoint.index? ? '/' : link.omit_base.omit_trailing_slash
|
495
560
|
|
496
561
|
match = false
|
497
562
|
paths.each do |pattern|
|
@@ -532,9 +597,9 @@ module Wgit
|
|
532
597
|
)
|
533
598
|
end
|
534
599
|
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
600
|
+
alias_method :crawl, :crawl_urls
|
601
|
+
alias_method :crawl_pages, :crawl_urls
|
602
|
+
alias_method :crawl_page, :crawl_url
|
603
|
+
alias_method :crawl_r, :crawl_site
|
539
604
|
end
|
540
605
|
end
|
@@ -0,0 +1,204 @@
|
|
1
|
+
require_relative "../../utils"
|
2
|
+
require_relative "../../url"
|
3
|
+
require_relative "../../document"
|
4
|
+
require_relative "../../model"
|
5
|
+
require_relative "../database_adapter"
|
6
|
+
|
7
|
+
module Wgit::Database
|
8
|
+
# Database implementer class for in-memory (RAM) storage. This DB is mainly used
|
9
|
+
# for testing and experimenting with. This DB is thread safe.
|
10
|
+
class InMemory < DatabaseAdapter
|
11
|
+
# Initializes a thread safe InMemory Database instance.
|
12
|
+
#
|
13
|
+
# @param connection_string [String] Not used but needed to adhere to the
|
14
|
+
# DatabaseAdapter interface.
|
15
|
+
def initialize(connection_string = nil)
|
16
|
+
# Inits @urls and @docs vars.
|
17
|
+
initialize_store
|
18
|
+
|
19
|
+
super
|
20
|
+
end
|
21
|
+
|
22
|
+
# Overrides String#inspect to display collection sizes.
|
23
|
+
#
|
24
|
+
# @return [String] A short textual representation of this object.
|
25
|
+
def inspect
|
26
|
+
"#<Wgit::Database::InMemory num_urls=#{@urls.size} \
|
27
|
+
num_docs=#{@docs.size} size=#{size}>"
|
28
|
+
end
|
29
|
+
|
30
|
+
# The Wgit::Url's collection stored as an in-memory Concurrent::Array.
|
31
|
+
def urls(&block)
|
32
|
+
map_urls(@urls, &block)
|
33
|
+
end
|
34
|
+
|
35
|
+
# The Wgit::Document's collection stored as an in-memory Concurrent::Array.
|
36
|
+
def docs(&block)
|
37
|
+
map_documents(@docs, &block)
|
38
|
+
end
|
39
|
+
|
40
|
+
# The raw url Hashes, not mapped into their corresponding Wgit objects.
|
41
|
+
def url_hashes
|
42
|
+
@urls
|
43
|
+
end
|
44
|
+
|
45
|
+
# The raw doc Hashes, not mapped into their corresponding Wgit objects.
|
46
|
+
def doc_hashes
|
47
|
+
@docs
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns the current size of the in-memory database.
|
51
|
+
# An empty database will return a size of 4 because there are 4 bytes in
|
52
|
+
# two empty arrays (urls and docs collections).
|
53
|
+
#
|
54
|
+
# @return [Integer] The current size of the in-memory DB.
|
55
|
+
def size
|
56
|
+
@urls.to_s.size + @docs.to_s.size
|
57
|
+
end
|
58
|
+
|
59
|
+
# Searches the database's Document#text for the given query. The returned
|
60
|
+
# Documents are sorted for relevance, starting with the most relevant. Each
|
61
|
+
# Document's #score value will be set accordingly.
|
62
|
+
#
|
63
|
+
# @param query [Regexp, #to_s] The regex or text value to search each
|
64
|
+
# document's @text for.
|
65
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
66
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
67
|
+
# for separately.
|
68
|
+
# @param limit [Integer] The max number of results to return.
|
69
|
+
# @param skip [Integer] The number of results to skip.
|
70
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
71
|
+
# DB.
|
72
|
+
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
73
|
+
def search(
|
74
|
+
query, case_sensitive: false, whole_sentence: true,
|
75
|
+
limit: 10, skip: 0, &block
|
76
|
+
)
|
77
|
+
regex = Wgit::Utils.build_search_regex(
|
78
|
+
query, case_sensitive:, whole_sentence:)
|
79
|
+
|
80
|
+
# Search the Wgit::Document's, not the raw Hashes.
|
81
|
+
results = docs.select do |doc|
|
82
|
+
score = 0
|
83
|
+
doc.search(regex, case_sensitive:, whole_sentence:) do |results_hash|
|
84
|
+
score = results_hash.values.sum
|
85
|
+
end
|
86
|
+
next false if score.zero?
|
87
|
+
|
88
|
+
doc.instance_variable_set :@score, score
|
89
|
+
true
|
90
|
+
end
|
91
|
+
|
92
|
+
return [] if results.empty?
|
93
|
+
|
94
|
+
results = results.sort_by { |doc| -doc.score }
|
95
|
+
|
96
|
+
results = results[skip..]
|
97
|
+
return [] unless results
|
98
|
+
|
99
|
+
results = results[0...limit] if limit.positive?
|
100
|
+
results.each(&block) if block_given?
|
101
|
+
|
102
|
+
results
|
103
|
+
end
|
104
|
+
|
105
|
+
# Deletes everything in the urls and documents collections.
|
106
|
+
#
|
107
|
+
# @return [Integer] The number of deleted records.
|
108
|
+
def empty
|
109
|
+
previous_size = @urls.size + @docs.size
|
110
|
+
initialize_store
|
111
|
+
|
112
|
+
previous_size
|
113
|
+
end
|
114
|
+
|
115
|
+
# Returns Url records that haven't yet been crawled.
|
116
|
+
#
|
117
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
118
|
+
# @param skip [Integer] Skip n amount of Url's.
|
119
|
+
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
120
|
+
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
121
|
+
def uncrawled_urls(limit: 0, skip: 0, &block)
|
122
|
+
uncrawled = @urls.reject { |url| url["crawled"] }
|
123
|
+
uncrawled = uncrawled[skip..]
|
124
|
+
return [] unless uncrawled
|
125
|
+
|
126
|
+
uncrawled = uncrawled[0...limit] if limit.positive?
|
127
|
+
map_urls(uncrawled, &block)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Inserts or updates the object in the in-memory database.
|
131
|
+
#
|
132
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
|
133
|
+
# @return [Boolean] True if inserted, false if updated.
|
134
|
+
def upsert(obj)
|
135
|
+
collection, index, model = get_model_info(obj)
|
136
|
+
|
137
|
+
if index
|
138
|
+
collection[index] = model
|
139
|
+
false
|
140
|
+
else
|
141
|
+
collection << model
|
142
|
+
true
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# Bulk upserts the objects in the in-memory database collection.
|
147
|
+
# You cannot mix collection objs types, all must be Urls or Documents.
|
148
|
+
#
|
149
|
+
# @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
|
150
|
+
# inserted/updated.
|
151
|
+
# @return [Integer] The total number of newly inserted objects.
|
152
|
+
def bulk_upsert(objs)
|
153
|
+
assert_common_arr_types(objs, [Wgit::Url, Wgit::Document])
|
154
|
+
|
155
|
+
objs.reduce(0) do |inserted, obj|
|
156
|
+
inserted += 1 if upsert(obj)
|
157
|
+
inserted
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
|
163
|
+
# Creates a new Concurrent::Array for each collection.
|
164
|
+
def initialize_store
|
165
|
+
@urls = Concurrent::Array.new
|
166
|
+
@docs = Concurrent::Array.new
|
167
|
+
end
|
168
|
+
|
169
|
+
# Get the database's model info (collection type, index, model) for
|
170
|
+
# obj.
|
171
|
+
#
|
172
|
+
# Use like:
|
173
|
+
# ```
|
174
|
+
# collection, index, model = get_model_info(obj)
|
175
|
+
# ```
|
176
|
+
#
|
177
|
+
# Raises an error if obj isn't a Wgit::Url or Wgit::Document.
|
178
|
+
#
|
179
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
|
180
|
+
# @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
|
181
|
+
# @return [Array<Symbol, Hash>] The collection type, the obj's index (if in
|
182
|
+
# the collection, nil otherwise) and the Wgit::Model of obj.
|
183
|
+
def get_model_info(obj)
|
184
|
+
obj = obj.dup
|
185
|
+
|
186
|
+
case obj
|
187
|
+
when Wgit::Url
|
188
|
+
key = obj.to_s
|
189
|
+
collection = @urls
|
190
|
+
index = @urls.index { |url| url["url"] == key }
|
191
|
+
model = build_model(obj)
|
192
|
+
when Wgit::Document
|
193
|
+
key = obj.url.to_s
|
194
|
+
collection = @docs
|
195
|
+
index = @docs.index { |doc| doc["url"]&.[]("url") == key }
|
196
|
+
model = build_model(obj)
|
197
|
+
else
|
198
|
+
raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
|
199
|
+
end
|
200
|
+
|
201
|
+
[collection, index, model]
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|