wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
@@ -0,0 +1,627 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../../url'
|
4
|
+
require_relative '../../document'
|
5
|
+
require_relative '../../logger'
|
6
|
+
require_relative '../../model'
|
7
|
+
require_relative '../database_adapter'
|
8
|
+
require 'logger'
|
9
|
+
require 'mongo'
|
10
|
+
|
11
|
+
module Wgit::Database
|
12
|
+
# Database implementer class for MongoDB.
|
13
|
+
class MongoDB < DatabaseAdapter
|
14
|
+
# The default name of the urls collection.
|
15
|
+
URLS_COLLECTION = :urls
|
16
|
+
|
17
|
+
# The default name of the documents collection.
|
18
|
+
DOCUMENTS_COLLECTION = :documents
|
19
|
+
|
20
|
+
# The default name of the documents collection text search index.
|
21
|
+
TEXT_INDEX = 'text_search'
|
22
|
+
|
23
|
+
# The default name of the urls and documents collections unique index.
|
24
|
+
UNIQUE_INDEX = 'unique_url'
|
25
|
+
|
26
|
+
# The connection string for the database.
|
27
|
+
attr_reader :connection_string
|
28
|
+
|
29
|
+
# The database client object. Gets set when a connection is established.
|
30
|
+
attr_reader :client
|
31
|
+
|
32
|
+
# The raw MongoDB client result of the most recent operation.
|
33
|
+
attr_reader :last_result
|
34
|
+
|
35
|
+
# Initializes a connected database client using the provided
|
36
|
+
# connection_string or ENV['WGIT_CONNECTION_STRING'].
|
37
|
+
#
|
38
|
+
# @param connection_string [String] The connection string needed to connect
|
39
|
+
# to the database.
|
40
|
+
# @raise [StandardError] If a connection string isn't provided, either as a
|
41
|
+
# parameter or via the environment.
|
42
|
+
def initialize(connection_string = nil)
|
43
|
+
connection_string ||= ENV['WGIT_CONNECTION_STRING']
|
44
|
+
raise "connection_string and ENV['WGIT_CONNECTION_STRING'] are nil" \
|
45
|
+
unless connection_string
|
46
|
+
|
47
|
+
@client = MongoDB.establish_connection(connection_string)
|
48
|
+
@connection_string = connection_string
|
49
|
+
|
50
|
+
super
|
51
|
+
end
|
52
|
+
|
53
|
+
# A class alias for self.new.
|
54
|
+
#
|
55
|
+
# @param connection_string [String] The connection string needed to connect
|
56
|
+
# to the database.
|
57
|
+
# @raise [StandardError] If a connection string isn't provided, either as a
|
58
|
+
# parameter or via the environment.
|
59
|
+
# @return [Wgit::Database::MongoDB] The connected database client.
|
60
|
+
def self.connect(connection_string = nil)
|
61
|
+
new(connection_string)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Initializes a connected database client using the connection string.
|
65
|
+
#
|
66
|
+
# @param connection_string [String] The connection string needed to connect
|
67
|
+
# to the database.
|
68
|
+
# @raise [StandardError] If a connection cannot be established.
|
69
|
+
# @return [Mong::Client] The connected MongoDB client.
|
70
|
+
def self.establish_connection(connection_string)
|
71
|
+
# Only log for error (and more severe) scenarios.
|
72
|
+
Mongo::Logger.logger = Wgit.logger.clone
|
73
|
+
Mongo::Logger.logger.progname = 'mongo'
|
74
|
+
Mongo::Logger.logger.level = Logger::ERROR
|
75
|
+
|
76
|
+
# Connects to the database here.
|
77
|
+
Mongo::Client.new(connection_string)
|
78
|
+
end
|
79
|
+
|
80
|
+
### DDL ###
|
81
|
+
|
82
|
+
# Creates the 'urls' and 'documents' collections.
|
83
|
+
#
|
84
|
+
# @return [nil] Always returns nil.
|
85
|
+
def create_collections
|
86
|
+
@client[URLS_COLLECTION].create
|
87
|
+
@client[DOCUMENTS_COLLECTION].create
|
88
|
+
|
89
|
+
nil
|
90
|
+
end
|
91
|
+
|
92
|
+
# Creates the urls and documents unique 'url' indexes.
|
93
|
+
#
|
94
|
+
# @return [nil] Always returns nil.
|
95
|
+
def create_unique_indexes
|
96
|
+
@client[URLS_COLLECTION].indexes.create_one(
|
97
|
+
{ url: 1 }, name: UNIQUE_INDEX, unique: true
|
98
|
+
)
|
99
|
+
|
100
|
+
@client[DOCUMENTS_COLLECTION].indexes.create_one(
|
101
|
+
{ 'url.url' => 1 }, name: UNIQUE_INDEX, unique: true
|
102
|
+
)
|
103
|
+
|
104
|
+
nil
|
105
|
+
end
|
106
|
+
|
107
|
+
# Sets the documents collection search fields via a text index. This method
|
108
|
+
# is called from Wgit::Model.set_search_fields and shouldn't be
|
109
|
+
# called directly.
|
110
|
+
#
|
111
|
+
# This method is labor intensive on large collections so change rarely and
|
112
|
+
# wisely. This method is idempotent in that it will remove the index if it
|
113
|
+
# already exists before it creates the new index.
|
114
|
+
#
|
115
|
+
# @param fields [Hash<Symbol, Integer>] The field names or the field names
|
116
|
+
# and their coresponding search weights.
|
117
|
+
# @raise [StandardError] If fields is not a Hash.
|
118
|
+
def search_fields=(fields)
|
119
|
+
assert_type(fields, Hash)
|
120
|
+
|
121
|
+
indexes = @client[DOCUMENTS_COLLECTION].indexes
|
122
|
+
|
123
|
+
indexes.drop_one(TEXT_INDEX) if indexes.get(TEXT_INDEX)
|
124
|
+
indexes.create_one(
|
125
|
+
fields.transform_values { 'text' },
|
126
|
+
{ name: TEXT_INDEX, weights: fields, background: true }
|
127
|
+
)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Gets the documents collection text search fields and their weights.
|
131
|
+
#
|
132
|
+
# @return [Hash] The fields and their weights.
|
133
|
+
def search_fields
|
134
|
+
indexes = @client[DOCUMENTS_COLLECTION].indexes
|
135
|
+
indexes.get(TEXT_INDEX)&.[]('weights')
|
136
|
+
end
|
137
|
+
|
138
|
+
### DML ###
|
139
|
+
|
140
|
+
### Create Data ###
|
141
|
+
|
142
|
+
# Insert one or more Url or Document objects into the DB.
|
143
|
+
#
|
144
|
+
# @param data [Wgit::Url, Wgit::Document, Enumerable<Wgit::Url,
|
145
|
+
# Wgit::Document>] The records to insert/create.
|
146
|
+
# @raise [StandardError] If data isn't valid.
|
147
|
+
def insert(data)
|
148
|
+
collection = nil
|
149
|
+
request_obj = nil
|
150
|
+
|
151
|
+
if data.respond_to?(:map)
|
152
|
+
request_obj = data.map do |obj|
|
153
|
+
collection, _, model = get_model_info(obj)
|
154
|
+
model
|
155
|
+
end
|
156
|
+
else
|
157
|
+
collection, _, model = get_model_info(data)
|
158
|
+
request_obj = model
|
159
|
+
end
|
160
|
+
|
161
|
+
create(collection, request_obj)
|
162
|
+
end
|
163
|
+
|
164
|
+
# Inserts or updates the object in the database.
|
165
|
+
#
|
166
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
|
167
|
+
# @return [Boolean] True if inserted, false if updated.
|
168
|
+
def upsert(obj)
|
169
|
+
collection, query, model = get_model_info(obj)
|
170
|
+
data_hash = model.merge(Wgit::Model.common_update_data)
|
171
|
+
result = @client[collection].replace_one(query, data_hash, upsert: true)
|
172
|
+
|
173
|
+
result.matched_count.zero?
|
174
|
+
ensure
|
175
|
+
@last_result = result
|
176
|
+
end
|
177
|
+
|
178
|
+
# Bulk upserts the objects in the database collection.
|
179
|
+
# You cannot mix collection objs types, all must be Urls or Documents.
|
180
|
+
#
|
181
|
+
# @param objs [Array<Wgit::Url>, Array<Wgit::Document>] The objs to be
|
182
|
+
# inserted/updated.
|
183
|
+
# @raise [StandardError] If objs is empty.
|
184
|
+
# @return [Integer] The total number of newly inserted objects.
|
185
|
+
def bulk_upsert(objs)
|
186
|
+
assert_common_arr_types(objs, [Wgit::Url, Wgit::Document])
|
187
|
+
raise 'objs is empty' if objs.empty?
|
188
|
+
|
189
|
+
collection = nil
|
190
|
+
request_objs = objs.map do |obj|
|
191
|
+
collection, query, model = get_model_info(obj)
|
192
|
+
data_hash = model.merge(Wgit::Model.common_update_data)
|
193
|
+
|
194
|
+
{
|
195
|
+
update_many: {
|
196
|
+
filter: query,
|
197
|
+
update: { '$set' => data_hash },
|
198
|
+
upsert: true
|
199
|
+
}
|
200
|
+
}
|
201
|
+
end
|
202
|
+
|
203
|
+
result = @client[collection].bulk_write(request_objs)
|
204
|
+
result.upserted_count + result.modified_count
|
205
|
+
ensure
|
206
|
+
@last_result = result
|
207
|
+
end
|
208
|
+
|
209
|
+
### Retrieve Data ###
|
210
|
+
|
211
|
+
# Returns all Document records from the DB. Use #search to filter based on
|
212
|
+
# the Wgit::Model.search_fields of the documents collection.
|
213
|
+
#
|
214
|
+
# All Documents are sorted by date_added ascending, in other words the
|
215
|
+
# first doc returned is the first one that was inserted into the DB.
|
216
|
+
#
|
217
|
+
# @param limit [Integer] The max number of returned records. 0 returns all.
|
218
|
+
# @param skip [Integer] Skip n records.
|
219
|
+
# @yield [doc] Given each Document object (Wgit::Document) returned from
|
220
|
+
# the DB.
|
221
|
+
# @return [Array<Wgit::Document>] The Documents obtained from the DB.
|
222
|
+
def docs(limit: 0, skip: 0, &block)
|
223
|
+
results = retrieve(DOCUMENTS_COLLECTION, {},
|
224
|
+
sort: { date_added: 1 }, limit:, skip:)
|
225
|
+
return [] if results.count < 1 # results#empty? doesn't exist.
|
226
|
+
|
227
|
+
map_documents(results, &block)
|
228
|
+
end
|
229
|
+
|
230
|
+
# Returns all Url records from the DB.
|
231
|
+
#
|
232
|
+
# All Urls are sorted by date_added ascending, in other words the first url
|
233
|
+
# returned is the first one that was inserted into the DB.
|
234
|
+
#
|
235
|
+
# @param crawled [Boolean] Filter by Url#crawled value. nil returns all.
|
236
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
237
|
+
# @param skip [Integer] Skip n amount of Url's.
|
238
|
+
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
239
|
+
# @return [Array<Wgit::Url>] The Urls obtained from the DB.
|
240
|
+
def urls(crawled: nil, limit: 0, skip: 0, &block)
|
241
|
+
query = crawled.nil? ? {} : { crawled: }
|
242
|
+
sort = { date_added: 1 }
|
243
|
+
|
244
|
+
results = retrieve(URLS_COLLECTION, query, sort:, limit:, skip:)
|
245
|
+
return [] if results.count < 1 # results#empty? doesn't exist.
|
246
|
+
|
247
|
+
map_urls(results, &block)
|
248
|
+
end
|
249
|
+
|
250
|
+
# Returns Url records that have been crawled.
|
251
|
+
#
|
252
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
253
|
+
# @param skip [Integer] Skip n amount of Url's.
|
254
|
+
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
255
|
+
# @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
|
256
|
+
def crawled_urls(limit: 0, skip: 0, &block)
|
257
|
+
urls(crawled: true, limit:, skip:, &block)
|
258
|
+
end
|
259
|
+
|
260
|
+
# Returns Url records that haven't yet been crawled.
|
261
|
+
#
|
262
|
+
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
263
|
+
# @param skip [Integer] Skip n amount of Url's.
|
264
|
+
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
265
|
+
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
266
|
+
def uncrawled_urls(limit: 0, skip: 0, &block)
|
267
|
+
urls(crawled: false, limit:, skip:, &block)
|
268
|
+
end
|
269
|
+
|
270
|
+
# Searches the database's Documents for the given query using the
|
271
|
+
# `Wgit::Model.search_fields`.
|
272
|
+
#
|
273
|
+
# The MongoDB search algorithm ranks/sorts the results in order (highest
|
274
|
+
# first) based on each document's "textScore" (which records the number of
|
275
|
+
# query hits). The "textScore" is then stored in each Document result
|
276
|
+
# object for use elsewhere if needed; accessed via Wgit::Document#score.
|
277
|
+
#
|
278
|
+
# @param query [String] The text query to search with.
|
279
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
280
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
281
|
+
# for separately.
|
282
|
+
# @param limit [Integer] The max number of results to return.
|
283
|
+
# @param skip [Integer] The number of results to skip.
|
284
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
285
|
+
# DB.
|
286
|
+
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
287
|
+
def search(
|
288
|
+
query, case_sensitive: false, whole_sentence: true,
|
289
|
+
limit: 10, skip: 0, &block
|
290
|
+
)
|
291
|
+
query = query.to_s.strip
|
292
|
+
query.replace("\"#{query}\"") if whole_sentence
|
293
|
+
|
294
|
+
# Sort based on the most search hits (aka "textScore").
|
295
|
+
# We use the sort_proj hash as both a sort and a projection below.
|
296
|
+
sort_proj = { score: { :$meta => 'textScore' } }
|
297
|
+
query = {
|
298
|
+
:$text => {
|
299
|
+
:$search => query,
|
300
|
+
:$caseSensitive => case_sensitive
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
304
|
+
results = retrieve(DOCUMENTS_COLLECTION, query,
|
305
|
+
sort: sort_proj, projection: sort_proj,
|
306
|
+
limit:, skip:)
|
307
|
+
map_documents(results, &block)
|
308
|
+
end
|
309
|
+
|
310
|
+
# Searches the database's Documents for the given query and then searches
|
311
|
+
# each result in turn using `doc.search`. Instead of an Array of Documents,
|
312
|
+
# this method returns a Hash of the docs url => search_results creating a
|
313
|
+
# search engine like result set for quick access to text matches.
|
314
|
+
#
|
315
|
+
# @param query [String] The text query to search with.
|
316
|
+
# @param case_sensitive [Boolean] Whether character case must match.
|
317
|
+
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
318
|
+
# for separately.
|
319
|
+
# @param limit [Integer] The max number of results to return.
|
320
|
+
# @param skip [Integer] The number of results to skip.
|
321
|
+
# @param sentence_limit [Integer] The max length of each search result
|
322
|
+
# sentence.
|
323
|
+
# @param top_result_only [Boolean] Whether to return all of the documents
|
324
|
+
# search results (Array<String>) or just the top (most relevent) result
|
325
|
+
# (String).
|
326
|
+
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
327
|
+
# DB.
|
328
|
+
# @return [Hash<String, String | Array<String>>] The search results obtained
|
329
|
+
# from the DB having mapped the docs url => search_results. The format of
|
330
|
+
# search_results depends on the value of `top_result_only`.
|
331
|
+
def search!(
|
332
|
+
query, case_sensitive: false, whole_sentence: true,
|
333
|
+
limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
|
334
|
+
)
|
335
|
+
results = search(query, case_sensitive:, whole_sentence:, limit:, skip:)
|
336
|
+
|
337
|
+
results
|
338
|
+
.map do |doc|
|
339
|
+
yield(doc) if block_given?
|
340
|
+
|
341
|
+
results = doc.search(
|
342
|
+
query, case_sensitive:, whole_sentence:, sentence_limit:
|
343
|
+
)
|
344
|
+
|
345
|
+
if results.empty?
|
346
|
+
Wgit.logger.warn("MongoDB and Document #search calls have \
|
347
|
+
differing results")
|
348
|
+
next nil
|
349
|
+
end
|
350
|
+
|
351
|
+
results = results.first if top_result_only
|
352
|
+
[doc.url, results]
|
353
|
+
end
|
354
|
+
.compact
|
355
|
+
.to_h
|
356
|
+
end
|
357
|
+
|
358
|
+
# Returns statistics about the database.
|
359
|
+
#
|
360
|
+
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
361
|
+
def stats
|
362
|
+
@client.command(dbStats: 0).documents[0]
|
363
|
+
end
|
364
|
+
|
365
|
+
# Returns the current size of the database.
|
366
|
+
#
|
367
|
+
# @return [Integer] The current size of the DB.
|
368
|
+
def size
|
369
|
+
stats[:dataSize]
|
370
|
+
end
|
371
|
+
|
372
|
+
# Returns the total number of URL records in the DB.
|
373
|
+
#
|
374
|
+
# @return [Integer] The current number of URL records.
|
375
|
+
def num_urls
|
376
|
+
@client[URLS_COLLECTION].count
|
377
|
+
end
|
378
|
+
|
379
|
+
# Returns the total number of Document records in the DB.
|
380
|
+
#
|
381
|
+
# @return [Integer] The current number of Document records.
|
382
|
+
def num_docs
|
383
|
+
@client[DOCUMENTS_COLLECTION].count
|
384
|
+
end
|
385
|
+
|
386
|
+
# Returns the total number of records (urls + docs) in the DB.
|
387
|
+
#
|
388
|
+
# @return [Integer] The current number of URL and Document records.
|
389
|
+
def num_records
|
390
|
+
num_urls + num_docs
|
391
|
+
end
|
392
|
+
|
393
|
+
# Returns whether or not a record with the given 'url' field (which is
|
394
|
+
# unique) exists in the database's 'urls' collection.
|
395
|
+
#
|
396
|
+
# @param url [Wgit::Url] The Url to search the DB for.
|
397
|
+
# @return [Boolean] True if url exists, otherwise false.
|
398
|
+
def url?(url)
|
399
|
+
assert_type(url, String) # This includes Wgit::Url's.
|
400
|
+
query = { url: }
|
401
|
+
retrieve(URLS_COLLECTION, query, limit: 1).any?
|
402
|
+
end
|
403
|
+
|
404
|
+
# Returns whether or not a record with the given doc 'url.url' field
|
405
|
+
# (which is unique) exists in the database's 'documents' collection.
|
406
|
+
#
|
407
|
+
# @param doc [Wgit::Document] The Document to search the DB for.
|
408
|
+
# @return [Boolean] True if doc exists, otherwise false.
|
409
|
+
def doc?(doc)
|
410
|
+
assert_type(doc, Wgit::Document)
|
411
|
+
query = { 'url.url' => doc.url }
|
412
|
+
retrieve(DOCUMENTS_COLLECTION, query, limit: 1).any?
|
413
|
+
end
|
414
|
+
|
415
|
+
# Returns if a record exists with the given obj's url.
|
416
|
+
#
|
417
|
+
# @param obj [Wgit::Url, Wgit::Document] Object containing the url to
|
418
|
+
# search for.
|
419
|
+
# @return [Boolean] True if a record exists with the url, false otherwise.
|
420
|
+
def exists?(obj)
|
421
|
+
obj.is_a?(String) ? url?(obj) : doc?(obj)
|
422
|
+
end
|
423
|
+
|
424
|
+
# Returns a record from the database with the matching 'url' field; or nil.
|
425
|
+
# Pass either a Wgit::Url or Wgit::Document instance.
|
426
|
+
#
|
427
|
+
# @param obj [Wgit::Url, Wgit::Document] The record to search the DB for.
|
428
|
+
# @return [Wgit::Url, Wgit::Document, nil] The record with the matching
|
429
|
+
# 'url' field or nil if no results can be found.
|
430
|
+
def get(obj)
|
431
|
+
collection, query = get_model_info(obj)
|
432
|
+
|
433
|
+
record = retrieve(collection, query, limit: 1).first
|
434
|
+
return nil unless record
|
435
|
+
|
436
|
+
obj.class.new(record)
|
437
|
+
end
|
438
|
+
|
439
|
+
### Update Data ###
|
440
|
+
|
441
|
+
# Update a Url or Document object in the DB.
|
442
|
+
#
|
443
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj/record to update.
|
444
|
+
# @raise [StandardError] If the obj is not valid.
|
445
|
+
# @return [Integer] The number of updated records/objects.
|
446
|
+
def update(obj)
|
447
|
+
collection, query, model = get_model_info(obj)
|
448
|
+
data_hash = model.merge(Wgit::Model.common_update_data)
|
449
|
+
|
450
|
+
mutate(collection, query, { '$set' => data_hash })
|
451
|
+
end
|
452
|
+
|
453
|
+
### Delete Data ###
|
454
|
+
|
455
|
+
# Deletes a record from the database with the matching 'url' field.
|
456
|
+
# Pass either a Wgit::Url or Wgit::Document instance.
|
457
|
+
#
|
458
|
+
# @param obj [Wgit::Url, Wgit::Document] The record to search the DB for
|
459
|
+
# and delete.
|
460
|
+
# @return [Integer] The number of records deleted - should always be
|
461
|
+
# 0 or 1 because urls are unique.
|
462
|
+
def delete(obj)
|
463
|
+
collection, query = get_model_info(obj)
|
464
|
+
result = @client[collection].delete_one(query)
|
465
|
+
result.n
|
466
|
+
ensure
|
467
|
+
@last_result = result
|
468
|
+
end
|
469
|
+
|
470
|
+
# Deletes everything in the urls collection.
|
471
|
+
#
|
472
|
+
# @return [Integer] The number of deleted records.
|
473
|
+
def empty_urls
|
474
|
+
result = @client[URLS_COLLECTION].delete_many({})
|
475
|
+
result.n
|
476
|
+
ensure
|
477
|
+
@last_result = result
|
478
|
+
end
|
479
|
+
|
480
|
+
# Deletes everything in the documents collection.
|
481
|
+
#
|
482
|
+
# @return [Integer] The number of deleted records.
|
483
|
+
def empty_docs
|
484
|
+
result = @client[DOCUMENTS_COLLECTION].delete_many({})
|
485
|
+
result.n
|
486
|
+
ensure
|
487
|
+
@last_result = result
|
488
|
+
end
|
489
|
+
|
490
|
+
# Deletes everything in the urls and documents collections.
|
491
|
+
#
|
492
|
+
# @return [Integer] The number of deleted records.
|
493
|
+
def empty
|
494
|
+
empty_urls + empty_docs
|
495
|
+
end
|
496
|
+
|
497
|
+
private
|
498
|
+
|
499
|
+
# Get the database's model info (collection type, query hash, model) for
|
500
|
+
# obj.
|
501
|
+
#
|
502
|
+
# Use like:
|
503
|
+
# ```
|
504
|
+
# collection, query, model = get_model_info(obj)
|
505
|
+
# ```
|
506
|
+
#
|
507
|
+
# Raises an error if obj isn't a Wgit::Url or Wgit::Document.
|
508
|
+
# Note, that no database calls are made during this method call.
|
509
|
+
#
|
510
|
+
# @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
|
511
|
+
# @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
|
512
|
+
# @return [Array<Symbol, Hash>] The collection type, query to get
|
513
|
+
# the record/obj from the database (if it exists) and the model of obj.
|
514
|
+
def get_model_info(obj)
|
515
|
+
obj = obj.dup
|
516
|
+
|
517
|
+
case obj
|
518
|
+
when Wgit::Url
|
519
|
+
collection = URLS_COLLECTION
|
520
|
+
query = { url: obj.to_s }
|
521
|
+
model = build_model(obj)
|
522
|
+
when Wgit::Document
|
523
|
+
collection = DOCUMENTS_COLLECTION
|
524
|
+
query = { 'url.url' => obj.url.to_s }
|
525
|
+
model = build_model(obj)
|
526
|
+
else
|
527
|
+
raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
|
528
|
+
end
|
529
|
+
|
530
|
+
[collection, query, model]
|
531
|
+
end
|
532
|
+
|
533
|
+
# Create/insert one or more Url or Document records into the DB.
|
534
|
+
#
|
535
|
+
# @param collection [Symbol] Either :urls or :documents.
|
536
|
+
# @param data [Hash, Array<Hash>] The data to insert.
|
537
|
+
# @raise [StandardError] If data type is unsupported or the write fails.
|
538
|
+
# @return [Integer] The number of inserted records.
|
539
|
+
def create(collection, data)
|
540
|
+
assert_types(data, [Hash, Array])
|
541
|
+
|
542
|
+
case data
|
543
|
+
when Hash # Single record.
|
544
|
+
data.merge!(Wgit::Model.common_insert_data)
|
545
|
+
result = @client[collection.to_sym].insert_one(data)
|
546
|
+
raise 'DB write (insert) failed' unless write_succeeded?(result)
|
547
|
+
|
548
|
+
result.n
|
549
|
+
when Array # Multiple records.
|
550
|
+
assert_arr_type(data, Hash)
|
551
|
+
data.map! { |hash| hash.merge(Wgit::Model.common_insert_data) }
|
552
|
+
result = @client[collection.to_sym].insert_many(data)
|
553
|
+
unless write_succeeded?(result, num_writes: data.length)
|
554
|
+
raise 'DB write(s) (insert) failed'
|
555
|
+
end
|
556
|
+
|
557
|
+
result.inserted_count
|
558
|
+
else
|
559
|
+
raise 'data must be a Hash or an Array of Hashes'
|
560
|
+
end
|
561
|
+
ensure
|
562
|
+
@last_result = result
|
563
|
+
end
|
564
|
+
|
565
|
+
# Return if the write to the DB succeeded or not.
|
566
|
+
#
|
567
|
+
# @param result [Mongo::Collection::View] The write result.
|
568
|
+
# @param num_writes [Integer] The number of records written to.
|
569
|
+
# @raise [StandardError] If the result type isn't supported.
|
570
|
+
# @return [Boolean] True if the write was successful, false otherwise.
|
571
|
+
def write_succeeded?(result, num_writes: 1)
|
572
|
+
case result
|
573
|
+
when Mongo::Operation::Insert::Result # Single create result.
|
574
|
+
result.documents.first[:err].nil?
|
575
|
+
when Mongo::BulkWrite::Result # Multiple create result.
|
576
|
+
result.inserted_count == num_writes
|
577
|
+
when Mongo::Operation::Update::Result # Single/multiple update result.
|
578
|
+
singleton = (num_writes == 1)
|
579
|
+
singleton ? result.documents.first[:err].nil? : result.n == num_writes
|
580
|
+
else # Class no longer used, have you upgraded the 'mongo' gem?
|
581
|
+
raise "Result class not currently supported: #{result.class}"
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
# Retrieve Url or Document records from the DB.
|
586
|
+
#
|
587
|
+
# @param collection [Symbol] Either :urls or :documents.
|
588
|
+
# @param query [Hash] The query used for the retrieval.
|
589
|
+
# @param sort [Hash] The sort to use.
|
590
|
+
# @param projection [Hash] The projection to use.
|
591
|
+
# @param limit [Integer] The limit to use.
|
592
|
+
# @param skip [Integer] The skip to use.
|
593
|
+
# @raise [StandardError] If query type isn't valid.
|
594
|
+
# @return [Mongo::Collection::View] The retrieval viewset.
|
595
|
+
def retrieve(collection, query,
|
596
|
+
sort: {}, projection: {},
|
597
|
+
limit: 0, skip: 0)
|
598
|
+
assert_type(query, Hash)
|
599
|
+
@last_result = @client[collection.to_sym].find(query).projection(projection)
|
600
|
+
.skip(skip).limit(limit).sort(sort)
|
601
|
+
end
|
602
|
+
|
603
|
+
# Mutate/update one or more Url or Document records in the DB.
|
604
|
+
#
|
605
|
+
# This method expects Wgit::Model.common_update_data to have been merged in
|
606
|
+
# already by the calling method.
|
607
|
+
#
|
608
|
+
# @param collection [Symbol] Either :urls or :documents.
|
609
|
+
# @param query [Hash] The query used for the retrieval before updating.
|
610
|
+
# @param update [Hash] The updated/new object.
|
611
|
+
# @raise [StandardError] If the update fails.
|
612
|
+
# @return [Integer] The number of updated records/objects.
|
613
|
+
def mutate(collection, query, update)
|
614
|
+
assert_arr_type([query, update], Hash)
|
615
|
+
|
616
|
+
result = @client[collection.to_sym].update_one(query, update)
|
617
|
+
raise 'DB write(s) (update) failed' unless write_succeeded?(result)
|
618
|
+
|
619
|
+
result.n
|
620
|
+
ensure
|
621
|
+
@last_result = result
|
622
|
+
end
|
623
|
+
|
624
|
+
alias_method :num_objects, :num_records
|
625
|
+
alias_method :empty!, :empty
|
626
|
+
end
|
627
|
+
end
|