wgit 0.10.8 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +72 -1
- data/CODE_OF_CONDUCT.md +1 -1
- data/CONTRIBUTING.md +2 -2
- data/README.md +24 -20
- data/bin/wgit +75 -19
- data/lib/wgit/assertable.rb +33 -6
- data/lib/wgit/core_ext.rb +1 -1
- data/lib/wgit/crawler.rb +102 -37
- data/lib/wgit/database/adapters/in_memory.rb +204 -0
- data/lib/wgit/database/adapters/mongo_db.rb +627 -0
- data/lib/wgit/database/database.rb +18 -651
- data/lib/wgit/database/database_adapter.rb +147 -0
- data/lib/wgit/document.rb +222 -98
- data/lib/wgit/document_extractors.rb +16 -10
- data/lib/wgit/dsl.rb +74 -81
- data/lib/wgit/html_to_text.rb +277 -0
- data/lib/wgit/indexer.rb +184 -71
- data/lib/wgit/logger.rb +2 -2
- data/lib/wgit/model.rb +164 -0
- data/lib/wgit/response.rb +25 -13
- data/lib/wgit/robots_parser.rb +193 -0
- data/lib/wgit/url.rb +150 -90
- data/lib/wgit/utils.rb +200 -37
- data/lib/wgit/version.rb +1 -1
- data/lib/wgit.rb +18 -13
- metadata +56 -43
- data/lib/wgit/database/model.rb +0 -60
@@ -1,667 +1,34 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative '../document'
|
5
|
-
require_relative '../logger'
|
6
|
-
require_relative '../assertable'
|
7
|
-
require_relative 'model'
|
8
|
-
require 'logger'
|
9
|
-
require 'mongo'
|
3
|
+
require_relative "adapters/mongo_db"
|
10
4
|
|
11
5
|
module Wgit
|
12
|
-
#
|
13
|
-
# Document collections.
|
14
|
-
|
15
|
-
|
6
|
+
# Module providing a Database connection and CRUD operations for the Url and
|
7
|
+
# Document collections that form the Wgit persistence layer.
|
8
|
+
module Database
|
9
|
+
# The default Database adapter class used by Wgit.
|
10
|
+
DEFAULT_ADAPTER_CLASS = Wgit::Database::MongoDB
|
16
11
|
|
17
|
-
# The
|
18
|
-
|
12
|
+
# The Database adapter class to be used by Wgit. Set this based on the
|
13
|
+
# Database you want to use. The adapter doesn't exist yet? Write your own.
|
14
|
+
@adapter_class = DEFAULT_ADAPTER_CLASS
|
19
15
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
# The default name of the urls and documents collections unique index.
|
27
|
-
UNIQUE_INDEX = 'unique_url'
|
28
|
-
|
29
|
-
# The documents collection default text search index. Use
|
30
|
-
# `db.text_index = Wgit::Database::DEFAULT_TEXT_INDEX` to revert changes.
|
31
|
-
DEFAULT_TEXT_INDEX = {
|
32
|
-
title: 2,
|
33
|
-
description: 2,
|
34
|
-
keywords: 2,
|
35
|
-
text: 1
|
36
|
-
}.freeze
|
37
|
-
|
38
|
-
# The connection string for the database.
|
39
|
-
attr_reader :connection_string
|
40
|
-
|
41
|
-
# The database client object. Gets set when a connection is established.
|
42
|
-
attr_reader :client
|
43
|
-
|
44
|
-
# The documents collection text index, used to search the DB.
|
45
|
-
# A custom setter method is also provided for changing the search logic.
|
46
|
-
attr_reader :text_index
|
47
|
-
|
48
|
-
# The raw MongoDB result of the most recent operation.
|
49
|
-
attr_reader :last_result
|
50
|
-
|
51
|
-
# Initializes a connected database client using the provided
|
52
|
-
# connection_string or ENV['WGIT_CONNECTION_STRING'].
|
53
|
-
#
|
54
|
-
# @param connection_string [String] The connection string needed to connect
|
55
|
-
# to the database.
|
56
|
-
# @raise [StandardError] If a connection string isn't provided, either as a
|
57
|
-
# parameter or via the environment.
|
58
|
-
def initialize(connection_string = nil)
|
59
|
-
connection_string ||= ENV['WGIT_CONNECTION_STRING']
|
60
|
-
raise "connection_string and ENV['WGIT_CONNECTION_STRING'] are nil" \
|
61
|
-
unless connection_string
|
62
|
-
|
63
|
-
@client = Database.establish_connection(connection_string)
|
64
|
-
@connection_string = connection_string
|
65
|
-
@text_index = DEFAULT_TEXT_INDEX
|
16
|
+
class << self
|
17
|
+
# The Database adapter class to use with Wgit. The adapter you supply
|
18
|
+
# should be a subclass of Wgit::Database::DatabaseAdapter and should
|
19
|
+
# implement the methods within it, in order to work with Wgit.
|
20
|
+
attr_accessor :adapter_class
|
66
21
|
end
|
67
22
|
|
68
|
-
#
|
23
|
+
# Initializes a DatabaseAdapter instance. Is an alias for:
|
24
|
+
# `Wgit::Database.adapter_class.new(connection_string)`
|
69
25
|
#
|
70
26
|
# @param connection_string [String] The connection string needed to connect
|
71
27
|
# to the database.
|
72
28
|
# @raise [StandardError] If a connection string isn't provided, either as a
|
73
29
|
# parameter or via the environment.
|
74
|
-
|
75
|
-
|
76
|
-
new(connection_string)
|
77
|
-
end
|
78
|
-
|
79
|
-
# Initializes a connected database client using the connection string.
|
80
|
-
#
|
81
|
-
# @param connection_string [String] The connection string needed to connect
|
82
|
-
# to the database.
|
83
|
-
# @raise [StandardError] If a connection cannot be established.
|
84
|
-
# @return [Mong::Client] The connected MongoDB client.
|
85
|
-
def self.establish_connection(connection_string)
|
86
|
-
# Only log for error (and more severe) scenarios.
|
87
|
-
Mongo::Logger.logger = Wgit.logger.clone
|
88
|
-
Mongo::Logger.logger.progname = 'mongo'
|
89
|
-
Mongo::Logger.logger.level = Logger::ERROR
|
90
|
-
|
91
|
-
# Connects to the database here.
|
92
|
-
Mongo::Client.new(connection_string)
|
93
|
-
end
|
94
|
-
|
95
|
-
### DDL ###
|
96
|
-
|
97
|
-
# Creates the 'urls' and 'documents' collections.
|
98
|
-
#
|
99
|
-
# @return [nil] Always returns nil.
|
100
|
-
def create_collections
|
101
|
-
@client[URLS_COLLECTION].create
|
102
|
-
@client[DOCUMENTS_COLLECTION].create
|
103
|
-
|
104
|
-
nil
|
105
|
-
end
|
106
|
-
|
107
|
-
# Creates the urls and documents unique 'url' indexes.
|
108
|
-
#
|
109
|
-
# @return [nil] Always returns nil.
|
110
|
-
def create_unique_indexes
|
111
|
-
@client[URLS_COLLECTION].indexes.create_one(
|
112
|
-
{ url: 1 }, name: UNIQUE_INDEX, unique: true
|
113
|
-
)
|
114
|
-
|
115
|
-
@client[DOCUMENTS_COLLECTION].indexes.create_one(
|
116
|
-
{ 'url.url' => 1 }, name: UNIQUE_INDEX, unique: true
|
117
|
-
)
|
118
|
-
|
119
|
-
nil
|
120
|
-
end
|
121
|
-
|
122
|
-
# Set the documents collection text search index aka the fields to #search.
|
123
|
-
# This is labor intensive on large collections so change little and wisely.
|
124
|
-
# This method is idempotent in that it will remove the index if it already
|
125
|
-
# exists before it creates the new index.
|
126
|
-
#
|
127
|
-
# @param fields [Array<Symbol>, Hash<Symbol, Integer>] The field names or
|
128
|
-
# the field names and their coresponding search weights.
|
129
|
-
# @return [Array<Symbol>, Hash] The passed in value of fields. Use
|
130
|
-
# `#text_index` to get the new index's fields and weights.
|
131
|
-
# @raise [StandardError] If fields is of an incorrect type or an error
|
132
|
-
# occurs with the underlying DB client.
|
133
|
-
def text_index=(fields)
|
134
|
-
# We want to end up with a Hash of fields (Symbols) and their
|
135
|
-
# weights (Integers).
|
136
|
-
case fields
|
137
|
-
when Array # of Strings/Symbols.
|
138
|
-
fields = fields.map { |field| [field.to_sym, 1] }
|
139
|
-
when Hash # of Strings/Symbols and Integers.
|
140
|
-
fields = fields.map { |field, weight| [field.to_sym, weight.to_i] }
|
141
|
-
else
|
142
|
-
raise "fields must be an Array or Hash, not a #{fields.class}"
|
143
|
-
end
|
144
|
-
|
145
|
-
fields = fields.to_h
|
146
|
-
indexes = @client[DOCUMENTS_COLLECTION].indexes
|
147
|
-
|
148
|
-
indexes.drop_one(TEXT_INDEX) if indexes.get(TEXT_INDEX)
|
149
|
-
indexes.create_one(
|
150
|
-
fields.map { |field, _| [field, 'text'] }.to_h,
|
151
|
-
{ name: TEXT_INDEX, weights: fields, background: true }
|
152
|
-
)
|
153
|
-
|
154
|
-
@text_index = fields
|
155
|
-
end
|
156
|
-
|
157
|
-
### Create Data ###
|
158
|
-
|
159
|
-
# Insert one or more Url or Document objects into the DB.
|
160
|
-
#
|
161
|
-
# @param data [Wgit::Url, Wgit::Document, Enumerable<Wgit::Url,
|
162
|
-
# Wgit::Document>] The records to insert/create.
|
163
|
-
# @raise [StandardError] If data isn't valid.
|
164
|
-
def insert(data)
|
165
|
-
data = data.dup # Avoid modifying by reference.
|
166
|
-
collection = nil
|
167
|
-
|
168
|
-
if data.respond_to?(:map!)
|
169
|
-
data.map! do |obj|
|
170
|
-
collection, _, model = get_type_info(obj)
|
171
|
-
model
|
172
|
-
end
|
173
|
-
else
|
174
|
-
collection, _, model = get_type_info(data)
|
175
|
-
data = model
|
176
|
-
end
|
177
|
-
|
178
|
-
create(collection, data)
|
179
|
-
end
|
180
|
-
|
181
|
-
# Inserts or updates the object in the database.
|
182
|
-
#
|
183
|
-
# @param obj [Wgit::Url, Wgit::Document] The obj/record to insert/update.
|
184
|
-
# @return [Boolean] True if inserted, false if updated.
|
185
|
-
def upsert(obj)
|
186
|
-
collection, query, model = get_type_info(obj.dup)
|
187
|
-
data_hash = model.merge(Wgit::Model.common_update_data)
|
188
|
-
result = @client[collection].replace_one(query, data_hash, upsert: true)
|
189
|
-
|
190
|
-
result.matched_count.zero?
|
191
|
-
ensure
|
192
|
-
@last_result = result
|
193
|
-
end
|
194
|
-
|
195
|
-
### Retrieve Data ###
|
196
|
-
|
197
|
-
# Returns all Document records from the DB. Use #search to filter based on
|
198
|
-
# the text_index of the collection.
|
199
|
-
#
|
200
|
-
# All Documents are sorted by date_added ascending, in other words the
|
201
|
-
# first doc returned is the first one that was inserted into the DB.
|
202
|
-
#
|
203
|
-
# @param limit [Integer] The max number of returned records. 0 returns all.
|
204
|
-
# @param skip [Integer] Skip n records.
|
205
|
-
# @yield [doc] Given each Document object (Wgit::Document) returned from
|
206
|
-
# the DB.
|
207
|
-
# @return [Array<Wgit::Document>] The Documents obtained from the DB.
|
208
|
-
def docs(limit: 0, skip: 0)
|
209
|
-
results = retrieve(DOCUMENTS_COLLECTION, {},
|
210
|
-
sort: { date_added: 1 }, limit: limit, skip: skip)
|
211
|
-
return [] if results.count < 1 # results#empty? doesn't exist.
|
212
|
-
|
213
|
-
# results.respond_to? :map! is false so we use map and overwrite the var.
|
214
|
-
results = results.map { |doc_hash| Wgit::Document.new(doc_hash) }
|
215
|
-
results.each { |doc| yield(doc) } if block_given?
|
216
|
-
|
217
|
-
results
|
218
|
-
end
|
219
|
-
|
220
|
-
# Returns all Url records from the DB.
|
221
|
-
#
|
222
|
-
# All Urls are sorted by date_added ascending, in other words the first url
|
223
|
-
# returned is the first one that was inserted into the DB.
|
224
|
-
#
|
225
|
-
# @param crawled [Boolean] Filter by Url#crawled value. nil returns all.
|
226
|
-
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
227
|
-
# @param skip [Integer] Skip n amount of Url's.
|
228
|
-
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
229
|
-
# @return [Array<Wgit::Url>] The Urls obtained from the DB.
|
230
|
-
def urls(crawled: nil, limit: 0, skip: 0)
|
231
|
-
query = crawled.nil? ? {} : { crawled: crawled }
|
232
|
-
sort = { date_added: 1 }
|
233
|
-
|
234
|
-
results = retrieve(URLS_COLLECTION, query,
|
235
|
-
sort: sort, limit: limit, skip: skip)
|
236
|
-
return [] if results.count < 1 # results#empty? doesn't exist.
|
237
|
-
|
238
|
-
# results.respond_to? :map! is false so we use map and overwrite the var.
|
239
|
-
results = results.map { |url_doc| Wgit::Url.new(url_doc) }
|
240
|
-
results.each { |url| yield(url) } if block_given?
|
241
|
-
|
242
|
-
results
|
243
|
-
end
|
244
|
-
|
245
|
-
# Returns Url records that have been crawled.
|
246
|
-
#
|
247
|
-
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
248
|
-
# @param skip [Integer] Skip n amount of Url's.
|
249
|
-
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
250
|
-
# @return [Array<Wgit::Url>] The crawled Urls obtained from the DB.
|
251
|
-
def crawled_urls(limit: 0, skip: 0, &block)
|
252
|
-
urls(crawled: true, limit: limit, skip: skip, &block)
|
253
|
-
end
|
254
|
-
|
255
|
-
# Returned Url records that haven't yet been crawled.
|
256
|
-
#
|
257
|
-
# @param limit [Integer] The max number of Url's to return. 0 returns all.
|
258
|
-
# @param skip [Integer] Skip n amount of Url's.
|
259
|
-
# @yield [url] Given each Url object (Wgit::Url) returned from the DB.
|
260
|
-
# @return [Array<Wgit::Url>] The uncrawled Urls obtained from the DB.
|
261
|
-
def uncrawled_urls(limit: 0, skip: 0, &block)
|
262
|
-
urls(crawled: false, limit: limit, skip: skip, &block)
|
263
|
-
end
|
264
|
-
|
265
|
-
# Searches the database's Documents for the given query.
|
266
|
-
#
|
267
|
-
# The searched fields are decided by the text index setup on the
|
268
|
-
# documents collection. Currently we search against the following fields:
|
269
|
-
# "author", "keywords", "title" and "text" by default.
|
270
|
-
#
|
271
|
-
# The MongoDB search algorithm ranks/sorts the results in order (highest
|
272
|
-
# first) based on each document's "textScore" (which records the number of
|
273
|
-
# query hits). The "textScore" is then stored in each Document result
|
274
|
-
# object for use elsewhere if needed; accessed via Wgit::Document#score.
|
275
|
-
#
|
276
|
-
# @param query [String] The text query to search with.
|
277
|
-
# @param case_sensitive [Boolean] Whether character case must match.
|
278
|
-
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
279
|
-
# for separately.
|
280
|
-
# @param limit [Integer] The max number of results to return.
|
281
|
-
# @param skip [Integer] The number of results to skip.
|
282
|
-
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
283
|
-
# DB.
|
284
|
-
# @return [Array<Wgit::Document>] The search results obtained from the DB.
|
285
|
-
def search(
|
286
|
-
query, case_sensitive: false, whole_sentence: true, limit: 10, skip: 0
|
287
|
-
)
|
288
|
-
query = query.to_s.strip
|
289
|
-
query.replace('"' + query + '"') if whole_sentence
|
290
|
-
|
291
|
-
# Sort based on the most search hits (aka "textScore").
|
292
|
-
# We use the sort_proj hash as both a sort and a projection below.
|
293
|
-
sort_proj = { score: { :$meta => 'textScore' } }
|
294
|
-
query = { :$text => {
|
295
|
-
:$search => query,
|
296
|
-
:$caseSensitive => case_sensitive
|
297
|
-
} }
|
298
|
-
|
299
|
-
results = retrieve(DOCUMENTS_COLLECTION, query,
|
300
|
-
sort: sort_proj, projection: sort_proj,
|
301
|
-
limit: limit, skip: skip)
|
302
|
-
|
303
|
-
results.map do |mongo_doc|
|
304
|
-
doc = Wgit::Document.new(mongo_doc)
|
305
|
-
yield(doc) if block_given?
|
306
|
-
doc
|
307
|
-
end
|
308
|
-
end
|
309
|
-
|
310
|
-
# Searches the database's Documents for the given query and then searches
|
311
|
-
# each result in turn using `doc.search!`. This method is therefore the
|
312
|
-
# equivalent of calling `Wgit::Database#search` and then
|
313
|
-
# `Wgit::Document#search!` in turn. See their documentation for more info.
|
314
|
-
#
|
315
|
-
# @param query [String] The text query to search with.
|
316
|
-
# @param case_sensitive [Boolean] Whether character case must match.
|
317
|
-
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
318
|
-
# for separately.
|
319
|
-
# @param limit [Integer] The max number of results to return.
|
320
|
-
# @param skip [Integer] The number of results to skip.
|
321
|
-
# @param sentence_limit [Integer] The max length of each search result
|
322
|
-
# sentence.
|
323
|
-
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
324
|
-
# DB having called `doc.search!(query)`.
|
325
|
-
# @return [Array<Wgit::Document>] The search results obtained from the DB
|
326
|
-
# having called `doc.search!(query)`.
|
327
|
-
def search!(
|
328
|
-
query, case_sensitive: false, whole_sentence: true,
|
329
|
-
limit: 10, skip: 0, sentence_limit: 80
|
330
|
-
)
|
331
|
-
results = search(
|
332
|
-
query,
|
333
|
-
case_sensitive: case_sensitive,
|
334
|
-
whole_sentence: whole_sentence,
|
335
|
-
limit: limit,
|
336
|
-
skip: skip
|
337
|
-
)
|
338
|
-
|
339
|
-
results.each do |doc|
|
340
|
-
doc.search!(
|
341
|
-
query,
|
342
|
-
case_sensitive: case_sensitive,
|
343
|
-
whole_sentence: whole_sentence,
|
344
|
-
sentence_limit: sentence_limit
|
345
|
-
)
|
346
|
-
yield(doc) if block_given?
|
347
|
-
end
|
348
|
-
|
349
|
-
results
|
30
|
+
def self.new(connection_string = nil)
|
31
|
+
Wgit::Database.adapter_class.new(connection_string)
|
350
32
|
end
|
351
|
-
|
352
|
-
# Searches the database's Documents for the given query and then searches
|
353
|
-
# each result in turn using `doc.search`. Instead of an Array of Documents,
|
354
|
-
# this method returns a Hash of the docs url => search_results creating a
|
355
|
-
# search engine like result set for quick access to text matches.
|
356
|
-
#
|
357
|
-
# @param query [String] The text query to search with.
|
358
|
-
# @param case_sensitive [Boolean] Whether character case must match.
|
359
|
-
# @param whole_sentence [Boolean] Whether multiple words should be searched
|
360
|
-
# for separately.
|
361
|
-
# @param limit [Integer] The max number of results to return.
|
362
|
-
# @param skip [Integer] The number of results to skip.
|
363
|
-
# @param sentence_limit [Integer] The max length of each search result
|
364
|
-
# sentence.
|
365
|
-
# @param top_result_only [Boolean] Whether to return all of the documents
|
366
|
-
# search results or just the top (most relavent) result.
|
367
|
-
# @yield [doc] Given each search result (Wgit::Document) returned from the
|
368
|
-
# DB.
|
369
|
-
# @return [Hash<String, String | Array<String>>] The search results obtained
|
370
|
-
# from the DB having mapped the docs url => search_results. The format of
|
371
|
-
# search_results depends on the value of `top_result_only`.
|
372
|
-
def search_text(
|
373
|
-
query, case_sensitive: false, whole_sentence: true,
|
374
|
-
limit: 10, skip: 0, sentence_limit: 80, top_result_only: false
|
375
|
-
)
|
376
|
-
results = search(
|
377
|
-
query,
|
378
|
-
case_sensitive: case_sensitive,
|
379
|
-
whole_sentence: whole_sentence,
|
380
|
-
limit: limit,
|
381
|
-
skip: skip
|
382
|
-
)
|
383
|
-
|
384
|
-
results
|
385
|
-
.map do |doc|
|
386
|
-
yield(doc) if block_given?
|
387
|
-
|
388
|
-
results = doc.search(
|
389
|
-
query,
|
390
|
-
case_sensitive: case_sensitive,
|
391
|
-
whole_sentence: whole_sentence,
|
392
|
-
sentence_limit: sentence_limit
|
393
|
-
)
|
394
|
-
|
395
|
-
# Only return result if its text has a match - compact is called below.
|
396
|
-
next nil if results.empty?
|
397
|
-
|
398
|
-
[doc.url, (top_result_only ? results.first : results)]
|
399
|
-
end
|
400
|
-
.compact
|
401
|
-
.to_h
|
402
|
-
end
|
403
|
-
|
404
|
-
# Returns statistics about the database.
|
405
|
-
#
|
406
|
-
# @return [BSON::Document#[]#fetch] Similar to a Hash instance.
|
407
|
-
def stats
|
408
|
-
@client.command(dbStats: 0).documents[0]
|
409
|
-
end
|
410
|
-
|
411
|
-
# Returns the current size of the database.
|
412
|
-
#
|
413
|
-
# @return [Integer] The current size of the DB.
|
414
|
-
def size
|
415
|
-
stats[:dataSize]
|
416
|
-
end
|
417
|
-
|
418
|
-
# Returns the total number of URL records in the DB.
|
419
|
-
#
|
420
|
-
# @return [Integer] The current number of URL records.
|
421
|
-
def num_urls
|
422
|
-
@client[URLS_COLLECTION].count
|
423
|
-
end
|
424
|
-
|
425
|
-
# Returns the total number of Document records in the DB.
|
426
|
-
#
|
427
|
-
# @return [Integer] The current number of Document records.
|
428
|
-
def num_docs
|
429
|
-
@client[DOCUMENTS_COLLECTION].count
|
430
|
-
end
|
431
|
-
|
432
|
-
# Returns the total number of records (urls + docs) in the DB.
|
433
|
-
#
|
434
|
-
# @return [Integer] The current number of URL and Document records.
|
435
|
-
def num_records
|
436
|
-
num_urls + num_docs
|
437
|
-
end
|
438
|
-
|
439
|
-
# Returns whether or not a record with the given 'url' field (which is
|
440
|
-
# unique) exists in the database's 'urls' collection.
|
441
|
-
#
|
442
|
-
# @param url [Wgit::Url] The Url to search the DB for.
|
443
|
-
# @return [Boolean] True if url exists, otherwise false.
|
444
|
-
def url?(url)
|
445
|
-
assert_type(url, String) # This includes Wgit::Url's.
|
446
|
-
query = { url: url }
|
447
|
-
retrieve(URLS_COLLECTION, query, limit: 1).any?
|
448
|
-
end
|
449
|
-
|
450
|
-
# Returns whether or not a record with the given doc 'url.url' field
|
451
|
-
# (which is unique) exists in the database's 'documents' collection.
|
452
|
-
#
|
453
|
-
# @param doc [Wgit::Document] The Document to search the DB for.
|
454
|
-
# @return [Boolean] True if doc exists, otherwise false.
|
455
|
-
def doc?(doc)
|
456
|
-
assert_type(doc, Wgit::Document)
|
457
|
-
query = { 'url.url' => doc.url }
|
458
|
-
retrieve(DOCUMENTS_COLLECTION, query, limit: 1).any?
|
459
|
-
end
|
460
|
-
|
461
|
-
# Returns if a record exists with the given obj's url.
|
462
|
-
#
|
463
|
-
# @param obj [Wgit::Url, Wgit::Document] Object containing the url to
|
464
|
-
# search for.
|
465
|
-
# @return [Boolean] True if a record exists with the url, false otherwise.
|
466
|
-
def exists?(obj)
|
467
|
-
obj.is_a?(String) ? url?(obj) : doc?(obj)
|
468
|
-
end
|
469
|
-
|
470
|
-
# Returns a record from the database with the matching 'url' field; or nil.
|
471
|
-
# Pass either a Wgit::Url or Wgit::Document instance.
|
472
|
-
#
|
473
|
-
# @param obj [Wgit::Url, Wgit::Document] The record to search the DB for.
|
474
|
-
# @return [Wgit::Url, Wgit::Document, nil] The record with the matching
|
475
|
-
# 'url' field or nil if no results can be found.
|
476
|
-
def get(obj)
|
477
|
-
collection, query = get_type_info(obj)
|
478
|
-
|
479
|
-
record = retrieve(collection, query, limit: 1).first
|
480
|
-
return nil unless record
|
481
|
-
|
482
|
-
obj.class.new(record)
|
483
|
-
end
|
484
|
-
|
485
|
-
### Update Data ###
|
486
|
-
|
487
|
-
# Update a Url or Document object in the DB.
|
488
|
-
#
|
489
|
-
# @param obj [Wgit::Url, Wgit::Document] The obj/record to update.
|
490
|
-
# @raise [StandardError] If the obj is not valid.
|
491
|
-
# @return [Integer] The number of updated records/objects.
|
492
|
-
def update(obj)
|
493
|
-
collection, query, model = get_type_info(obj.dup)
|
494
|
-
data_hash = model.merge(Wgit::Model.common_update_data)
|
495
|
-
|
496
|
-
mutate(collection, query, { '$set' => data_hash })
|
497
|
-
end
|
498
|
-
|
499
|
-
### Delete Data ###
|
500
|
-
|
501
|
-
# Deletes a record from the database with the matching 'url' field.
|
502
|
-
# Pass either a Wgit::Url or Wgit::Document instance.
|
503
|
-
#
|
504
|
-
# @param obj [Wgit::Url, Wgit::Document] The record to search the DB for
|
505
|
-
# and delete.
|
506
|
-
# @return [Integer] The number of records deleted - should always be
|
507
|
-
# 0 or 1 because urls are unique.
|
508
|
-
def delete(obj)
|
509
|
-
collection, query = get_type_info(obj)
|
510
|
-
result = @client[collection].delete_one(query)
|
511
|
-
result.n
|
512
|
-
ensure
|
513
|
-
@last_result = result
|
514
|
-
end
|
515
|
-
|
516
|
-
# Deletes everything in the urls collection.
|
517
|
-
#
|
518
|
-
# @return [Integer] The number of deleted records.
|
519
|
-
def clear_urls
|
520
|
-
result = @client[URLS_COLLECTION].delete_many({})
|
521
|
-
result.n
|
522
|
-
ensure
|
523
|
-
@last_result = result
|
524
|
-
end
|
525
|
-
|
526
|
-
# Deletes everything in the documents collection.
|
527
|
-
#
|
528
|
-
# @return [Integer] The number of deleted records.
|
529
|
-
def clear_docs
|
530
|
-
result = @client[DOCUMENTS_COLLECTION].delete_many({})
|
531
|
-
result.n
|
532
|
-
ensure
|
533
|
-
@last_result = result
|
534
|
-
end
|
535
|
-
|
536
|
-
# Deletes everything in the urls and documents collections. This will nuke
|
537
|
-
# the entire database so yeah... be careful.
|
538
|
-
#
|
539
|
-
# @return [Integer] The number of deleted records.
|
540
|
-
def clear_db
|
541
|
-
clear_urls + clear_docs
|
542
|
-
end
|
543
|
-
|
544
|
-
private
|
545
|
-
|
546
|
-
# Get the database's type info (collection type, query hash, model) for
|
547
|
-
# obj.
|
548
|
-
#
|
549
|
-
# Raises an error if obj isn't a Wgit::Url or Wgit::Document.
|
550
|
-
# Note, that no database calls are made during this method call.
|
551
|
-
#
|
552
|
-
# @param obj [Wgit::Url, Wgit::Document] The obj to get semantics for.
|
553
|
-
# @raise [StandardError] If obj isn't a Wgit::Url or Wgit::Document.
|
554
|
-
# @return [Array<Symbol, Hash>] The collection type, query to get
|
555
|
-
# the record/obj from the database (if it exists) and the model of obj.
|
556
|
-
def get_type_info(obj)
|
557
|
-
case obj
|
558
|
-
when Wgit::Url
|
559
|
-
collection = URLS_COLLECTION
|
560
|
-
query = { url: obj.to_s }
|
561
|
-
model = Wgit::Model.url(obj)
|
562
|
-
when Wgit::Document
|
563
|
-
collection = DOCUMENTS_COLLECTION
|
564
|
-
query = { 'url.url' => obj.url.to_s }
|
565
|
-
model = Wgit::Model.document(obj)
|
566
|
-
else
|
567
|
-
raise "obj must be a Wgit::Url or Wgit::Document, not: #{obj.class}"
|
568
|
-
end
|
569
|
-
|
570
|
-
[collection, query, model]
|
571
|
-
end
|
572
|
-
|
573
|
-
# Create/insert one or more Url or Document records into the DB.
|
574
|
-
#
|
575
|
-
# @param collection [Symbol] Either :urls or :documents.
|
576
|
-
# @param data [Hash, Array<Hash>] The data to insert.
|
577
|
-
# @raise [StandardError] If data type is unsupported or the write fails.
|
578
|
-
# @return [Integer] The number of inserted records.
|
579
|
-
def create(collection, data)
|
580
|
-
assert_types(data, [Hash, Array])
|
581
|
-
|
582
|
-
case data
|
583
|
-
when Hash # Single record.
|
584
|
-
data.merge!(Wgit::Model.common_insert_data)
|
585
|
-
result = @client[collection.to_sym].insert_one(data)
|
586
|
-
raise 'DB write (insert) failed' unless write_succeeded?(result)
|
587
|
-
|
588
|
-
result.n
|
589
|
-
when Array # Multiple records.
|
590
|
-
assert_arr_type(data, Hash)
|
591
|
-
data.map! { |hash| hash.merge(Wgit::Model.common_insert_data) }
|
592
|
-
result = @client[collection.to_sym].insert_many(data)
|
593
|
-
unless write_succeeded?(result, num_writes: data.length)
|
594
|
-
raise 'DB write(s) (insert) failed'
|
595
|
-
end
|
596
|
-
|
597
|
-
result.inserted_count
|
598
|
-
else
|
599
|
-
raise 'data must be a Hash or an Array of Hashes'
|
600
|
-
end
|
601
|
-
ensure
|
602
|
-
@last_result = result
|
603
|
-
end
|
604
|
-
|
605
|
-
# Return if the write to the DB succeeded or not.
|
606
|
-
#
|
607
|
-
# @param result [Mongo::Collection::View] The write result.
|
608
|
-
# @param num_writes [Integer] The number of records written to.
|
609
|
-
# @raise [StandardError] If the result type isn't supported.
|
610
|
-
# @return [Boolean] True if the write was successful, false otherwise.
|
611
|
-
def write_succeeded?(result, num_writes: 1)
|
612
|
-
case result
|
613
|
-
when Mongo::Operation::Insert::Result # Single create result.
|
614
|
-
result.documents.first[:err].nil?
|
615
|
-
when Mongo::BulkWrite::Result # Multiple create result.
|
616
|
-
result.inserted_count == num_writes
|
617
|
-
when Mongo::Operation::Update::Result # Single/multiple update result.
|
618
|
-
singleton = (num_writes == 1)
|
619
|
-
singleton ? result.documents.first[:err].nil? : result.n == num_writes
|
620
|
-
else # Class no longer used, have you upgraded the 'mongo' gem?
|
621
|
-
raise "Result class not currently supported: #{result.class}"
|
622
|
-
end
|
623
|
-
end
|
624
|
-
|
625
|
-
# Retrieve Url or Document records from the DB.
|
626
|
-
#
|
627
|
-
# @param collection [Symbol] Either :urls or :documents.
|
628
|
-
# @param query [Hash] The query used for the retrieval.
|
629
|
-
# @param sort [Hash] The sort to use.
|
630
|
-
# @param projection [Hash] The projection to use.
|
631
|
-
# @param limit [Integer] The limit to use.
|
632
|
-
# @param skip [Integer] The skip to use.
|
633
|
-
# @raise [StandardError] If query type isn't valid.
|
634
|
-
# @return [Mongo::Collection::View] The retrieval viewset.
|
635
|
-
def retrieve(collection, query,
|
636
|
-
sort: {}, projection: {},
|
637
|
-
limit: 0, skip: 0)
|
638
|
-
assert_type(query, Hash)
|
639
|
-
@last_result = @client[collection.to_sym].find(query).projection(projection)
|
640
|
-
.skip(skip).limit(limit).sort(sort)
|
641
|
-
end
|
642
|
-
|
643
|
-
# Mutate/update one or more Url or Document records in the DB.
|
644
|
-
#
|
645
|
-
# This method expects Model.common_update_data to have been merged in
|
646
|
-
# already by the calling method.
|
647
|
-
#
|
648
|
-
# @param collection [Symbol] Either :urls or :documents.
|
649
|
-
# @param query [Hash] The query used for the retrieval before updating.
|
650
|
-
# @param update [Hash] The updated/new object.
|
651
|
-
# @raise [StandardError] If the update fails.
|
652
|
-
# @return [Integer] The number of updated records/objects.
|
653
|
-
def mutate(collection, query, update)
|
654
|
-
assert_arr_type([query, update], Hash)
|
655
|
-
|
656
|
-
result = @client[collection.to_sym].update_one(query, update)
|
657
|
-
raise 'DB write(s) (update) failed' unless write_succeeded?(result)
|
658
|
-
|
659
|
-
result.n
|
660
|
-
ensure
|
661
|
-
@last_result = result
|
662
|
-
end
|
663
|
-
|
664
|
-
alias num_objects num_records
|
665
|
-
alias clear_db! clear_db
|
666
33
|
end
|
667
34
|
end
|