store-digest 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ require 'store/digest/version'
2
+
3
+ module Store::Digest::Driver
4
+ # This is an abstract class for drivers.
5
+
6
+ # this is the only implementation we have so far
7
+ autoload :LMDB, 'store/digest/driver/lmdb'
8
+
9
+ protected
10
+
11
+ def setup **options
12
+ raise NotImplementedError, 'gotta roll your own, holmes'
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ require 'store/digest/driver'
2
+ require 'store/digest/blob/filesystem'
3
+ require 'store/digest/meta/lmdb'
4
+
5
+ module Store::Digest::Driver::LMDB
6
+ include Store::Digest::Driver
7
+ include Store::Digest::Blob::FileSystem
8
+ include Store::Digest::Meta::LMDB
9
+
10
+ protected
11
+
12
+ def setup **options
13
+ super
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ module Store
2
+ class Digest
3
+ module Meta
4
+ # This is an abstract module for metadata operations.
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,621 @@
1
+ require 'store/digest/meta'
2
+ require 'store/digest/trait'
3
+
4
+ require 'lmdb'
5
+ require 'uri/ni'
6
+
7
+ module Store::Digest::Meta::LMDB
8
+ include Store::Digest::Meta
9
+ include Store::Digest::Trait::RootDir
10
+
11
+ private
12
+
13
+ PRIMARY = :"sha-256"
14
+ DIGESTS = {
15
+ md5: 16,
16
+ "sha-1": 20,
17
+ "sha-256": 32,
18
+ "sha-384": 48,
19
+ "sha-512": 64,
20
+ }.freeze
21
+
22
+ FORMAT = 'Q>NNNNCZ*Z*Z*Z*'.freeze
23
+ RECORD = %i[
24
+ size ctime mtime ptime dtime flags type language charset encoding].freeze
25
+ INTS = %i[
26
+ size ctime mtime ptime dtime flags].map { |k| [k, :to_i] }.to_h.freeze
27
+ PACK = {
28
+ # control records
29
+ objects: 'Q>',
30
+ deleted: 'Q>',
31
+ bytes: 'Q>',
32
+ # object records
33
+ size: 'Q>',
34
+ ctime: ?N, # - also used in control
35
+ mtime: ?N, # - ditto
36
+ ptime: ?N,
37
+ dtime: ?N,
38
+ flags: ?C,
39
+ type: 'Z*',
40
+ language: 'Z*',
41
+ charset: 'Z*',
42
+ encoding: 'Z*',
43
+ }.transform_values(&:freeze).freeze
44
+
45
+ def inflate bin, rec
46
+ rec = rec.dup
47
+ digests = algorithms.map do |a|
48
+ uri = URI::NI.build(scheme: 'ni', path: "/#{a}")
49
+ uri.digest = a == primary ? bin : rec.slice!(0, DIGESTS[a])
50
+ [a, uri]
51
+ end.to_h
52
+
53
+ # size ctime mtime ptime dtime flags type language charset encoding
54
+ hash = RECORD.zip(rec.unpack(FORMAT)).to_h
55
+ hash[:digests] = digests
56
+
57
+ %i[ctime ptime mtime dtime].each do |k|
58
+ hash[k] = (hash[k] == 0) ? nil : Time.at(hash[k])
59
+ end
60
+
61
+ %i[type language charset encoding].each do |k|
62
+ hash[k] = nil if hash[k].empty?
63
+ end
64
+ hash
65
+ end
66
+
67
+ def deflate obj
68
+ obj = obj.to_h unless obj.is_a? Hash
69
+ algos = (algorithms - [primary]).map { |a| obj[:digests][a].digest }.join
70
+ rec = RECORD.map { |k| v = obj[k]; v.send INTS.fetch(k, :to_s) }
71
+ algos + rec.pack(FORMAT)
72
+ end
73
+
74
+ # NOTE these are all internal methods meant to be used inside other
75
+ # transactions so they do not run in transactions themselves
76
+
77
+ def control_add key, val
78
+ if ov = @dbs[:control][key.to_s]
79
+ fmt = case ov.length
80
+ when 4 then ?N
81
+ when 8 then 'Q>'
82
+ else
83
+ raise RuntimeError, "#{key} must be 4 or 8 bytes long"
84
+ end
85
+ ov = ov.unpack1 fmt
86
+ else
87
+ ov = 0
88
+ end
89
+
90
+ nv = ov + val
91
+
92
+ @dbs[:control][key.to_s] = [nv].pack 'Q>'
93
+
94
+ nv
95
+ end
96
+
97
+ def control_get key
98
+ key = key.to_sym
99
+ raise ArgumentError, "Invalid control key #{key}" unless
100
+ %[ctime mtime objects deleted bytes].include? key
101
+ if val = @dbs[:control][key.to_s]
102
+ val.unpack1 PACK[key]
103
+ end
104
+ end
105
+
106
+ def index_pack key
107
+ case key
108
+ when nil then return
109
+ when Time then [key.to_i].pack ?N
110
+ when Integer then [key].pack 'Q>'
111
+ when String then key.b # no \0: key length is stored in the record
112
+ else raise ArgumentError, "Invalid type #{key.class}"
113
+ end
114
+ end
115
+
116
+ def index_add index, key, bin
117
+ key = index_pack(key) or return
118
+ # check first or it will just stupidly keep adding duplicate records
119
+ @dbs[index].put key, bin unless @dbs[index].has? key, bin
120
+ end
121
+
122
+ def index_rm index, key, bin
123
+ key = index_pack(key) or return
124
+ # soft delete baleets only when there is something to baleet
125
+ @dbs[index.to_sym].delete? key, bin
126
+ end
127
+
128
+ # return an enumerator
129
+ def index_get index, min, max = nil, range: false, &block
130
+ # min and max will be binary values and the cursor will return a range
131
+ min = index_pack(min)
132
+ max = index_pack(max)
133
+ return unless min || max
134
+
135
+ return enum_for :index_get, index, min, max unless block_given?
136
+
137
+ body = -> c do
138
+ # lmdb cursors are a pain in the ass because 'set' advances the
139
+ # cursor so you can't just run the whole thing in a loop, you
140
+ # have to do this instead:
141
+ if rec = (min ? c.set_range(min) : c.first)
142
+ return unless range or max or min == rec.first
143
+ block.call(*rec)
144
+ block.call(*rec) while rec = c.next_range(max || min)
145
+ end
146
+ end
147
+
148
+ @dbs[index.to_sym].cursor(&body)
149
+ nil
150
+ end
151
+
152
+ protected
153
+
154
+ def setup **options
155
+ # dir/umask
156
+ super
157
+
158
+ # now initialize our part
159
+ mapsize = options[:mapsize] || 2**27
160
+ raise ArgumentError, 'Mapsize must be a positive integer' unless
161
+ mapsize.is_a? Integer and mapsize > 0
162
+
163
+ lmdbopts = { mode: 0666 & ~umask, mapsize: mapsize }
164
+ @lmdb = ::LMDB.new dir, lmdbopts
165
+
166
+ algos = options[:algorithms] || DIGESTS.keys
167
+ raise ArgumentError, "Invalid algorithm specification #{algos}" unless
168
+ algos.is_a? Array and (algos - DIGESTS.keys).empty?
169
+
170
+ popt = options[:primary] || PRIMARY
171
+ raise ArgumentError, "Invalid primary algorithm #{popt}" unless
172
+ popt.is_a? Symbol and DIGESTS[popt]
173
+
174
+ @lmdb.transaction do
175
+ @dbs = { control: @lmdb.database('control', create: true) }
176
+
177
+ if a = algorithms
178
+ raise ArgumentError,
179
+ "Supplied algorithms #{algos.sort} do not match instantiated #{a}" if
180
+ algos.sort != a
181
+ else
182
+ a = algos.sort
183
+ @dbs[:control]['algorithms'] = a.join ?,
184
+ end
185
+
186
+ if pri = primary
187
+ raise ArgumentError,
188
+ "Supplied algorithm #{popt} does not match instantiated #{pri}" if
189
+ popt != pri
190
+ else
191
+ pri = popt
192
+ @dbs[:control]['primary'] = popt.to_s
193
+ end
194
+
195
+ now = Time.now
196
+ %w[ctime mtime].each do |t|
197
+ unless @dbs[:control].has? t
198
+ @dbs[:control][t] = [now.to_i].pack ?N
199
+ end
200
+ end
201
+
202
+ # clever if i do say so myself
203
+ %w[objects deleted bytes].each do |x|
204
+ @dbs[:control][x] = [0].pack 'Q>' unless send(x.to_sym)
205
+ end
206
+
207
+ # XXX we might actually wanna dupsort the non-primary digests too
208
+ dbs = RECORD.map do |k|
209
+ [k, [:dupsort]]
210
+ end.to_h.merge(a.map { |k| [k, []] }.to_h)
211
+
212
+ @dbs.merge!(dbs.map do |name, flags|
213
+ [name, @lmdb.database(name.to_s,
214
+ (flags + [:create]).map { |f| [f, true] }.to_h)]
215
+ end.to_h).freeze
216
+ end
217
+
218
+ @lmdb.sync
219
+ end
220
+
221
+ # Returns a metadata hash or `nil` if no changes have been made. A
222
+ # common scenario is that the caller will attempt to store an object
223
+ # that is already present, with the only distinction being `:ctime`
224
+ # (which is always ignored) and/or `:mtime`. Setting the `:preserve`
225
+ # keyword parameter to a true value will cause any new value for
226
+ # `:mtime` to be ignored as well. In that case, an attempt to store
227
+ # an otherwise identical record overtop of an existing one will
228
+ # return `nil`.
229
+ #
230
+ # @param obj [Store::Digest::Object] the object to store
231
+ # @param preserve [false, true] whether to preserve the mtime
232
+ # @return [nil, Hash] maybe the metadata content of the object
233
+ def set_meta obj, preserve: false
234
+ raise ArgumentError,
235
+ 'Object does not have a complete set of digests' unless
236
+ (algorithms - obj.algorithms).empty?
237
+
238
+ body = -> do
239
+ # noop if object is present and not deleted and no details have changed
240
+ bin = obj[primary].digest
241
+ newh = obj.to_h
242
+ now = Time.now
243
+
244
+ change = newh[:dtime] ? -1 : 1 # net change in records
245
+ oldrec = @dbs[primary][bin]
246
+ oldh = nil
247
+ newh = if oldrec
248
+ oldh = inflate bin, oldrec
249
+ oldh.merge(newh) do |k, ov, nv|
250
+ case k
251
+ when :ctime then ov # never overwrite ctime
252
+ when :mtime # only overwrite the mtime if specified
253
+ preserve ? (ov || nv || now) : (nv || ov || now)
254
+ when :ptime then nv || ov || now # XXX derive ptime?
255
+ when :dtime
256
+ # net change is zero if both or neither are set
257
+ change = 0 if (nv && ov) || (!nv && !ov)
258
+ nv
259
+ else nv
260
+ end
261
+ end
262
+ else
263
+ %i[ctime mtime ptime].each { |k| newh[k] ||= now }
264
+ newh
265
+ end
266
+ newrec = deflate newh
267
+
268
+ # we have to *break* out of blocks, not return!
269
+ # (ah but we can return from a lambda)
270
+ return if newrec == oldrec
271
+ # anyway a common scenario is a write where nothing is different
272
+ # but the mtime, so thepurpose
273
+
274
+ # these only need to be done if they haven't been done before
275
+ (algorithms - [primary]).each do |algo|
276
+ @dbs[algo][obj[algo].digest] = bin
277
+ end unless oldrec
278
+
279
+ # this only needs to be done if there are changes
280
+ @dbs[primary][bin] = newrec
281
+
282
+ # if old dtime is nil and new dtime is non-nil then we are deleting
283
+ # if old dtime is non-nil and new dtime is nil then we are restoring
284
+
285
+ if !oldrec
286
+ # new record: increment object count (by 1), increment byte
287
+ # count (by size)
288
+ control_add :objects, 1
289
+ if change > 0
290
+ control_add :bytes, newh[:size]
291
+ elsif change < 0
292
+ # note objects *and* deleted counts get incremented;
293
+ # allowing for the possibility that a fresh object can be
294
+ # added to the store "deleted".
295
+ control_add :deleted, 1
296
+ end
297
+ elsif change > 0
298
+ # restored record: decrement deleted count (by 1), increment
299
+ # byte count (by size)
300
+ control_add :deleted, -1
301
+ control_add :bytes, newh[:size]
302
+ elsif change < 0
303
+ # "deleted" record: increment deleted count (by 1), decrement
304
+ # byte count (by size)
305
+ control_add :deleted, 1
306
+ control_add :bytes, -newh[:size]
307
+ end
308
+ # otherwise do nothing
309
+
310
+ # note that actually *removing* a record is a separate process.
311
+
312
+ # okay now we update the indexes
313
+ RECORD.each do |k|
314
+ index_rm k, oldh[k], bin if oldh and oldh[k] and oldh[k] != newh[k]
315
+ index_add k, newh[k], bin # will noop on nil
316
+ end
317
+
318
+ # and finally update the mtime
319
+ @dbs[:control]['mtime'] = [now.to_i].pack ?N
320
+
321
+ newh
322
+ end
323
+
324
+ @lmdb.transaction do
325
+ body.call
326
+ end
327
+ end
328
+
329
+ def get_meta obj
330
+ body = -> do
331
+ # find/inflate master record
332
+ algo = if obj[primary]
333
+ primary
334
+ else
335
+ raise ArgumentError, 'Object must have digests' unless
336
+ obj.scanned?
337
+ obj.algorithms.sort do |a, b|
338
+ cmp = DIGESTS[b] <=> DIGESTS[a]
339
+ cmp == 0 ? a <=> b : cmp
340
+ end.first
341
+ end
342
+ bin = obj[algo].digest
343
+
344
+ # look up the primary digest based on a secondary
345
+ unless algo == primary
346
+ bin = @dbs[algo][bin] or return
347
+ end
348
+
349
+ # actually raise maybe? because this should never happen
350
+ rec = @dbs[primary][bin] or return
351
+
352
+ # return just a hash of all the elements
353
+ inflate bin, rec
354
+ end
355
+
356
+ @lmdb.transaction do
357
+ body.call
358
+ end
359
+ end
360
+
361
+ def remove_meta obj
362
+ body = -> do
363
+ hash = get_meta(obj) or return
364
+ bin = hash[:digests][primary].digest
365
+ now = Time.now
366
+
367
+ RECORD.each { |k| index_rm k, hash[k], bin }
368
+ hash[:digests].each { |algo, uri| @dbs[algo].delete uri.digest }
369
+
370
+ # remove counts
371
+ control_add :objects, -1
372
+ if hash[:dtime]
373
+ control_add :deleted, -1
374
+ else
375
+ control_add :bytes, -hash[:size]
376
+ hash[:dtime] = now
377
+ end
378
+
379
+ # and finally update the mtime
380
+ @dbs[:control]['mtime'] = [now.to_i].pack ?N
381
+
382
+ hash
383
+ end
384
+
385
+ @lmdb.transaction do
386
+ body.call
387
+ end
388
+ end
389
+
390
+ def mark_meta_deleted obj
391
+ body = -> do
392
+ # the object has to be in here to delete it
393
+ oldh = get_meta(obj) or return
394
+ # if the object is already "deleted" we do nothing
395
+ return if oldh[:dtime]
396
+
397
+ bin = oldh[:digests][primary].digest
398
+ now = Time.now
399
+
400
+ newh = oldh.merge(obj.to_h) do |k, ov, nv|
401
+ case k
402
+ when :digests then ov # - old values are guaranteed complete
403
+ when :size then ov # - we don't trust the new value
404
+ when :type then ov # - this gets set by default
405
+ when :dtime then now # - what we came here to do
406
+ else nv || ov
407
+ end
408
+ end
409
+
410
+ @dbs[primary][bin] = deflate(newh)
411
+ control_add :deleted, 1
412
+ control_add :bytes, -newh[:size]
413
+
414
+ # okay now we update the indexes
415
+ RECORD.each do |k|
416
+ index_rm k, oldh[k], bin if oldh and oldh[k] and oldh[k] != newh[k]
417
+ index_add k, newh[k], bin # will noop on nil
418
+ end
419
+
420
+ # and finally update the mtime
421
+ @dbs[:control]['mtime'] = [now.to_i].pack ?N
422
+
423
+ newh
424
+ end
425
+
426
+ @lmdb.transaction do
427
+ body.call
428
+ end
429
+ end
430
+
431
+ def meta_get_stats
432
+ @lmdb.transaction do
433
+ h = %i[ctime mtime objects deleted bytes].map do |k|
434
+ [k, @dbs[:control][k.to_s].unpack1(PACK[k])]
435
+ end.to_h
436
+
437
+ # fix the times
438
+ %i[ctime mtime].each { |t| h[t] = Time.at h[t] }
439
+
440
+ # get counts on all the countables
441
+ h.merge!(%i[type language charset encoding].map do |d|
442
+ ["#{d}s".to_sym,
443
+ @dbs[d].keys.map { |k| [k, @dbs[d].cardinality(k)] }.to_h]
444
+ end.to_h)
445
+
446
+ # would love to do min/max size/dates/etc but that is going to
447
+ # take some lower-level cursor finessing
448
+
449
+ h
450
+ end
451
+ end
452
+
453
+ public
454
+
455
+ def transaction &block
456
+ @lmdb.transaction do
457
+ block.call
458
+ end
459
+ end
460
+
461
+ # Return the set of algorithms initialized in the database.
462
+ # @return [Array] the algorithms
463
+ def algorithms
464
+
465
+ @algorithms ||= @lmdb.transaction do
466
+ if ret = @dbs[:control]['algorithms']
467
+ ret.strip.split(/\s*,+\s*/).map(&:to_sym)
468
+ end
469
+ end
470
+ end
471
+
472
+ # Return the primary digest algorithm.
473
+ # @return [Symbol] the primary algorithm
474
+ def primary
475
+ @primary ||= @lmdb.transaction do
476
+ if ret = @dbs[:control]['primary']
477
+ ret.strip.to_sym
478
+ end
479
+ end
480
+ end
481
+
482
+ # Return the number of objects in the database.
483
+ # @return [Integer]
484
+ def objects
485
+ @lmdb.transaction do
486
+ if ret = @dbs[:control]['objects']
487
+ ret.unpack1 'Q>' # 64-bit unsigned network-endian integer
488
+ end
489
+ end
490
+ end
491
+
492
+ # Return the number of objects whose payloads are deleted but are
493
+ # still on record.
494
+ # @return [Integer]
495
+ def deleted
496
+ @lmdb.transaction do
497
+ if ret = @dbs[:control]['deleted']
498
+ ret.unpack1 'Q>'
499
+ end
500
+ end
501
+ end
502
+
503
+ # Return the number of bytes stored in the database (notwithstanding
504
+ # the database itself).
505
+ # @return [Integer]
506
+ def bytes
507
+ @lmdb.transaction do
508
+ if ret = @dbs[:control]['bytes']
509
+ ret.unpack1 'Q>'
510
+ end
511
+ end
512
+ end
513
+
514
+ # Return a list of objects matching the given criteria. The result
515
+ # set will be the intersection of all supplied parameters. `:type`,
516
+ # `:charset`, `:encoding`, and `:language` are treated like discrete
517
+ # sets, while the rest of the parameters are treated like ranges
518
+ # (two-element arrays). Single values will be coerced into arrays;
519
+ # single range values will be interpreted as an inclusive lower
520
+ # bound. To bound only at the top, use a two-element array with its
521
+ # first value `nil`, like so: `size: [nil, 31337]`. The sorting
522
+ # criteria are the symbols of the other parameters.
523
+ #
524
+ # @param type [nil, String, #to_a]
525
+ # @param charset [nil, String, #to_a]
526
+ # @param encoding [nil, String, #to_a]
527
+ # @param language [nil, String, #to_a]
528
+ # @param size [nil, Integer, #to_a] byte size range
529
+ # @param ctime [nil, Time, DateTime, #to_a] creation time range
530
+ # @param mtime [nil, Time, DateTime, #to_a] modification time range
531
+ # @param ptime [nil, Time, DateTime, #to_a] medatata property change range
532
+ # @param dtime [nil, Time, DateTime, #to_a] deletion time range
533
+ # @param sort [nil, Symbol, #to_a] sorting criteria
534
+ # @return [Array] the list
535
+
536
+ PARAMS = %i[type charset encoding language
537
+ size ctime mtime ptime dtime].freeze
538
+
539
+ def list type: nil, charset: nil, encoding: nil, language: nil,
540
+ size: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil, sort: nil
541
+ # coerce all the inputs
542
+ params = begin
543
+ b = binding
544
+ ph = {}
545
+ PARAMS.each do |key|
546
+ val = b.local_variable_get key
547
+ val = case val
548
+ when nil then []
549
+ when Time then [val]
550
+ when DateTime then [val.to_time]
551
+ when -> (v) { v.respond_to? :to_a } then val.to_a
552
+ else [val]
553
+ end
554
+ ph[key] = val unless val.empty?
555
+ end
556
+ ph
557
+ end
558
+ # find the smallest denominator
559
+ index = params.keys.map do |k|
560
+ [k, @dbs[k].size]
561
+ end.sort { |a, b| a[1] <=> b[1] }.map(&:first).first
562
+ out = {}
563
+ @lmdb.transaction do
564
+ if index
565
+ warn params.inspect
566
+ if INTS[index]
567
+ index_get index, *params[index], range: true do |_, v|
568
+ u = URI("ni:///#{primary};")
569
+ u.digest = v
570
+ out[u] ||= get u
571
+ end
572
+ else
573
+ params[index].each do |val|
574
+ index_get index, val do |_, v|
575
+ u = URI("ni:///#{primary};")
576
+ u.digest = v
577
+ out[u] ||= get u
578
+ end
579
+ end
580
+ end
581
+ rest = params.keys - [index]
582
+ unless rest.empty?
583
+ out.select! do |_, obj|
584
+ rest.map do |param|
585
+ if val = obj.send(param)
586
+ warn "#{param} #{params[param]} <=> #{val}"
587
+ if INTS[param]
588
+ min, max = params[param]
589
+ if min && max
590
+ val >= min && val <= max
591
+ elsif min
592
+ val >= min
593
+ elsif max
594
+ val <= max
595
+ end
596
+ else
597
+ params[param].include? val
598
+ end
599
+ else
600
+ false
601
+ end
602
+ end.all?(true)
603
+ end
604
+ end
605
+ else
606
+ # if we aren't filtering at all we can just obtain everything
607
+ @dbs[primary].cursor do |c|
608
+ while rec = c.next
609
+ u = URI("ni:///#{primary};")
610
+ u.digest = rec.first
611
+ out[u] ||= get u
612
+ end
613
+ end
614
+ end
615
+ end
616
+
617
+ # now we sort
618
+ out.values
619
+ end
620
+
621
+ end