store-digest 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ require 'store/digest/version'
2
+
3
+ module Store::Digest::Driver
4
+ # This is an abstract class for drivers.
5
+
6
+ # this is the only implementation we have so far
7
+ autoload :LMDB, 'store/digest/driver/lmdb'
8
+
9
+ protected
10
+
11
+ def setup **options
12
+ raise NotImplementedError, 'gotta roll your own, holmes'
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ require 'store/digest/driver'
2
+ require 'store/digest/blob/filesystem'
3
+ require 'store/digest/meta/lmdb'
4
+
5
+ module Store::Digest::Driver::LMDB
6
+ include Store::Digest::Driver
7
+ include Store::Digest::Blob::FileSystem
8
+ include Store::Digest::Meta::LMDB
9
+
10
+ protected
11
+
12
+ def setup **options
13
+ super
14
+ end
15
+ end
@@ -0,0 +1,7 @@
1
+ module Store
2
+ class Digest
3
+ module Meta
4
+ # This is an abstract module for metadata operations.
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,621 @@
1
+ require 'store/digest/meta'
2
+ require 'store/digest/trait'
3
+
4
+ require 'lmdb'
5
+ require 'uri/ni'
6
+
7
+ module Store::Digest::Meta::LMDB
8
+ include Store::Digest::Meta
9
+ include Store::Digest::Trait::RootDir
10
+
11
+ private
12
+
13
+ PRIMARY = :"sha-256"
14
+ DIGESTS = {
15
+ md5: 16,
16
+ "sha-1": 20,
17
+ "sha-256": 32,
18
+ "sha-384": 48,
19
+ "sha-512": 64,
20
+ }.freeze
21
+
22
+ FORMAT = 'Q>NNNNCZ*Z*Z*Z*'.freeze
23
+ RECORD = %i[
24
+ size ctime mtime ptime dtime flags type language charset encoding].freeze
25
+ INTS = %i[
26
+ size ctime mtime ptime dtime flags].map { |k| [k, :to_i] }.to_h.freeze
27
+ PACK = {
28
+ # control records
29
+ objects: 'Q>',
30
+ deleted: 'Q>',
31
+ bytes: 'Q>',
32
+ # object records
33
+ size: 'Q>',
34
+ ctime: ?N, # - also used in control
35
+ mtime: ?N, # - ditto
36
+ ptime: ?N,
37
+ dtime: ?N,
38
+ flags: ?C,
39
+ type: 'Z*',
40
+ language: 'Z*',
41
+ charset: 'Z*',
42
+ encoding: 'Z*',
43
+ }.transform_values(&:freeze).freeze
44
+
45
+ def inflate bin, rec
46
+ rec = rec.dup
47
+ digests = algorithms.map do |a|
48
+ uri = URI::NI.build(scheme: 'ni', path: "/#{a}")
49
+ uri.digest = a == primary ? bin : rec.slice!(0, DIGESTS[a])
50
+ [a, uri]
51
+ end.to_h
52
+
53
+ # size ctime mtime ptime dtime flags type language charset encoding
54
+ hash = RECORD.zip(rec.unpack(FORMAT)).to_h
55
+ hash[:digests] = digests
56
+
57
+ %i[ctime ptime mtime dtime].each do |k|
58
+ hash[k] = (hash[k] == 0) ? nil : Time.at(hash[k])
59
+ end
60
+
61
+ %i[type language charset encoding].each do |k|
62
+ hash[k] = nil if hash[k].empty?
63
+ end
64
+ hash
65
+ end
66
+
67
+ def deflate obj
68
+ obj = obj.to_h unless obj.is_a? Hash
69
+ algos = (algorithms - [primary]).map { |a| obj[:digests][a].digest }.join
70
+ rec = RECORD.map { |k| v = obj[k]; v.send INTS.fetch(k, :to_s) }
71
+ algos + rec.pack(FORMAT)
72
+ end
73
+
74
+ # NOTE these are all internal methods meant to be used inside other
75
+ # transactions so they do not run in transactions themselves
76
+
77
+ def control_add key, val
78
+ if ov = @dbs[:control][key.to_s]
79
+ fmt = case ov.length
80
+ when 4 then ?N
81
+ when 8 then 'Q>'
82
+ else
83
+ raise RuntimeError, "#{key} must be 4 or 8 bytes long"
84
+ end
85
+ ov = ov.unpack1 fmt
86
+ else
87
+ ov = 0
88
+ end
89
+
90
+ nv = ov + val
91
+
92
+ @dbs[:control][key.to_s] = [nv].pack 'Q>'
93
+
94
+ nv
95
+ end
96
+
97
+ def control_get key
98
+ key = key.to_sym
99
+ raise ArgumentError, "Invalid control key #{key}" unless
100
+ %[ctime mtime objects deleted bytes].include? key
101
+ if val = @dbs[:control][key.to_s]
102
+ val.unpack1 PACK[key]
103
+ end
104
+ end
105
+
106
+ def index_pack key
107
+ case key
108
+ when nil then return
109
+ when Time then [key.to_i].pack ?N
110
+ when Integer then [key].pack 'Q>'
111
+ when String then key.b # no \0: key length is stored in the record
112
+ else raise ArgumentError, "Invalid type #{key.class}"
113
+ end
114
+ end
115
+
116
+ def index_add index, key, bin
117
+ key = index_pack(key) or return
118
+ # check first or it will just stupidly keep adding duplicate records
119
+ @dbs[index].put key, bin unless @dbs[index].has? key, bin
120
+ end
121
+
122
+ def index_rm index, key, bin
123
+ key = index_pack(key) or return
124
+ # soft delete baleets only when there is something to baleet
125
+ @dbs[index.to_sym].delete? key, bin
126
+ end
127
+
128
+ # return an enumerator
129
+ def index_get index, min, max = nil, range: false, &block
130
+ # min and max will be binary values and the cursor will return a range
131
+ min = index_pack(min)
132
+ max = index_pack(max)
133
+ return unless min || max
134
+
135
+ return enum_for :index_get, index, min, max unless block_given?
136
+
137
+ body = -> c do
138
+ # lmdb cursors are a pain in the ass because 'set' advances the
139
+ # cursor so you can't just run the whole thing in a loop, you
140
+ # have to do this instead:
141
+ if rec = (min ? c.set_range(min) : c.first)
142
+ return unless range or max or min == rec.first
143
+ block.call(*rec)
144
+ block.call(*rec) while rec = c.next_range(max || min)
145
+ end
146
+ end
147
+
148
+ @dbs[index.to_sym].cursor(&body)
149
+ nil
150
+ end
151
+
152
+ protected
153
+
154
+ def setup **options
155
+ # dir/umask
156
+ super
157
+
158
+ # now initialize our part
159
+ mapsize = options[:mapsize] || 2**27
160
+ raise ArgumentError, 'Mapsize must be a positive integer' unless
161
+ mapsize.is_a? Integer and mapsize > 0
162
+
163
+ lmdbopts = { mode: 0666 & ~umask, mapsize: mapsize }
164
+ @lmdb = ::LMDB.new dir, lmdbopts
165
+
166
+ algos = options[:algorithms] || DIGESTS.keys
167
+ raise ArgumentError, "Invalid algorithm specification #{algos}" unless
168
+ algos.is_a? Array and (algos - DIGESTS.keys).empty?
169
+
170
+ popt = options[:primary] || PRIMARY
171
+ raise ArgumentError, "Invalid primary algorithm #{popt}" unless
172
+ popt.is_a? Symbol and DIGESTS[popt]
173
+
174
+ @lmdb.transaction do
175
+ @dbs = { control: @lmdb.database('control', create: true) }
176
+
177
+ if a = algorithms
178
+ raise ArgumentError,
179
+ "Supplied algorithms #{algos.sort} do not match instantiated #{a}" if
180
+ algos.sort != a
181
+ else
182
+ a = algos.sort
183
+ @dbs[:control]['algorithms'] = a.join ?,
184
+ end
185
+
186
+ if pri = primary
187
+ raise ArgumentError,
188
+ "Supplied algorithm #{popt} does not match instantiated #{pri}" if
189
+ popt != pri
190
+ else
191
+ pri = popt
192
+ @dbs[:control]['primary'] = popt.to_s
193
+ end
194
+
195
+ now = Time.now
196
+ %w[ctime mtime].each do |t|
197
+ unless @dbs[:control].has? t
198
+ @dbs[:control][t] = [now.to_i].pack ?N
199
+ end
200
+ end
201
+
202
+ # clever if i do say so myself
203
+ %w[objects deleted bytes].each do |x|
204
+ @dbs[:control][x] = [0].pack 'Q>' unless send(x.to_sym)
205
+ end
206
+
207
+ # XXX we might actually wanna dupsort the non-primary digests too
208
+ dbs = RECORD.map do |k|
209
+ [k, [:dupsort]]
210
+ end.to_h.merge(a.map { |k| [k, []] }.to_h)
211
+
212
+ @dbs.merge!(dbs.map do |name, flags|
213
+ [name, @lmdb.database(name.to_s,
214
+ (flags + [:create]).map { |f| [f, true] }.to_h)]
215
+ end.to_h).freeze
216
+ end
217
+
218
+ @lmdb.sync
219
+ end
220
+
221
+ # Returns a metadata hash or `nil` if no changes have been made. A
222
+ # common scenario is that the caller will attempt to store an object
223
+ # that is already present, with the only distinction being `:ctime`
224
+ # (which is always ignored) and/or `:mtime`. Setting the `:preserve`
225
+ # keyword parameter to a true value will cause any new value for
226
+ # `:mtime` to be ignored as well. In that case, an attempt to store
227
+ # an otherwise identical record overtop of an existing one will
228
+ # return `nil`.
229
+ #
230
+ # @param obj [Store::Digest::Object] the object to store
231
+ # @param preserve [false, true] whether to preserve the mtime
232
+ # @return [nil, Hash] maybe the metadata content of the object
233
+ def set_meta obj, preserve: false
234
+ raise ArgumentError,
235
+ 'Object does not have a complete set of digests' unless
236
+ (algorithms - obj.algorithms).empty?
237
+
238
+ body = -> do
239
+ # noop if object is present and not deleted and no details have changed
240
+ bin = obj[primary].digest
241
+ newh = obj.to_h
242
+ now = Time.now
243
+
244
+ change = newh[:dtime] ? -1 : 1 # net change in records
245
+ oldrec = @dbs[primary][bin]
246
+ oldh = nil
247
+ newh = if oldrec
248
+ oldh = inflate bin, oldrec
249
+ oldh.merge(newh) do |k, ov, nv|
250
+ case k
251
+ when :ctime then ov # never overwrite ctime
252
+ when :mtime # only overwrite the mtime if specified
253
+ preserve ? (ov || nv || now) : (nv || ov || now)
254
+ when :ptime then nv || ov || now # XXX derive ptime?
255
+ when :dtime
256
+ # net change is zero if both or neither are set
257
+ change = 0 if (nv && ov) || (!nv && !ov)
258
+ nv
259
+ else nv
260
+ end
261
+ end
262
+ else
263
+ %i[ctime mtime ptime].each { |k| newh[k] ||= now }
264
+ newh
265
+ end
266
+ newrec = deflate newh
267
+
268
+ # we have to *break* out of blocks, not return!
269
+ # (ah but we can return from a lambda)
270
+ return if newrec == oldrec
271
+ # anyway a common scenario is a write where nothing is different
272
+ # but the mtime, so thepurpose
273
+
274
+ # these only need to be done if they haven't been done before
275
+ (algorithms - [primary]).each do |algo|
276
+ @dbs[algo][obj[algo].digest] = bin
277
+ end unless oldrec
278
+
279
+ # this only needs to be done if there are changes
280
+ @dbs[primary][bin] = newrec
281
+
282
+ # if old dtime is nil and new dtime is non-nil then we are deleting
283
+ # if old dtime is non-nil and new dtime is nil then we are restoring
284
+
285
+ if !oldrec
286
+ # new record: increment object count (by 1), increment byte
287
+ # count (by size)
288
+ control_add :objects, 1
289
+ if change > 0
290
+ control_add :bytes, newh[:size]
291
+ elsif change < 0
292
+ # note objects *and* deleted counts get incremented;
293
+ # allowing for the possibility that a fresh object can be
294
+ # added to the store "deleted".
295
+ control_add :deleted, 1
296
+ end
297
+ elsif change > 0
298
+ # restored record: decrement deleted count (by 1), increment
299
+ # byte count (by size)
300
+ control_add :deleted, -1
301
+ control_add :bytes, newh[:size]
302
+ elsif change < 0
303
+ # "deleted" record: increment deleted count (by 1), decrement
304
+ # byte count (by size)
305
+ control_add :deleted, 1
306
+ control_add :bytes, -newh[:size]
307
+ end
308
+ # otherwise do nothing
309
+
310
+ # note that actually *removing* a record is a separate process.
311
+
312
+ # okay now we update the indexes
313
+ RECORD.each do |k|
314
+ index_rm k, oldh[k], bin if oldh and oldh[k] and oldh[k] != newh[k]
315
+ index_add k, newh[k], bin # will noop on nil
316
+ end
317
+
318
+ # and finally update the mtime
319
+ @dbs[:control]['mtime'] = [now.to_i].pack ?N
320
+
321
+ newh
322
+ end
323
+
324
+ @lmdb.transaction do
325
+ body.call
326
+ end
327
+ end
328
+
329
+ def get_meta obj
330
+ body = -> do
331
+ # find/inflate master record
332
+ algo = if obj[primary]
333
+ primary
334
+ else
335
+ raise ArgumentError, 'Object must have digests' unless
336
+ obj.scanned?
337
+ obj.algorithms.sort do |a, b|
338
+ cmp = DIGESTS[b] <=> DIGESTS[a]
339
+ cmp == 0 ? a <=> b : cmp
340
+ end.first
341
+ end
342
+ bin = obj[algo].digest
343
+
344
+ # look up the primary digest based on a secondary
345
+ unless algo == primary
346
+ bin = @dbs[algo][bin] or return
347
+ end
348
+
349
+ # actually raise maybe? because this should never happen
350
+ rec = @dbs[primary][bin] or return
351
+
352
+ # return just a hash of all the elements
353
+ inflate bin, rec
354
+ end
355
+
356
+ @lmdb.transaction do
357
+ body.call
358
+ end
359
+ end
360
+
361
+ def remove_meta obj
362
+ body = -> do
363
+ hash = get_meta(obj) or return
364
+ bin = hash[:digests][primary].digest
365
+ now = Time.now
366
+
367
+ RECORD.each { |k| index_rm k, hash[k], bin }
368
+ hash[:digests].each { |algo, uri| @dbs[algo].delete uri.digest }
369
+
370
+ # remove counts
371
+ control_add :objects, -1
372
+ if hash[:dtime]
373
+ control_add :deleted, -1
374
+ else
375
+ control_add :bytes, -hash[:size]
376
+ hash[:dtime] = now
377
+ end
378
+
379
+ # and finally update the mtime
380
+ @dbs[:control]['mtime'] = [now.to_i].pack ?N
381
+
382
+ hash
383
+ end
384
+
385
+ @lmdb.transaction do
386
+ body.call
387
+ end
388
+ end
389
+
390
+ def mark_meta_deleted obj
391
+ body = -> do
392
+ # the object has to be in here to delete it
393
+ oldh = get_meta(obj) or return
394
+ # if the object is already "deleted" we do nothing
395
+ return if oldh[:dtime]
396
+
397
+ bin = oldh[:digests][primary].digest
398
+ now = Time.now
399
+
400
+ newh = oldh.merge(obj.to_h) do |k, ov, nv|
401
+ case k
402
+ when :digests then ov # - old values are guaranteed complete
403
+ when :size then ov # - we don't trust the new value
404
+ when :type then ov # - this gets set by default
405
+ when :dtime then now # - what we came here to do
406
+ else nv || ov
407
+ end
408
+ end
409
+
410
+ @dbs[primary][bin] = deflate(newh)
411
+ control_add :deleted, 1
412
+ control_add :bytes, -newh[:size]
413
+
414
+ # okay now we update the indexes
415
+ RECORD.each do |k|
416
+ index_rm k, oldh[k], bin if oldh and oldh[k] and oldh[k] != newh[k]
417
+ index_add k, newh[k], bin # will noop on nil
418
+ end
419
+
420
+ # and finally update the mtime
421
+ @dbs[:control]['mtime'] = [now.to_i].pack ?N
422
+
423
+ newh
424
+ end
425
+
426
+ @lmdb.transaction do
427
+ body.call
428
+ end
429
+ end
430
+
431
+ def meta_get_stats
432
+ @lmdb.transaction do
433
+ h = %i[ctime mtime objects deleted bytes].map do |k|
434
+ [k, @dbs[:control][k.to_s].unpack1(PACK[k])]
435
+ end.to_h
436
+
437
+ # fix the times
438
+ %i[ctime mtime].each { |t| h[t] = Time.at h[t] }
439
+
440
+ # get counts on all the countables
441
+ h.merge!(%i[type language charset encoding].map do |d|
442
+ ["#{d}s".to_sym,
443
+ @dbs[d].keys.map { |k| [k, @dbs[d].cardinality(k)] }.to_h]
444
+ end.to_h)
445
+
446
+ # would love to do min/max size/dates/etc but that is going to
447
+ # take some lower-level cursor finessing
448
+
449
+ h
450
+ end
451
+ end
452
+
453
+ public
454
+
455
+ def transaction &block
456
+ @lmdb.transaction do
457
+ block.call
458
+ end
459
+ end
460
+
461
+ # Return the set of algorithms initialized in the database.
462
+ # @return [Array] the algorithms
463
+ def algorithms
464
+
465
+ @algorithms ||= @lmdb.transaction do
466
+ if ret = @dbs[:control]['algorithms']
467
+ ret.strip.split(/\s*,+\s*/).map(&:to_sym)
468
+ end
469
+ end
470
+ end
471
+
472
+ # Return the primary digest algorithm.
473
+ # @return [Symbol] the primary algorithm
474
+ def primary
475
+ @primary ||= @lmdb.transaction do
476
+ if ret = @dbs[:control]['primary']
477
+ ret.strip.to_sym
478
+ end
479
+ end
480
+ end
481
+
482
+ # Return the number of objects in the database.
483
+ # @return [Integer]
484
+ def objects
485
+ @lmdb.transaction do
486
+ if ret = @dbs[:control]['objects']
487
+ ret.unpack1 'Q>' # 64-bit unsigned network-endian integer
488
+ end
489
+ end
490
+ end
491
+
492
+ # Return the number of objects whose payloads are deleted but are
493
+ # still on record.
494
+ # @return [Integer]
495
+ def deleted
496
+ @lmdb.transaction do
497
+ if ret = @dbs[:control]['deleted']
498
+ ret.unpack1 'Q>'
499
+ end
500
+ end
501
+ end
502
+
503
+ # Return the number of bytes stored in the database (notwithstanding
504
+ # the database itself).
505
+ # @return [Integer]
506
+ def bytes
507
+ @lmdb.transaction do
508
+ if ret = @dbs[:control]['bytes']
509
+ ret.unpack1 'Q>'
510
+ end
511
+ end
512
+ end
513
+
514
+ # Return a list of objects matching the given criteria. The result
515
+ # set will be the intersection of all supplied parameters. `:type`,
516
+ # `:charset`, `:encoding`, and `:language` are treated like discrete
517
+ # sets, while the rest of the parameters are treated like ranges
518
+ # (two-element arrays). Single values will be coerced into arrays;
519
+ # single range values will be interpreted as an inclusive lower
520
+ # bound. To bound only at the top, use a two-element array with its
521
+ # first value `nil`, like so: `size: [nil, 31337]`. The sorting
522
+ # criteria are the symbols of the other parameters.
523
+ #
524
+ # @param type [nil, String, #to_a]
525
+ # @param charset [nil, String, #to_a]
526
+ # @param encoding [nil, String, #to_a]
527
+ # @param language [nil, String, #to_a]
528
+ # @param size [nil, Integer, #to_a] byte size range
529
+ # @param ctime [nil, Time, DateTime, #to_a] creation time range
530
+ # @param mtime [nil, Time, DateTime, #to_a] modification time range
531
+ # @param ptime [nil, Time, DateTime, #to_a] medatata property change range
532
+ # @param dtime [nil, Time, DateTime, #to_a] deletion time range
533
+ # @param sort [nil, Symbol, #to_a] sorting criteria
534
+ # @return [Array] the list
535
+
536
+ PARAMS = %i[type charset encoding language
537
+ size ctime mtime ptime dtime].freeze
538
+
539
+ def list type: nil, charset: nil, encoding: nil, language: nil,
540
+ size: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil, sort: nil
541
+ # coerce all the inputs
542
+ params = begin
543
+ b = binding
544
+ ph = {}
545
+ PARAMS.each do |key|
546
+ val = b.local_variable_get key
547
+ val = case val
548
+ when nil then []
549
+ when Time then [val]
550
+ when DateTime then [val.to_time]
551
+ when -> (v) { v.respond_to? :to_a } then val.to_a
552
+ else [val]
553
+ end
554
+ ph[key] = val unless val.empty?
555
+ end
556
+ ph
557
+ end
558
+ # find the smallest denominator
559
+ index = params.keys.map do |k|
560
+ [k, @dbs[k].size]
561
+ end.sort { |a, b| a[1] <=> b[1] }.map(&:first).first
562
+ out = {}
563
+ @lmdb.transaction do
564
+ if index
565
+ warn params.inspect
566
+ if INTS[index]
567
+ index_get index, *params[index], range: true do |_, v|
568
+ u = URI("ni:///#{primary};")
569
+ u.digest = v
570
+ out[u] ||= get u
571
+ end
572
+ else
573
+ params[index].each do |val|
574
+ index_get index, val do |_, v|
575
+ u = URI("ni:///#{primary};")
576
+ u.digest = v
577
+ out[u] ||= get u
578
+ end
579
+ end
580
+ end
581
+ rest = params.keys - [index]
582
+ unless rest.empty?
583
+ out.select! do |_, obj|
584
+ rest.map do |param|
585
+ if val = obj.send(param)
586
+ warn "#{param} #{params[param]} <=> #{val}"
587
+ if INTS[param]
588
+ min, max = params[param]
589
+ if min && max
590
+ val >= min && val <= max
591
+ elsif min
592
+ val >= min
593
+ elsif max
594
+ val <= max
595
+ end
596
+ else
597
+ params[param].include? val
598
+ end
599
+ else
600
+ false
601
+ end
602
+ end.all?(true)
603
+ end
604
+ end
605
+ else
606
+ # if we aren't filtering at all we can just obtain everything
607
+ @dbs[primary].cursor do |c|
608
+ while rec = c.next
609
+ u = URI("ni:///#{primary};")
610
+ u.digest = rec.first
611
+ out[u] ||= get u
612
+ end
613
+ end
614
+ end
615
+ end
616
+
617
+ # now we sort
618
+ out.values
619
+ end
620
+
621
+ end