store-digest 0.3.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/store/digest.rb CHANGED
@@ -1,56 +1,262 @@
1
1
  require 'store/digest/version'
2
2
  require 'store/digest/driver'
3
- require 'store/digest/object'
4
-
3
+ require 'store/digest/entry'
4
+
5
+ # This is a general-purpose content-addressable store that interfaces
6
+ # via [RFC6920](https://datatracker.ietf.org/doc/html/rfc6920) addresses.
7
+ #
8
+ # Since a content-addressable store traffics in immutable blobs of
9
+ # bytes, the main interface is remarkably terse:
10
+ #
11
+ # * {#add} a blob-like object or existing {Store::Digest::Entry},
12
+ # * {#get} an entry from the store (if it exists), if you know one of
13
+ # its hash URIs,
14
+ # * or, {#remove} it.
15
+ #
16
+ # {Store::Digest} scans and stores multiple digest algorithms at once,
17
+ # since clients may only have a hash for a blob in a particular
18
+ # algorithm, and individual algorithms may get compromised from time
19
+ # to time. The set of algorithms is configurable, and fixed for each
20
+ # store instance when it is created.
21
+ #
22
+ # The currency of {Store::Digest}, then, is the {URI::NI} and the
23
+ # {Store::Digest::Entry}. There is also {Store::Digest::ReadWrapper},
24
+ # a small helper class capable of coercing non-IO-like objects
25
+ # (particulary those which one might find in a {Rack} message body)
26
+ # into something that behaves enough like an {IO} blob that it can be
27
+ # scanned. {Store::Digest::Entry} objects also masquerade as blobs
28
+ # with additional metadata.
29
+ #
5
30
  class Store::Digest
6
31
  private
7
32
 
8
- def coerce_object obj, type: nil, charset: nil,
9
- language: nil, encoding: nil, mtime: nil, strict: true
10
- obj = case obj
11
- when Store::Digest::Object
12
- obj.dup
13
- when URI::NI
14
- # just return the uri
15
- Store::Digest::Object.new digests: obj,
16
- type: type, charset: charset, language: language,
17
- encoding: encoding, mtime: mtime
18
- when IO, String, StringIO,
19
- -> x { %i[seek pos read].all? { |m| x.respond_to? m } }
20
- # assume this is going to be scanned later
21
- Store::Digest::Object.new obj,
22
- type: type, charset: charset, language: language,
23
- encoding: encoding, mtime: mtime
24
- when Pathname
25
- # actually open pathnames that are handed directly into S::D
26
- Store::Digest::Object.new obj.expand_path.open('rb'),
27
- type: type, charset: charset, language: language,
28
- encoding: encoding, mtime: mtime
29
- else
30
- raise ArgumentError,
31
- "Can't coerce a #{obj.class} to Store::Digest::Object"
32
- end
33
+ # Squeeze a digest URI (or several) out of the input, if possible.
34
+ #
35
+ # @param obj [URI::NI, Array<URI::NI>, Hash{Symbol=>URI::NI},
36
+ # Store::Digest::Entry] the thing to get URIs from
37
+ # @param select [false, true] whether to pick the "best" URI from a
38
+ # set or hash thereof
39
+ #
40
+ # @raise [ArgumentError] if the URIs can't be coerced
41
+ #
42
+ # @return [URI::NI, Hash{Symbol=>URI::NI}]
43
+ #
44
+ def coerce_uri obj, select: true
45
+ if obj.is_a? Store::Digest::Entry
46
+ digests = obj.digests
47
+
48
+ # this shouldn't happen but you never know
49
+ raise ArgumentError, 'Digest list is empty' if digests.empty?
50
+ else
51
+ obj = obj[:digests] if obj.is_a? Hash and obj.key? :digests
52
+ # this can also raise if it fails to coerce
53
+ digests = Store::Digest::Entry.coerce_digests obj, normative: true
54
+ end
55
+
56
+ # we should have a hash at this point
57
+ return digests.values unless select
58
+
59
+ # if we have this then return it
60
+ return digests[primary] if digests.key? primary
61
+
62
+ # grab this
63
+ lengths = URI::NI.lengths
64
+
65
+ # just pick the longest one i guess
66
+ digests.slice(*lengths.keys).values.sort do |a, b|
67
+ lengths[b.algorithm] <=> lengths[a.algorithm]
68
+ end.first
69
+ end
70
+
71
+ # From a metadata hash, determine if the entry is cache.
72
+ #
73
+ # @param meta [Hash] the metadata hash from the store
74
+ #
75
+ # @return [false, true]
76
+ #
77
+ def cache? meta
78
+ (meta[:flags] & Store::Digest::Entry::IS_CACHE).nonzero?
79
+ end
33
80
 
34
- # overwrite the user-mutable metadata
35
- b = binding
36
- %i[type charset language encoding mtime].each do |field|
37
- begin
38
- if x = b.local_variable_get(field)
39
- obj.send "#{field}=", x
81
+ # From a metadata hash, determine if the entry should be deleted.
82
+ #
83
+ # @param meta [Hash] the metadata hash from the store
84
+ #
85
+ # @return [false, true]
86
+ #
87
+ def deleted? meta
88
+ return false unless dtime = meta[:dtime]
89
+ cache?(meta) && dtime <= Time.now
90
+ end
91
+
92
+ # From an RFC6920 URI, get a raw hash
93
+ #
94
+ # @param uri [URI::NI] a digest URI
95
+ # @param tombstone [false, true] whether to return deleted metadata
96
+ # records
97
+ # @param remove [false, true, :forget] whether to remove (and
98
+ # forget) the record
99
+ #
100
+ # @return [Hash] the raw entry data
101
+ #
102
+ def get_raw uri, tombstone: false, remove: false
103
+ uri = coerce_uri uri
104
+
105
+ if remove
106
+ # this is how we pun
107
+ mm = remove == :forget ? :remove_meta : :mark_meta_deleted
108
+ bm = :remove_blob
109
+ else
110
+ mm = :get_meta
111
+ bm = :get_blob
112
+ end
113
+
114
+ transaction readonly: !remove do
115
+ # warn "#{remove} #{mm} #{bm}"
116
+ if meta = send(mm, uri)
117
+ if blob = send(bm, meta[:digests][primary].digest)
118
+ meta.merge content: blob
119
+ elsif tombstone
120
+ meta
40
121
  end
41
- rescue RuntimeError => e
42
- raise e if strict
43
122
  end
44
123
  end
124
+ end
125
+
126
+ # The difference between this and {#add} is that this takes a raw
127
+ # blob, eagerly scans it, and returns a `Hash`, whereas {#add}
128
+ # returns a {Store::Digest::Entry} object which can optionally scan
129
+ # lazily.
130
+ #
131
+ def add_raw content, **params
132
+ # slice out the subset
133
+ params = params.slice :type, :charset, :language, :encoding, :mtime, :cache
134
+ # this will automatically coerce nil to application/octet-stream
135
+ params[:type] = MimeMagic[params[:type]]
136
+ # add a modification time if missing
137
+ now = Time.now(in: ?Z)
138
+ mtime = params[:mtime] ||= now
139
+ flags = params[:flags] ||= Store::Digest::Entry::Flags.from(0)
140
+ if cache = params[:cache]
141
+ flags.cache = true
142
+ params[:dtime] ||= now + cache_ttl
143
+ end
144
+
145
+ transaction do
146
+
147
+ # warn @lmdb.active_txn.inspect
148
+
149
+ # managed temporary file handle
150
+ tmp = temp_blob
151
+
152
+ # get the basic scannable values (digests, size, type)
153
+ scanned = Entry.scan_raw(
154
+ content, algorithms: algorithms,
155
+ blocksize: blocksize, type: true) { |buf| tmp << buf }
156
+
157
+ # remove the scanned type if it is less specific than supplied
158
+ # scanned.delete(:type) if params[:type] &&
159
+ # params[:type].descendant_of?(scanned[:type])
160
+ scanned.delete(:type) if params[:type] &&
161
+ !scanned[:type].descendant_of?(params[:type])
162
+
163
+ # now merge the scanned params into the supplied ones
164
+ params.merge! scanned
45
165
 
46
- obj
166
+ # warn "asserted: #{params[:type]} -> scanned: #{scanned[:type]} #{scanned[:type].descendant_of?(params[:type])}"
167
+
168
+ # warn params.inspect
169
+
170
+ # replace the content with the settled blob
171
+ content = settle_blob params[:digests][primary].digest, tmp, mtime: mtime
172
+
173
+ # `set_meta` returns nil if unchanged
174
+ meta = set_meta params
175
+
176
+ # warn meta.inspect
177
+
178
+ # return the hash with the content
179
+ meta.merge(content: content)
180
+ end
181
+ end
182
+
183
+ # okay so:
184
+ #
185
+ # * the scanning nominally comes from the entry (class method)
186
+ # * hashes
187
+ # * size (bytes)
188
+ # * content-type (sampled)
189
+ # * the temp blob comes from the store
190
+ # * so does the settled blob (which could also be the temp blob)
191
+ # * everything else comes from the user (whether from params or entry)
192
+ #
193
+ # issues:
194
+ #
195
+ # * the store doesn't trust the entry to do the scanning so it has
196
+ # to do its own scan
197
+ # * (therefore make the actual scanning a class method)
198
+ # * however an entry that has an internal reference to the store
199
+ # should delegate scanning to it
200
+ # * the entry could just run `Store#add` that returns a fresh
201
+ # entry and shuck it for its contents and then throw it away
202
+ # * although Store::Digest::Entry deliberately obscures its
203
+ # contents so no that's no good
204
+
205
+ # * we don't want a turducken of entry objects; we want the raw file
206
+ # handle (or rather the lambda that returns a handle) and a wad of
207
+ # metadata
208
+
209
+ # * so i think `#add_raw` is the right idea but the question is what
210
+ # is its interface
211
+ # * the blob to be scanned
212
+ # * all known metadata
213
+ # * it should return the blob to use (or blob-returning
214
+ # lambda/closure/whatever) and whatever metadata comes out of
215
+ # scanning (hashes, size, content type)
216
+ # * content type and encoding may be different
217
+ # * ctime and ptime may be different from expected
218
+ # * dtime may be different
219
+ # * flags may be different (eg cache flag cleared)
220
+ # * actually fuck it just give back the equivalent of `Entry#to_h`
221
+ # (which has a `content:` key)
222
+ #
223
+ def add_raw2
224
+ transaction do
225
+ tmp = tmp_blob
226
+
227
+ Entry.scan_raw2(content, tmp, algorithms: algorithms, type: true) do
228
+
229
+ content = settle_blob digests[primary].digest, tmp, mtime: mtime
230
+
231
+ # `set_meta` returns nil if unchanged
232
+ meta = set_meta(params, preserve: preserve) || params
233
+
234
+ hash
235
+ end
236
+ end
47
237
  end
48
238
 
49
239
  public
50
240
 
51
- # Initialize a storage
52
- def initialize **options
53
- driver = options.delete(:driver) || Store::Digest::Driver::LMDB
241
+ # Initialize a content-addressable store.
242
+ #
243
+ # @note See individual drivers for driver-specific options.
244
+ #
245
+ # @see Store::Digest::Driver::LMDB
246
+ #
247
+ # @param driver [Module, Symbol, #to_sym] the driver to use
248
+ # @param blocksize [Integer] the default block size for scanning blobs
249
+ # @param mtimes [:preserve, :older, :newer] modification time overwrite policy
250
+ #
251
+ # @return [void]
252
+ #
253
+ def initialize driver: Store::Digest::Driver::LMDB,
254
+ blocksize: 2**16, mtimes: :preserve, ttl: 60 * 60 * 24, **options
255
+ driver ||= Store::Digest::Driver::LMDB
256
+
257
+ @blocksize = blocksize
258
+ @mtimes = mtimes || :preserve
259
+ @cache_ttl = ttl
54
260
 
55
261
  unless driver.is_a? Module
56
262
  # coerce to symbol
@@ -65,12 +271,17 @@ class Store::Digest
65
271
  "Driver #{driver} is not a Store::Digest::Driver" unless
66
272
  driver.ancestors.include? Store::Digest::Driver
67
273
 
274
+ # bolt the driver onto the instance
68
275
  extend driver
69
276
 
70
- #
277
+ # aaaand bootstrap it
71
278
  setup(**options)
279
+
280
+ # warn @lmdb.info
72
281
  end
73
282
 
283
+ attr_reader :blocksize, :mtimes, :cache_ttl
284
+
74
285
  # XXX this is not right; leave it for now
75
286
  # def to_s
76
287
  # '<%s:0x%016x objects=%d deleted=%d bytes=%d>' %
@@ -79,114 +290,106 @@ class Store::Digest
79
290
 
80
291
  # alias_method :inspect, :to_s
81
292
 
82
- # Add an object to the store. Takes pretty much anything that makes
293
+ # Add an object to the store. Will accept pretty much anything that makes
83
294
  # sense to throw at it.
84
295
  #
85
- # @note Prefabricated {Store::Digest::Object} instances will be
86
- # rescanned.
296
+ # @note Already-scanned {Store::Digest::Entry} instances will have
297
+ # to be rescanned, since the store can't trust the digests. Use
298
+ # {#add} or {Store::Digest::Entry#add_to} on an unscanned entry to
299
+ # scan only once.
87
300
  #
88
301
  # @note `:preserve` will cause a noop if object metadata is identical
89
302
  # save for `:ctime` and `:mtime` (`:ctime` is always ignored).
90
303
  #
91
- # @param obj [IO,File,Pathname,String,Store::Digest::Object] the object
304
+ # @param obj [IO,File,Pathname,String,Store::Digest::Entry] the object
92
305
  # @param type [String] the content type
93
306
  # @param charset [String] the character set, if applicable
94
307
  # @param language [String] the language, if applicable
95
308
  # @param encoding [String] the encoding (eg compression) if applicable
96
309
  # @param mtime [Time] the modification time, if not "now"
97
- # @param strict [true, false] strict checking on metadata input
98
- # @param preserve [false, true] preserve existing modification time
310
+ # @param cache [false, true, Numeric, Time] whether the object should be
311
+ # treated as cache, and/or when to evict it
312
+ # @param scan [false, true] eagerly scan the contents
99
313
  #
100
- # @return [Store::Digest::Object] The (potentially pre-existing) entry
314
+ # @return [Store::Digest::Entry] The (potentially pre-existing) entry
101
315
  #
102
- def add obj, type: nil, charset: nil, language: nil, encoding: nil,
103
- mtime: nil, strict: true, preserve: false
104
- return unless obj
105
-
106
- transaction do # |txn|
107
- obj = coerce_object obj, type: type, charset: charset,
108
- language: language, encoding: encoding, mtime: mtime, strict: strict
109
- raise ArgumentError, 'We need something to store!' unless obj.content?
110
-
111
- # this method is helicoptered in
112
- tmp = temp_blob
113
-
114
- # XXX this is stupid; figure out a better way to do this
316
+ def add obj, digests: nil, mtime: nil, type: nil, charset: nil,
317
+ encoding: nil, language: nil, cache: false, scan: false
115
318
 
116
- # get our digests
117
- obj.scan(digests: algorithms, blocksize: 2**20, strict: strict,
118
- type: type, charset: charset, language: language,
119
- encoding: encoding, mtime: mtime) do |buf|
120
- tmp << buf
121
- end
122
-
123
- # if we are scanning an object it is necessarily not deleted
124
- obj.dtime = nil
319
+ # warn "hmmmm #{obj.inspect}"
125
320
 
126
- # set_meta will return nil if there is no difference in what is set
127
- if h = set_meta(obj, preserve: preserve)
128
- # warn h.inspect
129
- # replace the object
321
+ # XXX this circumvents the integrity check
322
+ return obj.add(self) if obj.is_a? Store::Digest::Entry
130
323
 
131
- content = obj.content
324
+ raise ArgumentError, 'entry can\'t be nil' if obj.nil?
132
325
 
133
- # do this to prevent too many open files
134
- if content.is_a? File
135
- path = Pathname(content.path).expand_path
136
- content = -> { path.open('rb') }
137
- end
326
+ # turducken-ass call graph lol
327
+ Store::Digest::Entry.new obj, store: self, digests: digests, mtime: mtime,
328
+ type: type, charset: charset, encoding: encoding, language: language,
329
+ cache: cache, scan: scan
330
+ end
138
331
 
139
- obj = Store::Digest::Object.new content, fresh: true, **h
332
+ # Returns true if the entry is in the store.
333
+ #
334
+ # @param entry [URI::NI, Store::Digest::Entry] the hash address of
335
+ # an entry, or an entry object itself
336
+ # @param tombstone [false, true] whether to return "tombstone"
337
+ # metadata records of deleted entries
338
+ #
339
+ # @return [false, true] whether the entry (or its tombstone) is
340
+ # present in the store
341
+ #
342
+ def has? entry, tombstone: false
343
+ # coerce just because
344
+ tombstone = !!tombstone
140
345
 
141
- # now settle the blob into storage
142
- settle_blob obj[primary].digest, tmp, mtime: obj.mtime
346
+ transaction readonly: true do
347
+ # obviously false if there's no record
348
+ if h = get_meta(entry)
349
+ # a metadata record is considered a tombstone if it has a dtime
350
+ # at all if it's an ordinary entry, and in the past if it's cache
351
+ tombstone || !deleted?(h)
143
352
  else
144
- tmp.close
145
- tmp.unlink
146
-
147
- # warn "got here lolol"
148
-
149
- # eh just do this
150
- obj = get obj
151
- obj.fresh = false # object is not fresh since we already have it
353
+ false
152
354
  end
153
-
154
- obj
155
355
  end
156
356
  end
157
357
 
158
- # Retrieve an object from the store.
358
+ # Retrieve an entry from the store.
159
359
  #
160
- # @param obj [URI, Store::Digest::Object]
360
+ # @note I'm not sure why you would want to `#get` an entry that you
361
+ # already had, but you can.
161
362
  #
162
- # @return [Store::Digest::Object, nil]
163
- def get obj
164
- transaction readonly: true do
165
- obj = coerce_object obj
166
- if h = get_meta(obj) # bail if this does not exist
167
- b = get_blob h[:digests][primary].digest # may be nil
168
- Store::Digest::Object.new b, **h
169
- end
363
+ # @param obj [URI::NI, Array<URI::NI>, Hash{Symbol=>URI::NI},
364
+ # Store::Digest::Entry] some means of resolving an entry
365
+ #
366
+ # @return [Store::Digest::Entry, nil]
367
+ #
368
+ def get obj, tombstone: false
369
+ uri = coerce_uri obj
370
+
371
+ if hash = get_raw(uri, tombstone: tombstone)
372
+ Store::Digest::Entry.new(store: self) { hash }
170
373
  end
171
374
  end
172
375
 
173
376
  # Remove an object from the store, optionally "forgetting" it ever existed.
174
- # @param obj
175
- def remove obj, forget: false
176
- obj = coerce_object obj
177
- unless obj.scanned?
178
- raise ArgumentError,
179
- 'Cannot scan object because there is no content' unless obj.content?
180
- obj.scan digests: algorithms, blocksize: 2**20
181
- end
377
+ #
378
+ # @param entry [URI::NI, Store::Digest::Entry] the hash address of
379
+ # an entry, or an entry object itself
380
+ # @param tombstone [false, true] whether to return "tombstone"
381
+ # metadata records of deleted entries
382
+ # @param forget [false, true] whether to delete the metadata or just
383
+ # mark it as deleted
384
+ #
385
+ # @return [Store::Digest::Entry, nil]
386
+ #
387
+ def remove obj, tombstone: false, forget: false
388
+ uri = coerce_uri obj
389
+ rm = forget ? :forget : true
182
390
 
183
- # remove or mark metadata entry as deleted and remove blob
184
- transaction do
185
- if meta = forget ? remove_meta(obj) : mark_meta_deleted(obj)
186
- if blob = remove_blob(meta[:digests][primary].digest)
187
- Store::Digest::Object.new blob, **meta
188
- end
189
- end
391
+ if hash = get_raw(uri, tombstone: tombstone, remove: rm)
392
+ Store::Digest::Entry.new { hash }
190
393
  end
191
394
  end
192
395
 
@@ -197,11 +400,26 @@ class Store::Digest
197
400
  remove obj, forget: true
198
401
  end
199
402
 
403
+ def close
404
+ close_internal
405
+ end
406
+
407
+ # Determine if the store is cache-aware.
408
+ #
409
+ # @return [false, true]
410
+ #
411
+ def can_cache?
412
+ respond_to? :cache_ttl
413
+ end
414
+
200
415
  # Return statistics on the store
201
416
  def stats
202
417
  Stats.new(**meta_get_stats)
203
418
  end
204
419
 
420
+ # This class represents a set of rudimentary statistics for the
421
+ # contents of the store.
422
+ #
205
423
  class Stats
206
424
  private
207
425
 
data/store-digest.gemspec CHANGED
@@ -27,15 +27,14 @@ Gem::Specification.new do |spec|
27
27
  spec.required_ruby_version = '>= 3.0'
28
28
 
29
29
  # dev/test dependencies
30
- spec.add_development_dependency 'bundler', '>= 2.1'
31
- spec.add_development_dependency 'rake', '>= 13.0'
32
- spec.add_development_dependency 'rspec', '>= 3.9'
30
+ spec.add_development_dependency 'bundler', '~> 2', '>= 2.6'
31
+ spec.add_development_dependency 'rake', '~> 13'
32
+ spec.add_development_dependency 'rspec', '~> 3', '>= 3.9'
33
33
 
34
34
  # stuff we use
35
35
  spec.add_runtime_dependency 'base64', '~> 0.3' # stop it complaining
36
36
  spec.add_runtime_dependency 'base32', '~> 0.3', '>= 0.3.2'
37
- spec.add_runtime_dependency 'lmdb', '~> 0.7', '>= 0.7.1' # my hacks
38
- # spec.add_runtime_dependency 'mimemagic', '>= 0.4.3', '< 0.5'
39
- spec.add_runtime_dependency 'mimemagic', '>= 0.4.3'
40
- spec.add_runtime_dependency 'uri-ni', '>= 0.1.4'
37
+ spec.add_runtime_dependency 'lmdb', '~> 0.8', '>= 0.8.1' # my hacks
38
+ spec.add_runtime_dependency 'mimemagic-dorian', '~> 0.5', '>= 0.5.7' # reluctantly my hacks
39
+ spec.add_runtime_dependency 'uri-ni', '~> 0.2', '>= 0.2.6' # mine
41
40
  end