store-digest 0.3.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1214 @@
1
+ require 'store/digest/readwrapper'
2
+ require 'store/digest/error'
3
+
4
+ require 'uri'
5
+ require 'uri/ni'
6
+ require 'mimemagic-dorian'
7
+
8
+ # This class represents an entry in the content-addressable store.
9
+ #
10
+ # An entry can be initialized with:
11
+ #
12
+ # * a `String` (or anything that can `#to_s`)
13
+ # * an `Array` of strings (or anything that can `#each`)
14
+ # * a `Pathname` (as long as it refers to a file that can be opened for reading)
15
+ # * an `IO` object (as long as it's finite, such as a `File`, but
16
+ # it's your problem to ensure that it is)
17
+ # * anything that can `#read` (same deal on the finitude)
18
+ # * and two kinds of `#call`s:
19
+ # * zero arity, which is expected return something that quacks like
20
+ # a file handle,
21
+ # * nonzero arity, where the first argument is expected to be
22
+ # something that [behaves like a write
23
+ # handle](https://github.com/rack/rack/blob/main/SPEC.rdoc#streaming-body).
24
+ #
25
+ # This behaviour is so {Store::Digest::Entry} instances can be dropped
26
+ # into `Rack` request and response bodies and replace/consume whatever
27
+ # was in there before. As such, this class implements {#each},
28
+ # {#gets}, {#read}, {#rewind}, and {#close} to emulate an `Enumerable`
29
+ # and/or `IO` handle.
30
+ #
31
+ # Content is scanned lazily (i.e., not until you invoke any of the
32
+ # accessors or the {#scan}/{#scan!} or {#add_to} methods) unless you
33
+ # tell the constructor to be `eager:`. These objects are not
34
+ # associated with a store by default. You must {#initialize} with a
35
+ # reference to `store:`, {#add_to} a store later, or use
36
+ # {Store::Digest#add}, which returns one of these objects.
37
+ #
38
+ # If you initialize one of these objects with one or more hashes, it
39
+ # is assumed that it has already been scanned and the hashes are
40
+ # representative. If, however, you force a {#scan!}, it _will_ raise
41
+ # an error if the supplied hashes don't match.
42
+ #
43
+ class Store::Digest::Entry
44
+
45
+ # These is a struct for the bank of flags, with a couple of extra
46
+ # methods for parsing
47
+ #
48
+ Flags = Struct.new('Flags', :type_checked, :type_valid, :charset_checked,
49
+ :charset_valid, :encoding_checked, :encoding_valid,
50
+ :syntax_checked, :syntax_valid, :cache) do
51
+
52
+ class << self
53
+ # Initialize a struct of flags from arbitrary input
54
+ #
55
+ # @param arg [Store::Digest::Entry::Flags, Integer, #to_h, #to_a]
56
+ #
57
+ # @return [Store::Digest::Entry::Flags]
58
+ #
59
+ def from arg
60
+ # get the length since we use it in a few places
61
+ len = self.members.size
62
+
63
+ if arg.is_a? Integer
64
+ tmp = arg.digits(2).first(len)
65
+ elsif arg.is_a? self
66
+ # noop
67
+ return arg
68
+ elsif arg.is_a? Hash
69
+ tmp = arg.slice(*self.members).transform_values do |v|
70
+ !!(v && v != 0)
71
+ end
72
+ return self.[](**tmp)
73
+ elsif arg.respond_to? :to_a
74
+ tmp = arg.to_a.first(len)
75
+ else
76
+ raise ArgumentError, 'Input must be an integer or array'
77
+ end
78
+
79
+ # append these
80
+ tmp += [false] * (len - tmp.size) if tmp.size < len
81
+
82
+ # make sure these are true/false
83
+ tmp.map! { |b| !!(b && b != 0) }
84
+
85
+ # we do this because `new` doesn't do this
86
+ self.[](*tmp)
87
+ end
88
+
89
+ # Turn an arbitrary {Array} back into an {Integer}.
90
+ #
91
+ # @param array [Array]
92
+ #
93
+ # @return [Integer]
94
+ #
95
+ def to_i array
96
+ array.to_a.reverse.reduce(0) { |acc, b| (acc << 1) | (b ? 1 : 0) }
97
+ end
98
+ end
99
+
100
+ def &(int)
101
+ to_i & int.to_i
102
+ end
103
+
104
+ def |(int)
105
+ to_i | int.to_i
106
+ end
107
+
108
+ # wish there was a cleaner way to do derive individual instance
109
+ # methods from class methods
110
+ begin
111
+ cm = singleton_method :to_i
112
+ define_method(:to_i) { cm.call to_a }
113
+ end
114
+ end
115
+
116
+ # flag constants
117
+ TYPE_CHECKED = 1 << 0
118
+ TYPE_VALID = 1 << 1
119
+ CHARSET_CHECKED = 1 << 2
120
+ CHARSET_VALID = 1 << 3
121
+ ENCODING_CHECKED = 1 << 4
122
+ ENCODING_VALID = 1 << 5
123
+ SYNTAX_CHECKED = 1 << 6
124
+ SYNTAX_VALID = 1 << 7
125
+ IS_CACHE = 1 << 8
126
+
127
+ private
128
+
129
+ SAMPLE = 2**13 # must be big enough to detect ooxml
130
+ BLOCKSIZE = 2**16
131
+
132
+ CHARSETS = [
133
+ %w[utf8 utf-8],
134
+ %w[iso8859-1 iso-8859-1],
135
+ ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
136
+
137
+ ENCODINGS = [
138
+ %w[x-compress compress],
139
+ %w[x-gzip gzip],
140
+ ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
141
+
142
+ TOKEN = '[^\x0-\x20()<>@,;:\\\"/\[\]?=\x7f-\\xff]+'
143
+
144
+ # { key: [pattern, normalizer] } - assumes stripped and downcased
145
+ TOKENS = {
146
+ type: [/^(#{TOKEN}(?:\/#{TOKEN})?)$/on, -> c { MimeMagic[c] }],
147
+ charset: [/^(#{TOKEN})$/on,
148
+ -> c { c = c.tr(?_, ?-).downcase; CHARSETS.fetch c, c } ],
149
+ encoding: [/^(#{TOKEN})$/on,
150
+ -> c { c = c.tr(?_, ?-).downcase; ENCODINGS.fetch c, c } ],
151
+ language: [/^([a-z]{2,3}(?:[-_][0-9a-z]+)*)$/,
152
+ -> c { c.downcase.tr(?_, ?-).gsub(/-*$/, '') } ],
153
+ }
154
+
155
+ LABELS = {
156
+ size: 'Size (Bytes)',
157
+ ctime: 'Added to Store',
158
+ mtime: 'Last Modified',
159
+ ptime: 'Properties Modified',
160
+ dtime: 'Deleted (Expires)',
161
+ type: 'Content Type',
162
+ language: '(Natural) Language',
163
+ charset: 'Character Set',
164
+ encoding: 'Content Encoding',
165
+ }.freeze
166
+
167
+ MANDATORY = %i[size ctime mtime ptime]
168
+ OPTIONAL = %i[dtime type language charset encoding]
169
+ FLAG = %i[content-type charset content-encoding syntax].freeze
170
+ STATE = %i[unverified invalid recheck valid].freeze
171
+
172
+ def coerce_nn_int i
173
+ case i
174
+ when nil then 0
175
+ when Numeric
176
+ raise ArgumentError, 'size must be non-negative' if i < 0
177
+ i.to_i
178
+ else
179
+ raise TypeError, 'size must be nil or Numeric'
180
+ end
181
+ end
182
+
183
+ #
184
+ def coerce_time t, k = nil
185
+ case t
186
+ when nil then nil
187
+ when Time then t
188
+ when -> dt { dt.respond_to? :to_time }
189
+ t.to_time
190
+ when Integer
191
+ raise ArgumentError,
192
+ "#{k} given as Integer must be non-negative" if t < 0
193
+ Time.at t
194
+ else
195
+ raise TypeError, "Invalid type for #{k}: #{t.class}"
196
+ end
197
+ end
198
+
199
+ def coerce_token t, k
200
+ t = t.to_s.strip.downcase
201
+ pat, norm = TOKENS[k]
202
+ raise "#{k} #{t} does not match #{pat}" unless m = pat.match(t)
203
+ norm.call m.captures.first
204
+ end
205
+
206
+ def coerce_digests digests, empty: false, normative: nil
207
+ # we just sneak in the instance's algorithms
208
+ self.class.coerce_digests digests, algorithms: algorithms,
209
+ empty: empty, normative: normative
210
+ end
211
+
212
+ CACHE_TTL = 86400
213
+
214
+ def compute_cache cache
215
+ return unless cache
216
+ if cache.is_a? Numeric
217
+ # cache dtime should be relative to metadata parameter change time
218
+ @ptime + cache
219
+ elsif cache.is_a? Time
220
+ cache
221
+ elsif cache.respond_to? :to_time
222
+ cache.to_time
223
+ else
224
+ (@store ? @store.cache_ttl : CACHE_TTL)
225
+ end
226
+ end
227
+
228
+ # Returns metadata without calling the accessors and triggering a
229
+ # scan.
230
+ #
231
+ # @return [Hash] the current set of metadata
232
+ #
233
+ def meta_hash content: false, store: false
234
+ keys = %i[digests size ctime mtime ptime dtime
235
+ flags type charset encoding language]
236
+ keys.unshift :store if store && @store
237
+ keys.unshift :content if content && @content
238
+
239
+ keys.each_with_object({}) do |k, h|
240
+ v = "@#{k}"
241
+ h[k] = instance_variable_get(v) if instance_variable_defined?(v)
242
+ end
243
+ end
244
+
245
+ # Merge a metadata hash into the object.
246
+ #
247
+ # @param hash [Hash{Symbol=>Object}]
248
+ #
249
+ # @raise [Store::Digest::Error::Integrity]
250
+ #
251
+ # @return [void]
252
+ #
253
+ def merge_meta hash, content: false
254
+ # do itt
255
+ @content = hash[:content] if content and hash[:content]
256
+
257
+ # check the byte size
258
+ if hash[:size]
259
+ s = coerce_nn_int hash[:size]
260
+ raise Store::Digest::Error::Integrity,
261
+ "Scanned size #{s} does not match asserted #{@size}" if
262
+ @size and s != @size
263
+ @size = s
264
+ end
265
+
266
+ # check the digests
267
+ if hash[:digests]
268
+ digests = coerce_digests(hash[:digests], normative: true)
269
+ (@digests.keys & digests.keys).each do |k|
270
+ scanned = digests[k]
271
+ asserted = @digests[k]
272
+ raise Store::Digest::Error::CryptographicIntegrity,
273
+ "Scanned digest #{scanned} does not match asserted #{asserted}" if
274
+ scanned != asserted
275
+ end
276
+
277
+ # make sure wee also do the algorithms for parity
278
+ @digests = digests.transform_values(&:freeze).freeze
279
+ @algorithms = digests.keys.to_set.freeze
280
+ @scanned = true
281
+ end
282
+
283
+ # only update the type if it's more specific than the asserted one
284
+ if hash[:type]
285
+ t = coerce_token hash[:type], :type
286
+
287
+ # warn "#{@type.inspect} -> #{t.inspect}"
288
+
289
+ @type = (t.canonical || t) unless @type and !t.descendant_of?(@type)
290
+ # @type = (t.canonical || t) if !@type || t.descendant_of?(@type)
291
+ end
292
+
293
+ %i[charset encoding language].each do |key|
294
+ val = coerce_token(hash[key], key).freeze if hash[key]
295
+ # note the distinction
296
+ instance_variable_set("@#{key}", val) if hash.key? key
297
+ end
298
+
299
+ # mtime is special
300
+ if hash[:mtime]
301
+ # XXX TODO preserve older newer
302
+ @mtime = coerce_time hash[:mtime], :mtime
303
+ end
304
+
305
+ %i[ctime ptime dtime].each do |key|
306
+ val = coerce_time(hash[key], key).freeze if hash[key]
307
+ # again note the distinction
308
+ instance_variable_set("@#{key}", val) if hash.key? key
309
+ end
310
+
311
+ # finally we do the flags
312
+ @flags = Flags.from(hash[:flags]) if hash[:flags]
313
+
314
+ nil
315
+ end
316
+
317
+ # this is to swtich the content over
318
+ #
319
+ def dereference?
320
+ @content = @content.call if @content.respond_to? :call
321
+ end
322
+
323
+ def seekable? io
324
+ return false unless io.respond_to? :seek
325
+ begin
326
+ # this should be a noop
327
+ io.seek 0, IO::SEEK_CUR
328
+ true
329
+ rescue Errno::ESPIPE, Errno::EINVAL
330
+ false
331
+ end
332
+ end
333
+
334
+ public
335
+
336
+ # Create a new object, naively recording whatever it is handed.
337
+ #
338
+ # @note use {.scan} or {#scan} to populate the digests.
339
+ #
340
+ # @param content [IO, String, Proc, File, Pathname, ...] some content
341
+ # @param store [Store::Digest] the associated store, if present
342
+ # @param digests [Hash] the digests ascribed to the content
343
+ # @param type [String] assert the object's MIME type
344
+ # @param charset [String] the character set, if applicable
345
+ # @param language [String] the (RFC5646) language tag, if applicable
346
+ # @param encoding [String] the content-encoding (e.g. compression)
347
+ # @param mtime [Time] assert object modification time
348
+ # @param flags [Integer, Flags] validation state flags
349
+ # @param strict [true, false] raise an error on bad input
350
+ #
351
+ # @return [Store::Digest::Entry] the object in question
352
+ #
353
+ def initialize content = nil, store: nil, digests: nil, mtime: nil,
354
+ type: nil, charset: nil, encoding: nil, language: nil, flags: 0,
355
+ cache: false, strict: false, scan: false, &block
356
+
357
+ # set the associated store, if one is passed in
358
+ if store
359
+ raise 'Store must be an instance of Store::Digest' unless
360
+ store.is_a? Store::Digest
361
+ @store = store
362
+ end
363
+
364
+ now = Time.now
365
+
366
+ # this sets the empty digest hash and the scanning state to false
367
+ self.content = content if content
368
+
369
+ # we do this little ballet because `content=` may set mtime and type
370
+ @mtime = mtime || @mtime || now
371
+ type ||= @type || MimeMagic[nil]
372
+
373
+ # the following can be strings or symbols:
374
+ b = binding
375
+ TOKENS.keys.each do |k|
376
+ if x = b.local_variable_get(k)
377
+ x = if strict
378
+ coerce_token(x, k)
379
+ else
380
+ coerce_token(x, k) rescue nil
381
+ end
382
+ instance_variable_set "@#{k}", x.freeze if x
383
+ end
384
+ end
385
+
386
+ # warn "wtf #{@type.inspect}"
387
+
388
+ # we let the empty through
389
+ digests = coerce_digests digests, empty: true
390
+ if digests.is_a? Hash
391
+ @digests = digests
392
+ @algorithms = digests.empty? ? algorithms : digests.keys.to_set
393
+ @scanned = !digests.empty?
394
+ elsif !digests.empty?
395
+ @algorithms = digests.to_set
396
+ end
397
+
398
+ # we use this for `#get`
399
+ if block
400
+ hash = block.call @content
401
+
402
+ raise TypeError,
403
+ "Block return value must be Hash, not #{hash.class}" unless
404
+ hash.is_a? Hash
405
+ #
406
+ @scanned = true if hash[:digests]
407
+ merge_meta hash, content: true
408
+ elsif @content.nil?
409
+ raise ArgumentError,
410
+ 'Must initialize with either content, or a block, or both'
411
+ end
412
+
413
+ # just make sure the times are in
414
+ @ctime ||= now
415
+ @mtime ||= mtime || @ctime
416
+ @ptime ||= @ctime
417
+
418
+ # set the flags
419
+ @flags ||= Flags.from(flags || 0)
420
+ if cache
421
+ raise NotImplementedError, 'Associated store does not support caching' if
422
+ @store and !@store.can_cache?
423
+ @flags.cache = !!cache
424
+ @dtime = compute_cache cache
425
+ end
426
+
427
+ # scan preemptively if so directed
428
+ scan! if scan
429
+ end
430
+
431
+ attr_reader :store, :type, :charset, :language, :encoding,
432
+ :ctime, :mtime, :ptime, :dtime, :flags
433
+
434
+ TOKENS.keys.each do |key|
435
+ define_method("#{key}=") { |val| coerce_token val, key }
436
+ end
437
+
438
+ # This will take an array or hash or individual symbol or string or
439
+ # {URI::NI} object and try to coerce it into something it can use.
440
+ #
441
+ # * Individual strings/symbols/{URI::NI} objects will get wrapped in
442
+ # an array.
443
+ # * Strings will be scanned for conformance to RFC6920 and
444
+ # transformed into {URI::NI} objects if they match, otherwise they
445
+ # will be turned into symbols and matched against the repertoire
446
+ # of hash algorithms.
447
+ # * If a {URI::NI} object isn't valid (e.g., not the full length,
448
+ # algorithm not supported), this will raise an error; likewise if
449
+ # the symbol is not in the repertoire of algorithms.
450
+ # * Arrays must contain all the same kind of thing (strings,
451
+ # symbols, {URI::NI} objects)
452
+ # * Hash keys must coerce to symbols (via `#to_s`, `#to_sym`) that
453
+ # match the repertoire of algorithms.
454
+ # * Hash values must either be a string representing the decimal,
455
+ # base64, or hexadecimal digest of a length corresponding to the
456
+ # algorithm in the key, or a string representing an RFC6920 URI,
457
+ # or a {URI::NI}.
458
+ # * (Base64 strings may be padded or not, and use the standard
459
+ # non-URL-safe representation, or not)
460
+ # * Strings will then subsequently be transformed into {URI::NI}
461
+ # objects.
462
+ # * Hash values that are (either already or coerced into) {URI::NI}
463
+ # objects must be valid and their algorithms must match the hash
464
+ # key with which they are associated.
465
+ #
466
+ # The input (and thus the output) has two "moods":
467
+ #
468
+ # 1. _Anticipative_: "These are the digest algorithms we want to see
469
+ # hashes for."
470
+ # 2. _Normative_: "These are the hashes we already have for the
471
+ # input, and it should match them when scanned."
472
+ #
473
+ # In general inputs that coerce to arrays (except arrays whose
474
+ # contents coerce to {URI::NI} objects, which in turn will coerce to
475
+ # hashes) are considered anticipative, whereas inputs that coerce to
476
+ # hashes are considered normative. The return value will depend on
477
+ # the adjudicated intent: `Array` for anticipative, `Hash` for
478
+ # normative. The caller should inspect the return value to see which
479
+ # it is, because the difference is whether a subsequent scan of the
480
+ # content is intended to verify it (normative) or not (anticipative).
481
+ #
482
+ # @param digests [#to_sym, #to_s, URI::NI,
483
+ # #to_a<#to_sym,#to_s,URI::NI>, #to_h{#to_sym=>#to_s},
484
+ # #to_h{#to_sym=>URI::NI}] the thing to be coerced into digests
485
+ # @param empty [false, true] whether the set is allowed to be empty
486
+ # @param normative [nil, false, true] whether to assert the
487
+ # normative mood (`true`), the anticipative mood (`false`), or
488
+ # leave it to the caller (`nil`)
489
+ #
490
+ # @return [Array<Symbol>,Hash{Symbol=>URI::NI}]
491
+ #
492
+ def self.coerce_digests digests, algorithms: nil, empty: false, normative: nil
493
+ algorithms ||= URI::NI.algorithms
494
+
495
+ # handle nil
496
+ digests = [] if digests.nil?
497
+
498
+ # first we coerce into an array; note hashes respond to `#to_a`
499
+ digests = [digests] unless digests.respond_to? :to_a
500
+
501
+ raise ArgumentError,
502
+ 'Digest list can\'t be empty' if !empty and digests.empty?
503
+
504
+ if digests.is_a? Hash
505
+ # digests = digests[:digests] if digests.key? :digests
506
+ out = digests.map do |k, v|
507
+ # keys must go to symbols; symbols must be valid
508
+ k = k.to_s.downcase.to_sym unless k.is_a? Symbol
509
+ raise ArgumentError,
510
+ "#{k} is not a supported algorithm in this configuration" unless
511
+ algorithms.include? k
512
+
513
+ # this should raise on any invalid values
514
+ v = URI::NI.ingest k, v
515
+
516
+ # then we assert that the result itself is valid
517
+ raise ArgumentError, "Hash URI #{v} is invalid" unless v.valid?
518
+
519
+ [k, v]
520
+ end.to_h
521
+
522
+ # warn out
523
+
524
+ # note we are explicitly looking to see if normative is false
525
+ # rather than nil
526
+ return normative == false ? out.keys : out
527
+ end
528
+
529
+ # otherwise it should be an array so we'll make it into a set
530
+ digests = digests.to_a.map do |thing|
531
+ case thing
532
+ when Symbol then thing
533
+ when URI then URI::NI.ingest thing
534
+ else
535
+ # whatever it is, it should now be a string
536
+ thing = thing.to_s
537
+ if %r{^(?i:ni|https?)://}.match?(thing) and uri = URI::NI.ingest(thing)
538
+ uri
539
+ else
540
+ # turn it into a symbol
541
+ thing.strip.downcase.to_sym
542
+ end
543
+ end
544
+ end.uniq
545
+
546
+ # warn digests.inspect
547
+
548
+ if digests.all? { |d| d.is_a? URI::NI }
549
+ # we are expressly asking for anticipative if normative is literally false
550
+ return digests.map(&:algorithm) if normative == false
551
+
552
+ # otherwise if these are all digest URIs then this is normative;
553
+ # return as a hash
554
+ return digests.map do |d|
555
+ raise ArgumentError,
556
+ "#{d} is not a supported algorithm" unless
557
+ algorithms.include? d.algorithm
558
+
559
+ [d.algorithm.to_sym, d]
560
+ end.to_h
561
+ elsif digests.all? { |d| d.is_a? Symbol }
562
+ raise ArgumentError, 'Normative expressly normative' if normative
563
+
564
+ return digests
565
+ end
566
+
567
+ # if we get here, it's an error
568
+ raise ArgumentError,
569
+ 'Input must coerce to either all URIs or all Symbols'
570
+ end
571
+
572
+ # Scan a blob and return the digests and byte count.
573
+ #
574
+ # @note The `content` is assumed to be at position zero.
575
+ #
576
+ # @param content [#read] the object to be scanned
577
+ # @param algorithms [Array<Symbol,#to_sym>] the algorithms
578
+ # @param blocksize [Integer] the block size to use
579
+ # @param type [false, true] scan content for media type
580
+ #
581
+ # @yieldparam [String] a chunk of input
582
+ #
583
+ # @raise [ArgumentError] the content can't be coerced to
584
+ # something that quacks like `#read`
585
+ # @raise [ArgumentError] the algorithms supplied aren't supported
586
+ #
587
+ # @return [Array(Hash{Symbol=>URI::NI}, Integer)] a pair containing
588
+ # a hash of the digests and the size in bytes of the blob.
589
+ #
590
+ def self.scan_raw content, algorithms: URI::NI.algorithms,
591
+ blocksize: BLOCKSIZE, type: false, &block
592
+ # this will raise if it can't be coerced
593
+ content = Store::Digest::ReadWrapper.coerce content
594
+
595
+ # coerce digests
596
+
597
+ digests = begin
598
+ case algorithms
599
+ when Array, -> x { x.respond_to? :to_a }
600
+ algorithms.to_a.map(&:to_sym)
601
+ when Symbol, -> x { x.respond_to? :to_sym }
602
+ [algorithms.to_sym]
603
+ else
604
+ raise ArgumentError
605
+ end
606
+ rescue ArgumentError, TypeError, NoMethodError
607
+ raise ArgumentError,
608
+ "Digest algorithms must be coercible to an Array of Symbols"
609
+ end
610
+
611
+ # oh this shouldn't be empty btw
612
+ raise ArgumentError, 'Algorithm list should not be empty' if digests.empty?
613
+
614
+ # double-check if the digests are supported
615
+ raise ArgumentError,
616
+ "Unsupported digest algorithm(s) #{digests - URI::NI.algorithms}" unless
617
+ (digests - URI::NI.algorithms).empty?
618
+
619
+ # now queue up the contexts
620
+ digests = digests.map { |d| [d, URI::NI.context(d)] }.to_h
621
+
622
+ # we'll just make a uniform sequence to cycle through, why not
623
+ procs = digests.values.map { |u| -> buf { u << buf } }
624
+ procs << block if block
625
+
626
+ if type
627
+ sample = StringIO.new
628
+ procs << -> buf do
629
+ sample << buf
630
+ # take this out of the loop if we have enough
631
+ procs.pop if sample.pos >= SAMPLE
632
+ end
633
+ end
634
+
635
+ bytes = 0
636
+ while buf = content.read(blocksize)
637
+ buf = buf.to_s.b # ensure these are bytes we're reading
638
+ bytes += buf.size
639
+ procs.each { |b| b.call buf }
640
+ end
641
+
642
+ # apparently i do this because i painted myself into a corner with
643
+ # URI::NI and/or past me previously discovered that there is much
644
+ # more to the hash state than just the digest itself and forgot to
645
+ # tell later-past me when i discovered it a second time around
646
+ digests = digests.map do |k, v|
647
+ [k, URI::NI.compute(v, algorithm: k).freeze]
648
+ end.to_h
649
+
650
+ # return the gathered information; everything else is out of band
651
+ out = { digests: digests, size: bytes }
652
+
653
+ if sample
654
+ # felt cute lol
655
+ out[:type] = %i[by_magic default_type].lazy.filter_map do |m|
656
+ sample.rewind
657
+ MimeMagic.send m, sample
658
+ end.first
659
+ end
660
+
661
+ out
662
+ end
663
+
664
+ # Add this entry to a {Store::Digest} instance.
665
+ #
666
+ # @note This entry will become associated with the store if it isn't
667
+ # already. If this entry has already been scanned, it will be
668
+ # scanned again.
669
+ #
670
+ def add store = nil
671
+ raise ArgumentError,
672
+ 'no store associated with the entry and none passed in' if
673
+ [store, @store].all?(&:nil?)
674
+
675
+ # use the internal store if one is not supplied
676
+ store ||= @store
677
+ raise TypeError, 'Argument must be an instance of Store::Digest' unless
678
+ store.is_a? Store::Digest
679
+
680
+ # do this if not scanned
681
+
682
+ unless scanned? && store.has?(self)
683
+ # ok add the thing
684
+ hash = store.send :add_raw, @content, **meta_hash
685
+ merge_meta hash, content: true
686
+ end
687
+
688
+ # set the internal store if one is supplied and not present; do
689
+ # this after because calling store.has? will cause the record to
690
+ # be scanned, potentially against the very store, so it would be
691
+ # scanned by the same store twice.
692
+ @store ||= store
693
+
694
+ self
695
+ end
696
+
697
+ # Remove this entry from a store. Dissociates the entry from the
698
+ # store in the process. Will not signal if the entry wasn't in the
699
+ # store to begin with.
700
+ #
701
+ # @param store [nil, Store::Digest] the store to remove the entry
702
+ # @param forget [false, true] whether to purge the entry completely
703
+ # from the metadata or just delete the blob
704
+ #
705
+ def remove store = nil, forget: false
706
+ raise ArgumentError,
707
+ 'no store associated with the entry and none passed in' if
708
+ [store, @store].all?(&:nil?)
709
+ store ||= @store
710
+
711
+ raise TypeError, 'store must be a Store::Digest instance' unless
712
+ store.is_a? Store::Digest
713
+
714
+ # eliminate the relationship
715
+ @store = nil if @store.equal? store
716
+
717
+ rm = forget ? :forget : true
718
+ # this circumvents `private`; ignore return value
719
+ store.send :get_raw, digests[store.primary], remove: rm
720
+
721
+ self
722
+ end
723
+
724
+ # Preemptively scan a blob and return an entry.
725
+ #
726
+ # @param content [String, Pathname, IO, #each, #read, #call]
727
+ # anything that represents bytes or can be coerced or wrapped by
728
+ # {Store::Digest::ReadWrapper}
729
+ #
730
+ # @param store [Store::Digest]
731
+ # @param digests [Array<Symbol,#to_sym,URI::NI>, Hash{Symbol=>URI::NI}]
732
+ #
733
+ # @return [Store::Digest::Entry]
734
+ #
735
+ def self.scan content, store: nil, digests: URI::NI.algorithms, mtime: nil,
736
+ type: nil, language: nil, charset: nil, encoding: nil,
737
+ blocksize: BLOCKSIZE, &block
738
+ self.new content, store: store, digests: digests, mtime: mtime,
739
+ type: type, language: language, charset: charset, encoding: encoding,
740
+ scan: blocksize, &block
741
+ end
742
+
743
+ # Scan the blob if it hasn't already been scanned (idempotent).
744
+ #
745
+ # @return [self]
746
+ #
747
+ def scan
748
+ scan! if @content && !scanned?
749
+ self
750
+ end
751
+
752
+ STRINGIO_MAX = 2**16
753
+
754
+ # Scan the blob unconditionally. May raise an error if the byte size
755
+ # or digests are asserted in the constructor and don't match the scan.
756
+ #
757
+ # @raise [Store::Digest::Error:Integrity]
758
+ #
759
+ # @return [self]
760
+ #
761
+ def scan!
762
+ raise Store::Digest::Error::Deleted, 'Entry has no content' unless @content
763
+
764
+ if @store
765
+ # we use the store if one is associated
766
+ hash = @store.send :add_raw, @content, **meta_hash
767
+
768
+ @content = hash[:content]
769
+ elsif @content.respond_to? :rewind and seekable?(@content)
770
+ # we don't need a temporary file; we'll just reuse this file handle
771
+ @content.rewind
772
+ hash = self.class.scan_raw @content, algorithms: @algorithms, type: true
773
+ @content.rewind
774
+ else
775
+ # start with a stringio
776
+ tmp = StringIO.new
777
+ lam = -> buf do
778
+ tmp << buf
779
+
780
+ # check if it's too big
781
+ if tmp.size >= STRINGIO_MAX
782
+ # make an actual file
783
+ file = Tempfile.create anonymous: true, binmode: true
784
+
785
+ # put the string into it
786
+ tmp.rewind
787
+ file << tmp.read
788
+
789
+ # reassign tmp with the file
790
+ tmp = file
791
+
792
+ # reassign lam with this condition removed so we don't
793
+ # needlessly test it over and over with every iteration
794
+ lam = -> buf { file << buf }
795
+ end
796
+ end
797
+
798
+ # now we wrap lam in another block so it picks up the reassignment
799
+ hash = self.class.scan_raw(
800
+ @content, algorithms: @algorithms, type: true) { |buf| lam.call buf }
801
+ tmp.rewind
802
+ @content = tmp
803
+ end
804
+
805
+ # i suppose this is where the integrity is checked
806
+ if @scanned
807
+ # size
808
+ raise Store::Digest::Error::Integrity,
809
+ "Scanned size #{hash[:size]} does not match asserted #{@size}" if
810
+ hash[:size] != @size
811
+
812
+ # digests
813
+ (@digests.keys & hash[:digests].keys).each do |k|
814
+ scanned = hash[:digests][k]
815
+ asserted = @digests[k]
816
+ raise Store::Digest::Error::CryptographicIntegrity,
817
+ "Scanned digest #{scanned} does not match asserted #{asserted}" if
818
+ scanned != asserted
819
+ end
820
+ # XXX also do content type??
821
+ end
822
+
823
+ merge_meta hash
824
+
825
+ # unconditionally set this now
826
+ @scanned = true
827
+
828
+ self
829
+ end
830
+
831
+ # Returns true if the entry has already been scanned.
832
+ #
833
+ # @return [false, true]
834
+ #
835
+ def scanned?
836
+ !!@scanned
837
+ end
838
+
839
+ # Iterate over the blob contents.
840
+ #
841
+ # @yieldparam chunk [String] the chunk of blob
842
+ #
843
+ # @return [self]
844
+ #
845
+ def each sep = $/, limit = nil, chomp: false, &block
846
+ scan
847
+ dereference?
848
+ @content.each(sep, limit, chomp: chomp, &block)
849
+ end
850
+
851
+ # Emulate {IO#read}.
852
+ #
853
+ # @param length [Integer] the number of bytes to read
854
+ #
855
+ # @return [String, nil] up to `length` bytes or `nil` on EOF
856
+ #
857
+ def read length = nil, buffer = nil
858
+ scan
859
+ dereference?
860
+ # this should be set by scan
861
+ @content.read length, buffer
862
+ end
863
+
864
+ # Emulate {IO#gets}.
865
+ #
866
+ # @return [String] the next character
867
+ #
868
+ def gets sep = $/, chomp = false
869
+ scan
870
+ dereference?
871
+ @content.gets sep, chomp
872
+ end
873
+
874
+ def seek offset, whence = IO::SEEK_SET
875
+ scan
876
+ dereference?
877
+ @content.seek offset, whence
878
+ end
879
+
880
+ def pos
881
+ scan
882
+ dereference?
883
+ @content.pos
884
+ end
885
+
886
+ alias_method :tell, :pos
887
+
888
+ def pos= position
889
+ scan
890
+ dereference?
891
+ @content.pos = position
892
+ end
893
+
894
+ # Emulate {IO#rewind}.
895
+ #
896
+ # @return [0] always zero
897
+ #
898
+ def rewind
899
+ scan
900
+ dereference?
901
+
902
+ # content should be rewindable after a scan
903
+ @content.rewind
904
+ end
905
+
906
+ # No-op of {IO#open} for parity.
907
+ #
908
+ # @note Once the blob is scanned, an internal file handle is opened
909
+ # and stays open.
910
+ #
911
+ # @return [self]
912
+ #
913
+ def open *args
914
+ rewind
915
+ self
916
+ end
917
+
918
+ # No-op of {IO#close}.
919
+ #
920
+ # @return [self]
921
+ #
922
+ def close
923
+ rewind
924
+ self
925
+ end
926
+
927
+ # Determine (if possible) if the object is in the store. Returns
928
+ # `nil` if no store is associated with the entry, otherwise it will
929
+ # query the store.
930
+ #
931
+ # @return [nil, false, true] the status of the entry
932
+ #
933
+ def stored?
934
+ # warn @digests
935
+ scan
936
+ # warn scanned?
937
+ @store.has?(digests) if @store
938
+ end
939
+
940
+ # Return the algorithms used in the object.
941
+ #
942
+ # @return [Array]
943
+ #
944
+ def algorithms
945
+ @algorithms ||= (@store || URI::NI).algorithms.to_set
946
+ end
947
+
948
+ # Get the digest hash.
949
+ #
950
+ # @return [Hash] the digests
951
+ #
952
+ def digests
953
+ scan
954
+ @digests
955
+ end
956
+
957
+ # Get the byte size.
958
+ #
959
+ # @return [Integer] the bytes
960
+ #
961
+ def size
962
+ scan
963
+ @size
964
+ end
965
+
966
+ # Return a particular digest. Returns nil if there is no match.
967
+ #
968
+ # @param symbol [Symbol, #to_s, #to_sym] the digest
969
+ #
970
+ # @return [URI::NI, nil]
971
+ #
972
+ def digest symbol
973
+ raise ArgumentError, "This method takes a symbol" unless
974
+ symbol.respond_to? :to_sym
975
+ digests[symbol.to_sym]
976
+ end
977
+
978
+ alias_method :"[]", :digest
979
+
980
+ # Returns the content stored in the object.
981
+ #
982
+ # @note This is a vestigial method since {Store::Digest::Entry}
983
+ # now proxies {IO} calls.
984
+ #
985
+ # @return [self, nil] no-op if there is content, nil if not.
986
+ #
987
+ def content
988
+ self if @content
989
+ end
990
+
991
+ # Determines if there is content embedded in the object.
992
+ #
993
+ # @return [false, true]
994
+ #
995
+ def content?
996
+ !!@content
997
+ end
998
+
999
+ # Reset the content (and unset the scanned state).
1000
+ #
1001
+ # @param content [IO, String, Proc, File, Pathname, ...] some content
1002
+ #
1003
+ def content= content
1004
+ @digests = {}
1005
+ @scanned = false
1006
+ @content = Store::Digest::ReadWrapper.coerce content, thunk: true
1007
+
1008
+ if @content.respond_to?(:path) and path = @content.path
1009
+ warn MimeMagic.by_path path
1010
+ @type = MimeMagic.by_path path
1011
+ end
1012
+
1013
+ @mtime = @content.respond_to?(:stat) ? @content.stat.mtime : Time.now(in: ?Z)
1014
+ end
1015
+
1016
+ # Returns the type and charset, suitable for an HTTP header.
1017
+ #
1018
+ # @return [String]
1019
+ #
1020
+ def type_charset
1021
+ out = type.to_s
1022
+ out += ";charset=#{charset}" if charset
1023
+ out
1024
+ end
1025
+
1026
+ # Determines if the object has been scanned.
1027
+ #
1028
+ # @return [false, true]
1029
+ #
1030
+ def scanned?
1031
+ !@digests.empty?
1032
+ end
1033
+
1034
+ def flags= val
1035
+ @flags = Flags.from val
1036
+ end
1037
+
1038
+ # Returns whether the object is cache.
1039
+ #
1040
+ # @return [false, true]
1041
+ #
1042
+ def cache?
1043
+ !!@flags.cache
1044
+ end
1045
+
1046
+ # Assigns the cache status.
1047
+ #
1048
+ # @param value [false, true] anything falsy/truthy
1049
+ #
1050
+ # @return [void]
1051
+ #
1052
+ def cache= value
1053
+ @flags.cache = !!value
1054
+ end
1055
+
1056
+ # XXX i'm keeping these as-is for now
1057
+
1058
+ # Returns true if the content type has been checked.
1059
+ #
1060
+ # @return [false, true]
1061
+ #
1062
+ def type_checked?
1063
+ @flags.type_checked
1064
+ end
1065
+
1066
+ # Returns true if the content type has been checked _and_ is valid.
1067
+ #
1068
+ # @return [nil, false, true]
1069
+ #
1070
+ def type_valid?
1071
+ return nil unless @flags.type_checked
1072
+ @flags.type_valid
1073
+ end
1074
+
1075
+ # Returns true if the character set has been checked.
1076
+ #
1077
+ # @return [false, true]
1078
+ #
1079
+ def charset_checked?
1080
+ @flags.charset_checked
1081
+ end
1082
+
1083
+ # Returns true if the character set has been checked _and_ is valid.
1084
+ #
1085
+ # @return [nil, false, true]
1086
+ #
1087
+ def charset_valid?
1088
+ return nil unless @flags.charset_checked
1089
+ @flags.charset_valid
1090
+ end
1091
+
1092
+ # Returns true if the content encoding (e.g. gzip, deflate) has
1093
+ # been checked.
1094
+ #
1095
+ # @return [false, true]
1096
+ #
1097
+ def encoding_checked?
1098
+ @flags.encoding_checked
1099
+ end
1100
+
1101
+ # Returns true if the content encoding has been checked _and_ is valid.
1102
+ #
1103
+ # @return [nil, false, true]
1104
+ #
1105
+ def encoding_valid?
1106
+ return nil unless @flags.encoding_checked
1107
+ @flags.encoding_valid
1108
+ end
1109
+
1110
+ # Returns true if the blob's syntax has been checked.
1111
+ #
1112
+ # @return [false, true]
1113
+ #
1114
+ def syntax_checked?
1115
+ @flags.syntax_checked
1116
+ end
1117
+
1118
+ # Returns true if the blob's syntax has been checked _and_ is valid.
1119
+ #
1120
+ # @return [nil, false, true]
1121
+ #
1122
+ def syntax_valid?
1123
+ return nil unless @flags.syntax_checked
1124
+ @flags.syntax_valid
1125
+ end
1126
+
1127
+ %i[ctime mtime ptime dtime].each do |k|
1128
+ define_method "#{k}=" do |v|
1129
+ instance_variable_set "@#{k}", coerce_time(v, k).freeze
1130
+ end
1131
+ end
1132
+
1133
+ %i[type charset encoding language].each do |k|
1134
+ define_method "#{k}=" do |v|
1135
+ instance_variable_set "@#{k}", coerce_token(v, k).freeze
1136
+ end
1137
+
1138
+ define_method "#{k}_ok?" do |v|
1139
+ TOKENS[k].first.match? v
1140
+ end
1141
+ end
1142
+
1143
+ # If the entry is flagged as cache and the expiry time is in the
1144
+ # past, then the entry is stale.
1145
+ #
1146
+ def stale?
1147
+ cache? && @dtime && @dtime < Time.now
1148
+ end
1149
+
1150
+ # Just a plain old predicate to determine whether the blob has been
1151
+ # deleted from the store (but implicitly the metadata record
1152
+ # remains).
1153
+ #
1154
+ # @return [false, true]
1155
+ #
1156
+ def deleted?
1157
+ stale? or @dtime && !cache?
1158
+ end
1159
+
1160
+ # Return the object as a hash. Omits the content by default.
1161
+ #
1162
+ # @param content [false, true] include the content if true
1163
+ # @return [Hash] the object as a hash
1164
+ #
1165
+ def to_h content: false
1166
+ main = %i[content digests]
1167
+ main.shift unless content
1168
+ (main + MANDATORY + OPTIONAL + [:flags]).map do |k|
1169
+ [k, send(k).dup]
1170
+ end.to_h
1171
+ end
1172
+
1173
+ # Outputs a human-readable string representation of the object.
1174
+ #
1175
+ # @return [String] said representation
1176
+ #
1177
+ def to_s
1178
+ out = "#{self.class}\n Digests:\n"
1179
+
1180
+ # disgorge the digests
1181
+ digests.values.sort { |a, b| a.to_s <=> b.to_s }.each do |d|
1182
+ out << " #{d}\n"
1183
+ end
1184
+
1185
+ # now the fields
1186
+ MANDATORY.each { |m| out << " #{LABELS[m]}: #{send m}\n" }
1187
+ OPTIONAL.each do |o|
1188
+ val = send o
1189
+ out << " #{LABELS[o]}: #{val}\n" if val
1190
+ end
1191
+
1192
+ # now the validation statuses
1193
+ out << "Validation:\n"
1194
+ FLAG.each_index do |i|
1195
+ x = flags.to_i >> (3 - i) & 3
1196
+ out << (" %-16s: %s\n" % [FLAG[i], STATE[x]])
1197
+ end
1198
+
1199
+ out
1200
+ end
1201
+
1202
+ def inspect
1203
+ text = if scanned?
1204
+ ds = digests.values.map(&:to_s).sort.join ', '
1205
+ "size=#{size} type=#{type} (#{})"
1206
+ else
1207
+ "(not scanned)"
1208
+ end
1209
+
1210
+ "<#{self.class} #{text}>"
1211
+ end
1212
+ end
1213
+
1214
+ Store::Digest::Object = Store::Digest::Entry