store-digest 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,497 @@
1
+ require 'store/digest/version'
2
+
3
+ require 'uri'
4
+ require 'uri/ni'
5
+ require 'mimemagic'
6
+ require 'mimemagic/overlay'
7
+
8
+ class MimeMagic
9
+ # XXX erase this when these methods get added
10
+ unless self.method_defined? :parents
11
+ def self.parents type
12
+ TYPES.fetch(type, [nil,[]])[1].map { |t| new t }.uniq
13
+ end
14
+ end
15
+
16
+ unless self.method_defined? :ancestor_types
17
+ def self.ancestor_types type
18
+ parents(type).map { |t| ancestors(t) }.flatten.uniq
19
+ end
20
+ end
21
+
22
+ unless self.method_defined? :binary?
23
+ def self.binary? thing
24
+ sample = nil
25
+
26
+ # get some stuff out of the IO or get a substring
27
+ if %i[tell seek read].all? { |m| thing.respond_to? m }
28
+ pos = thing.tell
29
+ thing.seek 0, 0
30
+ sample = thing.read 1024
31
+ thing.seek pos
32
+ elsif thing.respond_to? :to_s
33
+ sample = thing.to_s[0,1024]
34
+ else
35
+ raise ArgumentError, "Cannot sample an instance of {thing.class}"
36
+ end
37
+
38
+ # consider this to be 'binary' if empty
39
+ return true if sample.nil? or sample.empty?
40
+ # control codes minus ordinary whitespace
41
+ /[\x0-\x8\xe-\x1f\x7f]/n.match?(sample) ? true : false
42
+ end
43
+ end
44
+
45
+ unless self.method_defined? :default_type
46
+ def self.default_type thing
47
+ new self.binary?(thing) ? 'application/octet-stream' : 'text/plain'
48
+ end
49
+ end
50
+ end
51
+
52
+ class Store::Digest::Object
53
+
54
+ private
55
+
56
+ SAMPLE = 2**13 # must be big enough to detect ooxml
57
+ BLOCKSIZE = 2**16
58
+
59
+ CHARSETS = [
60
+ %w[utf8 utf-8],
61
+ %w[iso8859-1 iso-8859-1],
62
+ ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
63
+
64
+ ENCODINGS = [
65
+ %w[x-compress compress],
66
+ %w[x-gzip gzip],
67
+ ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
68
+
69
+ TOKEN = '[^\x0-\x20()<>@,;:\\\"/\[\]?=\x7f-\\xff]+'
70
+
71
+ # { key: [pattern, normalizer] } - assumes stripped and downcased
72
+ TOKENS = {
73
+ type: [/^(#{TOKEN}(?:\/#{TOKEN})?)$/on, -> c { c.downcase }],
74
+ charset: [/^(#{TOKEN})$/on,
75
+ -> c { c = c.tr(?_, ?-).downcase; CHARSETS.fetch c, c } ],
76
+ encoding: [/^(#{TOKEN})$/on,
77
+ -> c { c = c.tr(?_, ?-).downcase; ENCODINGS.fetch c, c } ],
78
+ language: [/^([a-z]{2,3}(?:[-_][0-9a-z]+)*)$/,
79
+ -> c { c.downcase.tr(?_, ?-).gsub(/-*$/, '') } ],
80
+ }
81
+
82
+ # flag constants
83
+ TYPE_CHECKED = 1 << 0
84
+ TYPE_VALID = 1 << 1
85
+ CHARSET_CHECKED = 1 << 2
86
+ CHARSET_VALID = 1 << 3
87
+ ENCODING_CHECKED = 1 << 4
88
+ ENCODING_VALID = 1 << 5
89
+ SYNTAX_CHECKED = 1 << 6
90
+ SYNTAX_VALID = 1 << 7
91
+
92
+ LABELS = {
93
+ size: 'Size (Bytes)',
94
+ ctime: 'Added to Store',
95
+ mtime: 'Last Modified',
96
+ ptime: 'Properties Modified',
97
+ dtime: 'Deleted',
98
+ type: 'Content Type',
99
+ language: '(Natural) Language',
100
+ charset: 'Character Set',
101
+ encoding: 'Content Encoding',
102
+ }.freeze
103
+
104
+ MANDATORY = %i[size ctime mtime ptime]
105
+ OPTIONAL = %i[dtime type language charset encoding]
106
+ FLAG = %i[content-type charset content-encoding syntax].freeze
107
+ STATE = %i[unverified invalid recheck valid].freeze
108
+
109
+ def coerce_time t, k
110
+ case t
111
+ when nil then nil
112
+ when Time then t
113
+ when -> dt { dt.respond_to? :to_time }
114
+ t.to_time
115
+ when Integer
116
+ raise ArgumentError,
117
+ "#{k} given as Integer must be non-negative" if t < 0
118
+ Time.at t
119
+ else
120
+ raise ArgumentError, "Invalid type for #{k}: #{t.class}"
121
+ end
122
+ end
123
+
124
+ def coerce_token t, k
125
+ t = t.to_s.strip.downcase
126
+ pat, norm = TOKENS[k]
127
+ raise "#{k} #{t} does not match #{pat}" unless m = pat.match(t)
128
+ norm.call m[1]
129
+ end
130
+
131
+ public
132
+
133
+ # Create a new object, naively recording whatever is handed
134
+ #
135
+ # @note use {.scan} or {#scan} to populate
136
+ #
137
+ # @param content [IO, String, Proc, File, Pathname, ...] some content
138
+ # @param digests [Hash] the digests ascribed to the content
139
+ # @param size [Integer] assert the object's size
140
+ # @param type [String] assert the object's MIME type
141
+ # @param charset [String] the character set, if applicable
142
+ # @param language [String] the (RFC5646) language tag, if applicable
143
+ # @param encoding [String] the content-encoding (e.g. compression)
144
+ # @param ctime [Time] assert object creation time
145
+ # @param mtime [Time] assert object modification time
146
+ # @param ptime [Time] assert object metadata parameter modification time
147
+ # @param dtime [Time] assert object deletion time
148
+ # @param flags [Integer] validation state flags
149
+ # @param strict [true, false] raise an error on bad input
150
+ # @param fresh [true, false] assert "freshness" of object vis-a-vis the store
151
+ # @return [Store::Digest::Object] the object in question
152
+ def initialize content = nil, digests: {}, size: 0,
153
+ type: 'application/octet-stream', charset: nil, language: nil,
154
+ encoding: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil,
155
+ flags: 0, strict: true, fresh: false
156
+
157
+ # snag this immediately
158
+ @fresh = !!fresh
159
+
160
+ # check input on content
161
+ @content = case content
162
+ when nil then nil
163
+ when IO, StringIO, Proc then content
164
+ when String then StringIO.new content
165
+ when Pathname then -> { content.expand_path.open('rb') }
166
+ when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
167
+ content
168
+ else
169
+ raise ArgumentError,
170
+ "Cannot accept content given as #{content.class}"
171
+ end
172
+
173
+ # check input on digests
174
+ @digests = case digests
175
+ when Hash
176
+ # hash must be clean
177
+ digests.map do |k, v|
178
+ raise ArgumentError,
179
+ 'Digest keys must be symbol-able' unless
180
+ k.respond_to? :to_sym
181
+ k = k.to_sym
182
+ raise ArgumentError,
183
+ 'Digest values must be URI::NI' unless
184
+ v.is_a? URI::NI
185
+ raise ArgumentError,
186
+ 'Digest key must match value algorithm' unless
187
+ k == v.algorithm
188
+ [k.to_sym, v.dup.freeze]
189
+ end.to_h
190
+ when nil then {} # empty hash
191
+ when Array
192
+ # only accepts array of URI::NI
193
+ digests.map do |x|
194
+ raise ArgumentError,
195
+ "Digests given as array can only be URI::NI, not #{x}" \
196
+ unless x.is_a? URI::NI
197
+ [x.algorithm, x.dup.freeze]
198
+ end.to_h
199
+ when URI::NI then { digests.algorithm => digests.dup.freeze }
200
+ else
201
+ # everything else is invalid
202
+ raise ArgumentError,
203
+ "Cannot coerce digests given as #{digests.inspect}"
204
+ end
205
+
206
+ # ctime, mtime, ptime, dtime should be all nil or nonnegative
207
+ # integers or Time or DateTime
208
+ b = binding
209
+ %i[ctime mtime ptime dtime].each do |k|
210
+ v = coerce_time(b.local_variable_get(k), k)
211
+ instance_variable_set "@#{k}", v
212
+ end
213
+
214
+ # size and flags should be non-negative integers
215
+ %i[size flags].each do |k|
216
+ x = b.local_variable_get k
217
+ v = case x
218
+ when nil then 0
219
+ when Integer
220
+ raise ArgumentError, "#{k} must be non-negative" if x < 0
221
+ x
222
+ else
223
+ raise ArgumentError, "#{k} must be nil or an Integer"
224
+ end
225
+ instance_variable_set "@#{k}", v
226
+ end
227
+
228
+ # the following can be strings or symbols:
229
+ TOKENS.keys.each do |k|
230
+ if x = b.local_variable_get(k)
231
+ x = if strict
232
+ coerce_token(x, k)
233
+ else
234
+ coerce_token(x, k) rescue nil
235
+ end
236
+ instance_variable_set "@#{k}", x.freeze if x
237
+ end
238
+ end
239
+ end
240
+
241
+ # XXX come up with a policy for these that isn't stupid, plus input sanitation
242
+ attr_reader :digests, :size
243
+ attr_accessor :type, :charset, :language, :encoding,
244
+ :ctime, :mtime, :ptime, :dtime, :flags
245
+
246
+ #
247
+ def self.scan content, digests: URI::NI.algorithms, mtime: nil,
248
+ type: nil, language: nil, charset: nil, encoding: nil,
249
+ blocksize: BLOCKSIZE, strict: true, fresh: false, &block
250
+ self.new.scan content, digests: digests, mtime: mtime, type: type,
251
+ language: language, charset: charset, encoding: encoding,
252
+ blocksize: blocksize, strict: strict, fresh: fresh, &block
253
+ end
254
+
255
+ def scan content = nil, digests: URI::NI.algorithms, mtime: nil,
256
+ type: nil, charset: nil, language: nil, encoding: nil,
257
+ blocksize: BLOCKSIZE, strict: true, fresh: nil, &block
258
+ # update freshness if there is something to update
259
+ @fresh = !!fresh unless fresh.nil?
260
+ # we put all the scanning stuff in here
261
+ content = case content
262
+ when nil then self.content
263
+ when IO, StringIO then content
264
+ when String then StringIO.new content
265
+ when Pathname then content.open('rb')
266
+ when Proc then content.call
267
+ when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
268
+ content
269
+ else
270
+ raise ArgumentError,
271
+ "Cannot scan content of type #{content.class}"
272
+ end
273
+ content.binmode if content.respond_to? :binmode
274
+
275
+ # sane default for mtime
276
+ @mtime = coerce_time(mtime || @mtime ||
277
+ (content.respond_to?(:mtime) ? content.mtime : Time.now), :mtime)
278
+
279
+ # eh, *some* code reuse
280
+ b = binding
281
+ TOKENS.keys.each do |k|
282
+ if x = b.local_variable_get(k)
283
+ x = if strict
284
+ coerce_token(x, k)
285
+ else
286
+ coerce_token(x, k) rescue nil
287
+ end
288
+ instance_variable_set "@#{k}", x.freeze if x
289
+ end
290
+ end
291
+
292
+ digests = case digests
293
+ when Array then digests
294
+ when Symbol then [digests]
295
+ else
296
+ raise ArgumentError, 'Digests must be one or more symbol'
297
+ end
298
+ raise ArgumentError,
299
+ "Invalid digest list #{digests - URI::NI.algorithms}" unless
300
+ (digests - URI::NI.algorithms).empty?
301
+
302
+ # set up the contexts
303
+ digests = digests.map { |d| [d, URI::NI.context(d)] }.to_h
304
+
305
+ # sample for mime type checking
306
+ sample = StringIO.new ''
307
+ @size = 0
308
+ while buf = content.read(blocksize)
309
+ @size += buf.size
310
+ sample << buf if sample.pos < SAMPLE
311
+ digests.values.each { |ctx| ctx << buf }
312
+ block.call buf if block_given?
313
+ end
314
+
315
+ # seek the content back to the front and store it
316
+ content.seek 0, 0
317
+ @content = content
318
+
319
+ # set up the digests
320
+ @digests = digests.map do |k, v|
321
+ [k, URI::NI.compute(v, algorithm: k).freeze]
322
+ end.to_h.freeze
323
+
324
+ # obtain the sampled content type
325
+ ts = MimeMagic.by_magic(sample) || MimeMagic.default_type(sample)
326
+ if content.respond_to? :path
327
+ # may as well use the path if it's available and more specific
328
+ ps = MimeMagic.by_path(content.path)
329
+ # XXX the need to do ts.to_s is a bug in mimemagic
330
+ ts = ps if ps and ps.child_of?(ts.to_s)
331
+ end
332
+ @type = !type || ts.child_of?(type) ? ts.to_s : type
333
+
334
+ self
335
+ end
336
+
337
+ # Determine (or set) whether the object is "fresh", i.e. whether it
338
+ # is new (or restored), or had been previously been in the store.
339
+ #
340
+ # @param state [true, false]
341
+ def fresh? state = nil
342
+ state.nil? ? @fresh : @fresh = !!state
343
+ end
344
+
345
+ # Return the algorithms used in the object.
346
+ # @return [Array]
347
+ def algorithms
348
+ (@digests || {}).keys.sort
349
+ end
350
+
351
+ # Return a particular digest. Returns nil if there is no match.
352
+ # @param symbol [Symbol, #to_s, #to_sym] the digest
353
+ # @return [Symbol, nil]
354
+ def digest symbol
355
+ raise ArgumentError, "This method takes a symbol" unless
356
+ symbol.respond_to? :to_sym
357
+ digests[symbol.to_sym]
358
+ end
359
+
360
+ alias_method :"[]", :digest
361
+
362
+ # Returns the content stored in the object.
363
+ # @return [IO]
364
+ def content
365
+ @content.is_a?(Proc) ? @content.call : @content
366
+ end
367
+
368
+ # Determines if there is content embedded in the object.
369
+ # @return [false, true]
370
+ def content?
371
+ !!@content
372
+ end
373
+
374
+ # Returns the type and charset, suitable for an HTTP header.
375
+ # @return [String]
376
+ def type_charset
377
+ out = type.to_s
378
+ out += ";charset=#{charset}" if charset
379
+ out
380
+ end
381
+
382
+ # Determines if the object has been scanned.
383
+ # @return [false, true]
384
+ def scanned?
385
+ !@digests.empty?
386
+ end
387
+
388
+ # Returns true if the content type has been checked.
389
+ # @return [false, true]
390
+ def type_checked?
391
+ 0 != @flags & TYPE_CHECKED
392
+ end
393
+
394
+ # Returns true if the content type has been checked _and_ is valid.
395
+ # @return [false, true]
396
+ def type_valid?
397
+ 0 != @flags & (TYPE_CHECKED|TYPE_VALID)
398
+ end
399
+
400
+ # Returns true if the character set has been checked.
401
+ # @return [false, true]
402
+ def charset_checked?
403
+ 0 != @flags & CHARSET_CHECKED
404
+ end
405
+
406
+ # Returns true if the character set has been checked _and_ is valid.
407
+ # @return [false, true]
408
+ def charset_valid?
409
+ 0 != @flags & (CHARSET_CHECKED|CHARSET_VALID)
410
+ end
411
+
412
+ # Returns true if the content encoding (e.g. gzip, deflate) has
413
+ # been checked.
414
+ # @return [false, true]
415
+ def encoding_checked?
416
+ 0 != @flags & ENCODING_CHECKED
417
+ end
418
+
419
+ # Returns true if the content encoding has been checked _and_ is valid.
420
+ # @return [false, true]
421
+ def encoding_valid?
422
+ 0 != @flags & (ENCODING_CHECKED|ENCODING_VALID)
423
+ end
424
+
425
+ # Returns true if the blob's syntax has been checked.
426
+ # @return [false, true]
427
+ def syntax_checked?
428
+ 0 != @flags & SYNTAX_CHECKED
429
+ end
430
+
431
+ # Returns true if the blob's syntax has been checked _and_ is valid.
432
+ # @return [false, true]
433
+ def syntax_valid?
434
+ 0 != @flags & (SYNTAX_CHECKED|SYNTAX_VALID)
435
+ end
436
+
437
+ %i[ctime mtime ptime dtime].each do |k|
438
+ define_method "#{k}=" do |v|
439
+ instance_variable_set "@#{k}", coerce_time(v, k).freeze
440
+ end
441
+ end
442
+
443
+ %i[type charset encoding language].each do |k|
444
+ define_method "#{k}=" do |v|
445
+ instance_variable_set "@#{k}", coerce_token(v, k).freeze
446
+ end
447
+
448
+ define_method "#{k}_ok?" do |v|
449
+ TOKENS[k].first.match? v
450
+ end
451
+ end
452
+
453
+ # Just a plain old predicate to determine whether the blob has been
454
+ # deleted from the store (but implicitly the metadata record
455
+ # remains).
456
+ # @return [false, true]
457
+ def deleted?
458
+ !!@dtime
459
+ end
460
+
461
+ # Return the object as a hash. Omits the content by default.
462
+ # @param content [false, true] include the content if true
463
+ # @return [Hash] the object as a hash
464
+ def to_h content: false
465
+ main = %i[content digests]
466
+ main.shift unless content
467
+ (main + MANDATORY + OPTIONAL + [:flags]).map do |k|
468
+ [k, send(k).dup]
469
+ end.to_h
470
+ end
471
+
472
+ # Outputs a human-readable string representation of the object.
473
+ def to_s
474
+ out = "#{self.class}\n Digests:\n"
475
+
476
+ # disgorge the digests
477
+ digests.values.sort { |a, b| a.to_s <=> b.to_s }.each do |d|
478
+ out << " #{d}\n"
479
+ end
480
+
481
+ # now the fields
482
+ MANDATORY.each { |m| out << " #{LABELS[m]}: #{send m}\n" }
483
+ OPTIONAL.each do |o|
484
+ val = send o
485
+ out << " #{LABELS[o]}: #{val}\n" if val
486
+ end
487
+
488
+ # now the validation statuses
489
+ out << "Validation:\n"
490
+ FLAG.each_index do |i|
491
+ x = flags >> (3 - i) & 3
492
+ out << (" %-16s: %s\n" % [FLAG[i], STATE[x]])
493
+ end
494
+
495
+ out
496
+ end
497
+ end