store-digest 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,497 @@
1
+ require 'store/digest/version'
2
+
3
+ require 'uri'
4
+ require 'uri/ni'
5
+ require 'mimemagic'
6
+ require 'mimemagic/overlay'
7
+
8
+ class MimeMagic
9
+ # XXX erase this when these methods get added
10
+ unless self.method_defined? :parents
11
+ def self.parents type
12
+ TYPES.fetch(type, [nil,[]])[1].map { |t| new t }.uniq
13
+ end
14
+ end
15
+
16
+ unless self.method_defined? :ancestor_types
17
+ def self.ancestor_types type
18
+ parents(type).map { |t| ancestors(t) }.flatten.uniq
19
+ end
20
+ end
21
+
22
+ unless self.method_defined? :binary?
23
+ def self.binary? thing
24
+ sample = nil
25
+
26
+ # get some stuff out of the IO or get a substring
27
+ if %i[tell seek read].all? { |m| thing.respond_to? m }
28
+ pos = thing.tell
29
+ thing.seek 0, 0
30
+ sample = thing.read 1024
31
+ thing.seek pos
32
+ elsif thing.respond_to? :to_s
33
+ sample = thing.to_s[0,1024]
34
+ else
35
+ raise ArgumentError, "Cannot sample an instance of {thing.class}"
36
+ end
37
+
38
+ # consider this to be 'binary' if empty
39
+ return true if sample.nil? or sample.empty?
40
+ # control codes minus ordinary whitespace
41
+ /[\x0-\x8\xe-\x1f\x7f]/n.match?(sample) ? true : false
42
+ end
43
+ end
44
+
45
+ unless self.method_defined? :default_type
46
+ def self.default_type thing
47
+ new self.binary?(thing) ? 'application/octet-stream' : 'text/plain'
48
+ end
49
+ end
50
+ end
51
+
52
+ class Store::Digest::Object
53
+
54
+ private
55
+
56
+ SAMPLE = 2**13 # must be big enough to detect ooxml
57
+ BLOCKSIZE = 2**16
58
+
59
+ CHARSETS = [
60
+ %w[utf8 utf-8],
61
+ %w[iso8859-1 iso-8859-1],
62
+ ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
63
+
64
+ ENCODINGS = [
65
+ %w[x-compress compress],
66
+ %w[x-gzip gzip],
67
+ ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
68
+
69
+ TOKEN = '[^\x0-\x20()<>@,;:\\\"/\[\]?=\x7f-\\xff]+'
70
+
71
+ # { key: [pattern, normalizer] } - assumes stripped and downcased
72
+ TOKENS = {
73
+ type: [/^(#{TOKEN}(?:\/#{TOKEN})?)$/on, -> c { c.downcase }],
74
+ charset: [/^(#{TOKEN})$/on,
75
+ -> c { c = c.tr(?_, ?-).downcase; CHARSETS.fetch c, c } ],
76
+ encoding: [/^(#{TOKEN})$/on,
77
+ -> c { c = c.tr(?_, ?-).downcase; ENCODINGS.fetch c, c } ],
78
+ language: [/^([a-z]{2,3}(?:[-_][0-9a-z]+)*)$/,
79
+ -> c { c.downcase.tr(?_, ?-).gsub(/-*$/, '') } ],
80
+ }
81
+
82
+ # flag constants
83
+ TYPE_CHECKED = 1 << 0
84
+ TYPE_VALID = 1 << 1
85
+ CHARSET_CHECKED = 1 << 2
86
+ CHARSET_VALID = 1 << 3
87
+ ENCODING_CHECKED = 1 << 4
88
+ ENCODING_VALID = 1 << 5
89
+ SYNTAX_CHECKED = 1 << 6
90
+ SYNTAX_VALID = 1 << 7
91
+
92
+ LABELS = {
93
+ size: 'Size (Bytes)',
94
+ ctime: 'Added to Store',
95
+ mtime: 'Last Modified',
96
+ ptime: 'Properties Modified',
97
+ dtime: 'Deleted',
98
+ type: 'Content Type',
99
+ language: '(Natural) Language',
100
+ charset: 'Character Set',
101
+ encoding: 'Content Encoding',
102
+ }.freeze
103
+
104
+ MANDATORY = %i[size ctime mtime ptime]
105
+ OPTIONAL = %i[dtime type language charset encoding]
106
+ FLAG = %i[content-type charset content-encoding syntax].freeze
107
+ STATE = %i[unverified invalid recheck valid].freeze
108
+
109
+ def coerce_time t, k
110
+ case t
111
+ when nil then nil
112
+ when Time then t
113
+ when -> dt { dt.respond_to? :to_time }
114
+ t.to_time
115
+ when Integer
116
+ raise ArgumentError,
117
+ "#{k} given as Integer must be non-negative" if t < 0
118
+ Time.at t
119
+ else
120
+ raise ArgumentError, "Invalid type for #{k}: #{t.class}"
121
+ end
122
+ end
123
+
124
+ def coerce_token t, k
125
+ t = t.to_s.strip.downcase
126
+ pat, norm = TOKENS[k]
127
+ raise "#{k} #{t} does not match #{pat}" unless m = pat.match(t)
128
+ norm.call m[1]
129
+ end
130
+
131
+ public
132
+
133
+ # Create a new object, naively recording whatever is handed
134
+ #
135
+ # @note use {.scan} or {#scan} to populate
136
+ #
137
+ # @param content [IO, String, Proc, File, Pathname, ...] some content
138
+ # @param digests [Hash] the digests ascribed to the content
139
+ # @param size [Integer] assert the object's size
140
+ # @param type [String] assert the object's MIME type
141
+ # @param charset [String] the character set, if applicable
142
+ # @param language [String] the (RFC5646) language tag, if applicable
143
+ # @param encoding [String] the content-encoding (e.g. compression)
144
+ # @param ctime [Time] assert object creation time
145
+ # @param mtime [Time] assert object modification time
146
+ # @param ptime [Time] assert object metadata parameter modification time
147
+ # @param dtime [Time] assert object deletion time
148
+ # @param flags [Integer] validation state flags
149
+ # @param strict [true, false] raise an error on bad input
150
+ # @param fresh [true, false] assert "freshness" of object vis-a-vis the store
151
+ # @return [Store::Digest::Object] the object in question
152
+ def initialize content = nil, digests: {}, size: 0,
153
+ type: 'application/octet-stream', charset: nil, language: nil,
154
+ encoding: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil,
155
+ flags: 0, strict: true, fresh: false
156
+
157
+ # snag this immediately
158
+ @fresh = !!fresh
159
+
160
+ # check input on content
161
+ @content = case content
162
+ when nil then nil
163
+ when IO, StringIO, Proc then content
164
+ when String then StringIO.new content
165
+ when Pathname then -> { content.expand_path.open('rb') }
166
+ when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
167
+ content
168
+ else
169
+ raise ArgumentError,
170
+ "Cannot accept content given as #{content.class}"
171
+ end
172
+
173
+ # check input on digests
174
+ @digests = case digests
175
+ when Hash
176
+ # hash must be clean
177
+ digests.map do |k, v|
178
+ raise ArgumentError,
179
+ 'Digest keys must be symbol-able' unless
180
+ k.respond_to? :to_sym
181
+ k = k.to_sym
182
+ raise ArgumentError,
183
+ 'Digest values must be URI::NI' unless
184
+ v.is_a? URI::NI
185
+ raise ArgumentError,
186
+ 'Digest key must match value algorithm' unless
187
+ k == v.algorithm
188
+ [k.to_sym, v.dup.freeze]
189
+ end.to_h
190
+ when nil then {} # empty hash
191
+ when Array
192
+ # only accepts array of URI::NI
193
+ digests.map do |x|
194
+ raise ArgumentError,
195
+ "Digests given as array can only be URI::NI, not #{x}" \
196
+ unless x.is_a? URI::NI
197
+ [x.algorithm, x.dup.freeze]
198
+ end.to_h
199
+ when URI::NI then { digests.algorithm => digests.dup.freeze }
200
+ else
201
+ # everything else is invalid
202
+ raise ArgumentError,
203
+ "Cannot coerce digests given as #{digests.inspect}"
204
+ end
205
+
206
+ # ctime, mtime, ptime, dtime should be all nil or nonnegative
207
+ # integers or Time or DateTime
208
+ b = binding
209
+ %i[ctime mtime ptime dtime].each do |k|
210
+ v = coerce_time(b.local_variable_get(k), k)
211
+ instance_variable_set "@#{k}", v
212
+ end
213
+
214
+ # size and flags should be non-negative integers
215
+ %i[size flags].each do |k|
216
+ x = b.local_variable_get k
217
+ v = case x
218
+ when nil then 0
219
+ when Integer
220
+ raise ArgumentError, "#{k} must be non-negative" if x < 0
221
+ x
222
+ else
223
+ raise ArgumentError, "#{k} must be nil or an Integer"
224
+ end
225
+ instance_variable_set "@#{k}", v
226
+ end
227
+
228
+ # the following can be strings or symbols:
229
+ TOKENS.keys.each do |k|
230
+ if x = b.local_variable_get(k)
231
+ x = if strict
232
+ coerce_token(x, k)
233
+ else
234
+ coerce_token(x, k) rescue nil
235
+ end
236
+ instance_variable_set "@#{k}", x.freeze if x
237
+ end
238
+ end
239
+ end
240
+
241
+ # XXX come up with a policy for these that isn't stupid, plus input sanitation
242
+ attr_reader :digests, :size
243
+ attr_accessor :type, :charset, :language, :encoding,
244
+ :ctime, :mtime, :ptime, :dtime, :flags
245
+
246
+ #
247
+ def self.scan content, digests: URI::NI.algorithms, mtime: nil,
248
+ type: nil, language: nil, charset: nil, encoding: nil,
249
+ blocksize: BLOCKSIZE, strict: true, fresh: false, &block
250
+ self.new.scan content, digests: digests, mtime: mtime, type: type,
251
+ language: language, charset: charset, encoding: encoding,
252
+ blocksize: blocksize, strict: strict, fresh: fresh, &block
253
+ end
254
+
255
+ def scan content = nil, digests: URI::NI.algorithms, mtime: nil,
256
+ type: nil, charset: nil, language: nil, encoding: nil,
257
+ blocksize: BLOCKSIZE, strict: true, fresh: nil, &block
258
+ # update freshness if there is something to update
259
+ @fresh = !!fresh unless fresh.nil?
260
+ # we put all the scanning stuff in here
261
+ content = case content
262
+ when nil then self.content
263
+ when IO, StringIO then content
264
+ when String then StringIO.new content
265
+ when Pathname then content.open('rb')
266
+ when Proc then content.call
267
+ when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
268
+ content
269
+ else
270
+ raise ArgumentError,
271
+ "Cannot scan content of type #{content.class}"
272
+ end
273
+ content.binmode if content.respond_to? :binmode
274
+
275
+ # sane default for mtime
276
+ @mtime = coerce_time(mtime || @mtime ||
277
+ (content.respond_to?(:mtime) ? content.mtime : Time.now), :mtime)
278
+
279
+ # eh, *some* code reuse
280
+ b = binding
281
+ TOKENS.keys.each do |k|
282
+ if x = b.local_variable_get(k)
283
+ x = if strict
284
+ coerce_token(x, k)
285
+ else
286
+ coerce_token(x, k) rescue nil
287
+ end
288
+ instance_variable_set "@#{k}", x.freeze if x
289
+ end
290
+ end
291
+
292
+ digests = case digests
293
+ when Array then digests
294
+ when Symbol then [digests]
295
+ else
296
+ raise ArgumentError, 'Digests must be one or more symbol'
297
+ end
298
+ raise ArgumentError,
299
+ "Invalid digest list #{digests - URI::NI.algorithms}" unless
300
+ (digests - URI::NI.algorithms).empty?
301
+
302
+ # set up the contexts
303
+ digests = digests.map { |d| [d, URI::NI.context(d)] }.to_h
304
+
305
+ # sample for mime type checking
306
+ sample = StringIO.new ''
307
+ @size = 0
308
+ while buf = content.read(blocksize)
309
+ @size += buf.size
310
+ sample << buf if sample.pos < SAMPLE
311
+ digests.values.each { |ctx| ctx << buf }
312
+ block.call buf if block_given?
313
+ end
314
+
315
+ # seek the content back to the front and store it
316
+ content.seek 0, 0
317
+ @content = content
318
+
319
+ # set up the digests
320
+ @digests = digests.map do |k, v|
321
+ [k, URI::NI.compute(v, algorithm: k).freeze]
322
+ end.to_h.freeze
323
+
324
+ # obtain the sampled content type
325
+ ts = MimeMagic.by_magic(sample) || MimeMagic.default_type(sample)
326
+ if content.respond_to? :path
327
+ # may as well use the path if it's available and more specific
328
+ ps = MimeMagic.by_path(content.path)
329
+ # XXX the need to do ts.to_s is a bug in mimemagic
330
+ ts = ps if ps and ps.child_of?(ts.to_s)
331
+ end
332
+ @type = !type || ts.child_of?(type) ? ts.to_s : type
333
+
334
+ self
335
+ end
336
+
337
+ # Determine (or set) whether the object is "fresh", i.e. whether it
338
+ # is new (or restored), or had been previously been in the store.
339
+ #
340
+ # @param state [true, false]
341
+ def fresh? state = nil
342
+ state.nil? ? @fresh : @fresh = !!state
343
+ end
344
+
345
+ # Return the algorithms used in the object.
346
+ # @return [Array]
347
+ def algorithms
348
+ (@digests || {}).keys.sort
349
+ end
350
+
351
+ # Return a particular digest. Returns nil if there is no match.
352
+ # @param symbol [Symbol, #to_s, #to_sym] the digest
353
+ # @return [Symbol, nil]
354
+ def digest symbol
355
+ raise ArgumentError, "This method takes a symbol" unless
356
+ symbol.respond_to? :to_sym
357
+ digests[symbol.to_sym]
358
+ end
359
+
360
+ alias_method :"[]", :digest
361
+
362
+ # Returns the content stored in the object.
363
+ # @return [IO]
364
+ def content
365
+ @content.is_a?(Proc) ? @content.call : @content
366
+ end
367
+
368
+ # Determines if there is content embedded in the object.
369
+ # @return [false, true]
370
+ def content?
371
+ !!@content
372
+ end
373
+
374
+ # Returns the type and charset, suitable for an HTTP header.
375
+ # @return [String]
376
+ def type_charset
377
+ out = type.to_s
378
+ out += ";charset=#{charset}" if charset
379
+ out
380
+ end
381
+
382
+ # Determines if the object has been scanned.
383
+ # @return [false, true]
384
+ def scanned?
385
+ !@digests.empty?
386
+ end
387
+
388
+ # Returns true if the content type has been checked.
389
+ # @return [false, true]
390
+ def type_checked?
391
+ 0 != @flags & TYPE_CHECKED
392
+ end
393
+
394
+ # Returns true if the content type has been checked _and_ is valid.
395
+ # @return [false, true]
396
+ def type_valid?
397
+ 0 != @flags & (TYPE_CHECKED|TYPE_VALID)
398
+ end
399
+
400
+ # Returns true if the character set has been checked.
401
+ # @return [false, true]
402
+ def charset_checked?
403
+ 0 != @flags & CHARSET_CHECKED
404
+ end
405
+
406
+ # Returns true if the character set has been checked _and_ is valid.
407
+ # @return [false, true]
408
+ def charset_valid?
409
+ 0 != @flags & (CHARSET_CHECKED|CHARSET_VALID)
410
+ end
411
+
412
+ # Returns true if the content encoding (e.g. gzip, deflate) has
413
+ # been checked.
414
+ # @return [false, true]
415
+ def encoding_checked?
416
+ 0 != @flags & ENCODING_CHECKED
417
+ end
418
+
419
+ # Returns true if the content encoding has been checked _and_ is valid.
420
+ # @return [false, true]
421
+ def encoding_valid?
422
+ 0 != @flags & (ENCODING_CHECKED|ENCODING_VALID)
423
+ end
424
+
425
+ # Returns true if the blob's syntax has been checked.
426
+ # @return [false, true]
427
+ def syntax_checked?
428
+ 0 != @flags & SYNTAX_CHECKED
429
+ end
430
+
431
+ # Returns true if the blob's syntax has been checked _and_ is valid.
432
+ # @return [false, true]
433
+ def syntax_valid?
434
+ 0 != @flags & (SYNTAX_CHECKED|SYNTAX_VALID)
435
+ end
436
+
437
+ %i[ctime mtime ptime dtime].each do |k|
438
+ define_method "#{k}=" do |v|
439
+ instance_variable_set "@#{k}", coerce_time(v, k).freeze
440
+ end
441
+ end
442
+
443
+ %i[type charset encoding language].each do |k|
444
+ define_method "#{k}=" do |v|
445
+ instance_variable_set "@#{k}", coerce_token(v, k).freeze
446
+ end
447
+
448
+ define_method "#{k}_ok?" do |v|
449
+ TOKENS[k].first.match? v
450
+ end
451
+ end
452
+
453
+ # Just a plain old predicate to determine whether the blob has been
454
+ # deleted from the store (but implicitly the metadata record
455
+ # remains).
456
+ # @return [false, true]
457
+ def deleted?
458
+ !!@dtime
459
+ end
460
+
461
+ # Return the object as a hash. Omits the content by default.
462
+ # @param content [false, true] include the content if true
463
+ # @return [Hash] the object as a hash
464
+ def to_h content: false
465
+ main = %i[content digests]
466
+ main.shift unless content
467
+ (main + MANDATORY + OPTIONAL + [:flags]).map do |k|
468
+ [k, send(k).dup]
469
+ end.to_h
470
+ end
471
+
472
+ # Outputs a human-readable string representation of the object.
473
+ def to_s
474
+ out = "#{self.class}\n Digests:\n"
475
+
476
+ # disgorge the digests
477
+ digests.values.sort { |a, b| a.to_s <=> b.to_s }.each do |d|
478
+ out << " #{d}\n"
479
+ end
480
+
481
+ # now the fields
482
+ MANDATORY.each { |m| out << " #{LABELS[m]}: #{send m}\n" }
483
+ OPTIONAL.each do |o|
484
+ val = send o
485
+ out << " #{LABELS[o]}: #{val}\n" if val
486
+ end
487
+
488
+ # now the validation statuses
489
+ out << "Validation:\n"
490
+ FLAG.each_index do |i|
491
+ x = flags >> (3 - i) & 3
492
+ out << (" %-16s: %s\n" % [FLAG[i], STATE[x]])
493
+ end
494
+
495
+ out
496
+ end
497
+ end