store-digest 0.3.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,602 +0,0 @@
1
- require 'store/digest/version'
2
-
3
- require 'uri'
4
- require 'uri/ni'
5
- require 'mimemagic'
6
-
7
- class MimeMagic
8
- # XXX erase this when these methods get added
9
- unless singleton_class.method_defined? :parents
10
- def self.parents type
11
- TYPES.fetch(type.to_s, [nil,[]])[1].map { |t| new t }.uniq
12
- end
13
- end
14
-
15
- unless method_defined? :parents
16
- def parents
17
- out = TYPES.fetch(type.to_s.downcase, [nil, []])[1].map do |x|
18
- self.class.new x
19
- end
20
- # add this unless we're it
21
- out << self.class.new('application/octet-stream') if
22
- out.empty? and type.downcase != 'application/octet-stream'
23
-
24
- out.uniq
25
- end
26
- end
27
-
28
- unless method_defined? :lineage
29
- def lineage
30
- ([self] + parents.map { |t| t.lineage }.flatten).uniq
31
- end
32
- end
33
-
34
- unless method_defined? :descendant_of?
35
- def descendant_of? type
36
- lineage.map(&:type).include? type.to_s.downcase
37
- end
38
- end
39
-
40
- unless singleton_class.method_defined? :binary?
41
- def self.binary? thing
42
- sample = nil
43
-
44
- # get some stuff out of the IO or get a substring
45
- if %i[tell seek read].all? { |m| thing.respond_to? m }
46
- pos = thing.tell
47
- thing.seek 0, 0
48
- sample = thing.read 1024
49
- thing.seek pos
50
- elsif thing.respond_to? :to_s
51
- sample = thing.to_s[0,1024]
52
- else
53
- raise ArgumentError, "Cannot sample an instance of {thing.class}"
54
- end
55
-
56
- # consider this to be 'binary' if empty
57
- return true if sample.nil? or sample.empty?
58
- # control codes minus ordinary whitespace
59
- /[\x0-\x8\xe-\x1f\x7f]/n.match?(sample) ? true : false
60
- end
61
- end
62
-
63
- unless singleton_class.method_defined? :default_type
64
- def self.default_type thing
65
- new self.binary?(thing) ? 'application/octet-stream' : 'text/plain'
66
- end
67
- end
68
- end
69
-
70
- # Store entry object class.
71
- #
72
- class Store::Digest::Object
73
-
74
- # These is a struct for the bank of flags, with a couple of extra
75
- # methods for parsing
76
- #
77
- Flags = Struct.new(
78
- 'Flags',
79
- :type_checked, :type_valid, :charset_checked, :charset_valid,
80
- :encoding_checked, :encoding_valid, :syntax_checked, :syntax_valid, :cache
81
- ) do |name|
82
-
83
- # Initialize a struct of flags from arbitrary input
84
- #
85
- # @param arg [Store::Digest::Object::Flags, Integer, #to_h, #to_a]
86
- #
87
- # @return [Store::Digest::Object::Flags]
88
- #
89
- def self.from arg
90
- # get the length since we use it in a few places
91
- len = self.members.size
92
-
93
- if arg.is_a? Integer
94
- tmp = arg.digits(2).first(len)
95
- elsif arg.is_a? self
96
- # noop
97
- return arg
98
- elsif arg.is_a? Hash
99
- tmp = arg.slice(*self.members).transform_values do |v|
100
- !!(v && v != 0)
101
- end
102
- return self.[](**tmp)
103
- elsif arg.respond_to? :to_a
104
- tmp = arg.to_a.first(len)
105
- else
106
- raise ArgumentError, 'Input must be an integer or array'
107
- end
108
-
109
- # append these
110
- tmp += [false] * (len - tmp.size) if tmp.size < len
111
-
112
- # make sure these are true/false
113
- tmp.map! { |b| !!(b && b != 0) }
114
-
115
- # we do this because `new` doesn't do this
116
- self.[](*tmp)
117
- end
118
-
119
- # Turn an arbitrary {Array} back into an {Integer}.
120
- #
121
- # @param array [Array]
122
- #
123
- # @return [Integer]
124
- #
125
- def self.to_i array
126
- array.to_a.reverse.reduce(0) { |acc, b| (acc << 1) | (b ? 1 : 0) }
127
- end
128
-
129
- # wish there was a cleaner way to do derive individual instance
130
- # methods from class methods
131
- begin
132
- cm = self.method :to_i
133
- define_method(:to_i) { cm.call self.to_a }
134
- end
135
- end
136
-
137
- private
138
-
139
- SAMPLE = 2**13 # must be big enough to detect ooxml
140
- BLOCKSIZE = 2**16
141
-
142
- CHARSETS = [
143
- %w[utf8 utf-8],
144
- %w[iso8859-1 iso-8859-1],
145
- ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
146
-
147
- ENCODINGS = [
148
- %w[x-compress compress],
149
- %w[x-gzip gzip],
150
- ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
151
-
152
- TOKEN = '[^\x0-\x20()<>@,;:\\\"/\[\]?=\x7f-\\xff]+'
153
-
154
- # { key: [pattern, normalizer] } - assumes stripped and downcased
155
- TOKENS = {
156
- type: [/^(#{TOKEN}(?:\/#{TOKEN})?)$/on, -> c { c.downcase }],
157
- charset: [/^(#{TOKEN})$/on,
158
- -> c { c = c.tr(?_, ?-).downcase; CHARSETS.fetch c, c } ],
159
- encoding: [/^(#{TOKEN})$/on,
160
- -> c { c = c.tr(?_, ?-).downcase; ENCODINGS.fetch c, c } ],
161
- language: [/^([a-z]{2,3}(?:[-_][0-9a-z]+)*)$/,
162
- -> c { c.downcase.tr(?_, ?-).gsub(/-*$/, '') } ],
163
- }
164
-
165
- # flag constants
166
- TYPE_CHECKED = 1 << 0
167
- TYPE_VALID = 1 << 1
168
- CHARSET_CHECKED = 1 << 2
169
- CHARSET_VALID = 1 << 3
170
- ENCODING_CHECKED = 1 << 4
171
- ENCODING_VALID = 1 << 5
172
- SYNTAX_CHECKED = 1 << 6
173
- SYNTAX_VALID = 1 << 7
174
- IS_CACHE = 1 << 8
175
-
176
- LABELS = {
177
- size: 'Size (Bytes)',
178
- ctime: 'Added to Store',
179
- mtime: 'Last Modified',
180
- ptime: 'Properties Modified',
181
- dtime: 'Deleted (Expires)',
182
- type: 'Content Type',
183
- language: '(Natural) Language',
184
- charset: 'Character Set',
185
- encoding: 'Content Encoding',
186
- }.freeze
187
-
188
- MANDATORY = %i[size ctime mtime ptime]
189
- OPTIONAL = %i[dtime type language charset encoding]
190
- FLAG = %i[content-type charset content-encoding syntax].freeze
191
- STATE = %i[unverified invalid recheck valid].freeze
192
-
193
- def coerce_time t, k
194
- case t
195
- when nil then nil
196
- when Time then t
197
- when -> dt { dt.respond_to? :to_time }
198
- t.to_time
199
- when Integer
200
- raise ArgumentError,
201
- "#{k} given as Integer must be non-negative" if t < 0
202
- Time.at t
203
- else
204
- raise ArgumentError, "Invalid type for #{k}: #{t.class}"
205
- end
206
- end
207
-
208
- def coerce_token t, k
209
- t = t.to_s.strip.downcase
210
- pat, norm = TOKENS[k]
211
- raise "#{k} #{t} does not match #{pat}" unless m = pat.match(t)
212
- norm.call m[1]
213
- end
214
-
215
- public
216
-
217
- # Create a new object, naively recording whatever is handed
218
- #
219
- # @note use {.scan} or {#scan} to populate
220
- #
221
- # @param content [IO, String, Proc, File, Pathname, ...] some content
222
- # @param digests [Hash] the digests ascribed to the content
223
- # @param size [Integer] assert the object's size
224
- # @param type [String] assert the object's MIME type
225
- # @param charset [String] the character set, if applicable
226
- # @param language [String] the (RFC5646) language tag, if applicable
227
- # @param encoding [String] the content-encoding (e.g. compression)
228
- # @param ctime [Time] assert object creation time
229
- # @param mtime [Time] assert object modification time
230
- # @param ptime [Time] assert object metadata parameter modification time
231
- # @param dtime [Time] assert object deletion time
232
- # @param flags [Integer] validation state flags
233
- # @param strict [true, false] raise an error on bad input
234
- # @param fresh [true, false] assert "freshness" of object vis-a-vis the store
235
- #
236
- # @return [Store::Digest::Object] the object in question
237
- #
238
- def initialize content = nil, digests: {}, size: 0,
239
- type: 'application/octet-stream', charset: nil, language: nil,
240
- encoding: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil,
241
- flags: 0, strict: true, fresh: false
242
-
243
- # snag this immediately
244
- @fresh = !!fresh
245
-
246
- # check input on content
247
- @content = case content
248
- when nil then nil
249
- when IO, StringIO, Proc then content
250
- when String then StringIO.new content
251
- when Pathname then -> { content.expand_path.open('rb') }
252
- when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
253
- content
254
- else
255
- raise ArgumentError,
256
- "Cannot accept content given as #{content.class}"
257
- end
258
-
259
- # check input on digests
260
- @digests = case digests
261
- when Hash
262
- # hash must be clean
263
- digests.map do |k, v|
264
- raise ArgumentError,
265
- 'Digest keys must be symbol-able' unless
266
- k.respond_to? :to_sym
267
- k = k.to_sym
268
- raise ArgumentError,
269
- 'Digest values must be URI::NI' unless
270
- v.is_a? URI::NI
271
- raise ArgumentError,
272
- 'Digest key must match value algorithm' unless
273
- k == v.algorithm
274
- [k.to_sym, v.dup.freeze]
275
- end.to_h
276
- when nil then {} # empty hash
277
- when Array
278
- # only accepts array of URI::NI
279
- digests.map do |x|
280
- raise ArgumentError,
281
- "Digests given as array can only be URI::NI, not #{x}" \
282
- unless x.is_a? URI::NI
283
- [x.algorithm, x.dup.freeze]
284
- end.to_h
285
- when URI::NI then { digests.algorithm => digests.dup.freeze }
286
- else
287
- # everything else is invalid
288
- raise ArgumentError,
289
- "Cannot coerce digests given as #{digests.inspect}"
290
- end
291
-
292
- # ctime, mtime, ptime, dtime should be all nil or nonnegative
293
- # integers or Time or DateTime
294
- b = binding
295
- %i[ctime mtime ptime dtime].each do |k|
296
- v = coerce_time(b.local_variable_get(k), k)
297
- instance_variable_set "@#{k}", v
298
- end
299
-
300
- # set the flags
301
- @flags = Flags.from(flags || 0)
302
-
303
- @size = case size
304
- when nil then 0
305
- when Numeric
306
- raise ArgumentError, 'size must be non-negative' if size < 0
307
- size.to_i
308
- else
309
- raise ArgumentError, 'size must be nil or Numeric'
310
- end
311
-
312
- # the following can be strings or symbols:
313
- TOKENS.keys.each do |k|
314
- if x = b.local_variable_get(k)
315
- x = if strict
316
- coerce_token(x, k)
317
- else
318
- coerce_token(x, k) rescue nil
319
- end
320
- instance_variable_set "@#{k}", x.freeze if x
321
- end
322
- end
323
- end
324
-
325
- # XXX come up with a policy for these that isn't stupid, plus input sanitation
326
- attr_reader :digests, :size
327
- attr_accessor :type, :charset, :language, :encoding,
328
- :ctime, :mtime, :ptime, :dtime, :flags
329
-
330
- #
331
- def self.scan content, digests: URI::NI.algorithms, mtime: nil,
332
- type: nil, language: nil, charset: nil, encoding: nil,
333
- blocksize: BLOCKSIZE, strict: true, fresh: false, &block
334
- self.new.scan content, digests: digests, mtime: mtime, type: type,
335
- language: language, charset: charset, encoding: encoding,
336
- blocksize: blocksize, strict: strict, fresh: fresh, &block
337
- end
338
-
339
- def scan content = nil, digests: URI::NI.algorithms, mtime: nil,
340
- type: nil, charset: nil, language: nil, encoding: nil,
341
- blocksize: BLOCKSIZE, strict: true, fresh: nil, &block
342
- # update freshness if there is something to update
343
- @fresh = !!fresh unless fresh.nil?
344
- # we put all the scanning stuff in here
345
- content = case content
346
- when nil then self.content
347
- when IO, StringIO then content
348
- when String then StringIO.new content
349
- when Pathname then content.open('rb')
350
- when Proc then content.call
351
- when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
352
- content
353
- else
354
- raise ArgumentError,
355
- "Cannot scan content of type #{content.class}"
356
- end
357
- content.binmode if content.respond_to? :binmode
358
-
359
- # sane default for mtime
360
- @mtime = coerce_time(mtime || @mtime ||
361
- (content.respond_to?(:mtime) ? content.mtime : Time.now(in: ?Z)), :mtime)
362
-
363
- # eh, *some* code reuse
364
- b = binding
365
- TOKENS.keys.each do |k|
366
- if x = b.local_variable_get(k)
367
- x = if strict
368
- coerce_token(x, k)
369
- else
370
- coerce_token(x, k) rescue nil
371
- end
372
- instance_variable_set "@#{k}", x.freeze if x
373
- end
374
- end
375
-
376
- digests = case digests
377
- when Array then digests
378
- when Symbol then [digests]
379
- else
380
- raise ArgumentError, 'Digests must be one or more symbol'
381
- end
382
- raise ArgumentError,
383
- "Invalid digest list #{digests - URI::NI.algorithms}" unless
384
- (digests - URI::NI.algorithms).empty?
385
-
386
- # set up the contexts
387
- digests = digests.map { |d| [d, URI::NI.context(d)] }.to_h
388
-
389
- # sample for mime type checking
390
- sample = StringIO.new ''
391
- @size = 0
392
- while buf = content.read(blocksize)
393
- @size += buf.size
394
- sample << buf if sample.pos < SAMPLE
395
- digests.values.each { |ctx| ctx << buf }
396
- block.call buf if block_given?
397
- end
398
-
399
- # seek the content back to the front and store it
400
- content.seek 0, 0
401
- @content = content
402
-
403
- # set up the digests
404
- @digests = digests.map do |k, v|
405
- [k, URI::NI.compute(v, algorithm: k).freeze]
406
- end.to_h.freeze
407
-
408
- # ensure there is the most generic of possible types
409
- type ||= 'application/octet-stream'.freeze
410
-
411
- # obtain the sampled content type
412
- ts = MimeMagic.by_magic(sample) || MimeMagic.default_type(sample)
413
- if content.respond_to? :path
414
- # may as well use the path if it's available and more specific
415
- ps = MimeMagic.by_path(content.path.to_s)
416
- # XXX the need to do ts.to_s is a bug in mimemagic
417
- ts = ps if ps and ps.descendant_of?(ts.to_s)
418
- end
419
-
420
- # set the type to ts if it is more specific
421
- @type = ts.descendant_of?(type.to_s) ? ts.to_s.freeze :
422
- type.to_s.dup.downcase.freeze
423
-
424
- self
425
- end
426
-
427
- # Determine (or set) whether the object is "fresh", i.e. whether it
428
- # is new (or restored), or had been previously been in the store.
429
- #
430
- # @return [true, false]
431
- #
432
- def fresh?
433
- !!@fresh
434
- end
435
-
436
- def fresh= state
437
- @fresh = !!state
438
- end
439
-
440
- # Return the algorithms used in the object.
441
- # @return [Array]
442
- def algorithms
443
- (@digests || {}).keys.sort
444
- end
445
-
446
- # Return a particular digest. Returns nil if there is no match.
447
- # @param symbol [Symbol, #to_s, #to_sym] the digest
448
- # @return [Symbol, nil]
449
- def digest symbol
450
- raise ArgumentError, "This method takes a symbol" unless
451
- symbol.respond_to? :to_sym
452
- digests[symbol.to_sym]
453
- end
454
-
455
- alias_method :"[]", :digest
456
-
457
- # Returns the content stored in the object.
458
- # @return [IO]
459
- def content
460
- @content.is_a?(Proc) ? @content.call : @content
461
- end
462
-
463
- # Determines if there is content embedded in the object.
464
- # @return [false, true]
465
- def content?
466
- !!@content
467
- end
468
-
469
- # Returns the type and charset, suitable for an HTTP header.
470
- # @return [String]
471
- def type_charset
472
- out = type.to_s
473
- out += ";charset=#{charset}" if charset
474
- out
475
- end
476
-
477
- # Determines if the object has been scanned.
478
- # @return [false, true]
479
- def scanned?
480
- !@digests.empty?
481
- end
482
-
483
- # Returns whether the object is cache.
484
- #
485
- # @return [false, true]
486
- #
487
- def cache?
488
- !!@flags.cache
489
- end
490
-
491
- # XXX i'm keeping these as-is for now
492
-
493
- # Returns true if the content type has been checked.
494
- # @return [false, true]
495
- def type_checked?
496
- 0 != @flags.to_i & TYPE_CHECKED
497
- end
498
-
499
- # Returns true if the content type has been checked _and_ is valid.
500
- # @return [false, true]
501
- def type_valid?
502
- 0 != @flags.to_i & (TYPE_CHECKED|TYPE_VALID)
503
- end
504
-
505
- # Returns true if the character set has been checked.
506
- # @return [false, true]
507
- def charset_checked?
508
- 0 != @flags.to_i & CHARSET_CHECKED
509
- end
510
-
511
- # Returns true if the character set has been checked _and_ is valid.
512
- # @return [false, true]
513
- def charset_valid?
514
- 0 != @flags.to_i & (CHARSET_CHECKED|CHARSET_VALID)
515
- end
516
-
517
- # Returns true if the content encoding (e.g. gzip, deflate) has
518
- # been checked.
519
- # @return [false, true]
520
- def encoding_checked?
521
- 0 != @flags.to_i & ENCODING_CHECKED
522
- end
523
-
524
- # Returns true if the content encoding has been checked _and_ is valid.
525
- # @return [false, true]
526
- def encoding_valid?
527
- 0 != @flags.to_i & (ENCODING_CHECKED|ENCODING_VALID)
528
- end
529
-
530
- # Returns true if the blob's syntax has been checked.
531
- # @return [false, true]
532
- def syntax_checked?
533
- 0 != @flags.to_i & SYNTAX_CHECKED
534
- end
535
-
536
- # Returns true if the blob's syntax has been checked _and_ is valid.
537
- # @return [false, true]
538
- def syntax_valid?
539
- 0 != @flags.to_i & (SYNTAX_CHECKED|SYNTAX_VALID)
540
- end
541
-
542
- %i[ctime mtime ptime dtime].each do |k|
543
- define_method "#{k}=" do |v|
544
- instance_variable_set "@#{k}", coerce_time(v, k).freeze
545
- end
546
- end
547
-
548
- %i[type charset encoding language].each do |k|
549
- define_method "#{k}=" do |v|
550
- instance_variable_set "@#{k}", coerce_token(v, k).freeze
551
- end
552
-
553
- define_method "#{k}_ok?" do |v|
554
- TOKENS[k].first.match? v
555
- end
556
- end
557
-
558
- # Just a plain old predicate to determine whether the blob has been
559
- # deleted from the store (but implicitly the metadata record
560
- # remains).
561
- # @return [false, true]
562
- def deleted?
563
- !!@dtime
564
- end
565
-
566
- # Return the object as a hash. Omits the content by default.
567
- # @param content [false, true] include the content if true
568
- # @return [Hash] the object as a hash
569
- def to_h content: false
570
- main = %i[content digests]
571
- main.shift unless content
572
- (main + MANDATORY + OPTIONAL + [:flags]).map do |k|
573
- [k, send(k).dup]
574
- end.to_h
575
- end
576
-
577
- # Outputs a human-readable string representation of the object.
578
- def to_s
579
- out = "#{self.class}\n Digests:\n"
580
-
581
- # disgorge the digests
582
- digests.values.sort { |a, b| a.to_s <=> b.to_s }.each do |d|
583
- out << " #{d}\n"
584
- end
585
-
586
- # now the fields
587
- MANDATORY.each { |m| out << " #{LABELS[m]}: #{send m}\n" }
588
- OPTIONAL.each do |o|
589
- val = send o
590
- out << " #{LABELS[o]}: #{val}\n" if val
591
- end
592
-
593
- # now the validation statuses
594
- out << "Validation:\n"
595
- FLAG.each_index do |i|
596
- x = flags.to_i >> (3 - i) & 3
597
- out << (" %-16s: %s\n" % [FLAG[i], STATE[x]])
598
- end
599
-
600
- out
601
- end
602
- end