store-digest 0.3.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,623 +0,0 @@
1
- require 'store/digest/version'
2
-
3
- require 'forwardable'
4
- require 'uri'
5
- require 'uri/ni'
6
- require 'mimemagic'
7
-
8
- class MimeMagic
9
- # XXX erase this when these methods get added
10
- unless singleton_class.method_defined? :parents
11
- def self.parents type
12
- TYPES.fetch(type.to_s, [nil,[]])[1].map { |t| new t }.uniq
13
- end
14
- end
15
-
16
- unless method_defined? :parents
17
- def parents
18
- out = TYPES.fetch(type.to_s.downcase, [nil, []])[1].map do |x|
19
- self.class.new x
20
- end
21
- # add this unless we're it
22
- out << self.class.new('application/octet-stream') if
23
- out.empty? and type.downcase != 'application/octet-stream'
24
-
25
- out.uniq
26
- end
27
- end
28
-
29
- unless method_defined? :lineage
30
- def lineage
31
- ([self] + parents.map { |t| t.lineage }.flatten).uniq
32
- end
33
- end
34
-
35
- unless method_defined? :descendant_of?
36
- def descendant_of? type
37
- lineage.map(&:type).include? type.to_s.downcase
38
- end
39
- end
40
-
41
- unless singleton_class.method_defined? :binary?
42
- def self.binary? thing
43
- sample = nil
44
-
45
- # get some stuff out of the IO or get a substring
46
- if %i[tell seek read].all? { |m| thing.respond_to? m }
47
- pos = thing.tell
48
- thing.seek 0, 0
49
- sample = thing.read 1024
50
- thing.seek pos
51
- elsif thing.respond_to? :to_s
52
- sample = thing.to_s[0,1024]
53
- else
54
- raise ArgumentError, "Cannot sample an instance of {thing.class}"
55
- end
56
-
57
- # consider this to be 'binary' if empty
58
- return true if sample.nil? or sample.empty?
59
- # control codes minus ordinary whitespace
60
- /[\x0-\x8\xe-\x1f\x7f]/n.match?(sample) ? true : false
61
- end
62
- end
63
-
64
- unless singleton_class.method_defined? :default_type
65
- def self.default_type thing
66
- new self.binary?(thing) ? 'application/octet-stream' : 'text/plain'
67
- end
68
- end
69
- end
70
-
71
- # Store entry object class.
72
- #
73
- class Store::Digest::Object
74
-
75
- # Proxy IO instance that has a backreference to the store object.
76
- #
77
- class IOWrapper
78
- extend Forwardable
79
-
80
- def initialize object, io
81
- @object = object
82
- @io = io
83
- end
84
-
85
- attr_reader :object
86
-
87
- # any others??
88
- def_delegators :@io, :gets, :read, :each, :seek, :pos, :rewind
89
-
90
- end
91
-
92
- # These is a struct for the bank of flags, with a couple of extra
93
- # methods for parsing
94
- #
95
- Flags = Struct.new(
96
- 'Flags',
97
- :type_checked, :type_valid, :charset_checked, :charset_valid,
98
- :encoding_checked, :encoding_valid, :syntax_checked, :syntax_valid, :cache
99
- ) do |name|
100
-
101
- # Initialize a struct of flags from arbitrary input
102
- #
103
- # @param arg [Store::Digest::Object::Flags, Integer, #to_h, #to_a]
104
- #
105
- # @return [Store::Digest::Object::Flags]
106
- #
107
- def self.from arg
108
- # get the length since we use it in a few places
109
- len = self.members.size
110
-
111
- if arg.is_a? Integer
112
- tmp = arg.digits(2).first(len)
113
- elsif arg.is_a? self
114
- # noop
115
- return arg
116
- elsif arg.is_a? Hash
117
- tmp = arg.slice(*self.members).transform_values do |v|
118
- !!(v && v != 0)
119
- end
120
- return self.[](**tmp)
121
- elsif arg.respond_to? :to_a
122
- tmp = arg.to_a.first(len)
123
- else
124
- raise ArgumentError, 'Input must be an integer or array'
125
- end
126
-
127
- # append these
128
- tmp += [false] * (len - tmp.size) if tmp.size < len
129
-
130
- # make sure these are true/false
131
- tmp.map! { |b| !!(b && b != 0) }
132
-
133
- # we do this because `new` doesn't do this
134
- self.[](*tmp)
135
- end
136
-
137
- # Turn an arbitrary {Array} back into an {Integer}.
138
- #
139
- # @param array [Array]
140
- #
141
- # @return [Integer]
142
- #
143
- def self.to_i array
144
- array.to_a.reverse.reduce(0) { |acc, b| (acc << 1) | (b ? 1 : 0) }
145
- end
146
-
147
- # wish there was a cleaner way to do derive individual instance
148
- # methods from class methods
149
- begin
150
- cm = self.method :to_i
151
- define_method(:to_i) { cm.call self.to_a }
152
- end
153
- end
154
-
155
- private
156
-
157
- SAMPLE = 2**13 # must be big enough to detect ooxml
158
- BLOCKSIZE = 2**16
159
-
160
- CHARSETS = [
161
- %w[utf8 utf-8],
162
- %w[iso8859-1 iso-8859-1],
163
- ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
164
-
165
- ENCODINGS = [
166
- %w[x-compress compress],
167
- %w[x-gzip gzip],
168
- ].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
169
-
170
- TOKEN = '[^\x0-\x20()<>@,;:\\\"/\[\]?=\x7f-\\xff]+'
171
-
172
- # { key: [pattern, normalizer] } - assumes stripped and downcased
173
- TOKENS = {
174
- type: [/^(#{TOKEN}(?:\/#{TOKEN})?)$/on, -> c { c.downcase }],
175
- charset: [/^(#{TOKEN})$/on,
176
- -> c { c = c.tr(?_, ?-).downcase; CHARSETS.fetch c, c } ],
177
- encoding: [/^(#{TOKEN})$/on,
178
- -> c { c = c.tr(?_, ?-).downcase; ENCODINGS.fetch c, c } ],
179
- language: [/^([a-z]{2,3}(?:[-_][0-9a-z]+)*)$/,
180
- -> c { c.downcase.tr(?_, ?-).gsub(/-*$/, '') } ],
181
- }
182
-
183
- # flag constants
184
- TYPE_CHECKED = 1 << 0
185
- TYPE_VALID = 1 << 1
186
- CHARSET_CHECKED = 1 << 2
187
- CHARSET_VALID = 1 << 3
188
- ENCODING_CHECKED = 1 << 4
189
- ENCODING_VALID = 1 << 5
190
- SYNTAX_CHECKED = 1 << 6
191
- SYNTAX_VALID = 1 << 7
192
- IS_CACHE = 1 << 8
193
-
194
- LABELS = {
195
- size: 'Size (Bytes)',
196
- ctime: 'Added to Store',
197
- mtime: 'Last Modified',
198
- ptime: 'Properties Modified',
199
- dtime: 'Deleted (Expires)',
200
- type: 'Content Type',
201
- language: '(Natural) Language',
202
- charset: 'Character Set',
203
- encoding: 'Content Encoding',
204
- }.freeze
205
-
206
- MANDATORY = %i[size ctime mtime ptime]
207
- OPTIONAL = %i[dtime type language charset encoding]
208
- FLAG = %i[content-type charset content-encoding syntax].freeze
209
- STATE = %i[unverified invalid recheck valid].freeze
210
-
211
- def coerce_time t, k
212
- case t
213
- when nil then nil
214
- when Time then t
215
- when -> dt { dt.respond_to? :to_time }
216
- t.to_time
217
- when Integer
218
- raise ArgumentError,
219
- "#{k} given as Integer must be non-negative" if t < 0
220
- Time.at t
221
- else
222
- raise ArgumentError, "Invalid type for #{k}: #{t.class}"
223
- end
224
- end
225
-
226
- def coerce_token t, k
227
- t = t.to_s.strip.downcase
228
- pat, norm = TOKENS[k]
229
- raise "#{k} #{t} does not match #{pat}" unless m = pat.match(t)
230
- norm.call m[1]
231
- end
232
-
233
- public
234
-
235
- # Create a new object, naively recording whatever is handed
236
- #
237
- # @note use {.scan} or {#scan} to populate
238
- #
239
- # @param content [IO, String, Proc, File, Pathname, ...] some content
240
- # @param digests [Hash] the digests ascribed to the content
241
- # @param size [Integer] assert the object's size
242
- # @param type [String] assert the object's MIME type
243
- # @param charset [String] the character set, if applicable
244
- # @param language [String] the (RFC5646) language tag, if applicable
245
- # @param encoding [String] the content-encoding (e.g. compression)
246
- # @param ctime [Time] assert object creation time
247
- # @param mtime [Time] assert object modification time
248
- # @param ptime [Time] assert object metadata parameter modification time
249
- # @param dtime [Time] assert object deletion time
250
- # @param flags [Integer] validation state flags
251
- # @param strict [true, false] raise an error on bad input
252
- # @param fresh [true, false] assert "freshness" of object vis-a-vis the store
253
- #
254
- # @return [Store::Digest::Object] the object in question
255
- #
256
- def initialize content = nil, digests: {}, size: 0,
257
- type: 'application/octet-stream', charset: nil, language: nil,
258
- encoding: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil,
259
- flags: 0, strict: true, fresh: false
260
-
261
- # snag this immediately
262
- @fresh = !!fresh
263
-
264
- # check input on content
265
- @content = case content
266
- when nil then nil
267
- when IO, StringIO, Proc then content
268
- when String then StringIO.new content
269
- when Pathname then -> { content.expand_path.open('rb') }
270
- when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
271
- content
272
- else
273
- raise ArgumentError,
274
- "Cannot accept content given as #{content.class}"
275
- end
276
-
277
- # check input on digests
278
- @digests = case digests
279
- when Hash
280
- # hash must be clean
281
- digests.map do |k, v|
282
- raise ArgumentError,
283
- 'Digest keys must be symbol-able' unless
284
- k.respond_to? :to_sym
285
- k = k.to_sym
286
- raise ArgumentError,
287
- 'Digest values must be URI::NI' unless
288
- v.is_a? URI::NI
289
- raise ArgumentError,
290
- 'Digest key must match value algorithm' unless
291
- k == v.algorithm
292
- [k.to_sym, v.dup.freeze]
293
- end.to_h
294
- when nil then {} # empty hash
295
- when Array
296
- # only accepts array of URI::NI
297
- digests.map do |x|
298
- raise ArgumentError,
299
- "Digests given as array can only be URI::NI, not #{x}" \
300
- unless x.is_a? URI::NI
301
- [x.algorithm, x.dup.freeze]
302
- end.to_h
303
- when URI::NI then { digests.algorithm => digests.dup.freeze }
304
- else
305
- # everything else is invalid
306
- raise ArgumentError,
307
- "Cannot coerce digests given as #{digests.inspect}"
308
- end
309
-
310
- # ctime, mtime, ptime, dtime should be all nil or nonnegative
311
- # integers or Time or DateTime
312
- b = binding
313
- %i[ctime mtime ptime dtime].each do |k|
314
- v = coerce_time(b.local_variable_get(k), k)
315
- instance_variable_set "@#{k}", v
316
- end
317
-
318
- # set the flags
319
- @flags = Flags.from(flags || 0)
320
-
321
- @size = case size
322
- when nil then 0
323
- when Numeric
324
- raise ArgumentError, 'size must be non-negative' if size < 0
325
- size.to_i
326
- else
327
- raise ArgumentError, 'size must be nil or Numeric'
328
- end
329
-
330
- # the following can be strings or symbols:
331
- TOKENS.keys.each do |k|
332
- if x = b.local_variable_get(k)
333
- x = if strict
334
- coerce_token(x, k)
335
- else
336
- coerce_token(x, k) rescue nil
337
- end
338
- instance_variable_set "@#{k}", x.freeze if x
339
- end
340
- end
341
- end
342
-
343
- # XXX come up with a policy for these that isn't stupid, plus input sanitation
344
- attr_reader :digests, :size
345
- attr_accessor :type, :charset, :language, :encoding,
346
- :ctime, :mtime, :ptime, :dtime, :flags
347
-
348
- #
349
- def self.scan content, digests: URI::NI.algorithms, mtime: nil,
350
- type: nil, language: nil, charset: nil, encoding: nil,
351
- blocksize: BLOCKSIZE, strict: true, fresh: false, &block
352
- self.new.scan content, digests: digests, mtime: mtime, type: type,
353
- language: language, charset: charset, encoding: encoding,
354
- blocksize: blocksize, strict: strict, fresh: fresh, &block
355
- end
356
-
357
- def scan content = nil, digests: URI::NI.algorithms, mtime: nil,
358
- type: nil, charset: nil, language: nil, encoding: nil,
359
- blocksize: BLOCKSIZE, strict: true, fresh: nil, &block
360
- # update freshness if there is something to update
361
- @fresh = !!fresh unless fresh.nil?
362
- # we put all the scanning stuff in here
363
- content = case content
364
- when nil then self.content
365
- when IO, StringIO then content
366
- when String then StringIO.new content
367
- when Pathname then content.open('rb')
368
- when Proc then content.call
369
- when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
370
- content
371
- else
372
- raise ArgumentError,
373
- "Cannot scan content of type #{content.class}"
374
- end
375
- content.binmode if content.respond_to? :binmode
376
-
377
- # sane default for mtime
378
- @mtime = coerce_time(mtime || @mtime ||
379
- (content.respond_to?(:mtime) ? content.mtime : Time.now(in: ?Z)), :mtime)
380
-
381
- # eh, *some* code reuse
382
- b = binding
383
- TOKENS.keys.each do |k|
384
- if x = b.local_variable_get(k)
385
- x = if strict
386
- coerce_token(x, k)
387
- else
388
- coerce_token(x, k) rescue nil
389
- end
390
- instance_variable_set "@#{k}", x.freeze if x
391
- end
392
- end
393
-
394
- digests = case digests
395
- when Array then digests
396
- when Symbol then [digests]
397
- else
398
- raise ArgumentError, 'Digests must be one or more symbol'
399
- end
400
- raise ArgumentError,
401
- "Invalid digest list #{digests - URI::NI.algorithms}" unless
402
- (digests - URI::NI.algorithms).empty?
403
-
404
- # set up the contexts
405
- digests = digests.map { |d| [d, URI::NI.context(d)] }.to_h
406
-
407
- # sample for mime type checking
408
- sample = StringIO.new ''
409
- @size = 0
410
- while buf = content.read(blocksize)
411
- @size += buf.size
412
- sample << buf if sample.pos < SAMPLE
413
- digests.values.each { |ctx| ctx << buf }
414
- block.call buf if block_given?
415
- end
416
-
417
- # seek the content back to the front and store it
418
- content.seek 0, 0
419
- @content = content
420
-
421
- # set up the digests
422
- @digests = digests.map do |k, v|
423
- [k, URI::NI.compute(v, algorithm: k).freeze]
424
- end.to_h.freeze
425
-
426
- # ensure there is the most generic of possible types
427
- type ||= 'application/octet-stream'.freeze
428
-
429
- # obtain the sampled content type
430
- ts = MimeMagic.by_magic(sample) || MimeMagic.default_type(sample)
431
- if content.respond_to? :path
432
- # may as well use the path if it's available and more specific
433
- ps = MimeMagic.by_path(content.path.to_s)
434
- # XXX the need to do ts.to_s is a bug in mimemagic
435
- ts = ps if ps and ps.descendant_of?(ts.to_s)
436
- end
437
-
438
- # set the type to ts if it is more specific
439
- @type = ts.descendant_of?(type.to_s) ? ts.to_s.freeze :
440
- type.to_s.dup.downcase.freeze
441
-
442
- self
443
- end
444
-
445
- # Determine (or set) whether the object is "fresh", i.e. whether it
446
- # is new (or restored), or had been previously been in the store.
447
- #
448
- # @return [true, false]
449
- #
450
- def fresh?
451
- !!@fresh
452
- end
453
-
454
- def fresh= state
455
- @fresh = !!state
456
- end
457
-
458
- # Return the algorithms used in the object.
459
- # @return [Array]
460
- def algorithms
461
- (@digests || {}).keys.sort
462
- end
463
-
464
- # Return a particular digest. Returns nil if there is no match.
465
- # @param symbol [Symbol, #to_s, #to_sym] the digest
466
- # @return [Symbol, nil]
467
- def digest symbol
468
- raise ArgumentError, "This method takes a symbol" unless
469
- symbol.respond_to? :to_sym
470
- digests[symbol.to_sym]
471
- end
472
-
473
- alias_method :"[]", :digest
474
-
475
- # Returns the content stored in the object.
476
- #
477
- # @return [#read]
478
- #
479
- def content
480
- io = @content.is_a?(Proc) ? @content.call : @content
481
- io = io ? IOWrapper.new(self, io) : io
482
- end
483
-
484
- # Determines if there is content embedded in the object.
485
- # @return [false, true]
486
- def content?
487
- !!@content
488
- end
489
-
490
- # Returns the type and charset, suitable for an HTTP header.
491
- # @return [String]
492
- def type_charset
493
- out = type.to_s
494
- out += ";charset=#{charset}" if charset
495
- out
496
- end
497
-
498
- # Determines if the object has been scanned.
499
- # @return [false, true]
500
- def scanned?
501
- !@digests.empty?
502
- end
503
-
504
- # Returns whether the object is cache.
505
- #
506
- # @return [false, true]
507
- #
508
- def cache?
509
- !!@flags.cache
510
- end
511
-
512
- # XXX i'm keeping these as-is for now
513
-
514
- # Returns true if the content type has been checked.
515
- # @return [false, true]
516
- def type_checked?
517
- 0 != @flags.to_i & TYPE_CHECKED
518
- end
519
-
520
- # Returns true if the content type has been checked _and_ is valid.
521
- # @return [false, true]
522
- def type_valid?
523
- 0 != @flags.to_i & (TYPE_CHECKED|TYPE_VALID)
524
- end
525
-
526
- # Returns true if the character set has been checked.
527
- # @return [false, true]
528
- def charset_checked?
529
- 0 != @flags.to_i & CHARSET_CHECKED
530
- end
531
-
532
- # Returns true if the character set has been checked _and_ is valid.
533
- # @return [false, true]
534
- def charset_valid?
535
- 0 != @flags.to_i & (CHARSET_CHECKED|CHARSET_VALID)
536
- end
537
-
538
- # Returns true if the content encoding (e.g. gzip, deflate) has
539
- # been checked.
540
- # @return [false, true]
541
- def encoding_checked?
542
- 0 != @flags.to_i & ENCODING_CHECKED
543
- end
544
-
545
- # Returns true if the content encoding has been checked _and_ is valid.
546
- # @return [false, true]
547
- def encoding_valid?
548
- 0 != @flags.to_i & (ENCODING_CHECKED|ENCODING_VALID)
549
- end
550
-
551
- # Returns true if the blob's syntax has been checked.
552
- # @return [false, true]
553
- def syntax_checked?
554
- 0 != @flags.to_i & SYNTAX_CHECKED
555
- end
556
-
557
- # Returns true if the blob's syntax has been checked _and_ is valid.
558
- # @return [false, true]
559
- def syntax_valid?
560
- 0 != @flags.to_i & (SYNTAX_CHECKED|SYNTAX_VALID)
561
- end
562
-
563
- %i[ctime mtime ptime dtime].each do |k|
564
- define_method "#{k}=" do |v|
565
- instance_variable_set "@#{k}", coerce_time(v, k).freeze
566
- end
567
- end
568
-
569
- %i[type charset encoding language].each do |k|
570
- define_method "#{k}=" do |v|
571
- instance_variable_set "@#{k}", coerce_token(v, k).freeze
572
- end
573
-
574
- define_method "#{k}_ok?" do |v|
575
- TOKENS[k].first.match? v
576
- end
577
- end
578
-
579
- # Just a plain old predicate to determine whether the blob has been
580
- # deleted from the store (but implicitly the metadata record
581
- # remains).
582
- # @return [false, true]
583
- def deleted?
584
- !!@dtime
585
- end
586
-
587
- # Return the object as a hash. Omits the content by default.
588
- # @param content [false, true] include the content if true
589
- # @return [Hash] the object as a hash
590
- def to_h content: false
591
- main = %i[content digests]
592
- main.shift unless content
593
- (main + MANDATORY + OPTIONAL + [:flags]).map do |k|
594
- [k, send(k).dup]
595
- end.to_h
596
- end
597
-
598
- # Outputs a human-readable string representation of the object.
599
- def to_s
600
- out = "#{self.class}\n Digests:\n"
601
-
602
- # disgorge the digests
603
- digests.values.sort { |a, b| a.to_s <=> b.to_s }.each do |d|
604
- out << " #{d}\n"
605
- end
606
-
607
- # now the fields
608
- MANDATORY.each { |m| out << " #{LABELS[m]}: #{send m}\n" }
609
- OPTIONAL.each do |o|
610
- val = send o
611
- out << " #{LABELS[o]}: #{val}\n" if val
612
- end
613
-
614
- # now the validation statuses
615
- out << "Validation:\n"
616
- FLAG.each_index do |i|
617
- x = flags.to_i >> (3 - i) & 3
618
- out << (" %-16s: %s\n" % [FLAG[i], STATE[x]])
619
- end
620
-
621
- out
622
- end
623
- end