store-digest 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +202 -0
- data/README.md +231 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/store/digest.rb +282 -0
- data/lib/store/digest/blob.rb +7 -0
- data/lib/store/digest/blob/filesystem.rb +146 -0
- data/lib/store/digest/driver.rb +14 -0
- data/lib/store/digest/driver/lmdb.rb +15 -0
- data/lib/store/digest/meta.rb +7 -0
- data/lib/store/digest/meta/lmdb.rb +621 -0
- data/lib/store/digest/object.rb +497 -0
- data/lib/store/digest/trait.rb +32 -0
- data/lib/store/digest/version.rb +5 -0
- data/store-digest.gemspec +39 -0
- metadata +161 -0
@@ -0,0 +1,497 @@
|
|
1
|
+
require 'store/digest/version'
|
2
|
+
|
3
|
+
require 'uri'
|
4
|
+
require 'uri/ni'
|
5
|
+
require 'mimemagic'
|
6
|
+
require 'mimemagic/overlay'
|
7
|
+
|
8
|
+
class MimeMagic
|
9
|
+
# XXX erase this when these methods get added
|
10
|
+
unless self.method_defined? :parents
|
11
|
+
def self.parents type
|
12
|
+
TYPES.fetch(type, [nil,[]])[1].map { |t| new t }.uniq
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
unless self.method_defined? :ancestor_types
|
17
|
+
def self.ancestor_types type
|
18
|
+
parents(type).map { |t| ancestors(t) }.flatten.uniq
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
unless self.method_defined? :binary?
|
23
|
+
def self.binary? thing
|
24
|
+
sample = nil
|
25
|
+
|
26
|
+
# get some stuff out of the IO or get a substring
|
27
|
+
if %i[tell seek read].all? { |m| thing.respond_to? m }
|
28
|
+
pos = thing.tell
|
29
|
+
thing.seek 0, 0
|
30
|
+
sample = thing.read 1024
|
31
|
+
thing.seek pos
|
32
|
+
elsif thing.respond_to? :to_s
|
33
|
+
sample = thing.to_s[0,1024]
|
34
|
+
else
|
35
|
+
raise ArgumentError, "Cannot sample an instance of {thing.class}"
|
36
|
+
end
|
37
|
+
|
38
|
+
# consider this to be 'binary' if empty
|
39
|
+
return true if sample.nil? or sample.empty?
|
40
|
+
# control codes minus ordinary whitespace
|
41
|
+
/[\x0-\x8\xe-\x1f\x7f]/n.match?(sample) ? true : false
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
unless self.method_defined? :default_type
|
46
|
+
def self.default_type thing
|
47
|
+
new self.binary?(thing) ? 'application/octet-stream' : 'text/plain'
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Store::Digest::Object
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
SAMPLE = 2**13 # must be big enough to detect ooxml
|
57
|
+
BLOCKSIZE = 2**16
|
58
|
+
|
59
|
+
CHARSETS = [
|
60
|
+
%w[utf8 utf-8],
|
61
|
+
%w[iso8859-1 iso-8859-1],
|
62
|
+
].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
|
63
|
+
|
64
|
+
ENCODINGS = [
|
65
|
+
%w[x-compress compress],
|
66
|
+
%w[x-gzip gzip],
|
67
|
+
].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
|
68
|
+
|
69
|
+
TOKEN = '[^\x0-\x20()<>@,;:\\\"/\[\]?=\x7f-\\xff]+'
|
70
|
+
|
71
|
+
# { key: [pattern, normalizer] } - assumes stripped and downcased
|
72
|
+
TOKENS = {
|
73
|
+
type: [/^(#{TOKEN}(?:\/#{TOKEN})?)$/on, -> c { c.downcase }],
|
74
|
+
charset: [/^(#{TOKEN})$/on,
|
75
|
+
-> c { c = c.tr(?_, ?-).downcase; CHARSETS.fetch c, c } ],
|
76
|
+
encoding: [/^(#{TOKEN})$/on,
|
77
|
+
-> c { c = c.tr(?_, ?-).downcase; ENCODINGS.fetch c, c } ],
|
78
|
+
language: [/^([a-z]{2,3}(?:[-_][0-9a-z]+)*)$/,
|
79
|
+
-> c { c.downcase.tr(?_, ?-).gsub(/-*$/, '') } ],
|
80
|
+
}
|
81
|
+
|
82
|
+
# flag constants
|
83
|
+
TYPE_CHECKED = 1 << 0
|
84
|
+
TYPE_VALID = 1 << 1
|
85
|
+
CHARSET_CHECKED = 1 << 2
|
86
|
+
CHARSET_VALID = 1 << 3
|
87
|
+
ENCODING_CHECKED = 1 << 4
|
88
|
+
ENCODING_VALID = 1 << 5
|
89
|
+
SYNTAX_CHECKED = 1 << 6
|
90
|
+
SYNTAX_VALID = 1 << 7
|
91
|
+
|
92
|
+
LABELS = {
|
93
|
+
size: 'Size (Bytes)',
|
94
|
+
ctime: 'Added to Store',
|
95
|
+
mtime: 'Last Modified',
|
96
|
+
ptime: 'Properties Modified',
|
97
|
+
dtime: 'Deleted',
|
98
|
+
type: 'Content Type',
|
99
|
+
language: '(Natural) Language',
|
100
|
+
charset: 'Character Set',
|
101
|
+
encoding: 'Content Encoding',
|
102
|
+
}.freeze
|
103
|
+
|
104
|
+
MANDATORY = %i[size ctime mtime ptime]
|
105
|
+
OPTIONAL = %i[dtime type language charset encoding]
|
106
|
+
FLAG = %i[content-type charset content-encoding syntax].freeze
|
107
|
+
STATE = %i[unverified invalid recheck valid].freeze
|
108
|
+
|
109
|
+
def coerce_time t, k
|
110
|
+
case t
|
111
|
+
when nil then nil
|
112
|
+
when Time then t
|
113
|
+
when -> dt { dt.respond_to? :to_time }
|
114
|
+
t.to_time
|
115
|
+
when Integer
|
116
|
+
raise ArgumentError,
|
117
|
+
"#{k} given as Integer must be non-negative" if t < 0
|
118
|
+
Time.at t
|
119
|
+
else
|
120
|
+
raise ArgumentError, "Invalid type for #{k}: #{t.class}"
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def coerce_token t, k
|
125
|
+
t = t.to_s.strip.downcase
|
126
|
+
pat, norm = TOKENS[k]
|
127
|
+
raise "#{k} #{t} does not match #{pat}" unless m = pat.match(t)
|
128
|
+
norm.call m[1]
|
129
|
+
end
|
130
|
+
|
131
|
+
public
|
132
|
+
|
133
|
+
# Create a new object, naively recording whatever is handed
|
134
|
+
#
|
135
|
+
# @note use {.scan} or {#scan} to populate
|
136
|
+
#
|
137
|
+
# @param content [IO, String, Proc, File, Pathname, ...] some content
|
138
|
+
# @param digests [Hash] the digests ascribed to the content
|
139
|
+
# @param size [Integer] assert the object's size
|
140
|
+
# @param type [String] assert the object's MIME type
|
141
|
+
# @param charset [String] the character set, if applicable
|
142
|
+
# @param language [String] the (RFC5646) language tag, if applicable
|
143
|
+
# @param encoding [String] the content-encoding (e.g. compression)
|
144
|
+
# @param ctime [Time] assert object creation time
|
145
|
+
# @param mtime [Time] assert object modification time
|
146
|
+
# @param ptime [Time] assert object metadata parameter modification time
|
147
|
+
# @param dtime [Time] assert object deletion time
|
148
|
+
# @param flags [Integer] validation state flags
|
149
|
+
# @param strict [true, false] raise an error on bad input
|
150
|
+
# @param fresh [true, false] assert "freshness" of object vis-a-vis the store
|
151
|
+
# @return [Store::Digest::Object] the object in question
|
152
|
+
def initialize content = nil, digests: {}, size: 0,
|
153
|
+
type: 'application/octet-stream', charset: nil, language: nil,
|
154
|
+
encoding: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil,
|
155
|
+
flags: 0, strict: true, fresh: false
|
156
|
+
|
157
|
+
# snag this immediately
|
158
|
+
@fresh = !!fresh
|
159
|
+
|
160
|
+
# check input on content
|
161
|
+
@content = case content
|
162
|
+
when nil then nil
|
163
|
+
when IO, StringIO, Proc then content
|
164
|
+
when String then StringIO.new content
|
165
|
+
when Pathname then -> { content.expand_path.open('rb') }
|
166
|
+
when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
|
167
|
+
content
|
168
|
+
else
|
169
|
+
raise ArgumentError,
|
170
|
+
"Cannot accept content given as #{content.class}"
|
171
|
+
end
|
172
|
+
|
173
|
+
# check input on digests
|
174
|
+
@digests = case digests
|
175
|
+
when Hash
|
176
|
+
# hash must be clean
|
177
|
+
digests.map do |k, v|
|
178
|
+
raise ArgumentError,
|
179
|
+
'Digest keys must be symbol-able' unless
|
180
|
+
k.respond_to? :to_sym
|
181
|
+
k = k.to_sym
|
182
|
+
raise ArgumentError,
|
183
|
+
'Digest values must be URI::NI' unless
|
184
|
+
v.is_a? URI::NI
|
185
|
+
raise ArgumentError,
|
186
|
+
'Digest key must match value algorithm' unless
|
187
|
+
k == v.algorithm
|
188
|
+
[k.to_sym, v.dup.freeze]
|
189
|
+
end.to_h
|
190
|
+
when nil then {} # empty hash
|
191
|
+
when Array
|
192
|
+
# only accepts array of URI::NI
|
193
|
+
digests.map do |x|
|
194
|
+
raise ArgumentError,
|
195
|
+
"Digests given as array can only be URI::NI, not #{x}" \
|
196
|
+
unless x.is_a? URI::NI
|
197
|
+
[x.algorithm, x.dup.freeze]
|
198
|
+
end.to_h
|
199
|
+
when URI::NI then { digests.algorithm => digests.dup.freeze }
|
200
|
+
else
|
201
|
+
# everything else is invalid
|
202
|
+
raise ArgumentError,
|
203
|
+
"Cannot coerce digests given as #{digests.inspect}"
|
204
|
+
end
|
205
|
+
|
206
|
+
# ctime, mtime, ptime, dtime should be all nil or nonnegative
|
207
|
+
# integers or Time or DateTime
|
208
|
+
b = binding
|
209
|
+
%i[ctime mtime ptime dtime].each do |k|
|
210
|
+
v = coerce_time(b.local_variable_get(k), k)
|
211
|
+
instance_variable_set "@#{k}", v
|
212
|
+
end
|
213
|
+
|
214
|
+
# size and flags should be non-negative integers
|
215
|
+
%i[size flags].each do |k|
|
216
|
+
x = b.local_variable_get k
|
217
|
+
v = case x
|
218
|
+
when nil then 0
|
219
|
+
when Integer
|
220
|
+
raise ArgumentError, "#{k} must be non-negative" if x < 0
|
221
|
+
x
|
222
|
+
else
|
223
|
+
raise ArgumentError, "#{k} must be nil or an Integer"
|
224
|
+
end
|
225
|
+
instance_variable_set "@#{k}", v
|
226
|
+
end
|
227
|
+
|
228
|
+
# the following can be strings or symbols:
|
229
|
+
TOKENS.keys.each do |k|
|
230
|
+
if x = b.local_variable_get(k)
|
231
|
+
x = if strict
|
232
|
+
coerce_token(x, k)
|
233
|
+
else
|
234
|
+
coerce_token(x, k) rescue nil
|
235
|
+
end
|
236
|
+
instance_variable_set "@#{k}", x.freeze if x
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# XXX come up with a policy for these that isn't stupid, plus input sanitation
|
242
|
+
attr_reader :digests, :size
|
243
|
+
attr_accessor :type, :charset, :language, :encoding,
|
244
|
+
:ctime, :mtime, :ptime, :dtime, :flags
|
245
|
+
|
246
|
+
#
|
247
|
+
def self.scan content, digests: URI::NI.algorithms, mtime: nil,
|
248
|
+
type: nil, language: nil, charset: nil, encoding: nil,
|
249
|
+
blocksize: BLOCKSIZE, strict: true, fresh: false, &block
|
250
|
+
self.new.scan content, digests: digests, mtime: mtime, type: type,
|
251
|
+
language: language, charset: charset, encoding: encoding,
|
252
|
+
blocksize: blocksize, strict: strict, fresh: fresh, &block
|
253
|
+
end
|
254
|
+
|
255
|
+
def scan content = nil, digests: URI::NI.algorithms, mtime: nil,
|
256
|
+
type: nil, charset: nil, language: nil, encoding: nil,
|
257
|
+
blocksize: BLOCKSIZE, strict: true, fresh: nil, &block
|
258
|
+
# update freshness if there is something to update
|
259
|
+
@fresh = !!fresh unless fresh.nil?
|
260
|
+
# we put all the scanning stuff in here
|
261
|
+
content = case content
|
262
|
+
when nil then self.content
|
263
|
+
when IO, StringIO then content
|
264
|
+
when String then StringIO.new content
|
265
|
+
when Pathname then content.open('rb')
|
266
|
+
when Proc then content.call
|
267
|
+
when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
|
268
|
+
content
|
269
|
+
else
|
270
|
+
raise ArgumentError,
|
271
|
+
"Cannot scan content of type #{content.class}"
|
272
|
+
end
|
273
|
+
content.binmode if content.respond_to? :binmode
|
274
|
+
|
275
|
+
# sane default for mtime
|
276
|
+
@mtime = coerce_time(mtime || @mtime ||
|
277
|
+
(content.respond_to?(:mtime) ? content.mtime : Time.now), :mtime)
|
278
|
+
|
279
|
+
# eh, *some* code reuse
|
280
|
+
b = binding
|
281
|
+
TOKENS.keys.each do |k|
|
282
|
+
if x = b.local_variable_get(k)
|
283
|
+
x = if strict
|
284
|
+
coerce_token(x, k)
|
285
|
+
else
|
286
|
+
coerce_token(x, k) rescue nil
|
287
|
+
end
|
288
|
+
instance_variable_set "@#{k}", x.freeze if x
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
digests = case digests
|
293
|
+
when Array then digests
|
294
|
+
when Symbol then [digests]
|
295
|
+
else
|
296
|
+
raise ArgumentError, 'Digests must be one or more symbol'
|
297
|
+
end
|
298
|
+
raise ArgumentError,
|
299
|
+
"Invalid digest list #{digests - URI::NI.algorithms}" unless
|
300
|
+
(digests - URI::NI.algorithms).empty?
|
301
|
+
|
302
|
+
# set up the contexts
|
303
|
+
digests = digests.map { |d| [d, URI::NI.context(d)] }.to_h
|
304
|
+
|
305
|
+
# sample for mime type checking
|
306
|
+
sample = StringIO.new ''
|
307
|
+
@size = 0
|
308
|
+
while buf = content.read(blocksize)
|
309
|
+
@size += buf.size
|
310
|
+
sample << buf if sample.pos < SAMPLE
|
311
|
+
digests.values.each { |ctx| ctx << buf }
|
312
|
+
block.call buf if block_given?
|
313
|
+
end
|
314
|
+
|
315
|
+
# seek the content back to the front and store it
|
316
|
+
content.seek 0, 0
|
317
|
+
@content = content
|
318
|
+
|
319
|
+
# set up the digests
|
320
|
+
@digests = digests.map do |k, v|
|
321
|
+
[k, URI::NI.compute(v, algorithm: k).freeze]
|
322
|
+
end.to_h.freeze
|
323
|
+
|
324
|
+
# obtain the sampled content type
|
325
|
+
ts = MimeMagic.by_magic(sample) || MimeMagic.default_type(sample)
|
326
|
+
if content.respond_to? :path
|
327
|
+
# may as well use the path if it's available and more specific
|
328
|
+
ps = MimeMagic.by_path(content.path)
|
329
|
+
# XXX the need to do ts.to_s is a bug in mimemagic
|
330
|
+
ts = ps if ps and ps.child_of?(ts.to_s)
|
331
|
+
end
|
332
|
+
@type = !type || ts.child_of?(type) ? ts.to_s : type
|
333
|
+
|
334
|
+
self
|
335
|
+
end
|
336
|
+
|
337
|
+
# Determine (or set) whether the object is "fresh", i.e. whether it
|
338
|
+
# is new (or restored), or had been previously been in the store.
|
339
|
+
#
|
340
|
+
# @param state [true, false]
|
341
|
+
def fresh? state = nil
|
342
|
+
state.nil? ? @fresh : @fresh = !!state
|
343
|
+
end
|
344
|
+
|
345
|
+
# Return the algorithms used in the object.
|
346
|
+
# @return [Array]
|
347
|
+
def algorithms
|
348
|
+
(@digests || {}).keys.sort
|
349
|
+
end
|
350
|
+
|
351
|
+
# Return a particular digest. Returns nil if there is no match.
|
352
|
+
# @param symbol [Symbol, #to_s, #to_sym] the digest
|
353
|
+
# @return [Symbol, nil]
|
354
|
+
def digest symbol
|
355
|
+
raise ArgumentError, "This method takes a symbol" unless
|
356
|
+
symbol.respond_to? :to_sym
|
357
|
+
digests[symbol.to_sym]
|
358
|
+
end
|
359
|
+
|
360
|
+
alias_method :"[]", :digest
|
361
|
+
|
362
|
+
# Returns the content stored in the object.
|
363
|
+
# @return [IO]
|
364
|
+
def content
|
365
|
+
@content.is_a?(Proc) ? @content.call : @content
|
366
|
+
end
|
367
|
+
|
368
|
+
# Determines if there is content embedded in the object.
|
369
|
+
# @return [false, true]
|
370
|
+
def content?
|
371
|
+
!!@content
|
372
|
+
end
|
373
|
+
|
374
|
+
# Returns the type and charset, suitable for an HTTP header.
|
375
|
+
# @return [String]
|
376
|
+
def type_charset
|
377
|
+
out = type.to_s
|
378
|
+
out += ";charset=#{charset}" if charset
|
379
|
+
out
|
380
|
+
end
|
381
|
+
|
382
|
+
# Determines if the object has been scanned.
|
383
|
+
# @return [false, true]
|
384
|
+
def scanned?
|
385
|
+
!@digests.empty?
|
386
|
+
end
|
387
|
+
|
388
|
+
# Returns true if the content type has been checked.
|
389
|
+
# @return [false, true]
|
390
|
+
def type_checked?
|
391
|
+
0 != @flags & TYPE_CHECKED
|
392
|
+
end
|
393
|
+
|
394
|
+
# Returns true if the content type has been checked _and_ is valid.
|
395
|
+
# @return [false, true]
|
396
|
+
def type_valid?
|
397
|
+
0 != @flags & (TYPE_CHECKED|TYPE_VALID)
|
398
|
+
end
|
399
|
+
|
400
|
+
# Returns true if the character set has been checked.
|
401
|
+
# @return [false, true]
|
402
|
+
def charset_checked?
|
403
|
+
0 != @flags & CHARSET_CHECKED
|
404
|
+
end
|
405
|
+
|
406
|
+
# Returns true if the character set has been checked _and_ is valid.
|
407
|
+
# @return [false, true]
|
408
|
+
def charset_valid?
|
409
|
+
0 != @flags & (CHARSET_CHECKED|CHARSET_VALID)
|
410
|
+
end
|
411
|
+
|
412
|
+
# Returns true if the content encoding (e.g. gzip, deflate) has
|
413
|
+
# been checked.
|
414
|
+
# @return [false, true]
|
415
|
+
def encoding_checked?
|
416
|
+
0 != @flags & ENCODING_CHECKED
|
417
|
+
end
|
418
|
+
|
419
|
+
# Returns true if the content encoding has been checked _and_ is valid.
|
420
|
+
# @return [false, true]
|
421
|
+
def encoding_valid?
|
422
|
+
0 != @flags & (ENCODING_CHECKED|ENCODING_VALID)
|
423
|
+
end
|
424
|
+
|
425
|
+
# Returns true if the blob's syntax has been checked.
|
426
|
+
# @return [false, true]
|
427
|
+
def syntax_checked?
|
428
|
+
0 != @flags & SYNTAX_CHECKED
|
429
|
+
end
|
430
|
+
|
431
|
+
# Returns true if the blob's syntax has been checked _and_ is valid.
|
432
|
+
# @return [false, true]
|
433
|
+
def syntax_valid?
|
434
|
+
0 != @flags & (SYNTAX_CHECKED|SYNTAX_VALID)
|
435
|
+
end
|
436
|
+
|
437
|
+
%i[ctime mtime ptime dtime].each do |k|
|
438
|
+
define_method "#{k}=" do |v|
|
439
|
+
instance_variable_set "@#{k}", coerce_time(v, k).freeze
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
%i[type charset encoding language].each do |k|
|
444
|
+
define_method "#{k}=" do |v|
|
445
|
+
instance_variable_set "@#{k}", coerce_token(v, k).freeze
|
446
|
+
end
|
447
|
+
|
448
|
+
define_method "#{k}_ok?" do |v|
|
449
|
+
TOKENS[k].first.match? v
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
# Just a plain old predicate to determine whether the blob has been
|
454
|
+
# deleted from the store (but implicitly the metadata record
|
455
|
+
# remains).
|
456
|
+
# @return [false, true]
|
457
|
+
def deleted?
|
458
|
+
!!@dtime
|
459
|
+
end
|
460
|
+
|
461
|
+
# Return the object as a hash. Omits the content by default.
|
462
|
+
# @param content [false, true] include the content if true
|
463
|
+
# @return [Hash] the object as a hash
|
464
|
+
def to_h content: false
|
465
|
+
main = %i[content digests]
|
466
|
+
main.shift unless content
|
467
|
+
(main + MANDATORY + OPTIONAL + [:flags]).map do |k|
|
468
|
+
[k, send(k).dup]
|
469
|
+
end.to_h
|
470
|
+
end
|
471
|
+
|
472
|
+
# Outputs a human-readable string representation of the object.
|
473
|
+
def to_s
|
474
|
+
out = "#{self.class}\n Digests:\n"
|
475
|
+
|
476
|
+
# disgorge the digests
|
477
|
+
digests.values.sort { |a, b| a.to_s <=> b.to_s }.each do |d|
|
478
|
+
out << " #{d}\n"
|
479
|
+
end
|
480
|
+
|
481
|
+
# now the fields
|
482
|
+
MANDATORY.each { |m| out << " #{LABELS[m]}: #{send m}\n" }
|
483
|
+
OPTIONAL.each do |o|
|
484
|
+
val = send o
|
485
|
+
out << " #{LABELS[o]}: #{val}\n" if val
|
486
|
+
end
|
487
|
+
|
488
|
+
# now the validation statuses
|
489
|
+
out << "Validation:\n"
|
490
|
+
FLAG.each_index do |i|
|
491
|
+
x = flags >> (3 - i) & 3
|
492
|
+
out << (" %-16s: %s\n" % [FLAG[i], STATE[x]])
|
493
|
+
end
|
494
|
+
|
495
|
+
out
|
496
|
+
end
|
497
|
+
end
|