store-digest 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +202 -0
- data/README.md +231 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/store/digest.rb +282 -0
- data/lib/store/digest/blob.rb +7 -0
- data/lib/store/digest/blob/filesystem.rb +146 -0
- data/lib/store/digest/driver.rb +14 -0
- data/lib/store/digest/driver/lmdb.rb +15 -0
- data/lib/store/digest/meta.rb +7 -0
- data/lib/store/digest/meta/lmdb.rb +621 -0
- data/lib/store/digest/object.rb +497 -0
- data/lib/store/digest/trait.rb +32 -0
- data/lib/store/digest/version.rb +5 -0
- data/store-digest.gemspec +39 -0
- metadata +161 -0
@@ -0,0 +1,497 @@
|
|
1
|
+
require 'store/digest/version'
|
2
|
+
|
3
|
+
require 'uri'
|
4
|
+
require 'uri/ni'
|
5
|
+
require 'mimemagic'
|
6
|
+
require 'mimemagic/overlay'
|
7
|
+
|
8
|
+
class MimeMagic
|
9
|
+
# XXX erase this when these methods get added
|
10
|
+
unless self.method_defined? :parents
|
11
|
+
def self.parents type
|
12
|
+
TYPES.fetch(type, [nil,[]])[1].map { |t| new t }.uniq
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
unless self.method_defined? :ancestor_types
|
17
|
+
def self.ancestor_types type
|
18
|
+
parents(type).map { |t| ancestors(t) }.flatten.uniq
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
unless self.method_defined? :binary?
|
23
|
+
def self.binary? thing
|
24
|
+
sample = nil
|
25
|
+
|
26
|
+
# get some stuff out of the IO or get a substring
|
27
|
+
if %i[tell seek read].all? { |m| thing.respond_to? m }
|
28
|
+
pos = thing.tell
|
29
|
+
thing.seek 0, 0
|
30
|
+
sample = thing.read 1024
|
31
|
+
thing.seek pos
|
32
|
+
elsif thing.respond_to? :to_s
|
33
|
+
sample = thing.to_s[0,1024]
|
34
|
+
else
|
35
|
+
raise ArgumentError, "Cannot sample an instance of {thing.class}"
|
36
|
+
end
|
37
|
+
|
38
|
+
# consider this to be 'binary' if empty
|
39
|
+
return true if sample.nil? or sample.empty?
|
40
|
+
# control codes minus ordinary whitespace
|
41
|
+
/[\x0-\x8\xe-\x1f\x7f]/n.match?(sample) ? true : false
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
unless self.method_defined? :default_type
|
46
|
+
def self.default_type thing
|
47
|
+
new self.binary?(thing) ? 'application/octet-stream' : 'text/plain'
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class Store::Digest::Object
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
SAMPLE = 2**13 # must be big enough to detect ooxml
|
57
|
+
BLOCKSIZE = 2**16
|
58
|
+
|
59
|
+
CHARSETS = [
|
60
|
+
%w[utf8 utf-8],
|
61
|
+
%w[iso8859-1 iso-8859-1],
|
62
|
+
].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
|
63
|
+
|
64
|
+
ENCODINGS = [
|
65
|
+
%w[x-compress compress],
|
66
|
+
%w[x-gzip gzip],
|
67
|
+
].map { |k, v| [k.freeze, v.freeze] }.to_h.freeze
|
68
|
+
|
69
|
+
TOKEN = '[^\x0-\x20()<>@,;:\\\"/\[\]?=\x7f-\\xff]+'
|
70
|
+
|
71
|
+
# { key: [pattern, normalizer] } - assumes stripped and downcased
|
72
|
+
TOKENS = {
|
73
|
+
type: [/^(#{TOKEN}(?:\/#{TOKEN})?)$/on, -> c { c.downcase }],
|
74
|
+
charset: [/^(#{TOKEN})$/on,
|
75
|
+
-> c { c = c.tr(?_, ?-).downcase; CHARSETS.fetch c, c } ],
|
76
|
+
encoding: [/^(#{TOKEN})$/on,
|
77
|
+
-> c { c = c.tr(?_, ?-).downcase; ENCODINGS.fetch c, c } ],
|
78
|
+
language: [/^([a-z]{2,3}(?:[-_][0-9a-z]+)*)$/,
|
79
|
+
-> c { c.downcase.tr(?_, ?-).gsub(/-*$/, '') } ],
|
80
|
+
}
|
81
|
+
|
82
|
+
# flag constants
|
83
|
+
TYPE_CHECKED = 1 << 0
|
84
|
+
TYPE_VALID = 1 << 1
|
85
|
+
CHARSET_CHECKED = 1 << 2
|
86
|
+
CHARSET_VALID = 1 << 3
|
87
|
+
ENCODING_CHECKED = 1 << 4
|
88
|
+
ENCODING_VALID = 1 << 5
|
89
|
+
SYNTAX_CHECKED = 1 << 6
|
90
|
+
SYNTAX_VALID = 1 << 7
|
91
|
+
|
92
|
+
LABELS = {
|
93
|
+
size: 'Size (Bytes)',
|
94
|
+
ctime: 'Added to Store',
|
95
|
+
mtime: 'Last Modified',
|
96
|
+
ptime: 'Properties Modified',
|
97
|
+
dtime: 'Deleted',
|
98
|
+
type: 'Content Type',
|
99
|
+
language: '(Natural) Language',
|
100
|
+
charset: 'Character Set',
|
101
|
+
encoding: 'Content Encoding',
|
102
|
+
}.freeze
|
103
|
+
|
104
|
+
MANDATORY = %i[size ctime mtime ptime]
|
105
|
+
OPTIONAL = %i[dtime type language charset encoding]
|
106
|
+
FLAG = %i[content-type charset content-encoding syntax].freeze
|
107
|
+
STATE = %i[unverified invalid recheck valid].freeze
|
108
|
+
|
109
|
+
def coerce_time t, k
|
110
|
+
case t
|
111
|
+
when nil then nil
|
112
|
+
when Time then t
|
113
|
+
when -> dt { dt.respond_to? :to_time }
|
114
|
+
t.to_time
|
115
|
+
when Integer
|
116
|
+
raise ArgumentError,
|
117
|
+
"#{k} given as Integer must be non-negative" if t < 0
|
118
|
+
Time.at t
|
119
|
+
else
|
120
|
+
raise ArgumentError, "Invalid type for #{k}: #{t.class}"
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def coerce_token t, k
|
125
|
+
t = t.to_s.strip.downcase
|
126
|
+
pat, norm = TOKENS[k]
|
127
|
+
raise "#{k} #{t} does not match #{pat}" unless m = pat.match(t)
|
128
|
+
norm.call m[1]
|
129
|
+
end
|
130
|
+
|
131
|
+
public
|
132
|
+
|
133
|
+
# Create a new object, naively recording whatever is handed
|
134
|
+
#
|
135
|
+
# @note use {.scan} or {#scan} to populate
|
136
|
+
#
|
137
|
+
# @param content [IO, String, Proc, File, Pathname, ...] some content
|
138
|
+
# @param digests [Hash] the digests ascribed to the content
|
139
|
+
# @param size [Integer] assert the object's size
|
140
|
+
# @param type [String] assert the object's MIME type
|
141
|
+
# @param charset [String] the character set, if applicable
|
142
|
+
# @param language [String] the (RFC5646) language tag, if applicable
|
143
|
+
# @param encoding [String] the content-encoding (e.g. compression)
|
144
|
+
# @param ctime [Time] assert object creation time
|
145
|
+
# @param mtime [Time] assert object modification time
|
146
|
+
# @param ptime [Time] assert object metadata parameter modification time
|
147
|
+
# @param dtime [Time] assert object deletion time
|
148
|
+
# @param flags [Integer] validation state flags
|
149
|
+
# @param strict [true, false] raise an error on bad input
|
150
|
+
# @param fresh [true, false] assert "freshness" of object vis-a-vis the store
|
151
|
+
# @return [Store::Digest::Object] the object in question
|
152
|
+
def initialize content = nil, digests: {}, size: 0,
|
153
|
+
type: 'application/octet-stream', charset: nil, language: nil,
|
154
|
+
encoding: nil, ctime: nil, mtime: nil, ptime: nil, dtime: nil,
|
155
|
+
flags: 0, strict: true, fresh: false
|
156
|
+
|
157
|
+
# snag this immediately
|
158
|
+
@fresh = !!fresh
|
159
|
+
|
160
|
+
# check input on content
|
161
|
+
@content = case content
|
162
|
+
when nil then nil
|
163
|
+
when IO, StringIO, Proc then content
|
164
|
+
when String then StringIO.new content
|
165
|
+
when Pathname then -> { content.expand_path.open('rb') }
|
166
|
+
when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
|
167
|
+
content
|
168
|
+
else
|
169
|
+
raise ArgumentError,
|
170
|
+
"Cannot accept content given as #{content.class}"
|
171
|
+
end
|
172
|
+
|
173
|
+
# check input on digests
|
174
|
+
@digests = case digests
|
175
|
+
when Hash
|
176
|
+
# hash must be clean
|
177
|
+
digests.map do |k, v|
|
178
|
+
raise ArgumentError,
|
179
|
+
'Digest keys must be symbol-able' unless
|
180
|
+
k.respond_to? :to_sym
|
181
|
+
k = k.to_sym
|
182
|
+
raise ArgumentError,
|
183
|
+
'Digest values must be URI::NI' unless
|
184
|
+
v.is_a? URI::NI
|
185
|
+
raise ArgumentError,
|
186
|
+
'Digest key must match value algorithm' unless
|
187
|
+
k == v.algorithm
|
188
|
+
[k.to_sym, v.dup.freeze]
|
189
|
+
end.to_h
|
190
|
+
when nil then {} # empty hash
|
191
|
+
when Array
|
192
|
+
# only accepts array of URI::NI
|
193
|
+
digests.map do |x|
|
194
|
+
raise ArgumentError,
|
195
|
+
"Digests given as array can only be URI::NI, not #{x}" \
|
196
|
+
unless x.is_a? URI::NI
|
197
|
+
[x.algorithm, x.dup.freeze]
|
198
|
+
end.to_h
|
199
|
+
when URI::NI then { digests.algorithm => digests.dup.freeze }
|
200
|
+
else
|
201
|
+
# everything else is invalid
|
202
|
+
raise ArgumentError,
|
203
|
+
"Cannot coerce digests given as #{digests.inspect}"
|
204
|
+
end
|
205
|
+
|
206
|
+
# ctime, mtime, ptime, dtime should be all nil or nonnegative
|
207
|
+
# integers or Time or DateTime
|
208
|
+
b = binding
|
209
|
+
%i[ctime mtime ptime dtime].each do |k|
|
210
|
+
v = coerce_time(b.local_variable_get(k), k)
|
211
|
+
instance_variable_set "@#{k}", v
|
212
|
+
end
|
213
|
+
|
214
|
+
# size and flags should be non-negative integers
|
215
|
+
%i[size flags].each do |k|
|
216
|
+
x = b.local_variable_get k
|
217
|
+
v = case x
|
218
|
+
when nil then 0
|
219
|
+
when Integer
|
220
|
+
raise ArgumentError, "#{k} must be non-negative" if x < 0
|
221
|
+
x
|
222
|
+
else
|
223
|
+
raise ArgumentError, "#{k} must be nil or an Integer"
|
224
|
+
end
|
225
|
+
instance_variable_set "@#{k}", v
|
226
|
+
end
|
227
|
+
|
228
|
+
# the following can be strings or symbols:
|
229
|
+
TOKENS.keys.each do |k|
|
230
|
+
if x = b.local_variable_get(k)
|
231
|
+
x = if strict
|
232
|
+
coerce_token(x, k)
|
233
|
+
else
|
234
|
+
coerce_token(x, k) rescue nil
|
235
|
+
end
|
236
|
+
instance_variable_set "@#{k}", x.freeze if x
|
237
|
+
end
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# XXX come up with a policy for these that isn't stupid, plus input sanitation
|
242
|
+
attr_reader :digests, :size
|
243
|
+
attr_accessor :type, :charset, :language, :encoding,
|
244
|
+
:ctime, :mtime, :ptime, :dtime, :flags
|
245
|
+
|
246
|
+
#
|
247
|
+
def self.scan content, digests: URI::NI.algorithms, mtime: nil,
|
248
|
+
type: nil, language: nil, charset: nil, encoding: nil,
|
249
|
+
blocksize: BLOCKSIZE, strict: true, fresh: false, &block
|
250
|
+
self.new.scan content, digests: digests, mtime: mtime, type: type,
|
251
|
+
language: language, charset: charset, encoding: encoding,
|
252
|
+
blocksize: blocksize, strict: strict, fresh: fresh, &block
|
253
|
+
end
|
254
|
+
|
255
|
+
def scan content = nil, digests: URI::NI.algorithms, mtime: nil,
|
256
|
+
type: nil, charset: nil, language: nil, encoding: nil,
|
257
|
+
blocksize: BLOCKSIZE, strict: true, fresh: nil, &block
|
258
|
+
# update freshness if there is something to update
|
259
|
+
@fresh = !!fresh unless fresh.nil?
|
260
|
+
# we put all the scanning stuff in here
|
261
|
+
content = case content
|
262
|
+
when nil then self.content
|
263
|
+
when IO, StringIO then content
|
264
|
+
when String then StringIO.new content
|
265
|
+
when Pathname then content.open('rb')
|
266
|
+
when Proc then content.call
|
267
|
+
when -> x { %i[read seek pos].all? { |m| x.respond_to? m } }
|
268
|
+
content
|
269
|
+
else
|
270
|
+
raise ArgumentError,
|
271
|
+
"Cannot scan content of type #{content.class}"
|
272
|
+
end
|
273
|
+
content.binmode if content.respond_to? :binmode
|
274
|
+
|
275
|
+
# sane default for mtime
|
276
|
+
@mtime = coerce_time(mtime || @mtime ||
|
277
|
+
(content.respond_to?(:mtime) ? content.mtime : Time.now), :mtime)
|
278
|
+
|
279
|
+
# eh, *some* code reuse
|
280
|
+
b = binding
|
281
|
+
TOKENS.keys.each do |k|
|
282
|
+
if x = b.local_variable_get(k)
|
283
|
+
x = if strict
|
284
|
+
coerce_token(x, k)
|
285
|
+
else
|
286
|
+
coerce_token(x, k) rescue nil
|
287
|
+
end
|
288
|
+
instance_variable_set "@#{k}", x.freeze if x
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
digests = case digests
|
293
|
+
when Array then digests
|
294
|
+
when Symbol then [digests]
|
295
|
+
else
|
296
|
+
raise ArgumentError, 'Digests must be one or more symbol'
|
297
|
+
end
|
298
|
+
raise ArgumentError,
|
299
|
+
"Invalid digest list #{digests - URI::NI.algorithms}" unless
|
300
|
+
(digests - URI::NI.algorithms).empty?
|
301
|
+
|
302
|
+
# set up the contexts
|
303
|
+
digests = digests.map { |d| [d, URI::NI.context(d)] }.to_h
|
304
|
+
|
305
|
+
# sample for mime type checking
|
306
|
+
sample = StringIO.new ''
|
307
|
+
@size = 0
|
308
|
+
while buf = content.read(blocksize)
|
309
|
+
@size += buf.size
|
310
|
+
sample << buf if sample.pos < SAMPLE
|
311
|
+
digests.values.each { |ctx| ctx << buf }
|
312
|
+
block.call buf if block_given?
|
313
|
+
end
|
314
|
+
|
315
|
+
# seek the content back to the front and store it
|
316
|
+
content.seek 0, 0
|
317
|
+
@content = content
|
318
|
+
|
319
|
+
# set up the digests
|
320
|
+
@digests = digests.map do |k, v|
|
321
|
+
[k, URI::NI.compute(v, algorithm: k).freeze]
|
322
|
+
end.to_h.freeze
|
323
|
+
|
324
|
+
# obtain the sampled content type
|
325
|
+
ts = MimeMagic.by_magic(sample) || MimeMagic.default_type(sample)
|
326
|
+
if content.respond_to? :path
|
327
|
+
# may as well use the path if it's available and more specific
|
328
|
+
ps = MimeMagic.by_path(content.path)
|
329
|
+
# XXX the need to do ts.to_s is a bug in mimemagic
|
330
|
+
ts = ps if ps and ps.child_of?(ts.to_s)
|
331
|
+
end
|
332
|
+
@type = !type || ts.child_of?(type) ? ts.to_s : type
|
333
|
+
|
334
|
+
self
|
335
|
+
end
|
336
|
+
|
337
|
+
# Determine (or set) whether the object is "fresh", i.e. whether it
|
338
|
+
# is new (or restored), or had been previously been in the store.
|
339
|
+
#
|
340
|
+
# @param state [true, false]
|
341
|
+
def fresh? state = nil
|
342
|
+
state.nil? ? @fresh : @fresh = !!state
|
343
|
+
end
|
344
|
+
|
345
|
+
# Return the algorithms used in the object.
|
346
|
+
# @return [Array]
|
347
|
+
def algorithms
|
348
|
+
(@digests || {}).keys.sort
|
349
|
+
end
|
350
|
+
|
351
|
+
# Return a particular digest. Returns nil if there is no match.
|
352
|
+
# @param symbol [Symbol, #to_s, #to_sym] the digest
|
353
|
+
# @return [Symbol, nil]
|
354
|
+
def digest symbol
|
355
|
+
raise ArgumentError, "This method takes a symbol" unless
|
356
|
+
symbol.respond_to? :to_sym
|
357
|
+
digests[symbol.to_sym]
|
358
|
+
end
|
359
|
+
|
360
|
+
alias_method :"[]", :digest
|
361
|
+
|
362
|
+
# Returns the content stored in the object.
|
363
|
+
# @return [IO]
|
364
|
+
def content
|
365
|
+
@content.is_a?(Proc) ? @content.call : @content
|
366
|
+
end
|
367
|
+
|
368
|
+
# Determines if there is content embedded in the object.
|
369
|
+
# @return [false, true]
|
370
|
+
def content?
|
371
|
+
!!@content
|
372
|
+
end
|
373
|
+
|
374
|
+
# Returns the type and charset, suitable for an HTTP header.
|
375
|
+
# @return [String]
|
376
|
+
def type_charset
|
377
|
+
out = type.to_s
|
378
|
+
out += ";charset=#{charset}" if charset
|
379
|
+
out
|
380
|
+
end
|
381
|
+
|
382
|
+
# Determines if the object has been scanned.
|
383
|
+
# @return [false, true]
|
384
|
+
def scanned?
|
385
|
+
!@digests.empty?
|
386
|
+
end
|
387
|
+
|
388
|
+
# Returns true if the content type has been checked.
|
389
|
+
# @return [false, true]
|
390
|
+
def type_checked?
|
391
|
+
0 != @flags & TYPE_CHECKED
|
392
|
+
end
|
393
|
+
|
394
|
+
# Returns true if the content type has been checked _and_ is valid.
|
395
|
+
# @return [false, true]
|
396
|
+
def type_valid?
|
397
|
+
0 != @flags & (TYPE_CHECKED|TYPE_VALID)
|
398
|
+
end
|
399
|
+
|
400
|
+
# Returns true if the character set has been checked.
|
401
|
+
# @return [false, true]
|
402
|
+
def charset_checked?
|
403
|
+
0 != @flags & CHARSET_CHECKED
|
404
|
+
end
|
405
|
+
|
406
|
+
# Returns true if the character set has been checked _and_ is valid.
|
407
|
+
# @return [false, true]
|
408
|
+
def charset_valid?
|
409
|
+
0 != @flags & (CHARSET_CHECKED|CHARSET_VALID)
|
410
|
+
end
|
411
|
+
|
412
|
+
# Returns true if the content encoding (e.g. gzip, deflate) has
|
413
|
+
# been checked.
|
414
|
+
# @return [false, true]
|
415
|
+
def encoding_checked?
|
416
|
+
0 != @flags & ENCODING_CHECKED
|
417
|
+
end
|
418
|
+
|
419
|
+
# Returns true if the content encoding has been checked _and_ is valid.
|
420
|
+
# @return [false, true]
|
421
|
+
def encoding_valid?
|
422
|
+
0 != @flags & (ENCODING_CHECKED|ENCODING_VALID)
|
423
|
+
end
|
424
|
+
|
425
|
+
# Returns true if the blob's syntax has been checked.
|
426
|
+
# @return [false, true]
|
427
|
+
def syntax_checked?
|
428
|
+
0 != @flags & SYNTAX_CHECKED
|
429
|
+
end
|
430
|
+
|
431
|
+
# Returns true if the blob's syntax has been checked _and_ is valid.
|
432
|
+
# @return [false, true]
|
433
|
+
def syntax_valid?
|
434
|
+
0 != @flags & (SYNTAX_CHECKED|SYNTAX_VALID)
|
435
|
+
end
|
436
|
+
|
437
|
+
%i[ctime mtime ptime dtime].each do |k|
|
438
|
+
define_method "#{k}=" do |v|
|
439
|
+
instance_variable_set "@#{k}", coerce_time(v, k).freeze
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
%i[type charset encoding language].each do |k|
|
444
|
+
define_method "#{k}=" do |v|
|
445
|
+
instance_variable_set "@#{k}", coerce_token(v, k).freeze
|
446
|
+
end
|
447
|
+
|
448
|
+
define_method "#{k}_ok?" do |v|
|
449
|
+
TOKENS[k].first.match? v
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
# Just a plain old predicate to determine whether the blob has been
|
454
|
+
# deleted from the store (but implicitly the metadata record
|
455
|
+
# remains).
|
456
|
+
# @return [false, true]
|
457
|
+
def deleted?
|
458
|
+
!!@dtime
|
459
|
+
end
|
460
|
+
|
461
|
+
# Return the object as a hash. Omits the content by default.
|
462
|
+
# @param content [false, true] include the content if true
|
463
|
+
# @return [Hash] the object as a hash
|
464
|
+
def to_h content: false
|
465
|
+
main = %i[content digests]
|
466
|
+
main.shift unless content
|
467
|
+
(main + MANDATORY + OPTIONAL + [:flags]).map do |k|
|
468
|
+
[k, send(k).dup]
|
469
|
+
end.to_h
|
470
|
+
end
|
471
|
+
|
472
|
+
# Outputs a human-readable string representation of the object.
|
473
|
+
def to_s
|
474
|
+
out = "#{self.class}\n Digests:\n"
|
475
|
+
|
476
|
+
# disgorge the digests
|
477
|
+
digests.values.sort { |a, b| a.to_s <=> b.to_s }.each do |d|
|
478
|
+
out << " #{d}\n"
|
479
|
+
end
|
480
|
+
|
481
|
+
# now the fields
|
482
|
+
MANDATORY.each { |m| out << " #{LABELS[m]}: #{send m}\n" }
|
483
|
+
OPTIONAL.each do |o|
|
484
|
+
val = send o
|
485
|
+
out << " #{LABELS[o]}: #{val}\n" if val
|
486
|
+
end
|
487
|
+
|
488
|
+
# now the validation statuses
|
489
|
+
out << "Validation:\n"
|
490
|
+
FLAG.each_index do |i|
|
491
|
+
x = flags >> (3 - i) & 3
|
492
|
+
out << (" %-16s: %s\n" % [FLAG[i], STATE[x]])
|
493
|
+
end
|
494
|
+
|
495
|
+
out
|
496
|
+
end
|
497
|
+
end
|