archive-utils 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 52a10a6b4f10f5b140e6c47e9d88dd25d4407610
4
+ data.tar.gz: 374996e4a353ff7876c4717bb5b73757bae24420
5
+ SHA512:
6
+ metadata.gz: 6f60a79a35a425ed109f633a0ed1b3b066c97b806c00e928f33d17fd448b018be18f4f08d1c46c9f91b2f031aa0defadb53f58f622a4502d57a524be55ad6219
7
+ data.tar.gz: 3498dfacc552e52d3db4d0910b56eaa0da8ceac2b47059e15976744f0e2c18c167c82657894a1cffaba35789b35435f8c6251f5d65f0436a52c860b56d9f9186
@@ -0,0 +1,23 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ Bundler.setup
4
+ require 'digest'
5
+ require 'find'
6
+ require 'json/pure'
7
+ require 'pathname'
8
+ require 'systemu'
9
+
10
+ # Should remove these dependencies from sdr-archive
11
+ #require 'moab_stanford'
12
+ #require 'rest-client'
13
+
14
+ module Archive
15
+ end
16
+
17
+ require 'archive/bagit_bag'
18
+ require 'archive/file_fixity'
19
+ require 'archive/fixity'
20
+ require 'archive/operating_system'
21
+ require 'archive/tarfile'
22
+ include Archive
23
+
@@ -0,0 +1,353 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A BagIt bag contains a structured copy of a digital object for storage, transfer, or archive
7
+ # @see https://tools.ietf.org/html/draft-kunze-bagit-10
8
+ # This class can be used to create, parse, or validate a bag instance
9
+ #
10
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
11
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
12
+ class BagitBag
13
+
14
+ # @param [Pathname,String] pathname The location of the bag home directory
15
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
16
+ def BagitBag.create_bag(pathname)
17
+ bag = BagitBag.new
18
+ bag.bag_pathname = pathname
19
+ bag.payload_pathname.mkpath
20
+ bag.write_bagit_txt
21
+ bag
22
+ end
23
+
24
+ # @param [Pathname,String] pathname The location of the bag home directory
25
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
26
+ def BagitBag.open_bag(pathname)
27
+ bag = BagitBag.new
28
+ bag.bag_pathname = pathname
29
+ raise "No bag found at #{bag.bag_pathname}" unless bag.bag_pathname.exist?
30
+ bagit_txt = bag.bag_pathname.join("bagit.txt")
31
+ raise "No bagit.txt file found at #{bagit_txt}" unless bagit_txt.exist?
32
+ bag
33
+ end
34
+
35
+ # @return [Pathname] The location of the bag home directory
36
+ def bag_pathname
37
+ @bag_pathname
38
+ end
39
+
40
+ # @param [Pathname,String] pathname The location of the bag home directory
41
+ # @return [Void] Set the location of the bag home directory
42
+ def bag_pathname=(pathname)
43
+ @bag_pathname = Pathname(pathname)
44
+ end
45
+
46
+ # @return [Pathname] The location of the bag data directory
47
+ def payload_pathname
48
+ bag_pathname.join('data')
49
+ end
50
+
51
+ # @return [Pathname] Generate the bagit.txt tag file
52
+ def write_bagit_txt
53
+ bagit_txt = bag_pathname.join("bagit.txt")
54
+ bagit_txt.open('w') do |f|
55
+ f.puts "Tag-File-Character-Encoding: UTF-8"
56
+ f.puts "BagIt-Version: 0.97"
57
+ end
58
+ bagit_txt
59
+ end
60
+
61
+ # @return [Hash<String,String] A hash containing the properties documented in the bagit.txt tagfile
62
+ def read_bagit_txt
63
+ properties = Hash.new
64
+ bagit_txt = bag_pathname.join("bagit.txt")
65
+ bagit_txt.readlines.each do |line|
66
+ line.chomp!.strip!
67
+ key,value = line.split(':',2)
68
+ properties[key.strip] = value.strip if value
69
+ end
70
+ properties
71
+ end
72
+
73
+ # @return [Array<Symbol>] The list of checksum types to be used when generating fixity data
74
+ def bag_checksum_types
75
+ @bag_checksum_types ||= Fixity.default_checksum_types
76
+ end
77
+
78
+ # @param [Object] types The list of checksum types to be used when generating fixity data
79
+ # @return [Void] Set the list of checksum types to be used when generating fixity data
80
+ def bag_checksum_types=(*types)
81
+ @bag_checksum_types = Fixity.validate_checksum_types(*types)
82
+ end
83
+
84
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
85
+ # @param [Pathname] source_dir The source location of the directory whose contents are to be bagged
86
+ # @return [Pathname] Generate file_fixity_hash and send it to #add_files_to_payload
87
+ def add_dir_to_payload (link_mode, source_dir)
88
+ file_fixity_hash = Fixity.generate_checksums(source_dir, source_dir.find ,bag_checksum_types)
89
+ add_files_to_payload(link_mode, source_dir, file_fixity_hash)
90
+ payload_pathname
91
+ end
92
+
93
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
94
+ # @param [Pathname] source_basepath The source location of the directory whose contents are to be ingested
95
+ # @param [Hash<String,FileFixity>] file_fixity_hash The list of files (with fixity data) to be added to the payload
96
+ # @return [Pathname] Copy or link the files specified in the file_fixity_hash to the payload directory,
97
+ # then update the payload manifest files
98
+ def add_files_to_payload(link_mode, source_basepath, file_fixity_hash)
99
+ file_fixity_hash.keys.each do |file_id|
100
+ source_pathname = source_basepath.join(file_id)
101
+ target_pathname = payload_pathname.join(file_id)
102
+ copy_file(link_mode, source_pathname, target_pathname)
103
+ end
104
+ write_manifest_checksums('manifest', add_data_prefix(file_fixity_hash))
105
+ payload_pathname
106
+ end
107
+
108
+ # @param [Hash<String,FileFixity>] file_fixity_hash key is file_id, values are Fixity objects containing checksums
109
+ # @return [Hash<String,FileFixity>] A revised hash with file_id paths prefixed with 'data/'
110
+ def add_data_prefix(file_fixity_hash)
111
+ new_hash = Hash.new
112
+ file_fixity_hash.values.each do |fixity|
113
+ fixity.file_id = "data/#{fixity.file_id}"
114
+ new_hash[fixity.file_id] = fixity
115
+ end
116
+ new_hash
117
+ end
118
+
119
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
120
+ # @param [Pathname] source_pathname The source location of the file to be ingested
121
+ # @param [Pathname] target_pathname The location of the directory in which to place the file
122
+ # @return [Pathname] link or copy the specified file from source location to the target location
123
+ def copy_file(link_mode, source_pathname, target_pathname)
124
+ target_pathname.parent.mkpath
125
+ case link_mode
126
+ when :copy, nil
127
+ FileUtils.copy(source_pathname.to_s, target_pathname.to_s) # automatically dereferences symlinks
128
+ when :link
129
+ FileUtils.link(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
130
+ when :symlink
131
+ FileUtils.symlink(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
132
+ else
133
+ raise "Invalid link_mode: #{link_mode}, expected one of [:copy,:link,:symlink]"
134
+ end
135
+ target_pathname
136
+ end
137
+
138
+ # @param [Pathname,String] source_fullpath The location of the directory whose content will be tarred
139
+ # @param [Pathname,String] source_basepath The location of the directory to change to before doing the tar create
140
+ # @return [Tarfile] Create a tar archive of a directory into the payload directory,
141
+ # generating checksums in parallel processes and recording those checksums in the payload manifests
142
+ def add_payload_tarfile(tarfile_id,source_fullpath, source_basepath)
143
+ tarfile = Tarfile.new
144
+ tarfile.source_basepath = Pathname(source_basepath)
145
+ tarfile.source_fullpath = Pathname(source_fullpath)
146
+ tarfile.tarfile_basepath = payload_pathname
147
+ tarfile.tarfile_fullpath = payload_pathname.join("#{tarfile_id}")
148
+ tarfile.create_tarfile
149
+ file_fixity_hash = Fixity.generate_checksums(bag_pathname,[tarfile.tarfile_fullpath],bag_checksum_types)
150
+ write_manifest_checksums('manifest', file_fixity_hash)
151
+ tarfile
152
+ end
153
+
154
+ # @return [Pathname] Generate the bag-info.txt tag file to record the payload size
155
+ def write_bag_info_txt
156
+ payload_size = bag_payload_size
157
+ bag_info_txt = bag_pathname.join("bag-info.txt")
158
+ bag_info_txt.open('w') do |f|
159
+ f.puts "External-Identifier: #{bag_pathname.basename}"
160
+ f.puts "Payload-Oxum: #{payload_size[:bytes]}.#{payload_size[:files]}"
161
+ f.puts "Bag-Size: #{bag_size_human(payload_size[:bytes])}"
162
+ end
163
+ bag_info_txt
164
+ end
165
+
166
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
167
+ # derived from the payload directory contents
168
+ def bag_payload_size
169
+ payload_pathname.find.select{|f| f.file?}.inject({bytes: 0, files: 0}) do |hash,file|
170
+ hash[:bytes] += file.size
171
+ hash[:files] += 1
172
+ hash
173
+ end
174
+ end
175
+
176
+ # @param [Integer] bytes The total number of bytes in the payload
177
+ # @return [String] Human-readable rendition of the total payload size
178
+ def bag_size_human(bytes)
179
+ count = 0
180
+ size = bytes
181
+ while ( size >= 1024 and count < 4 )
182
+ size /= 1024.0
183
+ count += 1
184
+ end
185
+ if (count == 0)
186
+ return sprintf("%d B", size)
187
+ else
188
+ return sprintf("%.2f %s", size, %w[B KB MB GB TB][count] )
189
+ end
190
+ end
191
+
192
+ # @return [Hash<String,String] A hash containing the properties documented in the bag-info.txt tagfile
193
+ def read_bag_info_txt
194
+ properties = Hash.new
195
+ bag_info = bag_pathname.join("bag-info.txt")
196
+ bag_info.readlines.each do |line|
197
+ line.chomp!.strip!
198
+ key,value = line.split(':',2)
199
+ properties[key.strip] = value.strip if value
200
+ end
201
+ properties
202
+ end
203
+
204
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
205
+ # derived from the Payload-Oxum property
206
+ def info_payload_size
207
+ info = read_bag_info_txt
208
+ size_array = info['Payload-Oxum'].split('.')
209
+ size_hash = {:bytes => size_array[0].to_i, :files => size_array[1].to_i}
210
+ size_hash
211
+ end
212
+
213
+ # @return [Boolean] Compare the actual measured payload size against the value recorded in bag-info.txt
214
+ def verify_payload_size
215
+ info_size = info_payload_size
216
+ bag_size = bag_payload_size
217
+ if info_size != bag_size
218
+ raise "Failed payload size verification! Expected: #{info_size}, Found: #{bag_size}"
219
+ end
220
+ true
221
+ end
222
+
223
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's root directory
224
+ def generate_tagfile_checksums
225
+ # get list of all files in the bag home dir, except those starting with 'tagmanifest'
226
+ tagfiles = bag_pathname.children.reject{|file| file.basename.to_s.start_with?('tagmanifest')}
227
+ # generate checksums, using bag home dir as the base directory for file ids (per bagit spec)
228
+ Fixity.generate_checksums(bag_pathname, tagfiles, bag_checksum_types )
229
+ end
230
+
231
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's payload
232
+ def generate_payload_checksums
233
+ # get list of all files in the data directory
234
+ path_list = payload_pathname.find
235
+ # generate checksums, but use bag home dir as the base directory for file ids (per bagit spec)
236
+ Fixity.generate_checksums(bag_pathname, path_list, bag_checksum_types)
237
+ end
238
+
239
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be updated
240
+ # @param [Hash<String,FileFixity>] file_fixity_hash A hash containing file ids and fixity data
241
+ # @param [String] open_mode The file open mode (default is 'a')
242
+ # @return [Hash<Symbol,Pathname] Update each of the manifests with data from the file_fixity_hash
243
+ def write_manifest_checksums(manifest_type, file_fixity_hash, open_mode='a')
244
+ manifests = Hash.new
245
+ self.bag_checksum_types.each do |checksum_type|
246
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
247
+ manifest_file = manifest_pathname.open(open_mode)
248
+ file_fixity_hash.values.each do |fixity|
249
+ checksum = fixity.get_checksum(checksum_type)
250
+ manifest_file.puts("#{checksum} #{fixity.file_id}") if checksum
251
+ end
252
+ manifest_file.close
253
+ manifests[checksum_type] = manifest_pathname
254
+ end
255
+ manifests
256
+ end
257
+
258
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
259
+ # @return [Hash<String,FileFixity>] A hash containing file ids and fixity data derived from the manifest files
260
+ def read_manifest_files(manifest_type)
261
+ file_fixity_hash = Hash.new
262
+ checksum_type_list = Array.new
263
+ Fixity.valid_checksum_ids.each do |checksum_type|
264
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
265
+ if manifest_pathname.file?
266
+ checksum_type_list << checksum_type
267
+ manifest_pathname.readlines.each do |line|
268
+ line.chomp!.strip!
269
+ checksum,file_id = line.split(/[\s*]+/,2)
270
+ file_fixity = file_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
271
+ file_fixity.set_checksum(checksum_type,checksum)
272
+ file_fixity_hash[file_id] = file_fixity
273
+ end
274
+ end
275
+ end
276
+ self.bag_checksum_types = self.bag_checksum_types | checksum_type_list
277
+ file_fixity_hash
278
+ end
279
+
280
+ # @return [Boolean] Compare fixity data from the tag manifest files against the values measured by digesting the files
281
+ def verify_tagfile_manifests
282
+ manifest_type = 'tagmanifest'
283
+ manifest_fixity_hash = read_manifest_files(manifest_type)
284
+ bag_fixity_hash = generate_tagfile_checksums
285
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
286
+ end
287
+
288
+ # @return [Boolean] Compare fixity data from the payload manifest files against the values measured by digesting the files
289
+ def verify_payload_manifests
290
+ manifest_type = 'manifest'
291
+ manifest_fixity_hash = read_manifest_files(manifest_type)
292
+ bag_fixity_hash = generate_payload_checksums
293
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
294
+ end
295
+
296
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
297
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
298
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
299
+ # @return [Boolean] Compare fixity data from the manifest files against the values measured by digesting the files,
300
+ # returning true if equal or false if not equal
301
+ def verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
302
+ diff = manifest_diff(manifest_fixity_hash, bag_fixity_hash)
303
+ if diff.size > 0
304
+ raise "Failed #{manifest_type} verification! Differences: \n#{diff.inspect}"
305
+ end
306
+ true
307
+ end
308
+
309
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
310
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
311
+ # @return [Hash] A report of the differences between the fixity data from the manifest files
312
+ # against the values measured by digesting the files
313
+ def manifest_diff(manifest_fixity_hash, bag_fixity_hash)
314
+ diff = Hash.new
315
+ (manifest_fixity_hash.keys | bag_fixity_hash.keys).each do |file_id|
316
+ manifest_fixity = manifest_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
317
+ bag_fixity = bag_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
318
+ if manifest_fixity != bag_fixity
319
+ diff[file_id] = manifest_fixity.diff(bag_fixity,'manifest','bag')
320
+ end
321
+ end
322
+ diff
323
+ end
324
+
325
+ # @return [Boolean] Validate the bag containing the digital object
326
+ def verify_bag
327
+ verify_bag_structure
328
+ verify_tagfile_manifests
329
+ verify_payload_size
330
+ verify_payload_manifests
331
+ true
332
+ end
333
+
334
+ # @return [Boolean] Test the existence of expected files, return true if files exist, raise exception if not
335
+ def verify_bag_structure
336
+ required_files = ['data','bagit.txt','bag-info.txt','manifest-sha256.txt','tagmanifest-sha256.txt']
337
+ required_files.each{|filename| verify_pathname(bag_pathname.join(filename))}
338
+ optional_files = []
339
+ true
340
+ end
341
+
342
+ # @param [Pathname] pathname The file whose existence should be verified
343
+ # @return [Boolean] Test the existence of the specified path. Return true if file exists, raise exception if not
344
+ def verify_pathname(pathname)
345
+ raise "#{pathname.basename} not found at #{pathname}" unless pathname.exist?
346
+ true
347
+ end
348
+
349
+
350
+ end
351
+
352
+
353
+ end
@@ -0,0 +1,98 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # The fixity properties of a file, used to determine file content equivalence.
7
+ # Placing this data in a class by itself facilitates using the MD5, SHA1, etc checksums (and optionally the file size)
8
+ # as a single key when doing comparisons against other file instances. The design assumes that this file fixity
9
+ # is sufficiently unique to act as a comparator for determining file equality or verifying checksum manifests.
10
+ #
11
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
12
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
13
+ class FileFixity
14
+
15
+ # @param [Hash<Symbol,Object>] options Key,Value pairs specifying initial values of attributes
16
+ def initialize(options=nil)
17
+ @checksums=Hash.new
18
+ options = {} if options.nil?
19
+ options.each do |key,value|
20
+ #instance_variable_set("@#{key}", value)
21
+ send "#{key}=", value
22
+ end
23
+ end
24
+
25
+ # @return [String] The name of the file, relative to its base directory
26
+ # (for payload files, path relative to the data folder. For tag files, path relative to the bag home folder)
27
+ attr_accessor :file_id
28
+
29
+ # @return [Integer] The size of the file in bytes
30
+ attr_accessor :bytes
31
+
32
+ # @return [Hash<Symbol,String>] The MD5, SHA1, SHA256, etc checksum values of the file
33
+ attr_accessor :checksums
34
+
35
+ # @param [Symbol,String] type The type of checksum (e.g. :md5, :sha1, :sha256)
36
+ # @return [String] The value of the file digest
37
+ def get_checksum(type)
38
+ checksum_type = type.to_s.downcase.to_sym
39
+ self.checksums[checksum_type]
40
+ end
41
+
42
+ # @param type [Symbol,String] The type of checksum
43
+ # @param value [String] value of the file digest
44
+ # @return [void] Set the value for the specified checksum type in the checksum hash
45
+ def set_checksum(type,value)
46
+ checksum_type = type.to_s.downcase.to_sym
47
+ Fixity.validate_checksum_types(checksum_type)
48
+ self.checksums[checksum_type] = value
49
+ end
50
+
51
+ # @param other [FileFixity] The other file fixity being compared to this fixity
52
+ # @return [Boolean] Returns true if self and other have comparable fixity data.
53
+ def eql?(other)
54
+ matching_checksum_types = self.checksums.keys & other.checksums.keys
55
+ return false if matching_checksum_types.size == 0
56
+ matching_checksum_types.each do |type|
57
+ return false if self.checksums[type] != other.checksums[type]
58
+ end
59
+ true
60
+ end
61
+
62
+ # (see #eql?)
63
+ def ==(other)
64
+ eql?(other)
65
+ end
66
+
67
+ # @return [Fixnum] Compute a hash-code for the fixity value array.
68
+ # Two file instances with the same content will have the same hash code (and will compare using eql?).
69
+ # @note The hash and eql? methods override the methods inherited from Object.
70
+ # These methods ensure that instances of this class can be used as Hash keys. See
71
+ # * {http://www.paulbutcher.com/2007/10/navigating-the-equality-maze/}
72
+ # * {http://techbot.me/2011/05/ruby-basics-equality-operators-ruby/}
73
+ # Also overriden is {#==} so that equality tests in other contexts will also return the expected result.
74
+ def hash
75
+ [self.file_id].hash
76
+ end
77
+
78
+ # @param [FileFixity] other The other FileFixity object being compared to this one
79
+ # @param [String] left The label to use for values from this base FileFixity object
80
+ # @param [String] right he label to use for values from the other FileFixity object
81
+ # @return [Hash<symbol,Hash<String,String>] details of the checksum differences between fixity objects
82
+ def diff(other,left='base',right='other')
83
+ diff_hash = Hash.new
84
+ matching_checksum_types = (self.checksums.keys & other.checksums.keys)
85
+ matching_checksum_types = (self.checksums.keys | other.checksums.keys) if matching_checksum_types.empty?
86
+ matching_checksum_types.each do |type|
87
+ base_checksum = self.checksums[type]
88
+ other_checksum = other.checksums[type]
89
+ if base_checksum != other_checksum
90
+ diff_hash[type] = {left => base_checksum, right => other_checksum }
91
+ end
92
+ end
93
+ return diff_hash.size > 0 ? diff_hash : nil
94
+ end
95
+
96
+ end
97
+
98
+ end
@@ -0,0 +1,155 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A Struct to hold properties of a given checksum digest type
7
+ ChecksumType = Struct.new(:id, :hex_length, :names)
8
+
9
+ # A helper class that facilites the generation and processing of checksums
10
+ #
11
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
12
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
13
+ class Fixity
14
+
15
+ @@default_checksum_types = [:sha1, :sha256]
16
+
17
+ # @return [Array<Symbol>] The list of checksum types to be used when generating fixity data
18
+ def Fixity.default_checksum_types
19
+ @@default_checksum_types
20
+ end
21
+
22
+ # @param [Array<Symbol>] types The list of checksum types to be used when generating fixity data
23
+ # @return [Void] Set the list of checksum types to be used when generating fixity data
24
+ def Fixity.default_checksum_types=(*types)
25
+ @@default_checksum_types = Fixity.validate_checksum_types(*types)
26
+ end
27
+
28
+ @@valid_checksum_types = [
29
+ ChecksumType.new(:md5, 32, ['MD5']),
30
+ ChecksumType.new(:sha1, 40, ['SHA-1', 'SHA1']),
31
+ ChecksumType.new(:sha256, 64, ['SHA-256', 'SHA256']),
32
+ ChecksumType.new(:sha384, 96, ['SHA-384', 'SHA384']),
33
+ ChecksumType.new(:sha512, 128, ['SHA-512', 'SHA512'])
34
+ ]
35
+
36
+ # @return [Array<ChecksumType>] The list of allowed ChecksumType structs containing the type's properties
37
+ def Fixity.valid_checksum_types
38
+ @@valid_checksum_types
39
+ end
40
+
41
+ # @return [Array<Symbol>] The list of allowed checksum types
42
+ def Fixity.valid_checksum_ids
43
+ @@valid_checksum_types.map { |type| type.id }
44
+ end
45
+
46
+ # @param [Array<Symbol>] types The list of checksum types being specified by the caller
47
+ # @return [Object] The list of specified checksum types after being checked for validity
48
+ def Fixity.validate_checksum_types(*types)
49
+ checksum_types = types.flatten
50
+ invalid_types = checksum_types - valid_checksum_ids
51
+ raise "Invalid digest type specified: #{invalid_types.inspect}" unless invalid_types.empty?
52
+ checksum_types
53
+ end
54
+
55
+ # @param [Array<Symbol>] checksum_types The list of checksum types being specified by the caller
56
+ # @return [Array<Digest::Class>] The list of digest implementation objects that will generate the checksums
57
+ def Fixity.get_digesters(checksum_types=@@default_checksum_types)
58
+ checksum_types.inject(Hash.new) do |digesters, checksum_type|
59
+ case checksum_type
60
+ when :md5
61
+ digesters[checksum_type] = Digest::MD5.new
62
+ when :sha1
63
+ digesters[checksum_type] = Digest::SHA1.new
64
+ when :sha256
65
+ digesters[checksum_type] = Digest::SHA2.new(256)
66
+ when :sha384
67
+ digesters[checksum_type] = Digest::SHA2.new(384)
68
+ when :sha512
69
+ digesters[checksum_type] = Digest::SHA2.new(512)
70
+ else
71
+ raise "Unrecognized checksum type: #{checksum_type}"
72
+ end
73
+ digesters
74
+ end
75
+ end
76
+
77
+ # @param pathname [Pathname] The location of the file to be digested
78
+ # @param [Object] base_pathname The base directory from which relative paths (file IDS) will be derived
79
+ # @param [Object] checksum_types The list of checksum types being specified by the caller (or default list)
80
+ # @return [FileFixity] Generate a FileFixity instance containing fixity properties measured from of a physical file
81
+ def Fixity.fixity_from_file(pathname, base_pathname, checksum_types=@@default_checksum_types)
82
+ file_fixity = FileFixity.new
83
+ file_fixity.file_id = pathname.relative_path_from(base_pathname).to_s
84
+ file_fixity.bytes = pathname.size
85
+ digesters = Fixity.get_digesters(checksum_types)
86
+ pathname.open("r") do |stream|
87
+ while buffer = stream.read(8192)
88
+ digesters.values.each { |digest| digest.update(buffer) }
89
+ end
90
+ end
91
+ digesters.each { |checksum_type, digest| file_fixity.checksums[checksum_type] = digest.hexdigest }
92
+ file_fixity
93
+ end
94
+
95
+ # @param [Pathname] base_pathname The directory path used as the base for deriving relative paths (file IDs)
96
+ # @param [Array<Pathname>] path_list The list of pathnames for files whose fixity will be generated
97
+ # @return [Hash<String,FileFixity>] A hash containing file ids and fixity data derived from the actual files
98
+ def Fixity.generate_checksums(base_pathname, path_list, checksum_types=@@default_checksum_types)
99
+ path_list = base_pathname.find if path_list.nil?
100
+ file_fixity_hash = Hash.new
101
+ path_list.select{|pathname| pathname.file?}.each do |file|
102
+ file_fixity = Fixity.fixity_from_file(file, base_pathname, checksum_types)
103
+ file_fixity_hash[file_fixity.file_id] = file_fixity
104
+ end
105
+ file_fixity_hash
106
+ end
107
+
108
+ # @param [Integer] length The length of the checksum value in hex format
109
+ # @return [ChecksumType] The ChecksumType struct that contains the properties of the matching checksum type
110
+ def Fixity.type_for_length(length)
111
+ @@valid_checksum_types.select {|type| type.hex_length == length}.first
112
+ end
113
+
114
+ # @param [Object] file_id The filename or relative path of the file from its base directory
115
+ # @param [Object] checksum_values The digest values of the file
116
+ # @return [FileFixity] Generate a FileFixity instance containing fixity properties supplied by the caller
117
+ def Fixity.fixity_from_checksum_values(file_id, checksum_values)
118
+ file_fixity = FileFixity.new
119
+ file_fixity.file_id = file_id
120
+ checksum_values.each do |digest|
121
+ checksum_type = Fixity.type_for_length(digest.length)
122
+ file_fixity.checksums[checksum_type.id] = digest
123
+ end
124
+ file_fixity
125
+ end
126
+
127
+ # @param [Hash<String,FileFixity>] file_fixity_hash A hash containing file ids and fixity data derived from the manifest files
128
+ # @return [Hash<String,Hash<Symbol,String] A hash containing file ids and checksum data derived from the file_fixity_hash
129
+ def Fixity.file_checksum_hash(file_fixity_hash)
130
+ checksum_hash = Hash.new
131
+ file_fixity_hash.values.each{|file| checksum_hash[file.file_id] = file.checksums}
132
+ checksum_hash
133
+ end
134
+
135
+ # @param [Symbol,String] checksum_type The type of checksum digest to be generated
136
+ # @param [Pathname,String] file_pathname The location of the file to digest
137
+ # @return [String] The operating system shell command that will generate the checksum digest value
138
+ def Fixity.openssl_digest_command(checksum_type,file_pathname)
139
+ command = "openssl dgst -#{checksum_type} #{file_pathname}"
140
+ command
141
+ end
142
+
143
+ # @param [Symbol,String] checksum_type The type of checksum digest to be generated
144
+ # @param [Pathname,String] file_pathname The location of the file to digest
145
+ # @return [String] The checksum digest value for the file
146
+ def Fixity.openssl_digest(checksum_type,file_pathname)
147
+ command = openssl_digest_command(checksum_type,file_pathname)
148
+ stdout = OperatingSystem.execute(command)
149
+ checksum = stdout.scan(/[A-Za-z0-9]+/).last
150
+ checksum
151
+ end
152
+
153
+ end
154
+
155
+ end
@@ -0,0 +1,33 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A wrapper class around the systemu gem that is used for shelling out to the operating system
7
+ # and executing a command
8
+ #
9
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
10
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
11
+ class OperatingSystem
12
+
13
+ # Executes a system command in a subprocess.
14
+ # The method will return stdout from the command if execution was successful.
15
+ # The method will raise an exception if if execution fails.
16
+ # The exception's message will contain the explaination of the failure.
17
+ # @param [String] command the command to be executed
18
+ # @return [String] stdout from the command if execution was successful
19
+ def OperatingSystem.execute(command)
20
+ status, stdout, stderr = systemu(command)
21
+ if (status.exitstatus != 0)
22
+ raise stderr
23
+ end
24
+ return stdout
25
+ rescue
26
+ msg = "Command failed to execute: [#{command}] caused by <STDERR = #{stderr.split($/).join('; ')}>"
27
+ msg << " STDOUT = #{stdout.split($/).join('; ')}" if (stdout && (stdout.length > 0))
28
+ raise msg
29
+ end
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,160 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A tar archive file containing a set of digital object files
7
+ #
8
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
9
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
10
+ class Tarfile
11
+
12
+ # @return [String] create archive of the specified format
13
+ # * gnu = GNU tar 1.13.x format
14
+ # * posix = POSIX 1003.1-2001 (pax) format
15
+ attr_accessor :format
16
+
17
+ # @return [Boolean] Follow symlinks and archive the files they point to
18
+ attr_accessor :dereference
19
+
20
+ # @return [Boolean] Verify that files were copied faithfully
21
+ attr_accessor :verify
22
+
23
+ # @return [Boolean] Create/list/extract multi-volume archive (not yet implemented)
24
+ attr_accessor :multi_volume
25
+
26
+ # @param [Hash<Symbol,Object>] options Key,Value pairs specifying initial values of attributes
27
+ # @return [Tarfile] Initialize a new Tarfile object
28
+ def initialize(options=nil)
29
+ # set defaults
30
+ @format=:posix
31
+ @dereference = true
32
+ @verify = false
33
+ @multi_volume = false
34
+ # override defaults
35
+ options={} if options.nil?
36
+ options.each do |key,value|
37
+ #instance_variable_set("@#{key}", value)
38
+ send "#{key}=", value
39
+ end
40
+ end
41
+
42
+ # @return [Pathname] The full path of the ancestor dir in which the tar file resides
43
+ def tarfile_basepath
44
+ raise "Tarfile basepath is nil" unless @tarfile_basepath
45
+ @tarfile_basepath
46
+ end
47
+
48
+ # @param [Pathname,String] basepath The full path of the ancestor dir in which the tar file resides
49
+ # @return [Void] Set the full path of the ancestor dir in which the tar file resides
50
+ def tarfile_basepath=(basepath)
51
+ raise "No pathname specified" unless basepath
52
+ @tarfile_basepath = Pathname(basepath).expand_path
53
+ end
54
+
55
+ # @return [Pathname] the full path of the tar archive file to be created or extracted from
56
+ def tarfile_fullpath
57
+ @tarfile_fullpath
58
+ end
59
+
60
+ # @param [Pathname,String] fullpath The full path of tar file
61
+ # @return [Void] Sets the full path of tar file
62
+ def tarfile_fullpath=(fullpath)
63
+ @tarfile_fullpath = Pathname(fullpath).expand_path
64
+ end
65
+
66
+ # @return [String] The id (path relative to basepath) of the tar file
67
+ def tarfile_relative_path
68
+ @tarfile_fullpath.relative_path_from(@tarfile_basepath).to_s
69
+ end
70
+
71
+ # @return [Pathname] The full path of the source file or directory being archived
72
+ def source_fullpath
73
+ raise "Source pathname is nil" unless @source_pathname
74
+ @source_pathname
75
+ end
76
+
77
+ # @param [Pathname,String] source The full path of the source file or directory being archived
78
+ # @return [Void] Set the full path of the source file or directory being archived
79
+ def source_fullpath=(source)
80
+ raise "No pathname specified" unless source
81
+ @source_pathname = Pathname(source).expand_path
82
+ end
83
+
84
+ # @return [Pathname] The directory that is the basis of relative paths
85
+ def source_basepath
86
+ @source_basepath
87
+ end
88
+
89
+ # @param [Pathname,String] base The directory that is the basis of relative paths
90
+ # @return [Void] Set the base path of the source file or directory being archived
91
+ def source_basepath=(base)
92
+ raise "No pathname specified" unless base
93
+ @source_basepath = Pathname(base).expand_path
94
+ end
95
+
96
+ # @return [Pathname] The relative path from the source base directory to the source directory
97
+ def source_relative_path
98
+ source_fullpath.relative_path_from(source_basepath)
99
+ end
100
+
101
+ # @return [String] The shell command string to be used to create the tarfile
102
+ def create_cmd
103
+ command = "tar --create --file=#{tarfile_fullpath} --format=#{@format} "
104
+ command << "--dereference " if @dereference
105
+ command << "--verify " if @verify
106
+ command << "--directory='#{source_basepath}' " if source_basepath
107
+ command << source_relative_path.to_s
108
+ command
109
+ end
110
+
111
+ # @return [Tarfile] Shell out to the operating system and create the tar archive file
112
+ def create_tarfile
113
+ command = create_cmd
114
+ OperatingSystem.execute(command)
115
+ self
116
+ end
117
+
118
+ # @return [String] The shell command that will list the tarfile's contents
119
+ def list_cmd
120
+ command = "tar --list --file=#{tarfile_fullpath} "
121
+ command
122
+ end
123
+
124
+ # @return [String] The list of the tarfile's contents
125
+ def list_tarfile
126
+ command = list_cmd
127
+ list = OperatingSystem.execute(command)
128
+ list
129
+ end
130
+
131
+ # @return [Pathname] The location of the directory into which the tarfile should be extracted
132
+ def target_pathname
133
+ raise "Target pathname is nil" unless @target_pathname
134
+ @target_pathname
135
+ end
136
+
137
+ # @param [Pathname,String] source The location of the directory into which the tarfile should be extracted
138
+ # @return [Void] Set the location of the directory into which the tarfile should be extracted
139
+ def target_pathname=(target)
140
+ raise "No target pathname specified" unless target
141
+ @target_pathname = Pathname(target).expand_path
142
+ end
143
+
144
+ # @return [String] The shell command that will extract the tarfile's contents # @return [Void]
145
+ def extract_cmd
146
+ command = "tar --extract --file=#{tarfile_fullpath} "
147
+ command << "--directory='#{target_pathname}' " if target_pathname
148
+ command
149
+ end
150
+
151
+ # @return [String] Shell out to the operating system and extract the tar archive file
152
+ def extract_tarfile
153
+ command = extract_cmd
154
+ stdout = OperatingSystem.execute(command)
155
+ stdout
156
+ end
157
+
158
+ end
159
+
160
+ end
data/lib/libdir.rb ADDED
@@ -0,0 +1,3 @@
1
+ libdir = File.expand_path(File.join(File.dirname(__FILE__)))
2
+ $LOAD_PATH.unshift(libdir) unless $LOAD_PATH.include?(libdir)
3
+
metadata ADDED
@@ -0,0 +1,194 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: archive-utils
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Darren Weber
8
+ - Richard Anderson
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-10-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: json_pure
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.8'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.8'
28
+ - !ruby/object:Gem::Dependency
29
+ name: systemu
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '2.6'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '2.6'
42
+ - !ruby/object:Gem::Dependency
43
+ name: pry
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rake
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: '10'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '10'
70
+ - !ruby/object:Gem::Dependency
71
+ name: awesome_print
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '1'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '1'
84
+ - !ruby/object:Gem::Dependency
85
+ name: equivalent-xml
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '0.5'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '0.5'
98
+ - !ruby/object:Gem::Dependency
99
+ name: fakeweb
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - "~>"
103
+ - !ruby/object:Gem::Version
104
+ version: '1'
105
+ type: :development
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - "~>"
110
+ - !ruby/object:Gem::Version
111
+ version: '1'
112
+ - !ruby/object:Gem::Dependency
113
+ name: rspec
114
+ requirement: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - "~>"
117
+ - !ruby/object:Gem::Version
118
+ version: '2.0'
119
+ type: :development
120
+ prerelease: false
121
+ version_requirements: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - "~>"
124
+ - !ruby/object:Gem::Version
125
+ version: '2.0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: simplecov
128
+ requirement: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - "~>"
131
+ - !ruby/object:Gem::Version
132
+ version: '0.7'
133
+ type: :development
134
+ prerelease: false
135
+ version_requirements: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: '0.7'
140
+ - !ruby/object:Gem::Dependency
141
+ name: yard
142
+ requirement: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - "~>"
145
+ - !ruby/object:Gem::Version
146
+ version: '0.8'
147
+ type: :development
148
+ prerelease: false
149
+ version_requirements: !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - "~>"
152
+ - !ruby/object:Gem::Version
153
+ version: '0.8'
154
+ description: Contains classes to archive and retrieve digital object version content
155
+ and metadata
156
+ email:
157
+ - darren.weber@stanford.edu
158
+ executables: []
159
+ extensions: []
160
+ extra_rdoc_files: []
161
+ files:
162
+ - lib/archive-utils.rb
163
+ - lib/archive/bagit_bag.rb
164
+ - lib/archive/file_fixity.rb
165
+ - lib/archive/fixity.rb
166
+ - lib/archive/operating_system.rb
167
+ - lib/archive/tarfile.rb
168
+ - lib/libdir.rb
169
+ homepage: https://github.com/sul-dlss/archive-utils
170
+ licenses:
171
+ - Apache-2.0
172
+ metadata: {}
173
+ post_install_message:
174
+ rdoc_options: []
175
+ require_paths:
176
+ - lib
177
+ required_ruby_version: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - ">="
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
182
+ required_rubygems_version: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: 2.2.1
187
+ requirements: []
188
+ rubyforge_project:
189
+ rubygems_version: 2.4.2
190
+ signing_key:
191
+ specification_version: 4
192
+ summary: Ruby utilities for data archival (BagIt, Fixity, Tarfile).
193
+ test_files: []
194
+ has_rdoc: