archive-utils 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 52a10a6b4f10f5b140e6c47e9d88dd25d4407610
4
+ data.tar.gz: 374996e4a353ff7876c4717bb5b73757bae24420
5
+ SHA512:
6
+ metadata.gz: 6f60a79a35a425ed109f633a0ed1b3b066c97b806c00e928f33d17fd448b018be18f4f08d1c46c9f91b2f031aa0defadb53f58f622a4502d57a524be55ad6219
7
+ data.tar.gz: 3498dfacc552e52d3db4d0910b56eaa0da8ceac2b47059e15976744f0e2c18c167c82657894a1cffaba35789b35435f8c6251f5d65f0436a52c860b56d9f9186
@@ -0,0 +1,23 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ Bundler.setup
4
+ require 'digest'
5
+ require 'find'
6
+ require 'json/pure'
7
+ require 'pathname'
8
+ require 'systemu'
9
+
10
+ # Should remove these dependencies from sdr-archive
11
+ #require 'moab_stanford'
12
+ #require 'rest-client'
13
+
14
+ module Archive
15
+ end
16
+
17
+ require 'archive/bagit_bag'
18
+ require 'archive/file_fixity'
19
+ require 'archive/fixity'
20
+ require 'archive/operating_system'
21
+ require 'archive/tarfile'
22
+ include Archive
23
+
@@ -0,0 +1,353 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A BagIt bag contains a structured copy of a digital object for storage, transfer, or archive
7
+ # @see https://tools.ietf.org/html/draft-kunze-bagit-10
8
+ # This class can be used to create, parse, or validate a bag instance
9
+ #
10
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
11
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
12
+ class BagitBag
13
+
14
+ # @param [Pathname,String] pathname The location of the bag home directory
15
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
16
+ def BagitBag.create_bag(pathname)
17
+ bag = BagitBag.new
18
+ bag.bag_pathname = pathname
19
+ bag.payload_pathname.mkpath
20
+ bag.write_bagit_txt
21
+ bag
22
+ end
23
+
24
+ # @param [Pathname,String] pathname The location of the bag home directory
25
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
26
+ def BagitBag.open_bag(pathname)
27
+ bag = BagitBag.new
28
+ bag.bag_pathname = pathname
29
+ raise "No bag found at #{bag.bag_pathname}" unless bag.bag_pathname.exist?
30
+ bagit_txt = bag.bag_pathname.join("bagit.txt")
31
+ raise "No bagit.txt file found at #{bagit_txt}" unless bagit_txt.exist?
32
+ bag
33
+ end
34
+
35
+ # @return [Pathname] The location of the bag home directory
36
+ def bag_pathname
37
+ @bag_pathname
38
+ end
39
+
40
+ # @param [Pathname,String] pathname The location of the bag home directory
41
+ # @return [Void] Set the location of the bag home directory
42
+ def bag_pathname=(pathname)
43
+ @bag_pathname = Pathname(pathname)
44
+ end
45
+
46
+ # @return [Pathname] The location of the bag data directory
47
+ def payload_pathname
48
+ bag_pathname.join('data')
49
+ end
50
+
51
+ # @return [Pathname] Generate the bagit.txt tag file
52
+ def write_bagit_txt
53
+ bagit_txt = bag_pathname.join("bagit.txt")
54
+ bagit_txt.open('w') do |f|
55
+ f.puts "Tag-File-Character-Encoding: UTF-8"
56
+ f.puts "BagIt-Version: 0.97"
57
+ end
58
+ bagit_txt
59
+ end
60
+
61
+ # @return [Hash<String,String] A hash containing the properties documented in the bagit.txt tagfile
62
+ def read_bagit_txt
63
+ properties = Hash.new
64
+ bagit_txt = bag_pathname.join("bagit.txt")
65
+ bagit_txt.readlines.each do |line|
66
+ line.chomp!.strip!
67
+ key,value = line.split(':',2)
68
+ properties[key.strip] = value.strip if value
69
+ end
70
+ properties
71
+ end
72
+
73
+ # @return [Array<Symbol>] The list of checksum types to be used when generating fixity data
74
+ def bag_checksum_types
75
+ @bag_checksum_types ||= Fixity.default_checksum_types
76
+ end
77
+
78
+ # @param [Object] types The list of checksum types to be used when generating fixity data
79
+ # @return [Void] Set the list of checksum types to be used when generating fixity data
80
+ def bag_checksum_types=(*types)
81
+ @bag_checksum_types = Fixity.validate_checksum_types(*types)
82
+ end
83
+
84
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
85
+ # @param [Pathname] source_dir The source location of the directory whose contents are to be bagged
86
+ # @return [Pathname] Generate file_fixity_hash and send it to #add_files_to_payload
87
+ def add_dir_to_payload (link_mode, source_dir)
88
+ file_fixity_hash = Fixity.generate_checksums(source_dir, source_dir.find ,bag_checksum_types)
89
+ add_files_to_payload(link_mode, source_dir, file_fixity_hash)
90
+ payload_pathname
91
+ end
92
+
93
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
94
+ # @param [Pathname] source_basepath The source location of the directory whose contents are to be ingested
95
+ # @param [Hash<String,FileFixity>] file_fixity_hash The list of files (with fixity data) to be added to the payload
96
+ # @return [Pathname] Copy or link the files specified in the file_fixity_hash to the payload directory,
97
+ # then update the payload manifest files
98
+ def add_files_to_payload(link_mode, source_basepath, file_fixity_hash)
99
+ file_fixity_hash.keys.each do |file_id|
100
+ source_pathname = source_basepath.join(file_id)
101
+ target_pathname = payload_pathname.join(file_id)
102
+ copy_file(link_mode, source_pathname, target_pathname)
103
+ end
104
+ write_manifest_checksums('manifest', add_data_prefix(file_fixity_hash))
105
+ payload_pathname
106
+ end
107
+
108
+ # @param [Hash<String,FileFixity>] file_fixity_hash key is file_id, values are Fixity objects containing checksums
109
+ # @return [Hash<String,FileFixity>] A revised hash with file_id paths prefixed with 'data/'
110
+ def add_data_prefix(file_fixity_hash)
111
+ new_hash = Hash.new
112
+ file_fixity_hash.values.each do |fixity|
113
+ fixity.file_id = "data/#{fixity.file_id}"
114
+ new_hash[fixity.file_id] = fixity
115
+ end
116
+ new_hash
117
+ end
118
+
119
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
120
+ # @param [Pathname] source_pathname The source location of the file to be ingested
121
+ # @param [Pathname] target_pathname The location of the directory in which to place the file
122
+ # @return [Pathname] link or copy the specified file from source location to the target location
123
+ def copy_file(link_mode, source_pathname, target_pathname)
124
+ target_pathname.parent.mkpath
125
+ case link_mode
126
+ when :copy, nil
127
+ FileUtils.copy(source_pathname.to_s, target_pathname.to_s) # automatically dereferences symlinks
128
+ when :link
129
+ FileUtils.link(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
130
+ when :symlink
131
+ FileUtils.symlink(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
132
+ else
133
+ raise "Invalid link_mode: #{link_mode}, expected one of [:copy,:link,:symlink]"
134
+ end
135
+ target_pathname
136
+ end
137
+
138
+ # @param [Pathname,String] source_fullpath The location of the directory whose content will be tarred
139
+ # @param [Pathname,String] source_basepath The location of the directory to change to before doing the tar create
140
+ # @return [Tarfile] Create a tar archive of a directory into the payload directory,
141
+ # generating checksums in parallel processes and recording those checksums in the payload manifests
142
+ def add_payload_tarfile(tarfile_id,source_fullpath, source_basepath)
143
+ tarfile = Tarfile.new
144
+ tarfile.source_basepath = Pathname(source_basepath)
145
+ tarfile.source_fullpath = Pathname(source_fullpath)
146
+ tarfile.tarfile_basepath = payload_pathname
147
+ tarfile.tarfile_fullpath = payload_pathname.join("#{tarfile_id}")
148
+ tarfile.create_tarfile
149
+ file_fixity_hash = Fixity.generate_checksums(bag_pathname,[tarfile.tarfile_fullpath],bag_checksum_types)
150
+ write_manifest_checksums('manifest', file_fixity_hash)
151
+ tarfile
152
+ end
153
+
154
+ # @return [Pathname] Generate the bag-info.txt tag file to record the payload size
155
+ def write_bag_info_txt
156
+ payload_size = bag_payload_size
157
+ bag_info_txt = bag_pathname.join("bag-info.txt")
158
+ bag_info_txt.open('w') do |f|
159
+ f.puts "External-Identifier: #{bag_pathname.basename}"
160
+ f.puts "Payload-Oxum: #{payload_size[:bytes]}.#{payload_size[:files]}"
161
+ f.puts "Bag-Size: #{bag_size_human(payload_size[:bytes])}"
162
+ end
163
+ bag_info_txt
164
+ end
165
+
166
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
167
+ # derived from the payload directory contents
168
+ def bag_payload_size
169
+ payload_pathname.find.select{|f| f.file?}.inject({bytes: 0, files: 0}) do |hash,file|
170
+ hash[:bytes] += file.size
171
+ hash[:files] += 1
172
+ hash
173
+ end
174
+ end
175
+
176
+ # @param [Integer] bytes The total number of bytes in the payload
177
+ # @return [String] Human-readable rendition of the total payload size
178
+ def bag_size_human(bytes)
179
+ count = 0
180
+ size = bytes
181
+ while ( size >= 1024 and count < 4 )
182
+ size /= 1024.0
183
+ count += 1
184
+ end
185
+ if (count == 0)
186
+ return sprintf("%d B", size)
187
+ else
188
+ return sprintf("%.2f %s", size, %w[B KB MB GB TB][count] )
189
+ end
190
+ end
191
+
192
+ # @return [Hash<String,String] A hash containing the properties documented in the bag-info.txt tagfile
193
+ def read_bag_info_txt
194
+ properties = Hash.new
195
+ bag_info = bag_pathname.join("bag-info.txt")
196
+ bag_info.readlines.each do |line|
197
+ line.chomp!.strip!
198
+ key,value = line.split(':',2)
199
+ properties[key.strip] = value.strip if value
200
+ end
201
+ properties
202
+ end
203
+
204
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
205
+ # derived from the Payload-Oxum property
206
+ def info_payload_size
207
+ info = read_bag_info_txt
208
+ size_array = info['Payload-Oxum'].split('.')
209
+ size_hash = {:bytes => size_array[0].to_i, :files => size_array[1].to_i}
210
+ size_hash
211
+ end
212
+
213
+ # @return [Boolean] Compare the actual measured payload size against the value recorded in bag-info.txt
214
+ def verify_payload_size
215
+ info_size = info_payload_size
216
+ bag_size = bag_payload_size
217
+ if info_size != bag_size
218
+ raise "Failed payload size verification! Expected: #{info_size}, Found: #{bag_size}"
219
+ end
220
+ true
221
+ end
222
+
223
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's root directory
224
+ def generate_tagfile_checksums
225
+ # get list of all files in the bag home dir, except those starting with 'tagmanifest'
226
+ tagfiles = bag_pathname.children.reject{|file| file.basename.to_s.start_with?('tagmanifest')}
227
+ # generate checksums, using bag home dir as the base directory for file ids (per bagit spec)
228
+ Fixity.generate_checksums(bag_pathname, tagfiles, bag_checksum_types )
229
+ end
230
+
231
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's payload
232
+ def generate_payload_checksums
233
+ # get list of all files in the data directory
234
+ path_list = payload_pathname.find
235
+ # generate checksums, but use bag home dir as the base directory for file ids (per bagit spec)
236
+ Fixity.generate_checksums(bag_pathname, path_list, bag_checksum_types)
237
+ end
238
+
239
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be updated
240
+ # @param [Hash<String,FileFixity>] file_fixity_hash A hash containing file ids and fixity data
241
+ # @param [String] open_mode The file open mode (default is 'a')
242
+ # @return [Hash<Symbol,Pathname] Update each of the manifests with data from the file_fixity_hash
243
+ def write_manifest_checksums(manifest_type, file_fixity_hash, open_mode='a')
244
+ manifests = Hash.new
245
+ self.bag_checksum_types.each do |checksum_type|
246
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
247
+ manifest_file = manifest_pathname.open(open_mode)
248
+ file_fixity_hash.values.each do |fixity|
249
+ checksum = fixity.get_checksum(checksum_type)
250
+ manifest_file.puts("#{checksum} #{fixity.file_id}") if checksum
251
+ end
252
+ manifest_file.close
253
+ manifests[checksum_type] = manifest_pathname
254
+ end
255
+ manifests
256
+ end
257
+
258
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
259
+ # @return [Hash<String,FileFixity>] A hash containing file ids and fixity data derived from the manifest files
260
+ def read_manifest_files(manifest_type)
261
+ file_fixity_hash = Hash.new
262
+ checksum_type_list = Array.new
263
+ Fixity.valid_checksum_ids.each do |checksum_type|
264
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
265
+ if manifest_pathname.file?
266
+ checksum_type_list << checksum_type
267
+ manifest_pathname.readlines.each do |line|
268
+ line.chomp!.strip!
269
+ checksum,file_id = line.split(/[\s*]+/,2)
270
+ file_fixity = file_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
271
+ file_fixity.set_checksum(checksum_type,checksum)
272
+ file_fixity_hash[file_id] = file_fixity
273
+ end
274
+ end
275
+ end
276
+ self.bag_checksum_types = self.bag_checksum_types | checksum_type_list
277
+ file_fixity_hash
278
+ end
279
+
280
+ # @return [Boolean] Compare fixity data from the tag manifest files against the values measured by digesting the files
281
+ def verify_tagfile_manifests
282
+ manifest_type = 'tagmanifest'
283
+ manifest_fixity_hash = read_manifest_files(manifest_type)
284
+ bag_fixity_hash = generate_tagfile_checksums
285
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
286
+ end
287
+
288
+ # @return [Boolean] Compare fixity data from the payload manifest files against the values measured by digesting the files
289
+ def verify_payload_manifests
290
+ manifest_type = 'manifest'
291
+ manifest_fixity_hash = read_manifest_files(manifest_type)
292
+ bag_fixity_hash = generate_payload_checksums
293
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
294
+ end
295
+
296
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
297
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
298
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
299
+ # @return [Boolean] Compare fixity data from the manifest files against the values measured by digesting the files,
300
+ # returning true if equal or false if not equal
301
+ def verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
302
+ diff = manifest_diff(manifest_fixity_hash, bag_fixity_hash)
303
+ if diff.size > 0
304
+ raise "Failed #{manifest_type} verification! Differences: \n#{diff.inspect}"
305
+ end
306
+ true
307
+ end
308
+
309
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
310
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
311
+ # @return [Hash] A report of the differences between the fixity data from the manifest files
312
+ # against the values measured by digesting the files
313
+ def manifest_diff(manifest_fixity_hash, bag_fixity_hash)
314
+ diff = Hash.new
315
+ (manifest_fixity_hash.keys | bag_fixity_hash.keys).each do |file_id|
316
+ manifest_fixity = manifest_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
317
+ bag_fixity = bag_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
318
+ if manifest_fixity != bag_fixity
319
+ diff[file_id] = manifest_fixity.diff(bag_fixity,'manifest','bag')
320
+ end
321
+ end
322
+ diff
323
+ end
324
+
325
+ # @return [Boolean] Validate the bag containing the digital object
326
+ def verify_bag
327
+ verify_bag_structure
328
+ verify_tagfile_manifests
329
+ verify_payload_size
330
+ verify_payload_manifests
331
+ true
332
+ end
333
+
334
+ # @return [Boolean] Test the existence of expected files, return true if files exist, raise exception if not
335
+ def verify_bag_structure
336
+ required_files = ['data','bagit.txt','bag-info.txt','manifest-sha256.txt','tagmanifest-sha256.txt']
337
+ required_files.each{|filename| verify_pathname(bag_pathname.join(filename))}
338
+ optional_files = []
339
+ true
340
+ end
341
+
342
+ # @param [Pathname] pathname The file whose existence should be verified
343
+ # @return [Boolean] Test the existence of the specified path. Return true if file exists, raise exception if not
344
+ def verify_pathname(pathname)
345
+ raise "#{pathname.basename} not found at #{pathname}" unless pathname.exist?
346
+ true
347
+ end
348
+
349
+
350
+ end
351
+
352
+
353
+ end
@@ -0,0 +1,98 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # The fixity properties of a file, used to determine file content equivalence.
7
+ # Placing this data in a class by itself facilitates using the MD5, SHA1, etc checksums (and optionally the file size)
8
+ # as a single key when doing comparisons against other file instances. The design assumes that this file fixity
9
+ # is sufficiently unique to act as a comparator for determining file equality or verifying checksum manifests.
10
+ #
11
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
12
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
13
+ class FileFixity
14
+
15
+ # @param [Hash<Symbol,Object>] options Key,Value pairs specifying initial values of attributes
16
+ def initialize(options=nil)
17
+ @checksums=Hash.new
18
+ options = {} if options.nil?
19
+ options.each do |key,value|
20
+ #instance_variable_set("@#{key}", value)
21
+ send "#{key}=", value
22
+ end
23
+ end
24
+
25
+ # @return [String] The name of the file, relative to its base directory
26
+ # (for payload files, path relative to the data folder. For tag files, path relative to the bag home folder)
27
+ attr_accessor :file_id
28
+
29
+ # @return [Integer] The size of the file in bytes
30
+ attr_accessor :bytes
31
+
32
+ # @return [Hash<Symbol,String>] The MD5, SHA1, SHA256, etc checksum values of the file
33
+ attr_accessor :checksums
34
+
35
+ # @param [Symbol,String] type The type of checksum (e.g. :md5, :sha1, :sha256)
36
+ # @return [String] The value of the file digest
37
+ def get_checksum(type)
38
+ checksum_type = type.to_s.downcase.to_sym
39
+ self.checksums[checksum_type]
40
+ end
41
+
42
+ # @param type [Symbol,String] The type of checksum
43
+ # @param value [String] value of the file digest
44
+ # @return [void] Set the value for the specified checksum type in the checksum hash
45
+ def set_checksum(type,value)
46
+ checksum_type = type.to_s.downcase.to_sym
47
+ Fixity.validate_checksum_types(checksum_type)
48
+ self.checksums[checksum_type] = value
49
+ end
50
+
51
+ # @param other [FileFixity] The other file fixity being compared to this fixity
52
+ # @return [Boolean] Returns true if self and other have comparable fixity data.
53
+ def eql?(other)
54
+ matching_checksum_types = self.checksums.keys & other.checksums.keys
55
+ return false if matching_checksum_types.size == 0
56
+ matching_checksum_types.each do |type|
57
+ return false if self.checksums[type] != other.checksums[type]
58
+ end
59
+ true
60
+ end
61
+
62
+ # (see #eql?)
63
+ def ==(other)
64
+ eql?(other)
65
+ end
66
+
67
+ # @return [Fixnum] Compute a hash-code for the fixity value array.
68
+ # Two file instances with the same content will have the same hash code (and will compare using eql?).
69
+ # @note The hash and eql? methods override the methods inherited from Object.
70
+ # These methods ensure that instances of this class can be used as Hash keys. See
71
+ # * {http://www.paulbutcher.com/2007/10/navigating-the-equality-maze/}
72
+ # * {http://techbot.me/2011/05/ruby-basics-equality-operators-ruby/}
73
+ # Also overriden is {#==} so that equality tests in other contexts will also return the expected result.
74
+ def hash
75
+ [self.file_id].hash
76
+ end
77
+
78
+ # @param [FileFixity] other The other FileFixity object being compared to this one
79
+ # @param [String] left The label to use for values from this base FileFixity object
80
+ # @param [String] right he label to use for values from the other FileFixity object
81
+ # @return [Hash<symbol,Hash<String,String>] details of the checksum differences between fixity objects
82
+ def diff(other,left='base',right='other')
83
+ diff_hash = Hash.new
84
+ matching_checksum_types = (self.checksums.keys & other.checksums.keys)
85
+ matching_checksum_types = (self.checksums.keys | other.checksums.keys) if matching_checksum_types.empty?
86
+ matching_checksum_types.each do |type|
87
+ base_checksum = self.checksums[type]
88
+ other_checksum = other.checksums[type]
89
+ if base_checksum != other_checksum
90
+ diff_hash[type] = {left => base_checksum, right => other_checksum }
91
+ end
92
+ end
93
+ return diff_hash.size > 0 ? diff_hash : nil
94
+ end
95
+
96
+ end
97
+
98
+ end
@@ -0,0 +1,155 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A Struct to hold properties of a given checksum digest type
7
+ ChecksumType = Struct.new(:id, :hex_length, :names)
8
+
9
+ # A helper class that facilites the generation and processing of checksums
10
+ #
11
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
12
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
13
+ class Fixity
14
+
15
+ @@default_checksum_types = [:sha1, :sha256]
16
+
17
+ # @return [Array<Symbol>] The list of checksum types to be used when generating fixity data
18
+ def Fixity.default_checksum_types
19
+ @@default_checksum_types
20
+ end
21
+
22
+ # @param [Array<Symbol>] types The list of checksum types to be used when generating fixity data
23
+ # @return [Void] Set the list of checksum types to be used when generating fixity data
24
+ def Fixity.default_checksum_types=(*types)
25
+ @@default_checksum_types = Fixity.validate_checksum_types(*types)
26
+ end
27
+
28
+ @@valid_checksum_types = [
29
+ ChecksumType.new(:md5, 32, ['MD5']),
30
+ ChecksumType.new(:sha1, 40, ['SHA-1', 'SHA1']),
31
+ ChecksumType.new(:sha256, 64, ['SHA-256', 'SHA256']),
32
+ ChecksumType.new(:sha384, 96, ['SHA-384', 'SHA384']),
33
+ ChecksumType.new(:sha512, 128, ['SHA-512', 'SHA512'])
34
+ ]
35
+
36
+ # @return [Array<ChecksumType>] The list of allowed ChecksumType structs containing the type's properties
37
+ def Fixity.valid_checksum_types
38
+ @@valid_checksum_types
39
+ end
40
+
41
+ # @return [Array<Symbol>] The list of allowed checksum types
42
+ def Fixity.valid_checksum_ids
43
+ @@valid_checksum_types.map { |type| type.id }
44
+ end
45
+
46
+ # @param [Array<Symbol>] types The list of checksum types being specified by the caller
47
+ # @return [Object] The list of specified checksum types after being checked for validity
48
+ def Fixity.validate_checksum_types(*types)
49
+ checksum_types = types.flatten
50
+ invalid_types = checksum_types - valid_checksum_ids
51
+ raise "Invalid digest type specified: #{invalid_types.inspect}" unless invalid_types.empty?
52
+ checksum_types
53
+ end
54
+
55
+ # @param [Array<Symbol>] checksum_types The list of checksum types being specified by the caller
56
+ # @return [Array<Digest::Class>] The list of digest implementation objects that will generate the checksums
57
+ def Fixity.get_digesters(checksum_types=@@default_checksum_types)
58
+ checksum_types.inject(Hash.new) do |digesters, checksum_type|
59
+ case checksum_type
60
+ when :md5
61
+ digesters[checksum_type] = Digest::MD5.new
62
+ when :sha1
63
+ digesters[checksum_type] = Digest::SHA1.new
64
+ when :sha256
65
+ digesters[checksum_type] = Digest::SHA2.new(256)
66
+ when :sha384
67
+ digesters[checksum_type] = Digest::SHA2.new(384)
68
+ when :sha512
69
+ digesters[checksum_type] = Digest::SHA2.new(512)
70
+ else
71
+ raise "Unrecognized checksum type: #{checksum_type}"
72
+ end
73
+ digesters
74
+ end
75
+ end
76
+
77
+ # @param pathname [Pathname] The location of the file to be digested
78
+ # @param [Object] base_pathname The base directory from which relative paths (file IDS) will be derived
79
+ # @param [Object] checksum_types The list of checksum types being specified by the caller (or default list)
80
+ # @return [FileFixity] Generate a FileFixity instance containing fixity properties measured from of a physical file
81
+ def Fixity.fixity_from_file(pathname, base_pathname, checksum_types=@@default_checksum_types)
82
+ file_fixity = FileFixity.new
83
+ file_fixity.file_id = pathname.relative_path_from(base_pathname).to_s
84
+ file_fixity.bytes = pathname.size
85
+ digesters = Fixity.get_digesters(checksum_types)
86
+ pathname.open("r") do |stream|
87
+ while buffer = stream.read(8192)
88
+ digesters.values.each { |digest| digest.update(buffer) }
89
+ end
90
+ end
91
+ digesters.each { |checksum_type, digest| file_fixity.checksums[checksum_type] = digest.hexdigest }
92
+ file_fixity
93
+ end
94
+
95
+ # @param [Pathname] base_pathname The directory path used as the base for deriving relative paths (file IDs)
96
+ # @param [Array<Pathname>] path_list The list of pathnames for files whose fixity will be generated
97
+ # @return [Hash<String,FileFixity>] A hash containing file ids and fixity data derived from the actual files
98
+ def Fixity.generate_checksums(base_pathname, path_list, checksum_types=@@default_checksum_types)
99
+ path_list = base_pathname.find if path_list.nil?
100
+ file_fixity_hash = Hash.new
101
+ path_list.select{|pathname| pathname.file?}.each do |file|
102
+ file_fixity = Fixity.fixity_from_file(file, base_pathname, checksum_types)
103
+ file_fixity_hash[file_fixity.file_id] = file_fixity
104
+ end
105
+ file_fixity_hash
106
+ end
107
+
108
+ # @param [Integer] length The length of the checksum value in hex format
109
+ # @return [ChecksumType] The ChecksumType struct that contains the properties of the matching checksum type
110
+ def Fixity.type_for_length(length)
111
+ @@valid_checksum_types.select {|type| type.hex_length == length}.first
112
+ end
113
+
114
+ # @param [Object] file_id The filename or relative path of the file from its base directory
115
+ # @param [Object] checksum_values The digest values of the file
116
+ # @return [FileFixity] Generate a FileFixity instance containing fixity properties supplied by the caller
117
+ def Fixity.fixity_from_checksum_values(file_id, checksum_values)
118
+ file_fixity = FileFixity.new
119
+ file_fixity.file_id = file_id
120
+ checksum_values.each do |digest|
121
+ checksum_type = Fixity.type_for_length(digest.length)
122
+ file_fixity.checksums[checksum_type.id] = digest
123
+ end
124
+ file_fixity
125
+ end
126
+
127
+ # @param [Hash<String,FileFixity>] file_fixity_hash A hash containing file ids and fixity data derived from the manifest files
128
+ # @return [Hash<String,Hash<Symbol,String] A hash containing file ids and checksum data derived from the file_fixity_hash
129
+ def Fixity.file_checksum_hash(file_fixity_hash)
130
+ checksum_hash = Hash.new
131
+ file_fixity_hash.values.each{|file| checksum_hash[file.file_id] = file.checksums}
132
+ checksum_hash
133
+ end
134
+
135
+ # @param [Symbol,String] checksum_type The type of checksum digest to be generated
136
+ # @param [Pathname,String] file_pathname The location of the file to digest
137
+ # @return [String] The operating system shell command that will generate the checksum digest value
138
+ def Fixity.openssl_digest_command(checksum_type,file_pathname)
139
+ command = "openssl dgst -#{checksum_type} #{file_pathname}"
140
+ command
141
+ end
142
+
143
+ # @param [Symbol,String] checksum_type The type of checksum digest to be generated
144
+ # @param [Pathname,String] file_pathname The location of the file to digest
145
+ # @return [String] The checksum digest value for the file
146
+ def Fixity.openssl_digest(checksum_type,file_pathname)
147
+ command = openssl_digest_command(checksum_type,file_pathname)
148
+ stdout = OperatingSystem.execute(command)
149
+ checksum = stdout.scan(/[A-Za-z0-9]+/).last
150
+ checksum
151
+ end
152
+
153
+ end
154
+
155
+ end
@@ -0,0 +1,33 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A wrapper class around the systemu gem that is used for shelling out to the operating system
7
+ # and executing a command
8
+ #
9
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
10
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
11
+ class OperatingSystem
12
+
13
+ # Executes a system command in a subprocess.
14
+ # The method will return stdout from the command if execution was successful.
15
+ # The method will raise an exception if if execution fails.
16
+ # The exception's message will contain the explaination of the failure.
17
+ # @param [String] command the command to be executed
18
+ # @return [String] stdout from the command if execution was successful
19
+ def OperatingSystem.execute(command)
20
+ status, stdout, stderr = systemu(command)
21
+ if (status.exitstatus != 0)
22
+ raise stderr
23
+ end
24
+ return stdout
25
+ rescue
26
+ msg = "Command failed to execute: [#{command}] caused by <STDERR = #{stderr.split($/).join('; ')}>"
27
+ msg << " STDOUT = #{stdout.split($/).join('; ')}" if (stdout && (stdout.length > 0))
28
+ raise msg
29
+ end
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,160 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'archive-utils'
3
+
4
+ module Archive
5
+
6
+ # A tar archive file containing a set of digital object files
7
+ #
8
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
9
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
10
+ class Tarfile
11
+
12
+ # @return [String] create archive of the specified format
13
+ # * gnu = GNU tar 1.13.x format
14
+ # * posix = POSIX 1003.1-2001 (pax) format
15
+ attr_accessor :format
16
+
17
+ # @return [Boolean] Follow symlinks and archive the files they point to
18
+ attr_accessor :dereference
19
+
20
+ # @return [Boolean] Verify that files were copied faithfully
21
+ attr_accessor :verify
22
+
23
+ # @return [Boolean] Create/list/extract multi-volume archive (not yet implemented)
24
+ attr_accessor :multi_volume
25
+
26
+ # @param [Hash<Symbol,Object>] options Key,Value pairs specifying initial values of attributes
27
+ # @return [Tarfile] Initialize a new Tarfile object
28
+ def initialize(options=nil)
29
+ # set defaults
30
+ @format=:posix
31
+ @dereference = true
32
+ @verify = false
33
+ @multi_volume = false
34
+ # override defaults
35
+ options={} if options.nil?
36
+ options.each do |key,value|
37
+ #instance_variable_set("@#{key}", value)
38
+ send "#{key}=", value
39
+ end
40
+ end
41
+
42
+ # @return [Pathname] The full path of the ancestor dir in which the tar file resides
43
+ def tarfile_basepath
44
+ raise "Tarfile basepath is nil" unless @tarfile_basepath
45
+ @tarfile_basepath
46
+ end
47
+
48
+ # @param [Pathname,String] basepath The full path of the ancestor dir in which the tar file resides
49
+ # @return [Void] Set the full path of the ancestor dir in which the tar file resides
50
+ def tarfile_basepath=(basepath)
51
+ raise "No pathname specified" unless basepath
52
+ @tarfile_basepath = Pathname(basepath).expand_path
53
+ end
54
+
55
+ # @return [Pathname] the full path of the tar archive file to be created or extracted from
56
+ def tarfile_fullpath
57
+ @tarfile_fullpath
58
+ end
59
+
60
+ # @param [Pathname,String] fullpath The full path of tar file
61
+ # @return [Void] Sets the full path of tar file
62
+ def tarfile_fullpath=(fullpath)
63
+ @tarfile_fullpath = Pathname(fullpath).expand_path
64
+ end
65
+
66
+ # @return [String] The id (path relative to basepath) of the tar file
67
+ def tarfile_relative_path
68
+ @tarfile_fullpath.relative_path_from(@tarfile_basepath).to_s
69
+ end
70
+
71
+ # @return [Pathname] The full path of the source file or directory being archived
72
+ def source_fullpath
73
+ raise "Source pathname is nil" unless @source_pathname
74
+ @source_pathname
75
+ end
76
+
77
+ # @param [Pathname,String] source The full path of the source file or directory being archived
78
+ # @return [Void] Set the full path of the source file or directory being archived
79
+ def source_fullpath=(source)
80
+ raise "No pathname specified" unless source
81
+ @source_pathname = Pathname(source).expand_path
82
+ end
83
+
84
+ # @return [Pathname] The directory that is the basis of relative paths
85
+ def source_basepath
86
+ @source_basepath
87
+ end
88
+
89
+ # @param [Pathname,String] base The directory that is the basis of relative paths
90
+ # @return [Void] Set the base path of the source file or directory being archived
91
+ def source_basepath=(base)
92
+ raise "No pathname specified" unless base
93
+ @source_basepath = Pathname(base).expand_path
94
+ end
95
+
96
+ # @return [Pathname] The relative path from the source base directory to the source directory
97
+ def source_relative_path
98
+ source_fullpath.relative_path_from(source_basepath)
99
+ end
100
+
101
+ # @return [String] The shell command string to be used to create the tarfile
102
+ def create_cmd
103
+ command = "tar --create --file=#{tarfile_fullpath} --format=#{@format} "
104
+ command << "--dereference " if @dereference
105
+ command << "--verify " if @verify
106
+ command << "--directory='#{source_basepath}' " if source_basepath
107
+ command << source_relative_path.to_s
108
+ command
109
+ end
110
+
111
+ # @return [Tarfile] Shell out to the operating system and create the tar archive file
112
+ def create_tarfile
113
+ command = create_cmd
114
+ OperatingSystem.execute(command)
115
+ self
116
+ end
117
+
118
+ # @return [String] The shell command that will list the tarfile's contents
119
+ def list_cmd
120
+ command = "tar --list --file=#{tarfile_fullpath} "
121
+ command
122
+ end
123
+
124
+ # @return [String] The list of the tarfile's contents
125
+ def list_tarfile
126
+ command = list_cmd
127
+ list = OperatingSystem.execute(command)
128
+ list
129
+ end
130
+
131
+ # @return [Pathname] The location of the directory into which the tarfile should be extracted
132
+ def target_pathname
133
+ raise "Target pathname is nil" unless @target_pathname
134
+ @target_pathname
135
+ end
136
+
137
+ # @param [Pathname,String] source The location of the directory into which the tarfile should be extracted
138
+ # @return [Void] Set the location of the directory into which the tarfile should be extracted
139
+ def target_pathname=(target)
140
+ raise "No target pathname specified" unless target
141
+ @target_pathname = Pathname(target).expand_path
142
+ end
143
+
144
+ # @return [String] The shell command that will extract the tarfile's contents # @return [Void]
145
+ def extract_cmd
146
+ command = "tar --extract --file=#{tarfile_fullpath} "
147
+ command << "--directory='#{target_pathname}' " if target_pathname
148
+ command
149
+ end
150
+
151
+ # @return [String] Shell out to the operating system and extract the tar archive file
152
+ def extract_tarfile
153
+ command = extract_cmd
154
+ stdout = OperatingSystem.execute(command)
155
+ stdout
156
+ end
157
+
158
+ end
159
+
160
+ end
data/lib/libdir.rb ADDED
@@ -0,0 +1,3 @@
1
+ libdir = File.expand_path(File.join(File.dirname(__FILE__)))
2
+ $LOAD_PATH.unshift(libdir) unless $LOAD_PATH.include?(libdir)
3
+
metadata ADDED
@@ -0,0 +1,194 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: archive-utils
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Darren Weber
8
+ - Richard Anderson
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-10-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: json_pure
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.8'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.8'
28
+ - !ruby/object:Gem::Dependency
29
+ name: systemu
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '2.6'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '2.6'
42
+ - !ruby/object:Gem::Dependency
43
+ name: pry
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ type: :development
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ - !ruby/object:Gem::Dependency
57
+ name: rake
58
+ requirement: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - "~>"
61
+ - !ruby/object:Gem::Version
62
+ version: '10'
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '10'
70
+ - !ruby/object:Gem::Dependency
71
+ name: awesome_print
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '1'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '1'
84
+ - !ruby/object:Gem::Dependency
85
+ name: equivalent-xml
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '0.5'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '0.5'
98
+ - !ruby/object:Gem::Dependency
99
+ name: fakeweb
100
+ requirement: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - "~>"
103
+ - !ruby/object:Gem::Version
104
+ version: '1'
105
+ type: :development
106
+ prerelease: false
107
+ version_requirements: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - "~>"
110
+ - !ruby/object:Gem::Version
111
+ version: '1'
112
+ - !ruby/object:Gem::Dependency
113
+ name: rspec
114
+ requirement: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - "~>"
117
+ - !ruby/object:Gem::Version
118
+ version: '2.0'
119
+ type: :development
120
+ prerelease: false
121
+ version_requirements: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - "~>"
124
+ - !ruby/object:Gem::Version
125
+ version: '2.0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: simplecov
128
+ requirement: !ruby/object:Gem::Requirement
129
+ requirements:
130
+ - - "~>"
131
+ - !ruby/object:Gem::Version
132
+ version: '0.7'
133
+ type: :development
134
+ prerelease: false
135
+ version_requirements: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: '0.7'
140
+ - !ruby/object:Gem::Dependency
141
+ name: yard
142
+ requirement: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - "~>"
145
+ - !ruby/object:Gem::Version
146
+ version: '0.8'
147
+ type: :development
148
+ prerelease: false
149
+ version_requirements: !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - "~>"
152
+ - !ruby/object:Gem::Version
153
+ version: '0.8'
154
+ description: Contains classes to archive and retrieve digital object version content
155
+ and metadata
156
+ email:
157
+ - darren.weber@stanford.edu
158
+ executables: []
159
+ extensions: []
160
+ extra_rdoc_files: []
161
+ files:
162
+ - lib/archive-utils.rb
163
+ - lib/archive/bagit_bag.rb
164
+ - lib/archive/file_fixity.rb
165
+ - lib/archive/fixity.rb
166
+ - lib/archive/operating_system.rb
167
+ - lib/archive/tarfile.rb
168
+ - lib/libdir.rb
169
+ homepage: https://github.com/sul-dlss/archive-utils
170
+ licenses:
171
+ - Apache-2.0
172
+ metadata: {}
173
+ post_install_message:
174
+ rdoc_options: []
175
+ require_paths:
176
+ - lib
177
+ required_ruby_version: !ruby/object:Gem::Requirement
178
+ requirements:
179
+ - - ">="
180
+ - !ruby/object:Gem::Version
181
+ version: '0'
182
+ required_rubygems_version: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: 2.2.1
187
+ requirements: []
188
+ rubyforge_project:
189
+ rubygems_version: 2.4.2
190
+ signing_key:
191
+ specification_version: 4
192
+ summary: Ruby utilities for data archival (BagIt, Fixity, Tarfile).
193
+ test_files: []
194
+ has_rdoc: