sdr-replication 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e99a0814e4383ec6287dec7df41825d786e65919
4
+ data.tar.gz: 4890c2758dd820f22ce8aa9015621e1b33dec9ae
5
+ SHA512:
6
+ metadata.gz: deb3400e53fcbdf16cf8263ffe1deed70be83889633b4fecfa0de58e41a076e7f502de61d22f079b09ff6b81ddb29586bb6e898782f22e45010ecfd46d93828d
7
+ data.tar.gz: a0533d23addcc264e7aeca97f985bdad1d70e8081230e09ab5337a03314b058374541b1587f47a5f89dd2f558115677190d49b320b892a77e7031bb15ac9c3ff
data/lib/libdir.rb ADDED
@@ -0,0 +1,3 @@
1
+ libdir = File.expand_path(File.join(File.dirname(__FILE__)))
2
+ $LOAD_PATH.unshift(libdir) unless $LOAD_PATH.include?(libdir)
3
+
@@ -0,0 +1,110 @@
1
+ require 'rubygems'
2
+ require 'rest-client'
3
+
4
+ module Replication
5
+
6
+ # A wrapper class based on {RestClient} used to interface with the Archive Catalog service.
7
+ # <br>
8
+ # <br>
9
+ # The default RestClient behavior is:
10
+ # * for results code between 200 and 207 a RestClient::Response will be returned
11
+ # * for results code 301, 302 or 307 the redirection will be followed if the request is a get or a head
12
+ # * for result code 303 the redirection will be followed and the request transformed into a get
13
+ # * for other cases a RestClient::Exception holding the Response will be raised
14
+ #
15
+ # But we are using a technique that forces RestClient to always provide the response
16
+ # <br>
17
+ # <br>
18
+ # RestClient::Response has these instance methods (some inherited from AbstractResponse):
19
+ # * args
20
+ # * body
21
+ # * code (e.g. 204)
22
+ # * description (e.g. "204 No Content | 0 bytes")
23
+ # * headers
24
+ # * net_http_res
25
+ #
26
+ # @see https://github.com/rest-client/rest-client
27
+ # @see http://rubydoc.info/gems/rest-client/1.6.7/frames
28
+ class ArchiveCatalog
29
+
30
+ @root_uri = 'http://localhost:3000'
31
+ @timeout = 120
32
+
33
+ # @see https://www.google.com/search?q="class+<<+self"+"attr_accessor"
34
+ class << self
35
+
36
+ # @return [String] The base or home URL of the Archive Catalog web service
37
+ attr_accessor :root_uri
38
+
39
+ # @return [Integer] seconds to wait for a response or to open a connection. Value nil disables the timeout.
40
+ attr_accessor :timeout
41
+
42
+ # The base RestClient resource to be used for requests
43
+ def root_resource
44
+ RestClient::Resource.new(@root_uri, {:open_timeout => @timeout, :timeout => @timeout})
45
+ end
46
+
47
+ # Get the item record from the specified table for the specified primary key.
48
+ # @param [String] table name of the database table
49
+ # @param [String] id primary key for the item in the database table
50
+ # @return [Hash] the row (in key,value hash) from the specified table for the specified identifier.
51
+ # Response body contains the item data in JSON format, which is converted to a hash.
52
+ # @see http://tools.ietf.org/html/rfc2616#page-53
53
+ def get_item(table,id)
54
+ # Don't raise RestClient::Exception but return the response
55
+ headers = {:accept => 'application/json'}
56
+ response = root_resource["#{table}/#{id}.json"].get(headers) {|response, request, result| response }
57
+ case response.code.to_s
58
+ when '200'
59
+ JSON.parse(response.body)
60
+ else
61
+ raise response.description
62
+ end
63
+ end
64
+
65
+ # Retrieve an existing database record or add a new one using the data provided.
66
+ # @param [String] table name of the database table
67
+ # @param [Hash] hash the item data to be added to the database table
68
+ # @return [Hash] result containing the item data as if a GET were performed.
69
+ # The HTTP response code for success is 201 (Created).
70
+ # @see http://en.wikipedia.org/wiki/POST_(HTTP)
71
+ # @see http://tools.ietf.org/html/rfc2616#page-54
72
+ def find_or_create_item(table,hash)
73
+ payload = hash.to_json
74
+ headers = {:content_type => :json, :accept => :json}
75
+ # Don't raise RestClient::Exception but return the response
76
+ response = root_resource["#{table}.json"].post(payload, headers) {|response, request, result| response }
77
+ case response.code.to_s
78
+ when '201'
79
+ JSON.parse(response.body)
80
+ else
81
+ raise response.description
82
+ end
83
+ end
84
+
85
+ # Update the database columns for the specified item using the hash data.
86
+ # @param [String] table name of the database table
87
+ # @param [String] id primary key for the item in the database table
88
+ # @param [Hash] hash the item data to be updated in the database table
89
+ # @return (Boolean) true if the HTTP response code is 204, per specification for PATCH or PUT request types.
90
+ # Response body is empty, per same specification.
91
+ # @see https://tools.ietf.org/html/rfc5789
92
+ # @see http://stackoverflow.com/questions/797834/should-a-restful-put-operation-return-something/827045#827045
93
+ def update_item(table,id,hash)
94
+ payload = hash.to_json
95
+ headers = {:content_type => :json}
96
+ # Don't raise RestClient::Exception but return the response
97
+ response = root_resource["#{table}/#{id}.json"].patch(payload, headers) {|response, request, result| response }
98
+ case response.code.to_s
99
+ when '204'
100
+ true
101
+ else
102
+ raise response.description
103
+ end
104
+ end
105
+
106
+ end
107
+
108
+ end
109
+
110
+ end
@@ -0,0 +1,337 @@
1
+ require File.join(File.dirname(__FILE__),'../libdir')
2
+ require 'sdr_replication'
3
+
4
+ module Replication
5
+
6
+ # A BagIt bag contains a structured copy of a digital object for storage, transfer, or replication
7
+ # @see https://tools.ietf.org/html/draft-kunze-bagit-10
8
+ # This class can be used to create, parse, or validate a bag instance
9
+ #
10
+ # @note Copyright (c) 2014 by The Board of Trustees of the Leland Stanford Junior University.
11
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
12
+ class BagitBag
13
+
14
+ # @param [Pathname,String] pathname The location of the bag home directory
15
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
16
+ def BagitBag.create_bag(pathname)
17
+ bag = BagitBag.new
18
+ bag.bag_pathname = pathname
19
+ bag.payload_pathname.mkpath
20
+ bag.write_bagit_txt
21
+ bag
22
+ end
23
+
24
+ # @param [Pathname,String] pathname The location of the bag home directory
25
+ # @return [BagitBag] Initialize a new bag, create home and payload folders, write bagit.txt file
26
+ def BagitBag.open_bag(pathname)
27
+ bag = BagitBag.new
28
+ bag.bag_pathname = pathname
29
+ raise "No bag found at #{bag.bag_pathname}" unless bag.bag_pathname.exist?
30
+ bagit_txt = bag.bag_pathname.join("bagit.txt")
31
+ raise "No bagit.txt file found at #{bagit_txt}" unless bagit_txt.exist?
32
+ bag
33
+ end
34
+
35
+ # @return [Pathname] The location of the bag home directory
36
+ def bag_pathname
37
+ @bag_pathname
38
+ end
39
+
40
+ # @param [Pathname,String] pathname The location of the bag home directory
41
+ # @return [Void] Set the location of the bag home directory
42
+ def bag_pathname=(pathname)
43
+ @bag_pathname = Pathname(pathname)
44
+ end
45
+
46
+ # @return [Pathname] The location of the bag data directory
47
+ def payload_pathname
48
+ bag_pathname.join('data')
49
+ end
50
+
51
+ # @return [Pathname] Generate the bagit.txt tag file
52
+ def write_bagit_txt
53
+ bagit_txt = bag_pathname.join("bagit.txt")
54
+ bagit_txt.open('w') do |f|
55
+ f.puts "Tag-File-Character-Encoding: UTF-8"
56
+ f.puts "BagIt-Version: 0.97"
57
+ end
58
+ bagit_txt
59
+ end
60
+
61
+ # @return [Hash<String,String] A hash containing the properties documented in the bagit.txt tagfile
62
+ def read_bagit_txt
63
+ properties = Hash.new
64
+ bagit_txt = bag_pathname.join("bagit.txt")
65
+ bagit_txt.readlines.each do |line|
66
+ line.chomp!.strip!
67
+ key,value = line.split(':',2)
68
+ properties[key.strip] = value.strip if value
69
+ end
70
+ properties
71
+ end
72
+
73
+ # @return [Array<Symbol>] The list of checksum types to be used when generating fixity data
74
+ def bag_checksum_types
75
+ @bag_checksum_types ||= Fixity.default_checksum_types
76
+ end
77
+
78
+ # @param [Object] types The list of checksum types to be used when generating fixity data
79
+ # @return [Void] Set the list of checksum types to be used when generating fixity data
80
+ def bag_checksum_types=(*types)
81
+ @bag_checksum_types = Fixity.validate_checksum_types(*types)
82
+ end
83
+
84
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
85
+ # @param [Pathname] source_dir The source location of the directory whose contents are to be ingested
86
+ # @return [Pathname] Generate file_fixity_hash and send it to #add_payload_files
87
+ def add_payload_dir (link_mode, source_dir)
88
+ file_fixity_hash = Fixity.generate_checksums(source_dir, nil ,bag_checksum_types)
89
+ add_payload_files(link_mode, source_dir, file_fixity_hash)
90
+ payload_pathname
91
+ end
92
+
93
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
94
+ # @param [Pathname] source_basepath The source location of the directory whose contents are to be ingested
95
+ # @param [Hash<String,FileFixity>] file_fixity_hash The list of files (with fixity data) to be added to the payload
96
+ # @return [Pathname] Copy or link the files specified in the file_fixity_hash to the payload directory,
97
+ # then update the payload manifest files
98
+ def add_payload_files(link_mode, source_basepath, file_fixity_hash)
99
+ file_fixity_hash.keys.each do |file_id|
100
+ source_pathname = source_basepath.join(file_id)
101
+ target_pathname = payload_pathname.join(file_id)
102
+ copy_file(link_mode, source_pathname, target_pathname)
103
+ end
104
+ write_manifest_checksums('manifest', file_fixity_hash)
105
+ payload_pathname
106
+ end
107
+
108
+ # @param [Symbol] link_mode Specifies whether to :copy, :link, or :symlink the files to the payload directory
109
+ # @param [Pathname] source_pathname The source location of the file to be ingested
110
+ # @param [Pathname] target_pathname The location of the directory in which to place the file
111
+ # @return [Pathname] link or copy the specified file from source location to the target location
112
+ def copy_file(link_mode, source_pathname, target_pathname)
113
+ target_pathname.parent.mkpath
114
+ case link_mode
115
+ when :copy, nil
116
+ FileUtils.copy(source_pathname.to_s, target_pathname.to_s) # automatically dereferences symlinks
117
+ when :link
118
+ FileUtils.link(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
119
+ when :symlink
120
+ FileUtils.symlink(source_pathname.to_s, target_pathname.to_s) #, :force => true (false is default)
121
+ else
122
+ raise "Invalid link_mode: #{link_mode}, expected one of [:copy,:link,:symlink]"
123
+ end
124
+ target_pathname
125
+ end
126
+
127
+ # @param [Pathname,String] source_fullpath The location of the directory whose content will be tarred
128
+ # @param [Pathname,String] source_basepath The location of the directory to change to before doing the tar create
129
+ # @return [Tarfile] Create a tar archive of a directory into the payload directory,
130
+ # generating checksums in parallel processes and recording those checksums in the payload manifests
131
+ def add_payload_tarfile(tarfile_id,source_fullpath, source_basepath)
132
+ tarfile = Tarfile.new
133
+ tarfile.source_basepath = Pathname(source_basepath)
134
+ tarfile.source_fullpath = Pathname(source_fullpath)
135
+ tarfile.tarfile_basepath = payload_pathname
136
+ tarfile.tarfile_fullpath = payload_pathname.join("#{tarfile_id}")
137
+ tarfile.create_tarfile
138
+ file_fixity_hash = Fixity.generate_checksums(tarfile.tarfile_basepath,[tarfile.tarfile_fullpath],bag_checksum_types)
139
+ write_manifest_checksums('manifest', file_fixity_hash)
140
+ tarfile
141
+ end
142
+
143
+ # @return [Pathname] Generate the bag-info.txt tag file to record the payload size
144
+ def write_bag_info_txt
145
+ payload_size = bag_payload_size
146
+ bag_info_txt = bag_pathname.join("bag-info.txt")
147
+ bag_info_txt.open('w') do |f|
148
+ f.puts "External-Identifier: #{bag_pathname.basename}"
149
+ f.puts "Payload-Oxum: #{payload_size[:bytes]}.#{payload_size[:files]}"
150
+ f.puts "Bag-Size: #{bag_size_human(payload_size[:bytes])}"
151
+ end
152
+ bag_info_txt
153
+ end
154
+
155
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
156
+ # derived from the payload directory contents
157
+ def bag_payload_size
158
+ payload_pathname.find.select{|f| f.file?}.inject({bytes: 0, files: 0}) do |hash,file|
159
+ hash[:bytes] += file.size
160
+ hash[:files] += 1
161
+ hash
162
+ end
163
+ end
164
+
165
+ # @param [Integer] bytes The total number of bytes in the payload
166
+ # @return [String] Human-readable rendition of the total payload size
167
+ def bag_size_human(bytes)
168
+ count = 0
169
+ size = bytes
170
+ while ( size >= 1024 and count < 4 )
171
+ size /= 1024.0
172
+ count += 1
173
+ end
174
+ if (count == 0)
175
+ return sprintf("%d B", size)
176
+ else
177
+ return sprintf("%.2f %s", size, %w[B KB MB GB TB][count] )
178
+ end
179
+ end
180
+
181
+ # @return [Hash<String,String] A hash containing the properties documented in the bag-info.txt tagfile
182
+ def read_bag_info_txt
183
+ properties = Hash.new
184
+ bag_info = bag_pathname.join("bag-info.txt")
185
+ bag_info.readlines.each do |line|
186
+ line.chomp!.strip!
187
+ key,value = line.split(':',2)
188
+ properties[key.strip] = value.strip if value
189
+ end
190
+ properties
191
+ end
192
+
193
+ # @return [Hash<Symbol,Integer>] A hash contining the payload size in bytes, and the number of files,
194
+ # derived from the Payload-Oxum property
195
+ def info_payload_size
196
+ info = read_bag_info_txt
197
+ size_array = info['Payload-Oxum'].split('.')
198
+ size_hash = {:bytes => size_array[0].to_i, :files => size_array[1].to_i}
199
+ size_hash
200
+ end
201
+
202
+ # @return [Boolean] Compare the actual measured payload size against the value recorded in bag-info.txt
203
+ def verify_payload_size
204
+ info_size = info_payload_size
205
+ bag_size = bag_payload_size
206
+ if info_size != bag_size
207
+ raise "Failed payload size verification! Expected: #{info_size}, Found: #{bag_size}"
208
+ end
209
+ true
210
+ end
211
+
212
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's root directory
213
+ def generate_tagfile_checksums
214
+ tagfiles = bag_pathname.children.reject{|file| file.basename.to_s.start_with?('tagmanifest')}
215
+ Fixity.generate_checksums(bag_pathname, tagfiles, bag_checksum_types )
216
+ end
217
+
218
+ # @return [Hash<String,FileFixity>] create hash containing ids and checksums for all files in the bag's payload
219
+ def generate_payload_checksums
220
+ Fixity.generate_checksums(payload_pathname, nil, bag_checksum_types)
221
+ end
222
+
223
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be updated
224
+ # @param [Hash<String,FileFixity>] file_fixity_hash A hash containing file ids and fixity data
225
+ # @param [String] open_mode The file open mode (default is 'a')
226
+ # @return [Hash<Symbol,Pathname] Update each of the manifests with data from the file_fixity_hash
227
+ def write_manifest_checksums(manifest_type, file_fixity_hash, open_mode='a')
228
+ manifests = Hash.new
229
+ self.bag_checksum_types.each do |checksum_type|
230
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
231
+ manifest_file = manifest_pathname.open(open_mode)
232
+ file_fixity_hash.values.each do |fixity|
233
+ checksum = fixity.get_checksum(checksum_type)
234
+ manifest_file.puts("#{checksum} #{fixity.file_id}") if checksum
235
+ end
236
+ manifest_file.close
237
+ manifests[checksum_type] = manifest_pathname
238
+ end
239
+ manifests
240
+ end
241
+
242
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
243
+ # @return [Hash<String,FileFixity>] A hash containing file ids and fixity data derived from the manifest files
244
+ def read_manifest_files(manifest_type)
245
+ file_fixity_hash = Hash.new
246
+ checksum_type_list = Array.new
247
+ Fixity.valid_checksum_ids.each do |checksum_type|
248
+ manifest_pathname = bag_pathname.join("#{manifest_type}-#{checksum_type}.txt")
249
+ if manifest_pathname.file?
250
+ checksum_type_list << checksum_type
251
+ manifest_pathname.readlines.each do |line|
252
+ line.chomp!.strip!
253
+ checksum,file_id = line.split(/[\s*]+/,2)
254
+ file_fixity = file_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
255
+ file_fixity.set_checksum(checksum_type,checksum)
256
+ file_fixity_hash[file_id] = file_fixity
257
+ end
258
+ end
259
+ end
260
+ self.bag_checksum_types = self.bag_checksum_types | checksum_type_list
261
+ file_fixity_hash
262
+ end
263
+
264
+ # @return [Boolean] Compare fixity data from the tag manifest files against the values measured by digesting the files
265
+ def verify_tagfile_manifests
266
+ manifest_type = 'tagmanifest'
267
+ manifest_fixity_hash = read_manifest_files(manifest_type)
268
+ bag_fixity_hash = generate_tagfile_checksums
269
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
270
+ end
271
+
272
+ # @return [Boolean] Compare fixity data from the payload manifest files against the values measured by digesting the files
273
+ def verify_payload_manifests
274
+ manifest_type = 'manifest'
275
+ manifest_fixity_hash = read_manifest_files(manifest_type)
276
+ bag_fixity_hash = generate_payload_checksums
277
+ verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
278
+ end
279
+
280
+ # @param [String] manifest_type The type of manifest file ('manifest' or 'tagmanifest') to be read
281
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
282
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
283
+ # @return [Boolean] Compare fixity data from the manifest files against the values measured by digesting the files,
284
+ # returning true if equal or false if not equal
285
+ def verify_manifests(manifest_type, manifest_fixity_hash, bag_fixity_hash)
286
+ diff = manifest_diff(manifest_fixity_hash, bag_fixity_hash)
287
+ if diff.size > 0
288
+ raise "Failed #{manifest_type} verification! Differences: \n#{diff.inspect}"
289
+ end
290
+ true
291
+ end
292
+
293
+ # @param [Hash<String,FileFixity>] manifest_fixity_hash A hash containing file ids and fixity data derived from the manifest files
294
+ # @param [Hash<String,FileFixity>] bag_fixity_hash A hash containing file ids and fixity data derived from the actual files
295
+ # @return [Hash] A report of the differences between the fixity data from the manifest files
296
+ # against the values measured by digesting the files
297
+ def manifest_diff(manifest_fixity_hash, bag_fixity_hash)
298
+ diff = Hash.new
299
+ (manifest_fixity_hash.keys | bag_fixity_hash.keys).each do |file_id|
300
+ manifest_fixity = manifest_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
301
+ bag_fixity = bag_fixity_hash[file_id] || FileFixity.new(file_id: file_id)
302
+ if manifest_fixity != bag_fixity
303
+ diff[file_id] = manifest_fixity.diff(bag_fixity,'manifest','bag')
304
+ end
305
+ end
306
+ diff
307
+ end
308
+
309
+ # @return [Boolean] Validate the bag containing the digital object
310
+ def verify_bag
311
+ verify_bag_structure
312
+ verify_tagfile_manifests
313
+ verify_payload_size
314
+ verify_payload_manifests
315
+ true
316
+ end
317
+
318
+ # @return [Boolean] Test the existence of expected files, return true if files exist, raise exception if not
319
+ def verify_bag_structure
320
+ required_files = ['data','bagit.txt','bag-info.txt','manifest-sha256.txt','tagmanifest-sha256.txt']
321
+ required_files.each{|filename| verify_pathname(bag_pathname.join(filename))}
322
+ optional_files = []
323
+ true
324
+ end
325
+
326
+ # @param [Pathname] pathname The file whose existence should be verified
327
+ # @return [Boolean] Test the existence of the specified path. Return true if file exists, raise exception if not
328
+ def verify_pathname(pathname)
329
+ raise "#{pathname.basename} not found at #{pathname}" unless pathname.exist?
330
+ true
331
+ end
332
+
333
+
334
+ end
335
+
336
+
337
+ end