moab-versioning 4.4.0 → 5.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,328 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Moab
4
- # Given a deposit bag, ensures the contents valid for becoming a StorageObjectVersion
5
- # this is a Shameless Green implementation, combining code from:
6
- # - sdr-preservation-core/lib/sdr_ingest/validate_bag <-- old preservation robots
7
- # - archive-utils/lib/bagit_bag <-- gem only used by sdr-preservation-robots
8
- # - archive-utils/lib/file_fixity
9
- # - archive-utils/lib/fixity
10
- # this code adds duplication to this gem (see github issue #119);
11
- # for example, computing checksums is done
12
- # - deposit_bag_validator
13
- # - file_signature
14
- class DepositBagValidator
15
- BAG_DIR_NOT_FOUND = :bag_dir_not_found
16
- CHECKSUM_MISMATCH = :checksum_mismatch
17
- CHECKSUM_TYPE_UNRECOGNIZED = :checksum_type_unrecognized
18
- INVALID_VERSION_XXX_XML = :invalid_versionXxx_xml
19
- PAYLOAD_SIZE_MISMATCH = :payload_size_mismatch
20
- REQUIRED_FILE_NOT_FOUND = :required_file_not_found
21
- VERSION_MISMATCH_TO_MOAB = :version_mismatch_to_moab
22
- VERSION_MISSING_FROM_FILE = :version_missing_from_file
23
-
24
- ERROR_CODE_TO_MESSAGES = {
25
- BAG_DIR_NOT_FOUND => "Deposit bag directory %{bag_dir} does not exist",
26
- CHECKSUM_MISMATCH => "Failed %{manifest_type} verification. Differences: \n%{diffs}",
27
- CHECKSUM_TYPE_UNRECOGNIZED => "Checksum type unrecognized: %{checksum_type}; file: %{filename}",
28
- INVALID_VERSION_XXX_XML => "Unable to parse %{file_pathname}: %{err_info}",
29
- PAYLOAD_SIZE_MISMATCH => "Failed payload size verification. Expected: %{bag_info_sizes}; found: %{generated_sizes}",
30
- REQUIRED_FILE_NOT_FOUND => "Deposit bag required file %{file_pathname} not found",
31
- VERSION_MISMATCH_TO_MOAB => "Version mismatch in %{file_pathname}: Moab expected %{new_version}; found %{file_version}",
32
- VERSION_MISSING_FROM_FILE => "Version xml file %{version_file} missing data at %{xpath} containing version id"
33
- }.freeze
34
-
35
- REQUIRED_MANIFEST_CHECKSUM_TYPE = 'sha256'
36
- RECOGNIZED_CHECKSUM_ALGORITHMS = %i[md5 sha1 sha256 sha384 sha512].freeze
37
-
38
- TAGMANIFEST = 'tagmanifest'
39
- MANIFEST = 'manifest'
40
- DATA_DIR_BASENAME = 'data'
41
- BAG_INFO_TXT_BASENAME = 'bag-info.txt'
42
- VERSION_ADDITIONS_BASENAME = 'versionAdditions.xml'
43
- VERSION_INVENTORY_BASENAME = 'versionInventory.xml'
44
- VERSION_METADATA_PATH = "#{DATA_DIR_BASENAME}/metadata/versionMetadata.xml"
45
-
46
- REQUIRED_BAG_FILES = [
47
- DATA_DIR_BASENAME,
48
- 'bagit.txt',
49
- BAG_INFO_TXT_BASENAME,
50
- "#{MANIFEST}-#{REQUIRED_MANIFEST_CHECKSUM_TYPE}.txt",
51
- "#{TAGMANIFEST}-#{REQUIRED_MANIFEST_CHECKSUM_TYPE}.txt",
52
- VERSION_ADDITIONS_BASENAME,
53
- VERSION_INVENTORY_BASENAME,
54
- VERSION_METADATA_PATH
55
- ].freeze
56
-
57
- attr_reader :deposit_bag_pathname, :expected_new_version, :result_array
58
-
59
- def initialize(storage_object)
60
- @deposit_bag_pathname = storage_object.deposit_bag_pathname
61
- @expected_new_version = storage_object.current_version_id + 1
62
- @result_array = []
63
- end
64
-
65
- # returns Array of tiny error hashes, allowing multiple occurrences of a single error code
66
- def validation_errors
67
- return [single_error_hash(BAG_DIR_NOT_FOUND, bag_dir: deposit_bag_pathname)] unless deposit_bag_pathname.exist?
68
- return result_array unless required_bag_files_exist?
69
-
70
- verify_version
71
- verify_tagmanifests
72
- verify_payload_size
73
- verify_payload_manifests
74
- result_array # attr that accumulates any errors encountered along the way
75
- end
76
-
77
- private
78
-
79
- def bag_dir_exists?
80
- deposit_bag_pathname.exist?
81
- end
82
-
83
- # assumes this is called when result_array is empty, as subsequent checks will use these required files
84
- def required_bag_files_exist?
85
- REQUIRED_BAG_FILES.each do |filename|
86
- pathname = deposit_bag_pathname.join(filename)
87
- result_array << single_error_hash(REQUIRED_FILE_NOT_FOUND, file_pathname: pathname) unless pathname.exist?
88
- end
89
- result_array.empty?
90
- end
91
-
92
- def verify_version
93
- version_md_pathname = deposit_bag_pathname.join(VERSION_METADATA_PATH)
94
- version_from_file = last_version_id_from_version_md_xml(version_md_pathname)
95
- verify_version_from_xml_file(version_md_pathname, version_from_file) if version_from_file
96
-
97
- version_additions_pathname = deposit_bag_pathname.join(VERSION_ADDITIONS_BASENAME)
98
- version_from_file = version_id_from_version_manifest_xml(version_additions_pathname)
99
- verify_version_from_xml_file(version_additions_pathname, version_from_file) if version_from_file
100
-
101
- version_inventory_pathname = deposit_bag_pathname.join(VERSION_INVENTORY_BASENAME)
102
- version_from_file = version_id_from_version_manifest_xml(version_inventory_pathname)
103
- verify_version_from_xml_file(version_inventory_pathname, version_from_file) if version_from_file
104
- end
105
-
106
- def last_version_id_from_version_md_xml(version_md_pathname)
107
- last_version_id_from_xml(version_md_pathname, '/versionMetadata/version/@versionId')
108
- end
109
-
110
- def version_id_from_version_manifest_xml(version_manifest_xml_pathname)
111
- last_version_id_from_xml(version_manifest_xml_pathname, '/fileInventory/@versionId')
112
- end
113
-
114
- def last_version_id_from_xml(pathname, xpath)
115
- doc = Nokogiri::XML(File.open(pathname.to_s), &:strict)
116
- version_id = doc.xpath(xpath).last.text unless doc.xpath(xpath).empty?
117
- return version_id.to_i if version_id
118
-
119
- err_data = {
120
- version_file: pathname,
121
- xpath: xpath
122
- }
123
- result_array << single_error_hash(VERSION_MISSING_FROM_FILE, err_data) unless version_id
124
- nil
125
- rescue StandardError => e
126
- err_data = {
127
- file_pathname: pathname,
128
- err_info: "#{e}\n#{e.backtrace}"
129
- }
130
- result_array << single_error_hash(INVALID_VERSION_XXX_XML, err_data)
131
- nil
132
- end
133
-
134
- def verify_version_from_xml_file(file_pathname, found)
135
- return if found == expected_new_version
136
-
137
- err_data = {
138
- file_pathname: file_pathname,
139
- new_version: expected_new_version,
140
- file_version: found
141
- }
142
- result_array << single_error_hash(VERSION_MISMATCH_TO_MOAB, err_data)
143
- end
144
-
145
- # adds to result_array if tagmanifest checksums don't match generated checksums
146
- def verify_tagmanifests
147
- tagmanifests_checksums_hash = checksums_hash_from_manifest_files(TAGMANIFEST)
148
- types_to_generate = checksum_types_from_manifest_checksums_hash(tagmanifests_checksums_hash)
149
- generated_checksums_hash = generate_tagmanifest_checksums_hash(types_to_generate)
150
- verify_manifest_checksums(TAGMANIFEST, tagmanifests_checksums_hash, generated_checksums_hash)
151
- end
152
-
153
- # adds to result_array if manifest checksums don't match generated checksums
154
- def verify_payload_manifests
155
- manifests_checksums_hash = checksums_hash_from_manifest_files(MANIFEST)
156
- types_to_generate = checksum_types_from_manifest_checksums_hash(manifests_checksums_hash)
157
- generated_checksums_hash = generate_payload_checksums(types_to_generate)
158
- verify_manifest_checksums(MANIFEST, manifests_checksums_hash, generated_checksums_hash)
159
- end
160
-
161
- # construct hash based on manifest_type-alg.txt files in bag home dir
162
- # key: file_name, relative to base_path, value: hash of checksum alg => checksum value
163
- def checksums_hash_from_manifest_files(manifest_type)
164
- checksums_hash = {}
165
- deposit_bag_pathname.children.each do |child_pathname|
166
- if child_pathname.file?
167
- child_fname = child_pathname.basename.to_s
168
- match_result = child_fname.match("^#{manifest_type}-(.*).txt")
169
- if match_result
170
- checksum_type = match_result.captures.first.to_sym
171
- if RECOGNIZED_CHECKSUM_ALGORITHMS.include?(checksum_type)
172
- child_pathname.readlines.each do |line|
173
- line.chomp!.strip!
174
- checksum, file_name = line.split(/[\s*]+/, 2)
175
- file_checksums = checksums_hash[file_name] || {}
176
- file_checksums[checksum_type] = checksum
177
- checksums_hash[file_name] = file_checksums
178
- end
179
- else
180
- result_array << single_error_hash(CHECKSUM_TYPE_UNRECOGNIZED, checksum_type: checksum_type, filename: child_pathname)
181
- end
182
- end
183
- end
184
- end
185
- checksums_hash
186
- end
187
-
188
- # generate hash of checksums by file name for bag home dir files
189
- def generate_tagmanifest_checksums_hash(types_to_generate)
190
- # all names in the bag home dir except those starting with 'tagmanifest'
191
- home_dir_pathnames = deposit_bag_pathname.children.reject { |file| file.basename.to_s.start_with?(TAGMANIFEST) }
192
- hash_with_full_pathnames = generate_checksums_hash(home_dir_pathnames, types_to_generate)
193
- # return hash keys as basenames only
194
- hash_with_full_pathnames.map { |k, v| [Pathname.new(k).basename.to_s, v] }.to_h
195
- end
196
-
197
- # generate hash of checksums by file name for bag data dir files
198
- def generate_payload_checksums(types_to_generate)
199
- data_pathnames = deposit_bag_pathname.join(DATA_DIR_BASENAME).find
200
- hash_with_full_pathnames = generate_checksums_hash(data_pathnames, types_to_generate)
201
- # return hash keys beginning with 'data/'
202
- hash_with_full_pathnames.map { |k, v| [Pathname.new(k).relative_path_from(deposit_bag_pathname).to_s, v] }.to_h
203
- end
204
-
205
- def generate_checksums_hash(pathnames, types_to_generate)
206
- file_checksums_hash = {}
207
- pathnames.each do |pathname|
208
- file_checksums_hash[pathname.to_s] = generated_checksums(pathname, types_to_generate) if pathname.file?
209
- end
210
- file_checksums_hash
211
- end
212
-
213
- def generated_checksums(pathname, types_to_generate)
214
- my_digester_hash = digester_hash(types_to_generate)
215
- pathname.open('r') do |stream|
216
- while (buffer = stream.read(8192))
217
- my_digester_hash.each_value { |digest| digest.update(buffer) }
218
- end
219
- end
220
- file_checksums = {}
221
- my_digester_hash.each do |checksum_type, digest|
222
- file_checksums[checksum_type] = digest.hexdigest
223
- end
224
- file_checksums
225
- end
226
-
227
- def digester_hash(types_to_generate = DEFAULT_CHECKSUM_TYPES)
228
- types_to_generate.each_with_object({}) do |checksum_type, digester_hash|
229
- case checksum_type
230
- when :md5
231
- digester_hash[checksum_type] = Digest::MD5.new
232
- when :sha1
233
- digester_hash[checksum_type] = Digest::SHA1.new
234
- when :sha256
235
- digester_hash[checksum_type] = Digest::SHA2.new(256)
236
- when :sha384
237
- digesters[checksum_type] = Digest::SHA2.new(384)
238
- when :sha512
239
- digesters[checksum_type] = Digest::SHA2.new(512)
240
- else
241
- result_array << single_error_hash(CHECKSUM_TYPE_UNRECOGNIZED, checksum_type: checksum_type, filename: nil)
242
- end
243
- digester_hash
244
- end
245
- end
246
-
247
- def verify_manifest_checksums(manifest_type, manifests_checksum_hash, generated_checksum_hash)
248
- diff_hash = {}
249
- # NOTE: this is intentionally | instead of ||
250
- (manifests_checksum_hash.keys | generated_checksum_hash.keys).each do |file_name|
251
- manifest_checksums = manifests_checksum_hash[file_name] || {}
252
- generated_checksums = generated_checksum_hash[file_name] || {}
253
- if manifest_checksums != generated_checksums
254
- cdh = checksums_diff_hash(manifest_checksums, generated_checksums, manifest_type, 'generated')
255
- diff_hash[file_name] = cdh if cdh
256
- end
257
- end
258
- return if diff_hash.empty?
259
-
260
- err_data = {
261
- manifest_type: manifest_type,
262
- diffs: diff_hash
263
- }
264
- result_array << single_error_hash(CHECKSUM_MISMATCH, err_data)
265
- end
266
-
267
- def checksums_diff_hash(left_checksums, right_checksums, left_label, right_label)
268
- diff_hash = {}
269
- # NOTE: these are intentionally & and | instead of && and ||
270
- checksum_types_to_compare = (left_checksums.keys & right_checksums.keys)
271
- checksum_types_to_compare = (left_checksums.keys | right_checksums.keys) if checksum_types_to_compare.empty?
272
- checksum_types_to_compare.each do |type|
273
- left_checksum = left_checksums[type]
274
- right_checksum = right_checksums[type]
275
- diff_hash[type] = { left_label => left_checksum, right_label => right_checksum } if left_checksum != right_checksum
276
- end
277
- diff_hash.empty? ? nil : diff_hash
278
- end
279
-
280
- def verify_payload_size
281
- sizes_from_bag_info_file = bag_info_payload_size
282
- generated_sizes = generated_payload_size
283
- return if sizes_from_bag_info_file == generated_sizes
284
-
285
- err_data = {
286
- bag_info_sizes: sizes_from_bag_info_file,
287
- generated_sizes: generated_sizes
288
- }
289
- result_array << single_error_hash(PAYLOAD_SIZE_MISMATCH, err_data)
290
- end
291
-
292
- def bag_info_payload_size
293
- bag_info_txt_pathname = deposit_bag_pathname.join(BAG_INFO_TXT_BASENAME)
294
- bag_info_txt_pathname.readlines.each do |line|
295
- line.chomp!.strip!
296
- key, value = line.split(':', 2)
297
- if key.strip == 'Payload-Oxum'
298
- num_bytes, num_files = value.strip.split('.') if value
299
- return { bytes: num_bytes.to_i, files: num_files.to_i }
300
- end
301
- end
302
- end
303
-
304
- def generated_payload_size
305
- payload_pathname = deposit_bag_pathname.join(DATA_DIR_BASENAME)
306
- payload_pathname.find.select(&:file?).each_with_object(bytes: 0, files: 0) do |file, hash|
307
- hash[:bytes] += file.size
308
- hash[:files] += 1
309
- hash
310
- end
311
- end
312
-
313
- # checksums_hash: { fname => {:md5=>"xxx", :sha1=>"yyy"}, fname => ... }
314
- def checksum_types_from_manifest_checksums_hash(checksums_hash)
315
- types = []
316
- checksums_hash.each_value { |v| v.each_key { |k| types << k unless types.include?(k) } }
317
- types
318
- end
319
-
320
- def single_error_hash(error_code, err_data_hash)
321
- { error_code => error_code_msg(error_code, err_data_hash) }
322
- end
323
-
324
- def error_code_msg(error_code, err_data_hash)
325
- ERROR_CODE_TO_MESSAGES[error_code] % err_data_hash
326
- end
327
- end
328
- end
@@ -1,34 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Moab
4
- # The descriptive information about a digital object's collection of versions
5
- #
6
- # ====Data Model
7
- # * <b>{VersionMetadata} = descriptive information about a digital object's versions</b>
8
- # * {VersionMetadataEntry} [1..*] = attributes of a digital object version
9
- # * {VersionMetadataEvent} [1..*] = object version lifecycle events with timestamps
10
- #
11
- # @example {include:file:spec/fixtures/data/jq937jp0017/v3/metadata/versionMetadata.xml}
12
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
13
- # All rights reserved. See {file:LICENSE.rdoc} for details.
14
- class VersionMetadata < Serializer::Manifest
15
- include HappyMapper
16
-
17
- # The name of the XML element used to serialize this objects data
18
- tag 'versionMetadata'
19
-
20
- # (see Serializable#initialize)
21
- def initialize(opts = {})
22
- @versions = []
23
- super(opts)
24
- end
25
-
26
- # @attribute
27
- # @return [String] The digital object identifier
28
- attribute :digital_object_id, String, :tag => 'objectId'
29
-
30
- # @attribute
31
- # @return [Array<VersionMetadataEntry>] An array of version metadata entries, one per version
32
- has_many :versions, VersionMetadataEntry, :tag => 'version'
33
- end
34
- end
@@ -1,42 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Moab
4
- # A container element to record object version lifecycle events with timestamps
5
- #
6
- # ====Data Model
7
- # * {VersionMetadata} = descriptive information about a digital object's versions
8
- # * {VersionMetadataEntry} [1..*] = attributes of a digital object version
9
- # * <b>{VersionMetadataEvent} [1..*] = object version lifecycle events with timestamps</b>
10
- #
11
- # @see VersionMetadata
12
- # @see VersionMetadataEntry
13
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
14
- # All rights reserved. See {file:LICENSE.rdoc} for details.
15
- class VersionMetadataEvent < Serializer::Serializable
16
- include HappyMapper
17
-
18
- # The name of the XML element used to serialize this objects data
19
- tag 'event'
20
-
21
- # (see Serializable#initialize)
22
- def initialize(opts = {})
23
- super(opts)
24
- end
25
-
26
- # @attribute
27
- # @return [String] The type of event
28
- attribute :type, String
29
-
30
- # @attribute
31
- # @return [String] The date and time of an event
32
- attribute :datetime, String
33
-
34
- def datetime=(event_datetime)
35
- @datetime = Moab::UtcTime.input(event_datetime)
36
- end
37
-
38
- def datetime
39
- Moab::UtcTime.output(@datetime)
40
- end
41
- end
42
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Stanford
4
- # Utility Class for extracting content or other information from a Fedora Instance
5
- #
6
- # ====Data Model
7
- # * {DorMetadata} = utility methods for interfacing with Stanford metadata files (esp contentMetadata)
8
- # * {ContentInventory} [1..1] = utilities for transforming contentMetadata to versionInventory and doing comparisons
9
- # * <b>{ActiveFedoraObject} [1..*] = utility for extracting content or other information from a Fedora Instance</b>
10
- #
11
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
12
- # All rights reserved. See {file:LICENSE.rdoc} for details.
13
- class ActiveFedoraObject
14
- # @param fedora_object [Object] The Active Fedora representation of the Fedora Object
15
- # @return [Stanford::ActiveFedoraObject] Create a u
16
- def initialize(fedora_object)
17
- @fedora_object = fedora_object
18
- end
19
-
20
- # @return [Object] The Active Fedora representation of the Fedora Object
21
- attr_accessor :fedora_object
22
-
23
- # @api external
24
- # @param ds_id [String] The datastream identifier
25
- # @return [String] The content of the specified datastream
26
- def get_datastream_content(ds_id)
27
- @fedora_object.datastreams[ds_id].content
28
- end
29
- end
30
- end
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Stanford
4
- # Stanford-specific utility methods for interfacing with DOR metadata files
5
- #
6
- # ====Data Model
7
- # * <b>{DorMetadata} = utility methods for interfacing with Stanford metadata files (esp contentMetadata)</b>
8
- # * {ContentInventory} [1..1] = utilities for transforming contentMetadata to versionInventory and doing comparisons
9
- # * {ActiveFedoraObject} [1..*] = utility for extracting content or other information from a Fedora Instance
10
- #
11
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
12
- # All rights reserved. See {file:LICENSE.rdoc} for details.
13
- class DorMetadata
14
- # @return [String] The digital object identifier (druid)
15
- attr_accessor :digital_object_id
16
-
17
- # @return [Integer] \@versionId = The ordinal version number
18
- attr_accessor :version_id
19
-
20
- # @param digital_object_id [String] The digital object identifier
21
- # @param version_id [Integer] The ordinal version number
22
- # @return [Stanford::DorMetadata]
23
- def initialize(digital_object_id, version_id = nil)
24
- @digital_object_id = digital_object_id
25
- @version_id = version_id
26
- end
27
-
28
- # @api internal
29
- # @param directory [String] The location of the directory to be inventoried
30
- # @param version_id (see #initialize)
31
- # @return [FileInventory] Inventory of the files under the specified directory
32
- def inventory_from_directory(directory, version_id = nil)
33
- version_id ||= @version_id
34
- version_inventory = Moab::FileInventory.new(type: 'version', digital_object_id: @digital_object_id, version_id: version_id)
35
- content_metadata = IO.read(File.join(directory, 'contentMetadata.xml'))
36
- content_group = Stanford::ContentInventory.new.group_from_cm(content_metadata, 'preserve')
37
- version_inventory.groups << content_group
38
- metadata_group = Moab::FileGroup.new(:group_id => 'metadata').group_from_directory(directory)
39
- version_inventory.groups << metadata_group
40
- version_inventory
41
- end
42
- end
43
- end
@@ -1,38 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'find'
4
-
5
- module Stanford
6
- ##
7
- # methods for dealing with a directory which stores Moab objects
8
- class MoabStorageDirectory
9
- DRUID_TREE_REGEXP = '[[:lower:]]{2}/\\d{3}/[[:lower:]]{2}/\\d{4}'
10
- DRUID_REGEXP = '[[:lower:]]{2}\\d{3}[[:lower:]]{2}\\d{4}'
11
-
12
- def self.find_moab_paths(storage_dir)
13
- Find.find(storage_dir) do |path|
14
- Find.prune unless File.directory?(path) # don't bother with a matching on files, we only care about directories
15
- path_match_data = storage_dir_regexp(storage_dir).match(path)
16
- if path_match_data
17
- yield path_match_data[1], path, path_match_data # yield the druid, the full path, and the MatchData object
18
- Find.prune # we don't care about what's in the moab dir, we just want the paths that look like moabs
19
- end
20
- end
21
- end
22
-
23
- def self.list_moab_druids(storage_dir)
24
- druids = []
25
- find_moab_paths(storage_dir) { |druid, _path, _path_match_data| druids << druid }
26
- druids
27
- end
28
-
29
- private_class_method def self.storage_dir_regexps
30
- @storage_dir_regexps ||= {}
31
- end
32
-
33
- # this regexp caching makes things many times faster (e.g. went from ~2200 s to crawl disk11, down to ~300 s)
34
- private_class_method def self.storage_dir_regexp(storage_dir)
35
- storage_dir_regexps[storage_dir] ||= Regexp.new("^#{storage_dir}/#{DRUID_TREE_REGEXP}/(#{DRUID_REGEXP})$")
36
- end
37
- end
38
- end