moab-versioning 4.4.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,328 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Moab
4
- # Given a deposit bag, ensures the contents valid for becoming a StorageObjectVersion
5
- # this is a Shameless Green implementation, combining code from:
6
- # - sdr-preservation-core/lib/sdr_ingest/validate_bag <-- old preservation robots
7
- # - archive-utils/lib/bagit_bag <-- gem only used by sdr-preservation-robots
8
- # - archive-utils/lib/file_fixity
9
- # - archive-utils/lib/fixity
10
- # this code adds duplication to this gem (see github issue #119);
11
- # for example, computing checksums is done
12
- # - deposit_bag_validator
13
- # - file_signature
14
- class DepositBagValidator
15
- BAG_DIR_NOT_FOUND = :bag_dir_not_found
16
- CHECKSUM_MISMATCH = :checksum_mismatch
17
- CHECKSUM_TYPE_UNRECOGNIZED = :checksum_type_unrecognized
18
- INVALID_VERSION_XXX_XML = :invalid_versionXxx_xml
19
- PAYLOAD_SIZE_MISMATCH = :payload_size_mismatch
20
- REQUIRED_FILE_NOT_FOUND = :required_file_not_found
21
- VERSION_MISMATCH_TO_MOAB = :version_mismatch_to_moab
22
- VERSION_MISSING_FROM_FILE = :version_missing_from_file
23
-
24
- ERROR_CODE_TO_MESSAGES = {
25
- BAG_DIR_NOT_FOUND => "Deposit bag directory %{bag_dir} does not exist",
26
- CHECKSUM_MISMATCH => "Failed %{manifest_type} verification. Differences: \n%{diffs}",
27
- CHECKSUM_TYPE_UNRECOGNIZED => "Checksum type unrecognized: %{checksum_type}; file: %{filename}",
28
- INVALID_VERSION_XXX_XML => "Unable to parse %{file_pathname}: %{err_info}",
29
- PAYLOAD_SIZE_MISMATCH => "Failed payload size verification. Expected: %{bag_info_sizes}; found: %{generated_sizes}",
30
- REQUIRED_FILE_NOT_FOUND => "Deposit bag required file %{file_pathname} not found",
31
- VERSION_MISMATCH_TO_MOAB => "Version mismatch in %{file_pathname}: Moab expected %{new_version}; found %{file_version}",
32
- VERSION_MISSING_FROM_FILE => "Version xml file %{version_file} missing data at %{xpath} containing version id"
33
- }.freeze
34
-
35
- REQUIRED_MANIFEST_CHECKSUM_TYPE = 'sha256'
36
- RECOGNIZED_CHECKSUM_ALGORITHMS = %i[md5 sha1 sha256 sha384 sha512].freeze
37
-
38
- TAGMANIFEST = 'tagmanifest'
39
- MANIFEST = 'manifest'
40
- DATA_DIR_BASENAME = 'data'
41
- BAG_INFO_TXT_BASENAME = 'bag-info.txt'
42
- VERSION_ADDITIONS_BASENAME = 'versionAdditions.xml'
43
- VERSION_INVENTORY_BASENAME = 'versionInventory.xml'
44
- VERSION_METADATA_PATH = "#{DATA_DIR_BASENAME}/metadata/versionMetadata.xml"
45
-
46
- REQUIRED_BAG_FILES = [
47
- DATA_DIR_BASENAME,
48
- 'bagit.txt',
49
- BAG_INFO_TXT_BASENAME,
50
- "#{MANIFEST}-#{REQUIRED_MANIFEST_CHECKSUM_TYPE}.txt",
51
- "#{TAGMANIFEST}-#{REQUIRED_MANIFEST_CHECKSUM_TYPE}.txt",
52
- VERSION_ADDITIONS_BASENAME,
53
- VERSION_INVENTORY_BASENAME,
54
- VERSION_METADATA_PATH
55
- ].freeze
56
-
57
- attr_reader :deposit_bag_pathname, :expected_new_version, :result_array
58
-
59
- def initialize(storage_object)
60
- @deposit_bag_pathname = storage_object.deposit_bag_pathname
61
- @expected_new_version = storage_object.current_version_id + 1
62
- @result_array = []
63
- end
64
-
65
- # returns Array of tiny error hashes, allowing multiple occurrences of a single error code
66
- def validation_errors
67
- return [single_error_hash(BAG_DIR_NOT_FOUND, bag_dir: deposit_bag_pathname)] unless deposit_bag_pathname.exist?
68
- return result_array unless required_bag_files_exist?
69
-
70
- verify_version
71
- verify_tagmanifests
72
- verify_payload_size
73
- verify_payload_manifests
74
- result_array # attr that accumulates any errors encountered along the way
75
- end
76
-
77
- private
78
-
79
- def bag_dir_exists?
80
- deposit_bag_pathname.exist?
81
- end
82
-
83
- # assumes this is called when result_array is empty, as subsequent checks will use these required files
84
- def required_bag_files_exist?
85
- REQUIRED_BAG_FILES.each do |filename|
86
- pathname = deposit_bag_pathname.join(filename)
87
- result_array << single_error_hash(REQUIRED_FILE_NOT_FOUND, file_pathname: pathname) unless pathname.exist?
88
- end
89
- result_array.empty?
90
- end
91
-
92
- def verify_version
93
- version_md_pathname = deposit_bag_pathname.join(VERSION_METADATA_PATH)
94
- version_from_file = last_version_id_from_version_md_xml(version_md_pathname)
95
- verify_version_from_xml_file(version_md_pathname, version_from_file) if version_from_file
96
-
97
- version_additions_pathname = deposit_bag_pathname.join(VERSION_ADDITIONS_BASENAME)
98
- version_from_file = version_id_from_version_manifest_xml(version_additions_pathname)
99
- verify_version_from_xml_file(version_additions_pathname, version_from_file) if version_from_file
100
-
101
- version_inventory_pathname = deposit_bag_pathname.join(VERSION_INVENTORY_BASENAME)
102
- version_from_file = version_id_from_version_manifest_xml(version_inventory_pathname)
103
- verify_version_from_xml_file(version_inventory_pathname, version_from_file) if version_from_file
104
- end
105
-
106
- def last_version_id_from_version_md_xml(version_md_pathname)
107
- last_version_id_from_xml(version_md_pathname, '/versionMetadata/version/@versionId')
108
- end
109
-
110
- def version_id_from_version_manifest_xml(version_manifest_xml_pathname)
111
- last_version_id_from_xml(version_manifest_xml_pathname, '/fileInventory/@versionId')
112
- end
113
-
114
- def last_version_id_from_xml(pathname, xpath)
115
- doc = Nokogiri::XML(File.open(pathname.to_s), &:strict)
116
- version_id = doc.xpath(xpath).last.text unless doc.xpath(xpath).empty?
117
- return version_id.to_i if version_id
118
-
119
- err_data = {
120
- version_file: pathname,
121
- xpath: xpath
122
- }
123
- result_array << single_error_hash(VERSION_MISSING_FROM_FILE, err_data) unless version_id
124
- nil
125
- rescue StandardError => e
126
- err_data = {
127
- file_pathname: pathname,
128
- err_info: "#{e}\n#{e.backtrace}"
129
- }
130
- result_array << single_error_hash(INVALID_VERSION_XXX_XML, err_data)
131
- nil
132
- end
133
-
134
- def verify_version_from_xml_file(file_pathname, found)
135
- return if found == expected_new_version
136
-
137
- err_data = {
138
- file_pathname: file_pathname,
139
- new_version: expected_new_version,
140
- file_version: found
141
- }
142
- result_array << single_error_hash(VERSION_MISMATCH_TO_MOAB, err_data)
143
- end
144
-
145
- # adds to result_array if tagmanifest checksums don't match generated checksums
146
- def verify_tagmanifests
147
- tagmanifests_checksums_hash = checksums_hash_from_manifest_files(TAGMANIFEST)
148
- types_to_generate = checksum_types_from_manifest_checksums_hash(tagmanifests_checksums_hash)
149
- generated_checksums_hash = generate_tagmanifest_checksums_hash(types_to_generate)
150
- verify_manifest_checksums(TAGMANIFEST, tagmanifests_checksums_hash, generated_checksums_hash)
151
- end
152
-
153
- # adds to result_array if manifest checksums don't match generated checksums
154
- def verify_payload_manifests
155
- manifests_checksums_hash = checksums_hash_from_manifest_files(MANIFEST)
156
- types_to_generate = checksum_types_from_manifest_checksums_hash(manifests_checksums_hash)
157
- generated_checksums_hash = generate_payload_checksums(types_to_generate)
158
- verify_manifest_checksums(MANIFEST, manifests_checksums_hash, generated_checksums_hash)
159
- end
160
-
161
- # construct hash based on manifest_type-alg.txt files in bag home dir
162
- # key: file_name, relative to base_path, value: hash of checksum alg => checksum value
163
- def checksums_hash_from_manifest_files(manifest_type)
164
- checksums_hash = {}
165
- deposit_bag_pathname.children.each do |child_pathname|
166
- if child_pathname.file?
167
- child_fname = child_pathname.basename.to_s
168
- match_result = child_fname.match("^#{manifest_type}-(.*).txt")
169
- if match_result
170
- checksum_type = match_result.captures.first.to_sym
171
- if RECOGNIZED_CHECKSUM_ALGORITHMS.include?(checksum_type)
172
- child_pathname.readlines.each do |line|
173
- line.chomp!.strip!
174
- checksum, file_name = line.split(/[\s*]+/, 2)
175
- file_checksums = checksums_hash[file_name] || {}
176
- file_checksums[checksum_type] = checksum
177
- checksums_hash[file_name] = file_checksums
178
- end
179
- else
180
- result_array << single_error_hash(CHECKSUM_TYPE_UNRECOGNIZED, checksum_type: checksum_type, filename: child_pathname)
181
- end
182
- end
183
- end
184
- end
185
- checksums_hash
186
- end
187
-
188
- # generate hash of checksums by file name for bag home dir files
189
- def generate_tagmanifest_checksums_hash(types_to_generate)
190
- # all names in the bag home dir except those starting with 'tagmanifest'
191
- home_dir_pathnames = deposit_bag_pathname.children.reject { |file| file.basename.to_s.start_with?(TAGMANIFEST) }
192
- hash_with_full_pathnames = generate_checksums_hash(home_dir_pathnames, types_to_generate)
193
- # return hash keys as basenames only
194
- hash_with_full_pathnames.map { |k, v| [Pathname.new(k).basename.to_s, v] }.to_h
195
- end
196
-
197
- # generate hash of checksums by file name for bag data dir files
198
- def generate_payload_checksums(types_to_generate)
199
- data_pathnames = deposit_bag_pathname.join(DATA_DIR_BASENAME).find
200
- hash_with_full_pathnames = generate_checksums_hash(data_pathnames, types_to_generate)
201
- # return hash keys beginning with 'data/'
202
- hash_with_full_pathnames.map { |k, v| [Pathname.new(k).relative_path_from(deposit_bag_pathname).to_s, v] }.to_h
203
- end
204
-
205
- def generate_checksums_hash(pathnames, types_to_generate)
206
- file_checksums_hash = {}
207
- pathnames.each do |pathname|
208
- file_checksums_hash[pathname.to_s] = generated_checksums(pathname, types_to_generate) if pathname.file?
209
- end
210
- file_checksums_hash
211
- end
212
-
213
- def generated_checksums(pathname, types_to_generate)
214
- my_digester_hash = digester_hash(types_to_generate)
215
- pathname.open('r') do |stream|
216
- while (buffer = stream.read(8192))
217
- my_digester_hash.each_value { |digest| digest.update(buffer) }
218
- end
219
- end
220
- file_checksums = {}
221
- my_digester_hash.each do |checksum_type, digest|
222
- file_checksums[checksum_type] = digest.hexdigest
223
- end
224
- file_checksums
225
- end
226
-
227
- def digester_hash(types_to_generate = DEFAULT_CHECKSUM_TYPES)
228
- types_to_generate.each_with_object({}) do |checksum_type, digester_hash|
229
- case checksum_type
230
- when :md5
231
- digester_hash[checksum_type] = Digest::MD5.new
232
- when :sha1
233
- digester_hash[checksum_type] = Digest::SHA1.new
234
- when :sha256
235
- digester_hash[checksum_type] = Digest::SHA2.new(256)
236
- when :sha384
237
- digesters[checksum_type] = Digest::SHA2.new(384)
238
- when :sha512
239
- digesters[checksum_type] = Digest::SHA2.new(512)
240
- else
241
- result_array << single_error_hash(CHECKSUM_TYPE_UNRECOGNIZED, checksum_type: checksum_type, filename: nil)
242
- end
243
- digester_hash
244
- end
245
- end
246
-
247
- def verify_manifest_checksums(manifest_type, manifests_checksum_hash, generated_checksum_hash)
248
- diff_hash = {}
249
- # NOTE: this is intentionally | instead of ||
250
- (manifests_checksum_hash.keys | generated_checksum_hash.keys).each do |file_name|
251
- manifest_checksums = manifests_checksum_hash[file_name] || {}
252
- generated_checksums = generated_checksum_hash[file_name] || {}
253
- if manifest_checksums != generated_checksums
254
- cdh = checksums_diff_hash(manifest_checksums, generated_checksums, manifest_type, 'generated')
255
- diff_hash[file_name] = cdh if cdh
256
- end
257
- end
258
- return if diff_hash.empty?
259
-
260
- err_data = {
261
- manifest_type: manifest_type,
262
- diffs: diff_hash
263
- }
264
- result_array << single_error_hash(CHECKSUM_MISMATCH, err_data)
265
- end
266
-
267
- def checksums_diff_hash(left_checksums, right_checksums, left_label, right_label)
268
- diff_hash = {}
269
- # NOTE: these are intentionally & and | instead of && and ||
270
- checksum_types_to_compare = (left_checksums.keys & right_checksums.keys)
271
- checksum_types_to_compare = (left_checksums.keys | right_checksums.keys) if checksum_types_to_compare.empty?
272
- checksum_types_to_compare.each do |type|
273
- left_checksum = left_checksums[type]
274
- right_checksum = right_checksums[type]
275
- diff_hash[type] = { left_label => left_checksum, right_label => right_checksum } if left_checksum != right_checksum
276
- end
277
- diff_hash.empty? ? nil : diff_hash
278
- end
279
-
280
- def verify_payload_size
281
- sizes_from_bag_info_file = bag_info_payload_size
282
- generated_sizes = generated_payload_size
283
- return if sizes_from_bag_info_file == generated_sizes
284
-
285
- err_data = {
286
- bag_info_sizes: sizes_from_bag_info_file,
287
- generated_sizes: generated_sizes
288
- }
289
- result_array << single_error_hash(PAYLOAD_SIZE_MISMATCH, err_data)
290
- end
291
-
292
- def bag_info_payload_size
293
- bag_info_txt_pathname = deposit_bag_pathname.join(BAG_INFO_TXT_BASENAME)
294
- bag_info_txt_pathname.readlines.each do |line|
295
- line.chomp!.strip!
296
- key, value = line.split(':', 2)
297
- if key.strip == 'Payload-Oxum'
298
- num_bytes, num_files = value.strip.split('.') if value
299
- return { bytes: num_bytes.to_i, files: num_files.to_i }
300
- end
301
- end
302
- end
303
-
304
- def generated_payload_size
305
- payload_pathname = deposit_bag_pathname.join(DATA_DIR_BASENAME)
306
- payload_pathname.find.select(&:file?).each_with_object(bytes: 0, files: 0) do |file, hash|
307
- hash[:bytes] += file.size
308
- hash[:files] += 1
309
- hash
310
- end
311
- end
312
-
313
- # checksums_hash: { fname => {:md5=>"xxx", :sha1=>"yyy"}, fname => ... }
314
- def checksum_types_from_manifest_checksums_hash(checksums_hash)
315
- types = []
316
- checksums_hash.each_value { |v| v.each_key { |k| types << k unless types.include?(k) } }
317
- types
318
- end
319
-
320
- def single_error_hash(error_code, err_data_hash)
321
- { error_code => error_code_msg(error_code, err_data_hash) }
322
- end
323
-
324
- def error_code_msg(error_code, err_data_hash)
325
- ERROR_CODE_TO_MESSAGES[error_code] % err_data_hash
326
- end
327
- end
328
- end
@@ -1,34 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Moab
4
- # The descriptive information about a digital object's collection of versions
5
- #
6
- # ====Data Model
7
- # * <b>{VersionMetadata} = descriptive information about a digital object's versions</b>
8
- # * {VersionMetadataEntry} [1..*] = attributes of a digital object version
9
- # * {VersionMetadataEvent} [1..*] = object version lifecycle events with timestamps
10
- #
11
- # @example {include:file:spec/fixtures/data/jq937jp0017/v3/metadata/versionMetadata.xml}
12
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
13
- # All rights reserved. See {file:LICENSE.rdoc} for details.
14
- class VersionMetadata < Serializer::Manifest
15
- include HappyMapper
16
-
17
- # The name of the XML element used to serialize this objects data
18
- tag 'versionMetadata'
19
-
20
- # (see Serializable#initialize)
21
- def initialize(opts = {})
22
- @versions = []
23
- super(opts)
24
- end
25
-
26
- # @attribute
27
- # @return [String] The digital object identifier
28
- attribute :digital_object_id, String, :tag => 'objectId'
29
-
30
- # @attribute
31
- # @return [Array<VersionMetadataEntry>] An array of version metadata entries, one per version
32
- has_many :versions, VersionMetadataEntry, :tag => 'version'
33
- end
34
- end
@@ -1,42 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Moab
4
- # A container element to record object version lifecycle events with timestamps
5
- #
6
- # ====Data Model
7
- # * {VersionMetadata} = descriptive information about a digital object's versions
8
- # * {VersionMetadataEntry} [1..*] = attributes of a digital object version
9
- # * <b>{VersionMetadataEvent} [1..*] = object version lifecycle events with timestamps</b>
10
- #
11
- # @see VersionMetadata
12
- # @see VersionMetadataEntry
13
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
14
- # All rights reserved. See {file:LICENSE.rdoc} for details.
15
- class VersionMetadataEvent < Serializer::Serializable
16
- include HappyMapper
17
-
18
- # The name of the XML element used to serialize this objects data
19
- tag 'event'
20
-
21
- # (see Serializable#initialize)
22
- def initialize(opts = {})
23
- super(opts)
24
- end
25
-
26
- # @attribute
27
- # @return [String] The type of event
28
- attribute :type, String
29
-
30
- # @attribute
31
- # @return [String] The date and time of an event
32
- attribute :datetime, String
33
-
34
- def datetime=(event_datetime)
35
- @datetime = Moab::UtcTime.input(event_datetime)
36
- end
37
-
38
- def datetime
39
- Moab::UtcTime.output(@datetime)
40
- end
41
- end
42
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Stanford
4
- # Utility Class for extracting content or other information from a Fedora Instance
5
- #
6
- # ====Data Model
7
- # * {DorMetadata} = utility methods for interfacing with Stanford metadata files (esp contentMetadata)
8
- # * {ContentInventory} [1..1] = utilities for transforming contentMetadata to versionInventory and doing comparisons
9
- # * <b>{ActiveFedoraObject} [1..*] = utility for extracting content or other information from a Fedora Instance</b>
10
- #
11
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
12
- # All rights reserved. See {file:LICENSE.rdoc} for details.
13
- class ActiveFedoraObject
14
- # @param fedora_object [Object] The Active Fedora representation of the Fedora Object
15
- # @return [Stanford::ActiveFedoraObject] Create a u
16
- def initialize(fedora_object)
17
- @fedora_object = fedora_object
18
- end
19
-
20
- # @return [Object] The Active Fedora representation of the Fedora Object
21
- attr_accessor :fedora_object
22
-
23
- # @api external
24
- # @param ds_id [String] The datastream identifier
25
- # @return [String] The content of the specified datastream
26
- def get_datastream_content(ds_id)
27
- @fedora_object.datastreams[ds_id].content
28
- end
29
- end
30
- end
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Stanford
4
- # Stanford-specific utility methods for interfacing with DOR metadata files
5
- #
6
- # ====Data Model
7
- # * <b>{DorMetadata} = utility methods for interfacing with Stanford metadata files (esp contentMetadata)</b>
8
- # * {ContentInventory} [1..1] = utilities for transforming contentMetadata to versionInventory and doing comparisons
9
- # * {ActiveFedoraObject} [1..*] = utility for extracting content or other information from a Fedora Instance
10
- #
11
- # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
12
- # All rights reserved. See {file:LICENSE.rdoc} for details.
13
- class DorMetadata
14
- # @return [String] The digital object identifier (druid)
15
- attr_accessor :digital_object_id
16
-
17
- # @return [Integer] \@versionId = The ordinal version number
18
- attr_accessor :version_id
19
-
20
- # @param digital_object_id [String] The digital object identifier
21
- # @param version_id [Integer] The ordinal version number
22
- # @return [Stanford::DorMetadata]
23
- def initialize(digital_object_id, version_id = nil)
24
- @digital_object_id = digital_object_id
25
- @version_id = version_id
26
- end
27
-
28
- # @api internal
29
- # @param directory [String] The location of the directory to be inventoried
30
- # @param version_id (see #initialize)
31
- # @return [FileInventory] Inventory of the files under the specified directory
32
- def inventory_from_directory(directory, version_id = nil)
33
- version_id ||= @version_id
34
- version_inventory = Moab::FileInventory.new(type: 'version', digital_object_id: @digital_object_id, version_id: version_id)
35
- content_metadata = IO.read(File.join(directory, 'contentMetadata.xml'))
36
- content_group = Stanford::ContentInventory.new.group_from_cm(content_metadata, 'preserve')
37
- version_inventory.groups << content_group
38
- metadata_group = Moab::FileGroup.new(:group_id => 'metadata').group_from_directory(directory)
39
- version_inventory.groups << metadata_group
40
- version_inventory
41
- end
42
- end
43
- end
@@ -1,38 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'find'
4
-
5
- module Stanford
6
- ##
7
- # methods for dealing with a directory which stores Moab objects
8
- class MoabStorageDirectory
9
- DRUID_TREE_REGEXP = '[[:lower:]]{2}/\\d{3}/[[:lower:]]{2}/\\d{4}'
10
- DRUID_REGEXP = '[[:lower:]]{2}\\d{3}[[:lower:]]{2}\\d{4}'
11
-
12
- def self.find_moab_paths(storage_dir)
13
- Find.find(storage_dir) do |path|
14
- Find.prune unless File.directory?(path) # don't bother with a matching on files, we only care about directories
15
- path_match_data = storage_dir_regexp(storage_dir).match(path)
16
- if path_match_data
17
- yield path_match_data[1], path, path_match_data # yield the druid, the full path, and the MatchData object
18
- Find.prune # we don't care about what's in the moab dir, we just want the paths that look like moabs
19
- end
20
- end
21
- end
22
-
23
- def self.list_moab_druids(storage_dir)
24
- druids = []
25
- find_moab_paths(storage_dir) { |druid, _path, _path_match_data| druids << druid }
26
- druids
27
- end
28
-
29
- private_class_method def self.storage_dir_regexps
30
- @storage_dir_regexps ||= {}
31
- end
32
-
33
- # this regexp caching makes things many times faster (e.g. went from ~2200 s to crawl disk11, down to ~300 s)
34
- private_class_method def self.storage_dir_regexp(storage_dir)
35
- storage_dir_regexps[storage_dir] ||= Regexp.new("^#{storage_dir}/#{DRUID_TREE_REGEXP}/(#{DRUID_REGEXP})$")
36
- end
37
- end
38
- end