ocfl-tools 0.9.14

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OcflTools
4
+ module Utils
5
+ # converts [Integer] version to [String] v0001 format.
6
+ # Adjust VERSION_FORMAT to format string version to local needs.
7
+ # @return [String] of version in desired format, starting with 'v'.
8
+ def self.version_int_to_string(version)
9
+ result = OcflTools.config.version_format % version.to_i
10
+ end
11
+
12
+ # converts [String] version name to [Integer].
13
+ # OCFL spec requires string versions to start with 'v'.
14
+ # Chop off the 'v' at th start, make into integer.
15
+ # @param [String] version_name string to convert to an integer.
16
+ # @return [Integer] the version as an integer.
17
+ def self.version_string_to_int(version_name)
18
+ result = version_name.split('v')[1].to_i
19
+ end
20
+
21
+ # We sometimes need to make deep (not shallow) copies of objects, mostly hashes.
22
+ # When we are copying state from a prior version, we don't want our copy to still
23
+ # be mutable by that prior version hash. So a deep (serialized) copy is called for.
24
+ # @param [Object] o object to make a deep copy of.
25
+ # @return [Object] a new object with no links to the previous one.
26
+ def self.deep_copy(o)
27
+ # We need this serialize Hashes so they don't shallow'y refer to each other.
28
+ Marshal.load(Marshal.dump(o))
29
+ end
30
+
31
+ # Given a fully-resolvable file path, calculate and return @digest.
32
+ # @param [String] file fully-resolvable filesystem path to a file.
33
+ # @param [String] digest to encode file with.
34
+ # @return [String] checksum of requested file using specified digest algorithm.
35
+ def self.generate_file_digest(file, digest)
36
+ case digest
37
+ when 'md5'
38
+ # checksum = Digest::MD5.hexdigest(File.read(file))
39
+ computed_hash = Digest::MD5.new
40
+ open(file) do |s|
41
+ while chunk=s.read(8096)
42
+ computed_hash.update chunk
43
+ end
44
+ end
45
+ return "#{computed_hash}" # return as a String, not a Digest object.
46
+ when 'sha1'
47
+ # checksum = Digest::SHA1.hexdigest(File.read(file))
48
+ computed_hash = Digest::SHA1.new
49
+ open(file) do |s|
50
+ while chunk=s.read(8096)
51
+ computed_hash.update chunk
52
+ end
53
+ end
54
+ return "#{computed_hash}" # return as a String, not a Digest object.
55
+ when 'sha256'
56
+ # checksum = Digest::SHA256.hexdigest(File.read(file))
57
+ computed_hash = Digest::SHA256.new
58
+ open(file) do |s|
59
+ while chunk=s.read(8096)
60
+ computed_hash.update chunk
61
+ end
62
+ end
63
+ return "#{computed_hash}" # return as a String, not a Digest object.
64
+ when 'sha512'
65
+ # checksum = Digest::SHA512.hexdigest(File.read(file))
66
+ computed_hash = Digest::SHA512.new
67
+ open(file) do |s|
68
+ while chunk=s.read(8096)
69
+ computed_hash.update chunk
70
+ end
71
+ end
72
+ return "#{computed_hash}" # return as a String, not a Digest object.
73
+ else
74
+ raise 'Unknown digest type!'
75
+ end
76
+ checksum
77
+ end
78
+
79
+ # @param [Hash] disk_checksums first hash of [ filepath => digest ] to compare.
80
+ # @param [Hash] inventory_checksums second hash of [ filepath => digest ] to compare.
81
+ # @param {OcflTools::OcflResults} results optional results instance to put results into.
82
+ def self.compare_hash_checksums(disk_checksums:, inventory_checksums:, results: OcflTools::OcflResults.new, context: 'verify_checksums')
83
+ unless results.is_a?(OcflTools::OcflResults)
84
+ raise 'You need to give me a results instance!'
85
+ end
86
+
87
+ # 1st check! If everything is perfect, these two Hashs SHOULD BE IDENTICAL!
88
+ if inventory_checksums == disk_checksums
89
+ results.ok('O200', context, 'All digests successfully verified.')
90
+ return results
91
+ end
92
+
93
+ # If they are NOT the same, we have to increment thru the Hashes to work out what's up.
94
+ # It might be a file in the manifest that's not found on disk
95
+ # Or a file on disk that's not in the manifest.
96
+ # Or a file that is on disk and in the manifest, but the checksums don't match.
97
+
98
+ disk_files = disk_checksums.keys
99
+ inventory_files = inventory_checksums.keys
100
+
101
+ missing_from_inventory = disk_files - inventory_files
102
+ missing_from_disk = inventory_files - disk_files
103
+
104
+ unless missing_from_inventory.empty?
105
+ missing_from_inventory.each do |missing|
106
+ results.error('E111', context, "#{missing} found on disk but missing from inventory.json.")
107
+ end
108
+ end
109
+
110
+ unless missing_from_disk.empty?
111
+ missing_from_disk.each do |missing|
112
+ results.error('E111', context, "#{missing} in inventory but not found on disk.")
113
+ end
114
+ end
115
+
116
+ # checksum mismatches; requires the file to be in both hashes, so.
117
+ inventory_checksums.each do |file, digest|
118
+ next unless disk_checksums.key?(file)
119
+
120
+ if disk_checksums[file] != digest
121
+ results.error('E111', context, "#{file} digest in inventory does not match digest computed from disk")
122
+ end
123
+ end
124
+ results
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,195 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OcflTools
4
+ module Utils
5
+ module Files
6
+ # Given a directory, return a list of all files (no dirs or special files) found beneath it.
7
+ # @param [Pathname] directory full file path to directory to search.
8
+ # @return [Array] of files found in all sub-directories of given path.
9
+ def self.get_dir_files(directory)
10
+ # Don't crash out if the requested dir doesn't exist, just state the obvious: there are no files in it.
11
+ return [] unless Dir.exist?(directory) == true
12
+
13
+ Dir.chdir(directory)
14
+ directory_files = []
15
+ Dir.glob('**/*').select do |file|
16
+ directory_files << file if File.file? file
17
+ end
18
+ directory_files
19
+ end
20
+
21
+ # Given an object root and a version, return the files on disk in the appropriate content dir.
22
+ # @return [Array] of fully-qualified filepaths for this version of the {OcflTools::Ocflinventory}
23
+ def self.get_version_dir_files(object_root_dir, version)
24
+ version_format = OcflTools::Utils::Files.get_version_format(object_root_dir)
25
+ # Int to version format
26
+ version_name = version_format % version.to_i
27
+ # Get latest inventory file
28
+ inventory = OcflTools::Utils::Files.get_latest_inventory(object_root_dir)
29
+ # Get contentDirectory value from inventory (or use default value)
30
+ contentDirectory = OcflTools::Utils::Inventory.get_contentDirectory(inventory)
31
+ # Now bring it together and get the goods.
32
+ my_files = OcflTools::Utils::Files.get_dir_files("#{object_root_dir}/#{version_name}/#{contentDirectory}")
33
+ # And expand it to a full file path
34
+ OcflTools::Utils::Files.expand_filepaths(my_files, "#{object_root_dir}/#{version_name}/#{contentDirectory}")
35
+ end
36
+
37
+ # Given an object root and two versions, get the files on disk for that range of versions (inclusive)
38
+ def self.get_versions_dir_files(object_root_dir, version1, version2)
39
+ top_ver = [version1, version2].max
40
+ bot_ver = [version1, version2].min
41
+ all_files = []
42
+ count = bot_ver # start at the bottom
43
+ until count > top_ver # count to the top
44
+ all_files << OcflTools::Utils::Files.get_version_dir_files(object_root_dir, count)
45
+ count += 1
46
+ end
47
+ raise 'No files found in version directories!' if all_files.empty?
48
+
49
+ all_files.flatten!
50
+ end
51
+
52
+ # Given an object root directory, deduce and return the version directories by inspecting disk.
53
+ def self.get_version_directories(object_root_dir)
54
+ unless Dir.exist?(object_root_dir) == true
55
+ raise 'Directory does not exist!'
56
+ end
57
+
58
+ object_root_dirs = []
59
+ version_directories = []
60
+ Dir.chdir(object_root_dir)
61
+ Dir.glob('*').select do |file|
62
+ object_root_dirs << file if File.directory? file
63
+ end
64
+ if object_root_dirs.empty?
65
+ raise "No directories found in #{object_root_dir}!"
66
+ end
67
+
68
+ # Needs to call get version_format method here.
69
+ object_root_dirs.each do |i|
70
+ if i =~ /[^"{OcflTools::Utils.Files.get_version_format(object_root_dir)}"$]/
71
+ version_directories << i
72
+ end
73
+ end
74
+ raise 'No version directories found!' if version_directories.empty?
75
+
76
+ version_directories.sort! # sort it, to be nice.
77
+ end
78
+
79
+ # Given an object_root_directory, deduce the format used to describe version directories.
80
+ def self.get_version_format(object_root_dir)
81
+ unless Dir.exist?(object_root_dir) == true
82
+ raise 'Directory does not exist!'
83
+ end
84
+
85
+ # Get all directories starting with 'v', sort them.
86
+ # Take the top of the sort. Count the number of 0s found.
87
+ # Raises errors if it can't find an appropriate version 1 directory.
88
+ version_dirs = []
89
+ Dir.chdir(object_root_dir)
90
+ Dir.glob('v*').select do |file|
91
+ version_dirs << file if File.directory? file
92
+ end
93
+ version_dirs.sort!
94
+ # if there's a verson_dirs that's just 'v', throw it out! It's hot garbage edge case we'll deal with later.
95
+ version_dirs.delete('v') if version_dirs.include? 'v'
96
+
97
+ first_version = version_dirs[0] # the first element should be the first version directory.
98
+ first_version.slice!(0, 1) # cut the leading 'v' from the string.
99
+ if first_version.length == 1 # A length of 1 for the first version implies 'v1'
100
+ unless first_version.to_i == 1
101
+ raise "#{object_root_dir}/#{first_version} is not the first version directory!"
102
+ end
103
+
104
+ version_format = 'v%d'
105
+ else
106
+ # Make sure this is Integer 1.
107
+ unless first_version.to_i == 1
108
+ raise "#{object_root_dir}/#{first_version} is not the first version directory!"
109
+ end
110
+
111
+ version_format = "v%0#{first_version.length}d"
112
+ end
113
+ version_format
114
+ end
115
+
116
+ # Given a [Hash] of digests and [ filepaths ], flip & expand to unique Filepath => digest.
117
+ def self.invert_and_expand(digest_hash)
118
+ raise 'This only works on Hashes, buck-o' unless digest_hash.is_a?(Hash)
119
+
120
+ working_hash = OcflTools::Utils.deep_copy(digest_hash)
121
+ return_hash = {}
122
+ working_hash.each do |key, value|
123
+ value.each do |v|
124
+ return_hash[v] = key
125
+ end
126
+ end
127
+ return_hash
128
+ end
129
+
130
+ # Given a hash of digest => [ Filepaths ], invert and expand, then prepend a string to all filepaths.
131
+ def self.invert_and_expand_and_prepend(digest_hash, prepend_string)
132
+ raise 'This only works on Hashes, buck-o' unless digest_hash.is_a?(Hash)
133
+
134
+ return_hash = {}
135
+ filepath_hash = OcflTools::Utils::Files.invert_and_expand(digest_hash)
136
+ filepath_hash.each do |file, digest|
137
+ filepaths = OcflTools::Utils::Files.expand_filepaths(file, prepend_string)
138
+ return_hash[filepaths[0]] = digest
139
+ end
140
+ return_hash
141
+ end
142
+
143
+ # Given an array of files and a digestAlgorithm, create digests and return results in a [Hash]
144
+ def self.create_digests(files, digestAlgorithm)
145
+ my_digests = {}
146
+ array = Array(files) # make sure it's an array, so we can handle single files as well.
147
+ array.each do |file|
148
+ my_digests[file] = OcflTools::Utils.generate_file_digest(file, digestAlgorithm)
149
+ end
150
+ my_digests
151
+ end
152
+
153
+ # Given an array of (relative to object root) filepaths, expand to fully-resovable filesystem paths.
154
+ # If the object_root_dir is already at the front of the filepath, don't add it again.
155
+ def self.expand_filepaths(files, object_root_dir)
156
+ array = Array(files) # make sure whatever we have is an array, so we can handle single files too.
157
+ my_full_filepaths = []
158
+ array.each do |f|
159
+ # /^#{object_root_dir}/ matches on what we want.
160
+ unless f =~ /^#{object_root_dir}/
161
+ my_full_filepaths << "#{object_root_dir}/#{f}"
162
+ end
163
+ end
164
+ my_full_filepaths
165
+ end
166
+
167
+ # Given an object root dir, get the most recent inventory file.
168
+ def self.get_latest_inventory(object_root_dir)
169
+ # Tries most recent version dir first, then object root, then other version dirs.
170
+ # g_v_d returns a sorted array already. Reverse it, so we start with highest version.
171
+ my_versions = OcflTools::Utils::Files.get_version_directories(object_root_dir).reverse
172
+ case
173
+ when File.exist?("#{object_root_dir}/#{my_versions[0]}/inventory.json")
174
+ return "#{object_root_dir}/#{my_versions[0]}/inventory.json"
175
+ when File.exist?("#{object_root_dir}/inventory.json")
176
+ return "#{object_root_dir}/inventory.json"
177
+ else
178
+ # Quit out here if there was only 1 version directory
179
+ unless my_versions.size > 1
180
+ raise "No inventory file found in #{object_root_dir}!"
181
+ end
182
+
183
+ my_versions.delete_at(0) # drop the first element.
184
+ my_versions.each do |v|
185
+ if File.exist?("#{object_root_dir}/#{v}/inventory.json")
186
+ return "#{object_root_dir}/#{v}/inventory.json"
187
+ end
188
+ end
189
+ # If we get here, no inventory file found!
190
+ raise "No inventory file found in #{object_root_dir}!"
191
+ end
192
+ end
193
+ end
194
+ end
195
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OcflTools
4
+ module Utils
5
+ # A module of convenience methods for reading information from an OCFL inventory.json file.
6
+ # {get_value} and its children are designed to account for reading info from the top few lines of a potentially many-MB size file,
7
+ # without having to load it all into memory by ingesting it with {OcflTools::OcflInventory}.
8
+ module Inventory
9
+ # Given an inventory file and a key to search for, return the value at that key.
10
+ # @param [Pathname] inventory_file fully-qualified path to a valid OCFL inventory.json.
11
+ # @param [String] key the JSON key in the inventory file that you want to return a value for.
12
+ # @return [String or nil] the value of the requested key, or nil if not found.
13
+ def self.get_value(inventory_file, key)
14
+ unless %w[contentDirectory digestAlgorithm head type id].include?(key)
15
+ raise "#{key} is not a valid OCFL inventory header key"
16
+ end
17
+
18
+ inventory = OcflTools::OcflInventory.new.from_file(inventory_file)
19
+
20
+ case key
21
+ when 'contentDirectory'
22
+ inventory.contentDirectory
23
+ when 'digestAlgorithm'
24
+ inventory.digestAlgorithm
25
+ when 'head'
26
+ inventory.head
27
+ when 'type'
28
+ inventory.type
29
+ when 'id'
30
+ inventory.id
31
+ else
32
+ raise "Unknown key #{key}"
33
+ end
34
+
35
+ end
36
+
37
+ # Given an inventory file, return the value of contentDirectory IF FOUND, or 'content' if contentDirectory is not set.
38
+ # It explicitly does NOT use the config.content_directory setting for this check.
39
+ # @param [Pathname] inventory_file fully-qualified path to a valid OCFL inventory.json.
40
+ # @return [String] the value of content_directory in the JSON, if found, or the OCFL required default value of 'content'.
41
+ def self.get_contentDirectory(inventory_file)
42
+ contentDirectory = OcflTools::Utils::Inventory.get_value(inventory_file, 'contentDirectory')
43
+ contentDirectory = 'content' if contentDirectory.nil?
44
+ contentDirectory
45
+ end
46
+
47
+ # Given an inventory file, return the name of the digest algorithm used (e.g. 'sha512').
48
+ # @param [Pathname] inventory_file fully-qualified path to a valid OCFL inventory.json.
49
+ # @return [String] the string value describing the digest algorithm used in this inventory.
50
+ def self.get_digestAlgorithm(inventory_file)
51
+ digestAlgorithm = OcflTools::Utils::Inventory.get_value(inventory_file, 'digestAlgorithm')
52
+ if digestAlgorithm.nil?
53
+ # Actually against OCFL spec
54
+ raise "Unable to find value for digestAlgorithm in #{inventory_file}"
55
+ end
56
+
57
+ digestAlgorithm
58
+ end
59
+
60
+ # Given an inventory file, return the fixity block (if it exists) or nil.
61
+ # @param [Pathname] inventory_file fully-qualified path to a valid OCFL inventory.json.
62
+ # @return [Hash or nil] the fixity block from the provided inventory.json, or nil if the inventory does not contain a fixity block.
63
+ def self.get_fixity(inventory_file)
64
+ inventory = OcflTools::OcflInventory.new.from_file(inventory_file)
65
+ return nil if inventory.fixity.empty?
66
+
67
+ inventory.fixity
68
+ end
69
+
70
+ # Given an inventory file, return [Array] of the digest types found in the fixity block, or nil.
71
+ # @param [Pathname] inventory_file fully-qualified path to a valid OCFL inventory.json.
72
+ # @return [Array or nil] an array of [String] values, with each value being a digest type found in the fixity block, e.g. 'sha1', 'md5', etc, or nil if no fixity block is found.
73
+ def self.get_fixity_digestAlgorithms(inventory_file)
74
+ inventory = OcflTools::OcflInventory.new.from_file(inventory_file)
75
+ return nil if inventory.fixity.empty?
76
+
77
+ inventory.fixity.keys
78
+ end
79
+
80
+ # Given an inventory file and a digestAlgorithm, return [Hash] of digests and [ filepaths ], or nil.
81
+ # @param [Pathname] inventory_file fully-qualified path to a valid OCFL inventory.json.
82
+ # @param [String] digestAlgorithm the algorithm used in the fixity block that you want digests for.
83
+ # @return [Hash or nil] a hash of digests and filepaths from the fixity block for the given digest type, or nil if the inventory.json does not contain a fixity block.
84
+ def self.get_fixity_digests(inventory_file, digestAlgorithm)
85
+ inventory = OcflTools::OcflInventory.new.from_file(inventory_file)
86
+ return nil if inventory.fixity.empty?
87
+
88
+ inventory.fixity[digestAlgorithm]
89
+ end
90
+
91
+ # Given an inventory & version, return files from that version.
92
+
93
+ # Given an inventory and 2 versions, return all files for range of versions.
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.authors = ['Julian M. Morley']
5
+ gem.email = ['jmorley@stanford.edu']
6
+ gem.description = 'Tools to create, manipulate and write Oxford Common File Layout (OCFL) preservation objects.'
7
+ gem.summary = 'Tools to create, manipulate and write Oxford Common File Layout (OCFL) preservation objects.'
8
+ gem.homepage = 'https://github.com/sul-dlss-labs/OCFL-Tools'
9
+ gem.licenses = ['Apache-2.0']
10
+
11
+ gem.files = `git ls-files -z`.split("\x0").reject do |f|
12
+ f.match(%r{^(test|spec|features)/})
13
+ end
14
+ # gem.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
15
+
16
+ gem.add_runtime_dependency 'anyway_config', '~> 1.0'
17
+ gem.add_runtime_dependency 'fileutils', '~> 1.3'
18
+ gem.add_runtime_dependency 'json', '~> 2.2', '>= 2.2.0'
19
+
20
+ # gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
21
+ # gem.test_files = gem.files.grep(%r{^spec/})
22
+ gem.name = 'ocfl-tools'
23
+ gem.require_paths = ['lib']
24
+ gem.version = File.read('VERSION').strip
25
+ # gem.metadata["yard.run"] = "yri" # use "yard" to build full HTML docs.
26
+ gem.add_development_dependency 'pry-byebug' unless ENV['CI']
27
+ gem.add_development_dependency 'rake'
28
+ gem.add_development_dependency 'rspec'
29
+ gem.add_development_dependency 'rubocop'
30
+ gem.add_development_dependency 'rubocop-rspec'
31
+ end