ocfl-tools 0.9.14

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+
5
+ require 'rspec/core/rake_task'
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require 'rubocop/rake_task'
9
+ RuboCop::RakeTask.new
10
+
11
+ task default: %i[spec]
12
+
13
+ task all: %i[spec rubocop]
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.9.14
@@ -0,0 +1,56 @@
1
+ # A simple example to demonstrate the relationship between logical content in an OCFL object
2
+ # and the fully-resolved path to those binaries on the local storage system.
3
+
4
+ require 'ocfl-tools'
5
+ require 'optparse'
6
+
7
+ options = {}
8
+
9
+ opts = OptionParser.new do |opts|
10
+ opts.on('-d DIRECTORY', '--dir DIRECTORY', 'A directory containing an OCFL object') do |dir|
11
+ unless Dir.exist?(dir)
12
+ raise "#{dir} is not a valid directory path."
13
+ end
14
+ options[:object_root] = dir
15
+ end
16
+
17
+ opts.on('-v VERSION', '--version VERSION', 'An optional version number') do |ver|
18
+ options[:version] = ver.to_i
19
+ end
20
+
21
+ end
22
+
23
+ opts.parse(ARGV)
24
+
25
+ raise OptionParser::MissingArgument if options[:object_root].nil?
26
+
27
+ object_root = options[:object_root]
28
+
29
+ # The inventory we're working on might not conform to the site default version format.
30
+ # Inspect the object root to determine what version format we should use, and use it.
31
+ OcflTools.config.version_format = OcflTools::Utils::Files.get_version_format(object_root)
32
+
33
+ # Get the latest inventory file from the object root.
34
+ inventory_file = OcflTools::Utils::Files.get_latest_inventory(object_root)
35
+
36
+ # Create an ocfl object from that inventory.
37
+ ocfl_object = OcflTools::OcflInventory.new.from_file(inventory_file)
38
+
39
+ # If we've been asked for a specific version, use it.
40
+ if options[:version].nil?
41
+ version = OcflTools::Utils.version_string_to_int(ocfl_object.head)
42
+ else
43
+ version = options[:version]
44
+ end
45
+
46
+ local_files = ocfl_object.get_files(version)
47
+
48
+ # Prepend the object root path to content_path to get fully-resolvable files.
49
+ local_files.each do | logical_path, content_path |
50
+ local_files[logical_path] = object_root + '/' + content_path
51
+ end
52
+
53
+ # Output a pretty result, for demo purposes.
54
+ local_files.each do | logical_path, content_path |
55
+ puts " #{logical_path} => #{content_path}"
56
+ end
@@ -0,0 +1,23 @@
1
+ # Usage: ruby ./validate_object.rb /path/to/directory/to/check
2
+ require 'ocfl-tools'
3
+ require 'optparse'
4
+
5
+ options = {}
6
+
7
+ opts = OptionParser.new do |opts|
8
+ opts.on('-d DIRECTORY', '--dir DIRECTORY', 'A directory containing an OCFL object') do |dir|
9
+ unless Dir.exist?(dir)
10
+ raise "#{dir} is not a valid directory path."
11
+ end
12
+ options[:object_root] = dir
13
+ end
14
+ end
15
+
16
+ opts.parse(ARGV)
17
+
18
+ raise OptionParser::MissingArgument if options[:object_root].nil?
19
+
20
+ object_root = options[:object_root]
21
+
22
+
23
+ OcflTools::OcflValidator.new(object_root).validate_ocfl_object_root.print
data/lib/ocfl-tools.rb ADDED
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ # OcflTools is a module that provides a distintive namespace for classes that create,
4
+ # maintain and read Oxford Common File Layout preservation objects.
5
+ #
6
+ # ====Data Model
7
+ #
8
+ # * <b>{OcflObject} = an object that models the internal data structures of an OCFL manifest.</b>
9
+ # * {OcflInventory} = An I/O interface for {OcflObject} allowing the reading and creaton of OCFL inventory.json files.
10
+ #
11
+ # @note Copyright (c) 2019 by The Board of Trustees of the Leland Stanford Junior University.
12
+
13
+ require 'ocfl_tools'
14
+ require 'json'
15
+ require 'anyway'
16
+ require 'fileutils'
17
+ require 'digest'
18
+ require 'time' # for iso8601 checking.
19
+ require 'uri' # for, well, uri testing.
data/lib/ocfl_tools.rb ADDED
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OcflTools
4
+ require 'ocfl_tools/ocfl_object'
5
+ require 'ocfl_tools/ocfl_inventory'
6
+ require 'ocfl_tools/ocfl_verify'
7
+ require 'ocfl_tools/ocfl_deposit'
8
+ require 'ocfl_tools/ocfl_validator'
9
+ require 'ocfl_tools/ocfl_results'
10
+ require 'ocfl_tools/ocfl_delta'
11
+ require 'ocfl_tools/ocfl_actions'
12
+ require 'ocfl_tools/ocfl_errors'
13
+ require 'ocfl_tools/config'
14
+ require 'ocfl_tools/utils'
15
+ require 'ocfl_tools/utils_file'
16
+ require 'ocfl_tools/utils_inventory'
17
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'anyway'
4
+
5
+ module OcflTools
6
+ # Site-wide configuration settings for OCFL-Tools, using the 'anyway' gem.
7
+ # Settings and their default values are:
8
+ # version_format: "v%04d",
9
+ # content_type: 'https://ocfl.io/1.0/spec/#inventory',
10
+ # content_directory: 'content',
11
+ # digest_algorithm: 'sha512',
12
+ # fixity_algorithms: ['md5', 'sha1', 'sha256']
13
+ # ocfl_version: '1.0'
14
+ class Config < Anyway::Config
15
+ attr_config version_format: 'v%04d',
16
+ content_type: 'https://ocfl.io/1.0/spec/#inventory',
17
+ content_directory: 'content',
18
+ digest_algorithm: 'sha512',
19
+ fixity_algorithms: %w[md5 sha1 sha256], # site-specific allowable fixity algorithms
20
+ ocfl_version: '1.0'
21
+ end
22
+
23
+ # Creates a new config instance if it doesn't already exist.
24
+ def self.config
25
+ @config ||= Config.new
26
+ end
27
+ end
@@ -0,0 +1,146 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OcflTools
4
+ # Class for collating manifest actions, both for delta reporting and staging new versions.
5
+ class OcflActions
6
+ def initialize
7
+ @my_actions = {}
8
+ @my_actions['update_manifest'] = {}
9
+ @my_actions['add'] = {}
10
+ @my_actions['update'] = {}
11
+ @my_actions['copy'] = {}
12
+ @my_actions['move'] = {}
13
+ @my_actions['delete'] = {}
14
+ end
15
+
16
+ # Convenience method for obtaining a hash of recorded actions.
17
+ # @return [Hash] of actions stored in this instance.
18
+ def actions
19
+ # Don't return empty keys.
20
+ @my_actions.delete_if { |_k, v| v == {} }
21
+ @my_actions
22
+ end
23
+
24
+ # Convenience method for obtaining a hash recorded of actions.
25
+ # @return [Hash] of actions stored in this instance.
26
+ def all
27
+ # Don't return empty keys.
28
+ @my_actions.delete_if { |_k, v| v == {} }
29
+ @my_actions
30
+ end
31
+
32
+ # Creates an 'update_manifest' entry in the actions hash.
33
+ # @param [String] digest of the filepath being recorded.
34
+ # @param [Pathname] filepath of file to record.
35
+ # @return [Hash] of recorded action.
36
+ def update_manifest(digest, filepath)
37
+ if @my_actions['update_manifest'].key?(digest) == false
38
+ @my_actions['update_manifest'][digest] = []
39
+ end
40
+ # Only put unique values into filepaths
41
+ if @my_actions['update_manifest'][digest].include?(filepath)
42
+ return @my_actions['update_manifest'][digest]
43
+ else
44
+ @my_actions['update_manifest'][digest] = (@my_actions['update_manifest'][digest] << filepath)
45
+ end
46
+ end
47
+
48
+ # Creates an 'add' entry in the actions hash.
49
+ # @param [String] digest of the filepath being recorded.
50
+ # @param [Pathname] filepath of file to record.
51
+ # @return [Hash] of recorded action.
52
+ def add(digest, filepath)
53
+ if @my_actions['add'].key?(digest) == false
54
+ @my_actions['add'][digest] = []
55
+ end
56
+ # Only put unique values into filepaths
57
+ if @my_actions['add'][digest].include?(filepath)
58
+ return @my_actions['add'][digest]
59
+ else
60
+ @my_actions['add'][digest] = (@my_actions['add'][digest] << filepath)
61
+ end
62
+ end
63
+
64
+ # Creates an 'update' entry in the actions hash.
65
+ # @param [String] digest of the filepath being recorded.
66
+ # @param [Pathname] filepath of file to record.
67
+ # @return [Hash] of recorded action.
68
+ def update(digest, filepath)
69
+ if @my_actions['update'].key?(digest) == false
70
+ @my_actions['update'][digest] = []
71
+ end
72
+ # Only put unique values into filepaths
73
+ if @my_actions['update'][digest].include?(filepath)
74
+ return @my_actions['update'][digest]
75
+ else
76
+ @my_actions['update'][digest] = (@my_actions['update'][digest] << filepath)
77
+ end
78
+ end
79
+
80
+ # Creates a 'copy' entry in the actions hash.
81
+ # @param [String] digest of the filepath being recorded.
82
+ # @param [Pathname] filepath of file to record.
83
+ # @return [Hash] of recorded action.
84
+ def copy(digest, filepath)
85
+ if @my_actions['copy'].key?(digest) == false
86
+ @my_actions['copy'][digest] = []
87
+ end
88
+ # Only put unique values into filepaths
89
+ if @my_actions['copy'][digest].include?(filepath)
90
+ return @my_actions['copy'][digest]
91
+ else
92
+ @my_actions['copy'][digest] = (@my_actions['copy'][digest] << filepath)
93
+ end
94
+ end
95
+
96
+ # Creates a 'move' entry in the actions hash.
97
+ # @param [String] digest of the filepath being recorded.
98
+ # @param [Pathname] filepath of file to record.
99
+ # @return [Hash] of recorded action.
100
+ def move(digest, filepath)
101
+ if @my_actions['move'].key?(digest) == false
102
+ @my_actions['move'][digest] = []
103
+ end
104
+ # Only put unique values into filepaths
105
+ if @my_actions['move'][digest].include?(filepath)
106
+ return @my_actions['move'][digest]
107
+ else
108
+ @my_actions['move'][digest] = (@my_actions['move'][digest] << filepath)
109
+ end
110
+ end
111
+
112
+ # Creates a 'delete' entry in the actions hash.
113
+ # @param [String] digest of the filepath being recorded.
114
+ # @param [Pathname] filepath of file to record.
115
+ # @return [Hash] of recorded action.
116
+ def delete(digest, filepath)
117
+ if @my_actions['delete'].key?(digest) == false
118
+ @my_actions['delete'][digest] = []
119
+ end
120
+ # Only put unique values into filepaths
121
+ if @my_actions['delete'][digest].include?(filepath)
122
+ return @my_actions['delete'][digest]
123
+ else
124
+ @my_actions['delete'][digest] = (@my_actions['delete'][digest] << filepath)
125
+ end
126
+ end
127
+
128
+ # @param [String] digest of the filepath that is getting additional fixity values.
129
+ # @param [String] fixity_algorithm of the fixity digest being added (e.g. 'md5', 'sha1').
130
+ # @param [String] fixity_digest to associate with this digest.
131
+ # @return [Hash] of recorded fixity block.
132
+ def fixity(digest, fixity_algorithm, fixity_digest)
133
+ # Only create this key if used.
134
+ @my_actions['fixity'] = {} if @my_actions.key?('fixity') == false
135
+ if @my_actions['fixity'].key?(fixity_algorithm) == false
136
+ @my_actions['fixity'][fixity_algorithm] = {}
137
+ end
138
+ # only add unique fixity digests.
139
+ if @my_actions['fixity'][fixity_algorithm].include?(digest)
140
+ return @my_actions['fixity'][fixity_algorithm][digest]
141
+ else
142
+ @my_actions['fixity'][fixity_algorithm][digest] = fixity_digest
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,250 @@
1
+ # frozen_string_literal: true
2
+
3
+ module OcflTools
4
+ # Given an inventory, show changes from previous versions.
5
+ # OcflDelta takes in an OCFL Inventory object and creates a delta hash containing
6
+ # the actions performed to assemble the requested version.
7
+ class OcflDelta
8
+ attr_reader :delta
9
+
10
+ def initialize(ocfl_object)
11
+ # Duck sanity check.
12
+ ['@id', '@head', '@manifest', '@versions', '@fixity'].each do |var|
13
+ unless ocfl_object.instance_variable_defined?(var)
14
+ raise "Object #{ocfl_object} does not have instance var #{var} defined"
15
+ end
16
+ end
17
+
18
+ %w[get_state version_id_list get_digest].each do |mthd|
19
+ unless ocfl_object.respond_to?(mthd)
20
+ raise "Object #{ocfl_object} does not respond to #{mthd}"
21
+ end
22
+ end
23
+
24
+ @ocfl_object = ocfl_object
25
+ @delta = {}
26
+ # We need to get version format, for final report-out. Assume that the ocfl_object versions are
27
+ # formatted correctly (starting with a 'v'). We can't trust the site config setting
28
+ # for this, as there's no guarantee the inventory we are reading in was created at this site.
29
+ first_version = @ocfl_object.versions.keys.min # should get us 'v0001' or 'v1'
30
+ sliced_version = first_version.split('v')[1] # cut the leading 'v' from the string.
31
+ if sliced_version.length == 1 # A length of 1 for the first version implies 'v1'
32
+ @version_format = 'v%d'
33
+ else
34
+ @version_format = "v%0#{sliced_version.length}d"
35
+ end
36
+ end
37
+
38
+ # Generates a complete delta hash for all versions of this object.
39
+ def all
40
+ @ocfl_object.version_id_list.each do |version|
41
+ get_version_delta(version)
42
+ end
43
+ @delta
44
+ end
45
+
46
+ # Given a version, get the delta from the previous version.
47
+ # @param [Integer] version of object to get deltas for.
48
+ # @return [Hash] of actions applied to previous version to create current version.
49
+ def previous(version)
50
+ # San check, does version exist in object?
51
+ if version == 1
52
+ get_first_version_delta
53
+ else
54
+ # verify version exists, then...
55
+ unless @ocfl_object.version_id_list.include?(version)
56
+ raise "Version #{version} not found in #{@ocfl_object}!"
57
+ end
58
+ get_version_delta(version)
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def get_version_delta(version)
65
+
66
+ unless version > 1
67
+ return get_first_version_delta
68
+ end
69
+
70
+ current_digests = @ocfl_object.get_state(version)
71
+ current_files = OcflTools::Utils::Files.invert_and_expand(current_digests)
72
+
73
+ previous_digests = @ocfl_object.get_state((version - 1))
74
+ previous_files = OcflTools::Utils::Files.invert_and_expand(previous_digests)
75
+
76
+ missing_digests = {}
77
+ missing_files = {}
78
+
79
+ new_digests = {}
80
+ new_files = {}
81
+
82
+ unchanged_digests = {} # digests may not have changed, but filepaths can!
83
+ unchanged_files = {} # filepaths may not change, but digests can!
84
+
85
+ version_string = @version_format % version.to_i
86
+ @delta[version_string] = {}
87
+ @delta[version_string].clear # Always clear out the existing version delta.
88
+ actions = OcflTools::OcflActions.new
89
+
90
+ temp_digests = previous_digests.keys - current_digests.keys
91
+ unless temp_digests.empty?
92
+ temp_digests.each do |digest|
93
+ missing_digests[digest] = previous_digests[digest]
94
+ end
95
+ end
96
+
97
+ temp_files = previous_files.keys - current_files.keys
98
+ unless temp_files.empty?
99
+ temp_files.each do |file|
100
+ missing_files[file] = previous_files[file]
101
+ end
102
+ end
103
+
104
+ temp_digests = current_digests.keys - previous_digests.keys
105
+ unless temp_digests.empty?
106
+
107
+ temp_digests.each do |digest|
108
+ new_digests[digest] = current_digests[digest]
109
+ end
110
+ end
111
+
112
+ temp_files = current_files.keys - previous_files.keys
113
+ unless temp_files.empty?
114
+
115
+ temp_files.each do |file|
116
+ new_files[file] = current_files[file]
117
+ end
118
+ end
119
+
120
+ temp_digests = current_digests.keys - (new_digests.keys + missing_digests.keys)
121
+ unless temp_digests.empty?
122
+ temp_digests.each do |digest|
123
+ unchanged_digests[digest] = current_digests[digest]
124
+ end
125
+ end
126
+
127
+ temp_files = current_files.keys - (new_files.keys + missing_files.keys)
128
+ unless temp_files.empty?
129
+ temp_files.each do |file|
130
+ unchanged_files[file] = current_files[file]
131
+ end
132
+ end
133
+
134
+ # 1. ADD is new digest, new filepath.
135
+ # consult new_digests and new_files
136
+ unless new_digests.empty?
137
+ new_digests.each do |digest, filepaths|
138
+ # If new_files, check for ADD.
139
+ filepaths.each do |file|
140
+ if new_files.key?(file)
141
+ # new digest, new file, it's an ADD!
142
+ if new_files[file] == digest
143
+ actions.add(digest, file)
144
+ update_manifest_action(digest, version, actions)
145
+ next # need this so we don't also count it as an UPDATE
146
+ end
147
+ end
148
+
149
+ # 2. UPDATE is new digest, existing filepath
150
+ # if new_files doesn't have it, check current_files
151
+ if current_files.key?(file)
152
+ # New digest, existing file
153
+ if current_files[file] == digest
154
+ actions.update(digest, file)
155
+ update_manifest_action(digest, version, actions)
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
161
+
162
+ # 3. COPY is unchanged digest, additional (new) filepath
163
+ unless unchanged_digests.empty?
164
+ unchanged_digests.each do |digest, filepaths|
165
+ # get previous version filepaths, compare to current version filepaths.
166
+ if filepaths.size > previous_digests[digest].size
167
+ # Take current array from previous array
168
+ # What *new* filepaths do we have for this digest in this version?
169
+ copied_files = filepaths - previous_digests[digest]
170
+ copied_files.each do |copy_file|
171
+ actions.copy(digest, copy_file)
172
+ end
173
+ end
174
+
175
+ # 4. MOVE is unchanged digest, 1 deleted filepath, 1 added filepath.
176
+ if filepaths.size == previous_digests[digest].size
177
+ # For it to be a move, this digest must be listed in missing_files AND new_files.
178
+ if missing_files.value?(digest) && new_files.value?(digest)
179
+ # look this up in previous_files.
180
+ old_filename = previous_digests[digest][0]
181
+ new_filename = current_digests[digest][0]
182
+ actions.move(digest, old_filename)
183
+ actions.move(digest, new_filename)
184
+ end
185
+ end
186
+
187
+ # 5. One possible DELETE is unchanged digest, fewer filepaths.
188
+ if filepaths.size < previous_digests[digest].size
189
+
190
+ # Am I in missing_files ?
191
+ previous_filepaths = previous_digests[digest]
192
+ deleted_filepaths = previous_filepaths - filepaths
193
+ if deleted_filepaths.empty?
194
+ deleted_filepaths.each do |delete_me|
195
+ actions.delete(digest, delete_me)
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
201
+
202
+ # 6. DELETE of last filepath is where there's a missing_digest && the filepath is gone too.
203
+ unless missing_digests.empty?
204
+ missing_digests.each do |digest, filepaths|
205
+ # For each missing digest, see if any of its filepaths are still referenced in current files.
206
+ filepaths.each do |filepath|
207
+ actions.delete(digest, filepath) unless current_files.key?(filepath)
208
+ end
209
+ end
210
+ end
211
+
212
+ @delta[version_string] = actions.all
213
+ end
214
+
215
+ def update_manifest_action(digest, version, action)
216
+ version_string = @version_format % version.to_i
217
+ # We need to make a deep copy here so content_paths edits don't screw up the ocfl_object's manifest.
218
+ content_paths = OcflTools::Utils.deep_copy(@ocfl_object.manifest[digest])
219
+ # Find any content_path that starts with the current version's directory & contentDirectory;
220
+ # these are bitstreams that were added to this version directory.
221
+ content_paths.each do |content_path|
222
+ if content_path =~ /^#{version_string}\/#{@ocfl_object.contentDirectory}/
223
+ # Now trim from front of content_path.
224
+ content_path.slice!("#{version_string}/#{@ocfl_object.contentDirectory}/")
225
+ action.update_manifest(digest, content_path)
226
+ end
227
+ end
228
+ end
229
+
230
+ def get_first_version_delta
231
+ # Everything in get_state is an 'add'
232
+ version = 1
233
+ actions = OcflTools::OcflActions.new
234
+
235
+ version_string = @version_format % version.to_i
236
+ @delta[version_string] = {} # Always clear out the existing version delta.
237
+ @delta[version_string].clear
238
+
239
+ current_digests = @ocfl_object.get_state(version)
240
+ current_digests.each do |digest, filepaths|
241
+ filepaths.each do |file|
242
+ actions.add(digest, file)
243
+ update_manifest_action(digest, version, actions)
244
+ end
245
+ end
246
+ @delta[version_string] = actions.all
247
+ # Everything in Fixity is also an 'add'
248
+ end
249
+ end
250
+ end