moab-versioning 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/lib/moab.rb +59 -0
  3. data/lib/moab/bagger.rb +289 -0
  4. data/lib/moab/config.rb +21 -0
  5. data/lib/moab/exceptions.rb +18 -0
  6. data/lib/moab/file_group.rb +244 -0
  7. data/lib/moab/file_group_difference.rb +336 -0
  8. data/lib/moab/file_group_difference_subset.rb +45 -0
  9. data/lib/moab/file_instance.rb +82 -0
  10. data/lib/moab/file_instance_difference.rb +54 -0
  11. data/lib/moab/file_inventory.rb +279 -0
  12. data/lib/moab/file_inventory_difference.rb +132 -0
  13. data/lib/moab/file_manifestation.rb +85 -0
  14. data/lib/moab/file_signature.rb +200 -0
  15. data/lib/moab/signature_catalog.rb +195 -0
  16. data/lib/moab/signature_catalog_entry.rb +61 -0
  17. data/lib/moab/storage_object.rb +220 -0
  18. data/lib/moab/storage_object_version.rb +333 -0
  19. data/lib/moab/storage_repository.rb +57 -0
  20. data/lib/moab/storage_services.rb +104 -0
  21. data/lib/moab/verification_result.rb +83 -0
  22. data/lib/moab/version_metadata.rb +38 -0
  23. data/lib/moab/version_metadata_entry.rb +64 -0
  24. data/lib/moab/version_metadata_event.rb +47 -0
  25. data/lib/moab_stanford.rb +18 -0
  26. data/lib/monkey_patches.rb +65 -0
  27. data/lib/serializer.rb +36 -0
  28. data/lib/serializer/manifest.rb +76 -0
  29. data/lib/serializer/serializable.rb +178 -0
  30. data/lib/stanford/active_fedora_object.rb +34 -0
  31. data/lib/stanford/content_inventory.rb +236 -0
  32. data/lib/stanford/dor_metadata.rb +49 -0
  33. data/lib/stanford/storage_repository.rb +46 -0
  34. data/lib/stanford/storage_services.rb +66 -0
  35. data/lib/tasks/yard.rake +34 -0
  36. data/lib/tools/api_doc_generator.rb +396 -0
  37. data/lib/tools/spec_generator.rb +410 -0
  38. data/lib/tools/spec_generator_old.rb +49 -0
  39. metadata +252 -0
@@ -0,0 +1,54 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A container for recording difference information at the file level
6
+ # * If there was no change, the change type is set to <i>identical</i>
7
+ # * If the signature is unchanged, but the path has moved, the change type is set to <i>renamed</i>
8
+ # * If path is unchanged, but the signature has changed, the change type is set to <i>modified</i> and both signatures are reported
9
+ # * If the signature and path are only in the basis inventory, the change type is set to <i>deleted</i>
10
+ # * If the signature and path are only in the other inventory, the change type is set to <i>added</i>
11
+ # This is a child element of {FileGroupDifferenceSubset}, which is in turn a descendent of {FileInventoryDifference},
12
+ # the documentation of which contains a full example
13
+ #
14
+ # ====Data Model
15
+ # * {FileInventoryDifference} = compares two {FileInventory} instances based on file signatures and pathnames
16
+ # * {FileGroupDifference} [1..*] = performs analysis and reports differences between two matching {FileGroup} objects
17
+ # * {FileGroupDifferenceSubset} [1..5] = collects a set of file-level differences of a give change type
18
+ # * <b>{FileInstanceDifference} [1..*] = contains difference information at the file level</b>
19
+ # * {FileSignature} [1..2] = contains the file signature(s) of two file instances being compared
20
+ #
21
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
22
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
23
+ class FileInstanceDifference < Serializable
24
+
25
+ include HappyMapper
26
+
27
+ # The name of the XML element used to serialize this objects data
28
+ tag 'file'
29
+
30
+ # (see Serializable#initialize)
31
+ def initialize(opts={})
32
+ @signatures = Array.new
33
+ super(opts)
34
+ end
35
+
36
+ # @attribute
37
+ # @return [String] The type of file change
38
+ attribute :change, String
39
+
40
+ # @attribute
41
+ # @return [String] The file's path in the basis inventory (usually for an old version)
42
+ attribute :basis_path, String, :tag => 'basisPath', :on_save => Proc.new { |s| s.to_s }
43
+
44
+ # @attribute
45
+ # @return [String] The file's path in the other inventory (usually for an new version) compared against the basis
46
+ attribute :other_path, String, :tag => 'otherPath', :on_save => Proc.new { |s| s.to_s }
47
+
48
+ # @attribute
49
+ # @return [Array<FileSignature>] The fixity data of the file manifestation(s) (plural if change was a content modification)
50
+ has_many :signatures, FileSignature, :tag => 'fileSignature'
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,279 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A structured container for recording information about a collection of related files.
6
+ #
7
+ # The <b>scope</b> of the file collection depends on inventory type:
8
+ # * <i>version</i> = full set of data files comprising a digital object's version
9
+ # * <i>additions</i> = subset of data files that were newly added in the specified version
10
+ # * <i>manifests</i> = the fixity data for manifest files in the version's root folder
11
+ # * <i>directory</i> = set of files that were harvested from a filesystem directory
12
+ #
13
+ # The inventory contains one or more {FileGroup} subsets, which are most commonly used
14
+ # to provide segregation of digital object version's <i>content</i> and <i>metadata</i> files.
15
+ # Each group contains one or more {FileManifestation} entities,
16
+ # each of which represents a point-in-time snapshot of a given file's filesystem characteristics.
17
+ # The fixity data for a file is stored in a {FileSignature} element,
18
+ # while the filename and modification data are stored in one or more {FileInstance} elements.
19
+ # (Copies of a given file may be present in multiple locations in a collection)
20
+ #
21
+ # ====Data Model
22
+ # * <b>{FileInventory} = container for recording information about a collection of related files</b>
23
+ # * {FileGroup} [1..*] = subset allow segregation of content and metadata files
24
+ # * {FileManifestation} [1..*] = snapshot of a file's filesystem characteristics
25
+ # * {FileSignature} [1] = file fixity information
26
+ # * {FileInstance} [1..*] = filepath and timestamp of any physical file having that signature
27
+ #
28
+ # @example {include:file:spec/fixtures/derivatives/manifests/v3/versionInventory.xml}
29
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
30
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
31
+ class FileInventory < Manifest
32
+
33
+ include HappyMapper
34
+
35
+ # The name of the XML element used to serialize this object's data
36
+ tag 'fileInventory'
37
+
38
+ # (see Serializable#initialize)
39
+ def initialize(opts={})
40
+ @groups = Array.new
41
+ @inventory_datetime = Time.now
42
+ super(opts)
43
+ end
44
+
45
+ # @attribute
46
+ # @return [String] The type of inventory (version|additions|manifests|directory)
47
+ attribute :type, String
48
+
49
+ # @attribute
50
+ # @return [String] The digital object identifier (druid)
51
+ attribute :digital_object_id, String, :tag => 'objectId'
52
+
53
+ # @attribute
54
+ # @return [Integer] The ordinal version number
55
+ attribute :version_id, Integer, :tag => 'versionId', :key => true, :on_save => Proc.new {|n| n.to_s}
56
+
57
+ # @return [String] The unique identifier concatenating digital object id with version id
58
+ def composite_key
59
+ @digital_object_id + '-' + StorageObject.version_dirname(@version_id)
60
+ end
61
+
62
+ # @attribute
63
+ # @return [Time] The datetime at which the inventory was created
64
+ attribute :inventory_datetime, Time, :tag => 'inventoryDatetime', :on_save => Proc.new {|t| t.to_s}
65
+
66
+ def inventory_datetime=(datetime)
67
+ @inventory_datetime=Time.input(datetime)
68
+ end
69
+
70
+ def inventory_datetime
71
+ Time.output(@inventory_datetime)
72
+ end
73
+
74
+ # @attribute
75
+ # @return [Integer] The total number of data files in the inventory (dynamically calculated)
76
+ attribute :file_count, Integer, :tag => 'fileCount', :on_save => Proc.new {|t| t.to_s}
77
+
78
+ def file_count
79
+ groups.inject(0) { |sum, group| sum + group.file_count }
80
+ end
81
+
82
+ # @attribute
83
+ # @return [Integer] The total size (in bytes) in all files of all files in the inventory (dynamically calculated)
84
+ attribute :byte_count, Integer, :tag => 'byteCount', :on_save => Proc.new {|t| t.to_s}
85
+
86
+ def byte_count
87
+ groups.inject(0) { |sum, group| sum + group.byte_count }
88
+ end
89
+
90
+ # @attribute
91
+ # @return [Integer] The total disk usage (in 1 kB blocks) of all data files (estimating du -k result) (dynamically calculated)
92
+ attribute :block_count, Integer, :tag => 'blockCount', :on_save => Proc.new {|t| t.to_s}
93
+
94
+ def block_count
95
+ groups.inject(0) { |sum, group| sum + group.block_count }
96
+ end
97
+
98
+ # @attribute
99
+ # @return [Array<FileGroup>] The set of data groups comprising the version
100
+ has_many :groups, FileGroup, :tag => 'fileGroup'
101
+
102
+ # @return [Array<FileGroup] The set of data groups that contain files
103
+ def non_empty_groups
104
+ @groups.select{|group| !group.files.empty?}
105
+ end
106
+
107
+ # @param non_empty [Boolean] if true, return group_id's only for groups having files
108
+ # @return [Array<String>] group identifiers contained in this file inventory
109
+ def group_ids(non_empty=nil)
110
+ groups = non_empty ? self.non_empty_groups : @groups
111
+ groups.map{|group| group.group_id}
112
+ end
113
+
114
+ # @param [String] group_id The identifer of the group to be selected
115
+ # @return [FileGroup] The file group in this inventory for the specified group_id
116
+ def group(group_id)
117
+ @groups.find{ |group| group.group_id == group_id}
118
+ end
119
+
120
+ # @param group_id [String] File group identifer (e.g. data, metadata, manifests)
121
+ # @return [Boolean] true if the group is missing or empty
122
+ def group_empty?(group_id)
123
+ group = self.group(group_id)
124
+ group.nil? or group.files.empty?
125
+ end
126
+
127
+ # @return [Array<String>] The data fields to include in summary reports
128
+ def summary_fields
129
+ %w{type digital_object_id version_id inventory_datetime file_count byte_count block_count groups}
130
+ end
131
+
132
+ # @param [String] group_id The identifer of the group to be selected
133
+ # @param [String] file_id The group-relative path of the file (relative to the appropriate home directory)
134
+ # @return [FileSignature] The signature of the specified file
135
+ def file_signature(group_id, file_id)
136
+ file_group = group(group_id)
137
+ raise FileNotFoundException, "group #{group_id} not found for #{@digital_object_id} - #{@version_id}" if file_group.nil?
138
+ file_signature = file_group.path_hash[file_id]
139
+ raise FileNotFoundException, "#{group_id} file #{file_id} not found for #{@digital_object_id} - #{@version_id}" if file_signature.nil?
140
+ file_signature
141
+ end
142
+
143
+ # @api internal
144
+ # @param other [FileInventory] another instance of this class from which to clone identity values
145
+ # @return [void] Copy objectId and versionId values from another class instance into this instance
146
+ def copy_ids(other)
147
+ @digital_object_id = other.digital_object_id
148
+ @version_id = other.version_id
149
+ @inventory_datetime = other.inventory_datetime
150
+ end
151
+
152
+ # @api internal
153
+ # @return [String] Concatenation of the objectId and versionId values
154
+ def package_id
155
+ "#{@digital_object_id}-v#{@version_id}"
156
+ end
157
+
158
+ # @api internal
159
+ # @return [String] Returns either the version ID (if inventory is a version manifest) or the name of the directory that was harvested to create the inventory
160
+ def data_source
161
+ data_source = (groups.collect { |g| g.data_source.to_s }).join('|')
162
+ if data_source.start_with?('contentMetadata')
163
+ if version_id
164
+ "v#{version_id.to_s}-#{data_source}"
165
+ else
166
+ "new-#{data_source}"
167
+ end
168
+ else
169
+ if version_id
170
+ "v#{version_id.to_s}"
171
+ else
172
+ data_source
173
+ end
174
+
175
+ end
176
+ end
177
+
178
+ # @api external
179
+ # @param data_dir [Pathname,String] The location of files to be inventoried
180
+ # @param group_id [String] if specified, is used to set the group ID of the FileGroup created from the directory
181
+ # if nil, then the directory is assumed to contain both content and metadata subdirectories
182
+ # @return [FileInventory] Traverse a directory and return an inventory of the files it contains
183
+ # @example {include:file:spec/features/inventory/harvest_inventory_spec.rb}
184
+ def inventory_from_directory(data_dir,group_id=nil)
185
+ if group_id
186
+ @groups << FileGroup.new(:group_id=>group_id).group_from_directory(data_dir)
187
+ else
188
+ ['content','metadata'].each do |group_id|
189
+ @groups << FileGroup.new(:group_id=>group_id).group_from_directory(Pathname(data_dir).join(group_id))
190
+ end
191
+ end
192
+ self
193
+ end
194
+
195
+ # @param bag_dir [Pathname,String] The location of the BagIt bag to be inventoried
196
+ # @return [FileInventory] Traverse a BagIt bag's payload and return an inventory of the files it contains (using fixity from bag manifest files)
197
+ def inventory_from_bagit_bag(bag_dir)
198
+ bag_pathname = Pathname(bag_dir)
199
+ signatures_from_bag = signatures_from_bagit_manifests(bag_pathname)
200
+ bag_data_subdirs = bag_pathname.join('data').children
201
+ bag_data_subdirs.each do |subdir|
202
+ @groups << FileGroup.new(:group_id=>subdir.basename.to_s).group_from_bagit_subdir(subdir, signatures_from_bag)
203
+ end
204
+ self
205
+ end
206
+
207
+ # @param bag_pathname [Pathname] The location of the BagIt bag to be inventoried
208
+ # @return [Hash<Pathname,FileSignature>] The fixity data present in the bag's manifest files
209
+ def signatures_from_bagit_manifests(bag_pathname)
210
+ manifest_pathname = Hash.new
211
+ checksum_types = [:md5, :sha1, :sha256]
212
+ checksum_types.each do |type|
213
+ manifest_pathname[type] = bag_pathname.join("manifest-#{type.to_s}.txt")
214
+ end
215
+ signatures = OrderedHash.new { |hash,path| hash[path] = FileSignature.new }
216
+ checksum_types.each do |type|
217
+ if manifest_pathname[type].exist?
218
+ manifest_pathname[type].each_line do |line|
219
+ line.chomp!
220
+ checksum,data_path = line.split(/\s+\**/,2)
221
+ if checksum && data_path
222
+ file_pathname = bag_pathname.join(data_path)
223
+ signature = signatures[file_pathname]
224
+ signature.set_checksum(type, checksum)
225
+ end
226
+ end
227
+ end
228
+ end
229
+ signatures.each {|file_pathname,signature| signature.size = file_pathname.size}
230
+ signatures
231
+ end
232
+
233
+ # @api internal
234
+ # @return [String] The total size of the inventory expressed in KB, MB, GB or TB, depending on the magnitutde of the value
235
+ def human_size
236
+ count = 0
237
+ size = byte_count
238
+ while size >= 1024 and count < 4
239
+ size /= 1024.0
240
+ count += 1
241
+ end
242
+ if count == 0
243
+ sprintf("%d B", size)
244
+ else
245
+ sprintf("%.2f %s", size, %w[B KB MB GB TB][count])
246
+ end
247
+ end
248
+
249
+ # @api internal
250
+ # @param type [String] Specifies the type of inventory, and thus the filename used for storage
251
+ # @return [String] The standard name for the serialized inventory file of the given type
252
+ def self.xml_filename(type=nil)
253
+ case type
254
+ when "version"
255
+ 'versionInventory.xml'
256
+ when "additions"
257
+ 'versionAdditions.xml'
258
+ when "manifests"
259
+ 'manifestInventory.xml'
260
+ when "directory"
261
+ 'directoryInventory.xml'
262
+ else
263
+ raise "unknown inventory type: #{type.to_s}"
264
+ end
265
+ end
266
+
267
+ # @api external
268
+ # @param parent_dir [Pathname,String] The parent directory in which the xml file is to be stored
269
+ # @param type [String] The inventory type, which governs the filename used for serialization
270
+ # @return [void] write the {FileInventory} instance to a file
271
+ # @example {include:file:spec/features/inventory/write_inventory_xml_spec.rb}
272
+ def write_xml_file(parent_dir, type=nil)
273
+ type = @type if type.nil?
274
+ self.class.write_xml_file(self, parent_dir, type)
275
+ end
276
+
277
+ end
278
+
279
+ end
@@ -0,0 +1,132 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # Compares two {FileInventory} instances based primarily on file signatures and secondarily on file pathnames.
6
+ # Although the usual use will be to compare the content of 2 different temporal versions of the same object,
7
+ # it can also be used to verify an inventory document against an inventory harvested from a directory.
8
+ # The report is subdivided into sections for each of the file groups that compose the inventories being compared.
9
+ #
10
+ # ====Data Model
11
+ # * <b>{FileInventoryDifference} = compares two {FileInventory} instances based on file signatures and pathnames</b>
12
+ # * {FileGroupDifference} [1..*] = performs analysis and reports differences between two matching {FileGroup} objects
13
+ # * {FileGroupDifferenceSubset} [1..5] = collects a set of file-level differences of a give change type
14
+ # * {FileInstanceDifference} [1..*] = contains difference information at the file level
15
+ # * {FileSignature} [1..2] = contains the file signature(s) of two file instances being compared
16
+ #
17
+ # @example {include:file:spec/fixtures/derivatives/manifests/all/fileInventoryDifference.xml}
18
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
19
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
20
+ class FileInventoryDifference < Manifest
21
+
22
+ include HappyMapper
23
+
24
+ # The name of the XML element used to serialize this objects data
25
+ tag 'fileInventoryDifference'
26
+
27
+ # (see Serializable#initialize)
28
+ def initialize(opts={})
29
+ @group_differences = Array.new
30
+ super(opts)
31
+ end
32
+
33
+ # @attribute
34
+ # @return [String] The digital object ID (druid)
35
+ attribute :digital_object_id, String, :tag => 'objectId'
36
+
37
+ # @attribute
38
+ # @return [Integer] the number of differences found between the two inventories that were compared (dynamically calculated)
39
+ attribute :difference_count, Integer, :tag=> 'differenceCount',:on_save => Proc.new {|i| i.to_s}
40
+
41
+ def difference_count
42
+ @group_differences.inject(0) { |sum, group| sum + group.difference_count }
43
+ end
44
+
45
+ # @attribute
46
+ # @return [String] Id information from the version inventory used as the basis for comparison
47
+ attribute :basis, String
48
+
49
+ # @attribute
50
+ # @return [String] Id information about the version inventory compared to the basis
51
+ attribute :other, String
52
+
53
+ # @attribute
54
+ # @return [Time] The datetime at which the report was run
55
+ attribute :report_datetime, Time, :tag => 'reportDatetime', :on_save => Proc.new {|t| t.to_s}
56
+
57
+ def report_datetime=(datetime)
58
+ @report_datetime=Time.input(datetime)
59
+ end
60
+
61
+ def report_datetime
62
+ Time.output(@report_datetime)
63
+ end
64
+
65
+ # @attribute
66
+ # @return [Array<FileGroupDifference>] The set of data groups comprising the version
67
+ has_many :group_differences, FileGroupDifference, :tag => 'fileGroupDifference'
68
+
69
+ # @return [Array<String>] The data fields to include in summary reports
70
+ def summary_fields
71
+ %w{digital_object_id difference_count basis other report_datetime group_differences}
72
+ end
73
+
74
+ # @param [String] group_id The identifer of the group to be selected
75
+ # @return [FileGroupDifference] The subset of this report for the specified group_id (or nil if not found)
76
+ def group_difference(group_id)
77
+ @group_differences.find{ |group_difference| group_difference.group_id == group_id}
78
+ end
79
+
80
+ # @api external
81
+ # @param basis_inventory [FileInventory] The inventory that is the basis of the comparison
82
+ # @param other_inventory [FileInventory] The inventory that is compared against the basis inventory
83
+ # @return [FileInventoryDifference] Returns a report showing the differences, if any, between two inventories
84
+ # @example {include:file:spec/features/differences/version_compare_spec.rb}
85
+ def compare(basis_inventory, other_inventory)
86
+ @digital_object_id ||= common_object_id(basis_inventory, other_inventory)
87
+ @basis ||= basis_inventory.data_source
88
+ @other ||= other_inventory.data_source
89
+ @report_datetime = Time.now
90
+ # get a union list of all group_ids present in either inventory
91
+ group_ids = basis_inventory.group_ids | other_inventory.group_ids
92
+ group_ids.each do |group_id|
93
+ # get a pair of groups to compare, creating a empty group if not present in the inventory
94
+ basis_group = basis_inventory.group(group_id) || FileGroup.new(:group_id => group_id)
95
+ other_group = other_inventory.group(group_id) || FileGroup.new(:group_id => group_id)
96
+ @group_differences << FileGroupDifference.new.compare_file_groups(basis_group, other_group)
97
+ end
98
+ self
99
+ end
100
+
101
+ # @api internal
102
+ # @param (see #compare)
103
+ # @return [String] Returns either the common digitial object ID, or a concatenation of both inventory's IDs
104
+ def common_object_id(basis_inventory, other_inventory)
105
+ if basis_inventory.digital_object_id != other_inventory.digital_object_id
106
+ "#{basis_inventory.digital_object_id.to_s}|#{other_inventory.digital_object_id.to_s}"
107
+ else
108
+ basis_inventory.digital_object_id.to_s
109
+ end
110
+ end
111
+
112
+ # @return [Hash] Serializes the data and then filters it to report only the changes
113
+ def differences_detail
114
+ #return self.summary if difference_count == 0
115
+ inv_diff = self.to_hash
116
+ inv_diff["group_differences"].each_value do |group_diff|
117
+ delete_subsets = []
118
+ group_diff["subsets"].each do |change_type,subset|
119
+ delete_subsets << change_type if change_type == "identical" or subset["count"] == 0
120
+ end
121
+ delete_subsets.each do |change_type|
122
+ group_diff["subsets"].delete(change_type)
123
+ group_diff.delete(change_type) if change_type != "identical"
124
+ end
125
+ group_diff.delete("subsets") if group_diff["subsets"].empty?
126
+ end
127
+ inv_diff
128
+ end
129
+
130
+ end
131
+
132
+ end