moab-versioning 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/lib/moab.rb +59 -0
  3. data/lib/moab/bagger.rb +289 -0
  4. data/lib/moab/config.rb +21 -0
  5. data/lib/moab/exceptions.rb +18 -0
  6. data/lib/moab/file_group.rb +244 -0
  7. data/lib/moab/file_group_difference.rb +336 -0
  8. data/lib/moab/file_group_difference_subset.rb +45 -0
  9. data/lib/moab/file_instance.rb +82 -0
  10. data/lib/moab/file_instance_difference.rb +54 -0
  11. data/lib/moab/file_inventory.rb +279 -0
  12. data/lib/moab/file_inventory_difference.rb +132 -0
  13. data/lib/moab/file_manifestation.rb +85 -0
  14. data/lib/moab/file_signature.rb +200 -0
  15. data/lib/moab/signature_catalog.rb +195 -0
  16. data/lib/moab/signature_catalog_entry.rb +61 -0
  17. data/lib/moab/storage_object.rb +220 -0
  18. data/lib/moab/storage_object_version.rb +333 -0
  19. data/lib/moab/storage_repository.rb +57 -0
  20. data/lib/moab/storage_services.rb +104 -0
  21. data/lib/moab/verification_result.rb +83 -0
  22. data/lib/moab/version_metadata.rb +38 -0
  23. data/lib/moab/version_metadata_entry.rb +64 -0
  24. data/lib/moab/version_metadata_event.rb +47 -0
  25. data/lib/moab_stanford.rb +18 -0
  26. data/lib/monkey_patches.rb +65 -0
  27. data/lib/serializer.rb +36 -0
  28. data/lib/serializer/manifest.rb +76 -0
  29. data/lib/serializer/serializable.rb +178 -0
  30. data/lib/stanford/active_fedora_object.rb +34 -0
  31. data/lib/stanford/content_inventory.rb +236 -0
  32. data/lib/stanford/dor_metadata.rb +49 -0
  33. data/lib/stanford/storage_repository.rb +46 -0
  34. data/lib/stanford/storage_services.rb +66 -0
  35. data/lib/tasks/yard.rake +34 -0
  36. data/lib/tools/api_doc_generator.rb +396 -0
  37. data/lib/tools/spec_generator.rb +410 -0
  38. data/lib/tools/spec_generator_old.rb +49 -0
  39. metadata +252 -0
@@ -0,0 +1,54 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A container for recording difference information at the file level
6
+ # * If there was no change, the change type is set to <i>identical</i>
7
+ # * If the signature is unchanged, but the path has moved, the change type is set to <i>renamed</i>
8
+ # * If path is unchanged, but the signature has changed, the change type is set to <i>modified</i> and both signatures are reported
9
+ # * If the signature and path are only in the basis inventory, the change type is set to <i>deleted</i>
10
+ # * If the signature and path are only in the other inventory, the change type is set to <i>added</i>
11
+ # This is a child element of {FileGroupDifferenceSubset}, which is in turn a descendent of {FileInventoryDifference},
12
+ # the documentation of which contains a full example
13
+ #
14
+ # ====Data Model
15
+ # * {FileInventoryDifference} = compares two {FileInventory} instances based on file signatures and pathnames
16
+ # * {FileGroupDifference} [1..*] = performs analysis and reports differences between two matching {FileGroup} objects
17
+ # * {FileGroupDifferenceSubset} [1..5] = collects a set of file-level differences of a give change type
18
+ # * <b>{FileInstanceDifference} [1..*] = contains difference information at the file level</b>
19
+ # * {FileSignature} [1..2] = contains the file signature(s) of two file instances being compared
20
+ #
21
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
22
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
23
+ class FileInstanceDifference < Serializable
24
+
25
+ include HappyMapper
26
+
27
+ # The name of the XML element used to serialize this objects data
28
+ tag 'file'
29
+
30
+ # (see Serializable#initialize)
31
+ def initialize(opts={})
32
+ @signatures = Array.new
33
+ super(opts)
34
+ end
35
+
36
+ # @attribute
37
+ # @return [String] The type of file change
38
+ attribute :change, String
39
+
40
+ # @attribute
41
+ # @return [String] The file's path in the basis inventory (usually for an old version)
42
+ attribute :basis_path, String, :tag => 'basisPath', :on_save => Proc.new { |s| s.to_s }
43
+
44
+ # @attribute
45
+ # @return [String] The file's path in the other inventory (usually for an new version) compared against the basis
46
+ attribute :other_path, String, :tag => 'otherPath', :on_save => Proc.new { |s| s.to_s }
47
+
48
+ # @attribute
49
+ # @return [Array<FileSignature>] The fixity data of the file manifestation(s) (plural if change was a content modification)
50
+ has_many :signatures, FileSignature, :tag => 'fileSignature'
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,279 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A structured container for recording information about a collection of related files.
6
+ #
7
+ # The <b>scope</b> of the file collection depends on inventory type:
8
+ # * <i>version</i> = full set of data files comprising a digital object's version
9
+ # * <i>additions</i> = subset of data files that were newly added in the specified version
10
+ # * <i>manifests</i> = the fixity data for manifest files in the version's root folder
11
+ # * <i>directory</i> = set of files that were harvested from a filesystem directory
12
+ #
13
+ # The inventory contains one or more {FileGroup} subsets, which are most commonly used
14
+ # to provide segregation of digital object version's <i>content</i> and <i>metadata</i> files.
15
+ # Each group contains one or more {FileManifestation} entities,
16
+ # each of which represents a point-in-time snapshot of a given file's filesystem characteristics.
17
+ # The fixity data for a file is stored in a {FileSignature} element,
18
+ # while the filename and modification data are stored in one or more {FileInstance} elements.
19
+ # (Copies of a given file may be present in multiple locations in a collection)
20
+ #
21
+ # ====Data Model
22
+ # * <b>{FileInventory} = container for recording information about a collection of related files</b>
23
+ # * {FileGroup} [1..*] = subset allow segregation of content and metadata files
24
+ # * {FileManifestation} [1..*] = snapshot of a file's filesystem characteristics
25
+ # * {FileSignature} [1] = file fixity information
26
+ # * {FileInstance} [1..*] = filepath and timestamp of any physical file having that signature
27
+ #
28
+ # @example {include:file:spec/fixtures/derivatives/manifests/v3/versionInventory.xml}
29
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
30
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
31
+ class FileInventory < Manifest
32
+
33
+ include HappyMapper
34
+
35
+ # The name of the XML element used to serialize this object's data
36
+ tag 'fileInventory'
37
+
38
+ # (see Serializable#initialize)
39
+ def initialize(opts={})
40
+ @groups = Array.new
41
+ @inventory_datetime = Time.now
42
+ super(opts)
43
+ end
44
+
45
+ # @attribute
46
+ # @return [String] The type of inventory (version|additions|manifests|directory)
47
+ attribute :type, String
48
+
49
+ # @attribute
50
+ # @return [String] The digital object identifier (druid)
51
+ attribute :digital_object_id, String, :tag => 'objectId'
52
+
53
+ # @attribute
54
+ # @return [Integer] The ordinal version number
55
+ attribute :version_id, Integer, :tag => 'versionId', :key => true, :on_save => Proc.new {|n| n.to_s}
56
+
57
+ # @return [String] The unique identifier concatenating digital object id with version id
58
+ def composite_key
59
+ @digital_object_id + '-' + StorageObject.version_dirname(@version_id)
60
+ end
61
+
62
+ # @attribute
63
+ # @return [Time] The datetime at which the inventory was created
64
+ attribute :inventory_datetime, Time, :tag => 'inventoryDatetime', :on_save => Proc.new {|t| t.to_s}
65
+
66
+ def inventory_datetime=(datetime)
67
+ @inventory_datetime=Time.input(datetime)
68
+ end
69
+
70
+ def inventory_datetime
71
+ Time.output(@inventory_datetime)
72
+ end
73
+
74
+ # @attribute
75
+ # @return [Integer] The total number of data files in the inventory (dynamically calculated)
76
+ attribute :file_count, Integer, :tag => 'fileCount', :on_save => Proc.new {|t| t.to_s}
77
+
78
+ def file_count
79
+ groups.inject(0) { |sum, group| sum + group.file_count }
80
+ end
81
+
82
+ # @attribute
83
+ # @return [Integer] The total size (in bytes) in all files of all files in the inventory (dynamically calculated)
84
+ attribute :byte_count, Integer, :tag => 'byteCount', :on_save => Proc.new {|t| t.to_s}
85
+
86
+ def byte_count
87
+ groups.inject(0) { |sum, group| sum + group.byte_count }
88
+ end
89
+
90
+ # @attribute
91
+ # @return [Integer] The total disk usage (in 1 kB blocks) of all data files (estimating du -k result) (dynamically calculated)
92
+ attribute :block_count, Integer, :tag => 'blockCount', :on_save => Proc.new {|t| t.to_s}
93
+
94
+ def block_count
95
+ groups.inject(0) { |sum, group| sum + group.block_count }
96
+ end
97
+
98
+ # @attribute
99
+ # @return [Array<FileGroup>] The set of data groups comprising the version
100
+ has_many :groups, FileGroup, :tag => 'fileGroup'
101
+
102
+ # @return [Array<FileGroup] The set of data groups that contain files
103
+ def non_empty_groups
104
+ @groups.select{|group| !group.files.empty?}
105
+ end
106
+
107
+ # @param non_empty [Boolean] if true, return group_id's only for groups having files
108
+ # @return [Array<String>] group identifiers contained in this file inventory
109
+ def group_ids(non_empty=nil)
110
+ groups = non_empty ? self.non_empty_groups : @groups
111
+ groups.map{|group| group.group_id}
112
+ end
113
+
114
+ # @param [String] group_id The identifer of the group to be selected
115
+ # @return [FileGroup] The file group in this inventory for the specified group_id
116
+ def group(group_id)
117
+ @groups.find{ |group| group.group_id == group_id}
118
+ end
119
+
120
+ # @param group_id [String] File group identifer (e.g. data, metadata, manifests)
121
+ # @return [Boolean] true if the group is missing or empty
122
+ def group_empty?(group_id)
123
+ group = self.group(group_id)
124
+ group.nil? or group.files.empty?
125
+ end
126
+
127
+ # @return [Array<String>] The data fields to include in summary reports
128
+ def summary_fields
129
+ %w{type digital_object_id version_id inventory_datetime file_count byte_count block_count groups}
130
+ end
131
+
132
+ # @param [String] group_id The identifer of the group to be selected
133
+ # @param [String] file_id The group-relative path of the file (relative to the appropriate home directory)
134
+ # @return [FileSignature] The signature of the specified file
135
+ def file_signature(group_id, file_id)
136
+ file_group = group(group_id)
137
+ raise FileNotFoundException, "group #{group_id} not found for #{@digital_object_id} - #{@version_id}" if file_group.nil?
138
+ file_signature = file_group.path_hash[file_id]
139
+ raise FileNotFoundException, "#{group_id} file #{file_id} not found for #{@digital_object_id} - #{@version_id}" if file_signature.nil?
140
+ file_signature
141
+ end
142
+
143
+ # @api internal
144
+ # @param other [FileInventory] another instance of this class from which to clone identity values
145
+ # @return [void] Copy objectId and versionId values from another class instance into this instance
146
+ def copy_ids(other)
147
+ @digital_object_id = other.digital_object_id
148
+ @version_id = other.version_id
149
+ @inventory_datetime = other.inventory_datetime
150
+ end
151
+
152
+ # @api internal
153
+ # @return [String] Concatenation of the objectId and versionId values
154
+ def package_id
155
+ "#{@digital_object_id}-v#{@version_id}"
156
+ end
157
+
158
+ # @api internal
159
+ # @return [String] Returns either the version ID (if inventory is a version manifest) or the name of the directory that was harvested to create the inventory
160
+ def data_source
161
+ data_source = (groups.collect { |g| g.data_source.to_s }).join('|')
162
+ if data_source.start_with?('contentMetadata')
163
+ if version_id
164
+ "v#{version_id.to_s}-#{data_source}"
165
+ else
166
+ "new-#{data_source}"
167
+ end
168
+ else
169
+ if version_id
170
+ "v#{version_id.to_s}"
171
+ else
172
+ data_source
173
+ end
174
+
175
+ end
176
+ end
177
+
178
+ # @api external
179
+ # @param data_dir [Pathname,String] The location of files to be inventoried
180
+ # @param group_id [String] if specified, is used to set the group ID of the FileGroup created from the directory
181
+ # if nil, then the directory is assumed to contain both content and metadata subdirectories
182
+ # @return [FileInventory] Traverse a directory and return an inventory of the files it contains
183
+ # @example {include:file:spec/features/inventory/harvest_inventory_spec.rb}
184
+ def inventory_from_directory(data_dir,group_id=nil)
185
+ if group_id
186
+ @groups << FileGroup.new(:group_id=>group_id).group_from_directory(data_dir)
187
+ else
188
+ ['content','metadata'].each do |group_id|
189
+ @groups << FileGroup.new(:group_id=>group_id).group_from_directory(Pathname(data_dir).join(group_id))
190
+ end
191
+ end
192
+ self
193
+ end
194
+
195
+ # @param bag_dir [Pathname,String] The location of the BagIt bag to be inventoried
196
+ # @return [FileInventory] Traverse a BagIt bag's payload and return an inventory of the files it contains (using fixity from bag manifest files)
197
+ def inventory_from_bagit_bag(bag_dir)
198
+ bag_pathname = Pathname(bag_dir)
199
+ signatures_from_bag = signatures_from_bagit_manifests(bag_pathname)
200
+ bag_data_subdirs = bag_pathname.join('data').children
201
+ bag_data_subdirs.each do |subdir|
202
+ @groups << FileGroup.new(:group_id=>subdir.basename.to_s).group_from_bagit_subdir(subdir, signatures_from_bag)
203
+ end
204
+ self
205
+ end
206
+
207
+ # @param bag_pathname [Pathname] The location of the BagIt bag to be inventoried
208
+ # @return [Hash<Pathname,FileSignature>] The fixity data present in the bag's manifest files
209
+ def signatures_from_bagit_manifests(bag_pathname)
210
+ manifest_pathname = Hash.new
211
+ checksum_types = [:md5, :sha1, :sha256]
212
+ checksum_types.each do |type|
213
+ manifest_pathname[type] = bag_pathname.join("manifest-#{type.to_s}.txt")
214
+ end
215
+ signatures = OrderedHash.new { |hash,path| hash[path] = FileSignature.new }
216
+ checksum_types.each do |type|
217
+ if manifest_pathname[type].exist?
218
+ manifest_pathname[type].each_line do |line|
219
+ line.chomp!
220
+ checksum,data_path = line.split(/\s+\**/,2)
221
+ if checksum && data_path
222
+ file_pathname = bag_pathname.join(data_path)
223
+ signature = signatures[file_pathname]
224
+ signature.set_checksum(type, checksum)
225
+ end
226
+ end
227
+ end
228
+ end
229
+ signatures.each {|file_pathname,signature| signature.size = file_pathname.size}
230
+ signatures
231
+ end
232
+
233
+ # @api internal
234
+ # @return [String] The total size of the inventory expressed in KB, MB, GB or TB, depending on the magnitutde of the value
235
+ def human_size
236
+ count = 0
237
+ size = byte_count
238
+ while size >= 1024 and count < 4
239
+ size /= 1024.0
240
+ count += 1
241
+ end
242
+ if count == 0
243
+ sprintf("%d B", size)
244
+ else
245
+ sprintf("%.2f %s", size, %w[B KB MB GB TB][count])
246
+ end
247
+ end
248
+
249
+ # @api internal
250
+ # @param type [String] Specifies the type of inventory, and thus the filename used for storage
251
+ # @return [String] The standard name for the serialized inventory file of the given type
252
+ def self.xml_filename(type=nil)
253
+ case type
254
+ when "version"
255
+ 'versionInventory.xml'
256
+ when "additions"
257
+ 'versionAdditions.xml'
258
+ when "manifests"
259
+ 'manifestInventory.xml'
260
+ when "directory"
261
+ 'directoryInventory.xml'
262
+ else
263
+ raise "unknown inventory type: #{type.to_s}"
264
+ end
265
+ end
266
+
267
+ # @api external
268
+ # @param parent_dir [Pathname,String] The parent directory in which the xml file is to be stored
269
+ # @param type [String] The inventory type, which governs the filename used for serialization
270
+ # @return [void] write the {FileInventory} instance to a file
271
+ # @example {include:file:spec/features/inventory/write_inventory_xml_spec.rb}
272
+ def write_xml_file(parent_dir, type=nil)
273
+ type = @type if type.nil?
274
+ self.class.write_xml_file(self, parent_dir, type)
275
+ end
276
+
277
+ end
278
+
279
+ end
@@ -0,0 +1,132 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # Compares two {FileInventory} instances based primarily on file signatures and secondarily on file pathnames.
6
+ # Although the usual use will be to compare the content of 2 different temporal versions of the same object,
7
+ # it can also be used to verify an inventory document against an inventory harvested from a directory.
8
+ # The report is subdivided into sections for each of the file groups that compose the inventories being compared.
9
+ #
10
+ # ====Data Model
11
+ # * <b>{FileInventoryDifference} = compares two {FileInventory} instances based on file signatures and pathnames</b>
12
+ # * {FileGroupDifference} [1..*] = performs analysis and reports differences between two matching {FileGroup} objects
13
+ # * {FileGroupDifferenceSubset} [1..5] = collects a set of file-level differences of a give change type
14
+ # * {FileInstanceDifference} [1..*] = contains difference information at the file level
15
+ # * {FileSignature} [1..2] = contains the file signature(s) of two file instances being compared
16
+ #
17
+ # @example {include:file:spec/fixtures/derivatives/manifests/all/fileInventoryDifference.xml}
18
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
19
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
20
+ class FileInventoryDifference < Manifest
21
+
22
+ include HappyMapper
23
+
24
+ # The name of the XML element used to serialize this objects data
25
+ tag 'fileInventoryDifference'
26
+
27
+ # (see Serializable#initialize)
28
+ def initialize(opts={})
29
+ @group_differences = Array.new
30
+ super(opts)
31
+ end
32
+
33
+ # @attribute
34
+ # @return [String] The digital object ID (druid)
35
+ attribute :digital_object_id, String, :tag => 'objectId'
36
+
37
+ # @attribute
38
+ # @return [Integer] the number of differences found between the two inventories that were compared (dynamically calculated)
39
+ attribute :difference_count, Integer, :tag=> 'differenceCount',:on_save => Proc.new {|i| i.to_s}
40
+
41
+ def difference_count
42
+ @group_differences.inject(0) { |sum, group| sum + group.difference_count }
43
+ end
44
+
45
+ # @attribute
46
+ # @return [String] Id information from the version inventory used as the basis for comparison
47
+ attribute :basis, String
48
+
49
+ # @attribute
50
+ # @return [String] Id information about the version inventory compared to the basis
51
+ attribute :other, String
52
+
53
+ # @attribute
54
+ # @return [Time] The datetime at which the report was run
55
+ attribute :report_datetime, Time, :tag => 'reportDatetime', :on_save => Proc.new {|t| t.to_s}
56
+
57
+ def report_datetime=(datetime)
58
+ @report_datetime=Time.input(datetime)
59
+ end
60
+
61
+ def report_datetime
62
+ Time.output(@report_datetime)
63
+ end
64
+
65
+ # @attribute
66
+ # @return [Array<FileGroupDifference>] The set of data groups comprising the version
67
+ has_many :group_differences, FileGroupDifference, :tag => 'fileGroupDifference'
68
+
69
+ # @return [Array<String>] The data fields to include in summary reports
70
+ def summary_fields
71
+ %w{digital_object_id difference_count basis other report_datetime group_differences}
72
+ end
73
+
74
+ # @param [String] group_id The identifer of the group to be selected
75
+ # @return [FileGroupDifference] The subset of this report for the specified group_id (or nil if not found)
76
+ def group_difference(group_id)
77
+ @group_differences.find{ |group_difference| group_difference.group_id == group_id}
78
+ end
79
+
80
+ # @api external
81
+ # @param basis_inventory [FileInventory] The inventory that is the basis of the comparison
82
+ # @param other_inventory [FileInventory] The inventory that is compared against the basis inventory
83
+ # @return [FileInventoryDifference] Returns a report showing the differences, if any, between two inventories
84
+ # @example {include:file:spec/features/differences/version_compare_spec.rb}
85
+ def compare(basis_inventory, other_inventory)
86
+ @digital_object_id ||= common_object_id(basis_inventory, other_inventory)
87
+ @basis ||= basis_inventory.data_source
88
+ @other ||= other_inventory.data_source
89
+ @report_datetime = Time.now
90
+ # get a union list of all group_ids present in either inventory
91
+ group_ids = basis_inventory.group_ids | other_inventory.group_ids
92
+ group_ids.each do |group_id|
93
+ # get a pair of groups to compare, creating a empty group if not present in the inventory
94
+ basis_group = basis_inventory.group(group_id) || FileGroup.new(:group_id => group_id)
95
+ other_group = other_inventory.group(group_id) || FileGroup.new(:group_id => group_id)
96
+ @group_differences << FileGroupDifference.new.compare_file_groups(basis_group, other_group)
97
+ end
98
+ self
99
+ end
100
+
101
+ # @api internal
102
+ # @param (see #compare)
103
+ # @return [String] Returns either the common digitial object ID, or a concatenation of both inventory's IDs
104
+ def common_object_id(basis_inventory, other_inventory)
105
+ if basis_inventory.digital_object_id != other_inventory.digital_object_id
106
+ "#{basis_inventory.digital_object_id.to_s}|#{other_inventory.digital_object_id.to_s}"
107
+ else
108
+ basis_inventory.digital_object_id.to_s
109
+ end
110
+ end
111
+
112
+ # @return [Hash] Serializes the data and then filters it to report only the changes
113
+ def differences_detail
114
+ #return self.summary if difference_count == 0
115
+ inv_diff = self.to_hash
116
+ inv_diff["group_differences"].each_value do |group_diff|
117
+ delete_subsets = []
118
+ group_diff["subsets"].each do |change_type,subset|
119
+ delete_subsets << change_type if change_type == "identical" or subset["count"] == 0
120
+ end
121
+ delete_subsets.each do |change_type|
122
+ group_diff["subsets"].delete(change_type)
123
+ group_diff.delete(change_type) if change_type != "identical"
124
+ end
125
+ group_diff.delete("subsets") if group_diff["subsets"].empty?
126
+ end
127
+ inv_diff
128
+ end
129
+
130
+ end
131
+
132
+ end