moab-versioning 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/lib/moab.rb +59 -0
  3. data/lib/moab/bagger.rb +289 -0
  4. data/lib/moab/config.rb +21 -0
  5. data/lib/moab/exceptions.rb +18 -0
  6. data/lib/moab/file_group.rb +244 -0
  7. data/lib/moab/file_group_difference.rb +336 -0
  8. data/lib/moab/file_group_difference_subset.rb +45 -0
  9. data/lib/moab/file_instance.rb +82 -0
  10. data/lib/moab/file_instance_difference.rb +54 -0
  11. data/lib/moab/file_inventory.rb +279 -0
  12. data/lib/moab/file_inventory_difference.rb +132 -0
  13. data/lib/moab/file_manifestation.rb +85 -0
  14. data/lib/moab/file_signature.rb +200 -0
  15. data/lib/moab/signature_catalog.rb +195 -0
  16. data/lib/moab/signature_catalog_entry.rb +61 -0
  17. data/lib/moab/storage_object.rb +220 -0
  18. data/lib/moab/storage_object_version.rb +333 -0
  19. data/lib/moab/storage_repository.rb +57 -0
  20. data/lib/moab/storage_services.rb +104 -0
  21. data/lib/moab/verification_result.rb +83 -0
  22. data/lib/moab/version_metadata.rb +38 -0
  23. data/lib/moab/version_metadata_entry.rb +64 -0
  24. data/lib/moab/version_metadata_event.rb +47 -0
  25. data/lib/moab_stanford.rb +18 -0
  26. data/lib/monkey_patches.rb +65 -0
  27. data/lib/serializer.rb +36 -0
  28. data/lib/serializer/manifest.rb +76 -0
  29. data/lib/serializer/serializable.rb +178 -0
  30. data/lib/stanford/active_fedora_object.rb +34 -0
  31. data/lib/stanford/content_inventory.rb +236 -0
  32. data/lib/stanford/dor_metadata.rb +49 -0
  33. data/lib/stanford/storage_repository.rb +46 -0
  34. data/lib/stanford/storage_services.rb +66 -0
  35. data/lib/tasks/yard.rake +34 -0
  36. data/lib/tools/api_doc_generator.rb +396 -0
  37. data/lib/tools/spec_generator.rb +410 -0
  38. data/lib/tools/spec_generator_old.rb +49 -0
  39. metadata +252 -0
@@ -0,0 +1,85 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A container for a file signature and all the physical file instances that have that signature
6
+ # This element has one child {FileSignature} element, and one or more {FileInstance} elements
7
+ # Regarding the class name, see
8
+ # * {http://en.wikipedia.org/wiki/Functional_Requirements_for_Bibliographic_Records}
9
+ # * {http://planets-project.eu/events/copenhagen-2009/pre-reading/docs/Modelling%20Organizational%20Preservation%20Goals_Angela%20Dappert.pdf}
10
+ #
11
+ # ====Data Model
12
+ # * {FileInventory} = container for recording information about a collection of related files
13
+ # * {FileGroup} [1..*] = subset allow segregation of content and metadata files.
14
+ # * <b>{FileManifestation} [1..*] = snapshot of a file's filesystem characteristics</b>
15
+ # * {FileSignature} [1] = file fixity information
16
+ # * {FileInstance} [1..*] = filepath and timestamp of any physical file having that signature
17
+ #
18
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
19
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
20
+ class FileManifestation < Serializable
21
+ include HappyMapper
22
+
23
+ # The name of the XML element used to serialize this objects data
24
+ tag 'file'
25
+
26
+ # (see Serializable#initialize)
27
+ def initialize(opts={})
28
+ @instances = Array.new
29
+ super(opts)
30
+ end
31
+
32
+ # @attribute
33
+ # @return [FileSignature] The fixity data of the file instance
34
+ element :signature, FileSignature, :tag => 'fileSignature'
35
+
36
+ def signature
37
+ @signature.is_a?(Array) ? @signature[0] : @signature
38
+ end
39
+
40
+ def signature=(signature)
41
+ @signature = signature.is_a?(Array) ? signature[0] : signature
42
+ end
43
+
44
+ # @attribute
45
+ # @return [Array<FileInstance>] The location(s) of the file manifestation's file instances
46
+ has_many :instances, FileInstance, :tag => 'fileInstance'
47
+
48
+ # @api internal
49
+ # @return [Array<String>] Create an array from all the file paths of the child {FileInstance} objects
50
+ def paths
51
+ instances.collect { |i| i.path}
52
+ end
53
+
54
+ # @api internal
55
+ # @return [Integer] The total number of {FileInstance} objects in this manifestation.
56
+ # (Number of files that share this manifestation's signature)
57
+ def file_count
58
+ instances.size
59
+ end
60
+
61
+ # @api internal
62
+ # @return [Integer] The total size (in bytes) of all files that share this manifestation's signature
63
+ def byte_count
64
+ file_count.to_i * signature.size.to_i
65
+ end
66
+
67
+ # @api internal
68
+ # @return [Integer] The total disk usage (in 1 kB blocks) of all files that share this manifestation's signature
69
+ # (estimating du -k result)
70
+ def block_count
71
+ block_size=1024
72
+ instance_blocks = (signature.size.to_i + block_size - 1)/block_size
73
+ file_count * instance_blocks
74
+ end
75
+
76
+ # @api internal
77
+ # @param other [FileManifestation] The {FileManifestation} object to compare with self
78
+ # @return [Boolean] True if {FileManifestation} objects have same content
79
+ def ==(other)
80
+ (self.signature == other.signature) && (self.instances == other.instances)
81
+ end
82
+
83
+ end
84
+
85
+ end
@@ -0,0 +1,200 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # The fixity properties of a file, used to determine file content equivalence regardless of filename.
6
+ # Placing this data in a class by itself facilitates using file size together with the MD5 and SHA1 checksums
7
+ # as a single key when doing comparisons against other file instances. The Moab design assumes that this file signature
8
+ # is sufficiently unique to act as a comparator for determining file equality and eliminating file redundancy.
9
+ #
10
+ # The use of signatures for a compare-by-hash mechanism introduces a miniscule (but non-zero) risk
11
+ # that two non-identical files will have the same checksum. While this risk is only about 1 in 1048
12
+ # when using the SHA1 checksum alone, it can be reduced even further (to about 1 in 1086)
13
+ # if we use the MD5 and SHA1 checksums together. And we gain a bit more comfort by including a comparison of file sizes.
14
+ #
15
+ # Finally, the "collision" risk is reduced by isolation of each digital object's file pool within an object folder,
16
+ # instead of in a common storage area shared by the whole repository.
17
+ #
18
+ # ====Data Model
19
+ # * {FileInventory} = container for recording information about a collection of related files
20
+ # * {FileGroup} [1..*] = subset allow segregation of content and metadata files
21
+ # * {FileManifestation} [1..*] = snapshot of a file's filesystem characteristics
22
+ # * <b>{FileSignature} [1] = file fixity information</b>
23
+ # * {FileInstance} [1..*] = filepath and timestamp of any physical file having that signature
24
+ #
25
+ # * {SignatureCatalog} = lookup table containing a cumulative collection of all files ever ingested
26
+ # * {SignatureCatalogEntry} [1..*] = an row in the lookup table containing storage information about a single file
27
+ # * <b>{FileSignature} [1] = file fixity information</b>
28
+ #
29
+ # * {FileInventoryDifference} = compares two {FileInventory} instances based on file signatures and pathnames
30
+ # * {FileGroupDifference} [1..*] = performs analysis and reports differences between two matching {FileGroup} objects
31
+ # * {FileGroupDifferenceSubset} [1..5] = collects a set of file-level differences of a give change type
32
+ # * {FileInstanceDifference} [1..*] = contains difference information at the file level
33
+ # * <b>{FileSignature} [1..2] = contains the file signature(s) of two file instances being compared</b>
34
+ #
35
+ # @see http://searchstorage.techtarget.com/feature/The-skinny-on-data-deduplication
36
+ # @see http://www.ibm.com/developerworks/wikis/download/attachments/106987789/TSMDataDeduplication.pdf
37
+ # @see https://www.redlegg.com/pdf_file/3_1320410927_HowDataDedupeWorks_WP_100809.pdf
38
+ # @see http://www.library.yale.edu/iac/DPC/AN_DPC_FixityChecksFinal11.pdf
39
+ #
40
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
41
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
42
+ class FileSignature < Serializable
43
+
44
+ include HappyMapper
45
+
46
+ # The name of the XML element used to serialize this objects data
47
+ tag 'fileSignature'
48
+
49
+ # (see Serializable#initialize)
50
+ def initialize(opts={})
51
+ super(opts)
52
+ end
53
+
54
+ # @attribute
55
+ # @return [Integer] The size of the file in bytes
56
+ attribute :size, Integer, :on_save => Proc.new { |n| n.to_s }
57
+
58
+ # @attribute
59
+ # @return [String] The MD5 checksum value of the file
60
+ attribute :md5, String, :on_save => Proc.new { |n| n.nil? ? "" : n.to_s }
61
+
62
+ # @attribute
63
+ # @return [String] The SHA1 checksum value of the file
64
+ attribute :sha1, String, :on_save => Proc.new { |n| n.nil? ? "" : n.to_s }
65
+
66
+ # @attribute
67
+ # @return [String] The SHA256 checksum value of the file
68
+ attribute :sha256, String, :on_save => Proc.new { |n| n.nil? ? "" : n.to_s }
69
+
70
+ # @param type [Symbol,String] The type of checksum
71
+ # @param value [String] The checksum value
72
+ # @return [void] Set the value of the specified checksum type
73
+ def set_checksum(type,value)
74
+ case type.to_s.downcase.to_sym
75
+ when :md5
76
+ @md5 = value
77
+ when :sha1
78
+ @sha1 = value
79
+ when :sha256
80
+ @sha256 = value
81
+ else
82
+ raise "Unknown checksum type '#{type.to_s}'"
83
+ end
84
+ end
85
+
86
+ # @return [Hash<Symbol,String>] A hash of the checksum data
87
+ def checksums
88
+ checksum_hash = OrderedHash.new
89
+ checksum_hash[:md5] = @md5
90
+ checksum_hash[:sha1] = @sha1
91
+ checksum_hash[:sha256] = @sha256
92
+ checksum_hash.delete_if { |key,value| value.nil? or value.empty?}
93
+ checksum_hash
94
+ end
95
+
96
+ # @return [Boolean] The signature contains all of the 3 desired checksums
97
+ def complete?
98
+ checksums.size == 3
99
+ end
100
+
101
+ # @api internal
102
+ # @return [Hash<Symbol,String>] A hash of fixity data from this signataure object
103
+ def fixity
104
+ fixity_hash = OrderedHash.new
105
+ fixity_hash[:size] = @size.to_s
106
+ fixity_hash.merge!(checksums)
107
+ fixity_hash
108
+ end
109
+
110
+ # @api internal
111
+ # @param other [FileSignature] The other file signature being compared to this signature
112
+ # @return [Boolean] Returns true if self and other have comparable fixity data.
113
+ def eql?(other)
114
+ return false if self.size.to_i != other.size.to_i
115
+ self_checksums = self.checksums
116
+ other_checksums = other.checksums
117
+ matching_keys = self_checksums.keys & other_checksums.keys
118
+ return false if matching_keys.size == 0
119
+ matching_keys.each do |key|
120
+ return false if self_checksums[key] != other_checksums[key]
121
+ end
122
+ true
123
+ end
124
+
125
+ # @api internal
126
+ # (see #eql?)
127
+ def ==(other)
128
+ eql?(other)
129
+ end
130
+
131
+ # @api internal
132
+ # @return [Fixnum] Compute a hash-code for the fixity value array.
133
+ # Two file instances with the same content will have the same hash code (and will compare using eql?).
134
+ # @note The hash and eql? methods override the methods inherited from Object.
135
+ # These methods ensure that instances of this class can be used as Hash keys. See
136
+ # * {http://www.paulbutcher.com/2007/10/navigating-the-equality-maze/}
137
+ # * {http://techbot.me/2011/05/ruby-basics-equality-operators-ruby/}
138
+ # Also overriden is {#==} so that equality tests in other contexts will also return the expected result.
139
+ def hash
140
+ @size.to_i
141
+ end
142
+
143
+ # @api internal
144
+ # @param pathname [Pathname] The location of the file to be digested
145
+ # @return [FileSignature] Generate a FileSignature instance containing size and checksums for a physical file
146
+ def signature_from_file(pathname)
147
+ @size = pathname.size
148
+ md5_digest = Digest::MD5.new
149
+ sha1_digest = Digest::SHA1.new
150
+ sha256_digest = Digest::SHA2.new(256)
151
+ pathname.open("r") do |stream|
152
+ while buffer = stream.read(8192)
153
+ md5_digest.update(buffer)
154
+ sha1_digest.update(buffer)
155
+ sha256_digest.update(buffer)
156
+ end
157
+ end
158
+ @md5 = md5_digest.hexdigest
159
+ @sha1 = sha1_digest.hexdigest
160
+ @sha256 = sha256_digest.hexdigest
161
+ self
162
+ end
163
+
164
+ # @api internal
165
+ # @param pathname [Pathname] The location of the file whose full signature will be returned
166
+ # @return [FileSignature] The full signature derived from the file, unless the fixity is inconsistent with current values
167
+ def normalized_signature(pathname)
168
+ sig_from_file = FileSignature.new.signature_from_file(pathname)
169
+ if self.eql?(sig_from_file)
170
+ # The full signature from file is consistent with current values
171
+ return sig_from_file
172
+ else
173
+ # One or more of the fixity values is inconsistent, so raise an exception
174
+ raise "Signature inconsistent between inventory and file for #{pathname}: #{self.diff(sig_from_file).inspect}"
175
+ end
176
+ end
177
+
178
+ # @return [Hash<Symbol,String>] Key is type (e.g. :sha1), value is checksum names (e.g. ['SHA-1', 'SHA1'])
179
+ def FileSignature.checksum_names_for_type
180
+ names_for_type = OrderedHash.new
181
+ names_for_type[:md5] = ['MD5']
182
+ names_for_type[:sha1] = ['SHA-1', 'SHA1']
183
+ names_for_type[:sha256] = ['SHA-256', 'SHA256']
184
+ names_for_type
185
+ end
186
+
187
+ # @return [Hash<String, Symbol>] Key is checksum name (e.g. MD5), value is checksum type (e.g. :md5)
188
+ def FileSignature.checksum_type_for_name
189
+ type_for_name = OrderedHash.new
190
+ self.checksum_names_for_type.each do |type, names|
191
+ names.each do |name|
192
+ type_for_name[name] = type
193
+ end
194
+ end
195
+ type_for_name
196
+ end
197
+
198
+ end
199
+
200
+ end
@@ -0,0 +1,195 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A digital object's Signature Catalog is derived from an filtered aggregation of the file inventories
6
+ # of a digital object's set of versions. (see {#update})
7
+ # It has an entry for every file (identified by {FileSignature}) found in any of the versions,
8
+ # along with a record of the SDR storage location that was used to preserve a single file instance.
9
+ # Once this catalog has been populated, it has multiple uses:
10
+ # * The signature index is used to determine which files of a newly submitted object version
11
+ # are new additions and which are duplicates of files previously ingested. (See {#version_additions})
12
+ # (When a new version contains a mixture of added files and files carried over from the previous version
13
+ # we only need to store the files from the new version that have unique file signatures.)
14
+ # * Reconstruction of an object version (see {StorageObject#reconstruct_version}) requires a combination
15
+ # of a full version's {FileInventory} and the SignatureCatalog.
16
+ # * The catalog can also be used for performing consistency checks between manifest files and storage
17
+ #
18
+ # ====Data Model
19
+ # * <b>{SignatureCatalog} = lookup table containing a cumulative collection of all files ever ingested</b>
20
+ # * {SignatureCatalogEntry} [1..*] = an row in the lookup table containing storage information about a single file
21
+ # * {FileSignature} [1] = file fixity information
22
+ #
23
+ # @example {include:file:spec/fixtures/derivatives/manifests/v3/signatureCatalog.xml}
24
+ # @see StorageObject
25
+ # @see Bagger
26
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
27
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
28
+ class SignatureCatalog < Manifest
29
+ include HappyMapper
30
+
31
+ # The name of the XML element used to serialize this objects data
32
+ tag 'signatureCatalog'
33
+
34
+ # (see Serializable#initialize)
35
+ def initialize(opts={})
36
+ @entries = Array.new
37
+ @signature_hash = OrderedHash.new
38
+ super(opts)
39
+ end
40
+
41
+ # @attribute
42
+ # @return [String] The object ID (druid)
43
+ attribute :digital_object_id, String, :tag => 'objectId'
44
+
45
+ # @attribute
46
+ # @return [Integer] The ordinal version number
47
+ attribute :version_id, Integer, :tag => 'versionId', :key => true, :on_save => Proc.new {|n| n.to_s}
48
+
49
+ # @return [String] The unique identifier concatenating digital object id with version id
50
+ def composite_key
51
+ @digital_object_id + '-' + StorageObject.version_dirname(@version_id)
52
+ end
53
+
54
+ # @attribute
55
+ # @return [Time] The datetime at which the catalog was updated
56
+ attribute :catalog_datetime, Time, :tag => 'catalogDatetime', :on_save => Proc.new {|t| t.to_s}
57
+
58
+ def catalog_datetime=(datetime)
59
+ @catalog_datetime=Time.input(datetime)
60
+ end
61
+
62
+ def catalog_datetime
63
+ Time.output(@catalog_datetime)
64
+ end
65
+
66
+ # @attribute
67
+ # @return [Integer] The total number of data files (dynamically calculated)
68
+ attribute :file_count, Integer, :tag => 'fileCount', :on_save => Proc.new {|t| t.to_s}
69
+
70
+ def file_count
71
+ entries.size
72
+ end
73
+
74
+ # @attribute
75
+ # @return [Integer] The total size (in bytes) of all data files (dynamically calculated)
76
+ attribute :byte_count, Integer, :tag => 'byteCount', :on_save => Proc.new {|t| t.to_s}
77
+
78
+ def byte_count
79
+ entries.inject(0) { |sum, entry| sum + entry.signature.size.to_i }
80
+ end
81
+
82
+ # @attribute
83
+ # @return [Integer] The total disk usage (in 1 kB blocks) of all data files (estimating du -k result) (dynamically calculated)
84
+ attribute :block_count, Integer, :tag => 'blockCount', :on_save => Proc.new {|t| t.to_s}
85
+
86
+ def block_count
87
+ block_size=1024
88
+ entries.inject(0) { |sum, entry| sum + (entry.signature.size.to_i + block_size - 1)/block_size }
89
+ end
90
+
91
+ # @return [Array<String>] The data fields to include in summary reports
92
+ def summary_fields
93
+ %w{digital_object_id version_id catalog_datetime file_count byte_count block_count}
94
+ end
95
+
96
+ # @attribute
97
+ # @return [Array<SignatureCatalogEntry>] The set of data groups comprising the version
98
+ has_many :entries, SignatureCatalogEntry, :tag => 'entry'
99
+
100
+ def entries=(entry_array)
101
+ entry_array.each do |entry|
102
+ add_entry(entry)
103
+ end
104
+ end
105
+
106
+ # @return [OrderedHash] An index having {FileSignature} objects as keys and {SignatureCatalogEntry} objects as values
107
+ attr_accessor :signature_hash
108
+
109
+ # @api internal
110
+ # @param entry [SignatureCatalogEntry] The new catalog entry
111
+ # @return [void] Add a new entry to the catalog and to the {#signature_hash} index
112
+ def add_entry(entry)
113
+ @signature_hash[entry.signature] = entry
114
+ entries << entry
115
+ end
116
+
117
+ # @param [FileSignature] file_signature The signature of the file whose path is sought
118
+ # @return [String] The object-relative path of the file having the specified signature
119
+ def catalog_filepath(file_signature)
120
+ catalog_entry = @signature_hash[file_signature]
121
+ raise FileNotFoundException, "catalog entry not found for #{file_signature.fixity.inspect} in #{@digital_object_id} - #{@version_id}" if catalog_entry.nil?
122
+ catalog_entry.storage_path
123
+ end
124
+
125
+ # @param group [FileGroup] A group of the files from a file inventory
126
+ # @param group_pathname [Pathname] The location of the directory containing the group's files
127
+ # @return [void] Inspect and upgrade the group's signature data to include all desired checksums
128
+ def normalize_group_signatures(group, group_pathname=nil)
129
+ unless group_pathname.nil?
130
+ group_pathname = Pathname(group_pathname)
131
+ raise "Could not locate #{group_pathname}" unless group_pathname.exist?
132
+ end
133
+ group.files.each do |file|
134
+ unless file.signature.complete?
135
+ if @signature_hash.has_key?(file.signature)
136
+ file.signature = @signature_hash.find {|k,v| k == file.signature}[0]
137
+ elsif group_pathname
138
+ file_pathname = group_pathname.join(file.instances[0].path)
139
+ file.signature = file.signature.normalized_signature(file_pathname)
140
+ end
141
+ end
142
+ end
143
+ end
144
+
145
+ # @api external
146
+ # @param version_inventory [FileInventory] The complete inventory of the files comprising a digital object version
147
+ # @param data_pathname [Pathname] The location of the object's data directory
148
+ # @return [void] Compares the {FileSignature} entries in the new versions {FileInventory} against the signatures
149
+ # in this catalog and create new {SignatureCatalogEntry} addtions to the catalog
150
+ # @example {include:file:spec/features/catalog/catalog_update_spec.rb}
151
+ def update(version_inventory, data_pathname)
152
+ version_inventory.groups.each do |group|
153
+ group.files.each do |file|
154
+ unless @signature_hash.has_key?(file.signature)
155
+ entry = SignatureCatalogEntry.new
156
+ entry.version_id = version_inventory.version_id
157
+ entry.group_id = group.group_id
158
+ entry.path = file.instances[0].path
159
+ if file.signature.complete?
160
+ entry.signature = file.signature
161
+ else
162
+ file_pathname = data_pathname.join(group.group_id,entry.path)
163
+ entry.signature = file.signature.normalized_signature(file_pathname)
164
+ end
165
+ add_entry(entry)
166
+ end
167
+ end
168
+ end
169
+ @version_id = version_inventory.version_id
170
+ @catalog_datetime = Time.now
171
+ end
172
+
173
+ # @api external
174
+ # @param version_inventory (see #update)
175
+ # @return [FileInventory] Retrurns a filtered copy of the input inventory
176
+ # containing only those files that were added in this version
177
+ # @example {include:file:spec/features/catalog/version_additions_spec.rb}
178
+ def version_additions(version_inventory)
179
+ version_additions = FileInventory.new(:type=>'additions')
180
+ version_additions.copy_ids(version_inventory)
181
+ version_inventory.groups.each do |group|
182
+ group_addtions = FileGroup.new(:group_id => group.group_id)
183
+ group.files.each do |file|
184
+ unless @signature_hash.has_key?(file.signature)
185
+ group_addtions.add_file_instance(file.signature,file.instances[0])
186
+ end
187
+ end
188
+ version_additions.groups << group_addtions if group_addtions.files.size > 0
189
+ end
190
+ version_additions
191
+ end
192
+
193
+ end
194
+
195
+ end