moab-versioning 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/lib/moab.rb +59 -0
  3. data/lib/moab/bagger.rb +289 -0
  4. data/lib/moab/config.rb +21 -0
  5. data/lib/moab/exceptions.rb +18 -0
  6. data/lib/moab/file_group.rb +244 -0
  7. data/lib/moab/file_group_difference.rb +336 -0
  8. data/lib/moab/file_group_difference_subset.rb +45 -0
  9. data/lib/moab/file_instance.rb +82 -0
  10. data/lib/moab/file_instance_difference.rb +54 -0
  11. data/lib/moab/file_inventory.rb +279 -0
  12. data/lib/moab/file_inventory_difference.rb +132 -0
  13. data/lib/moab/file_manifestation.rb +85 -0
  14. data/lib/moab/file_signature.rb +200 -0
  15. data/lib/moab/signature_catalog.rb +195 -0
  16. data/lib/moab/signature_catalog_entry.rb +61 -0
  17. data/lib/moab/storage_object.rb +220 -0
  18. data/lib/moab/storage_object_version.rb +333 -0
  19. data/lib/moab/storage_repository.rb +57 -0
  20. data/lib/moab/storage_services.rb +104 -0
  21. data/lib/moab/verification_result.rb +83 -0
  22. data/lib/moab/version_metadata.rb +38 -0
  23. data/lib/moab/version_metadata_entry.rb +64 -0
  24. data/lib/moab/version_metadata_event.rb +47 -0
  25. data/lib/moab_stanford.rb +18 -0
  26. data/lib/monkey_patches.rb +65 -0
  27. data/lib/serializer.rb +36 -0
  28. data/lib/serializer/manifest.rb +76 -0
  29. data/lib/serializer/serializable.rb +178 -0
  30. data/lib/stanford/active_fedora_object.rb +34 -0
  31. data/lib/stanford/content_inventory.rb +236 -0
  32. data/lib/stanford/dor_metadata.rb +49 -0
  33. data/lib/stanford/storage_repository.rb +46 -0
  34. data/lib/stanford/storage_services.rb +66 -0
  35. data/lib/tasks/yard.rake +34 -0
  36. data/lib/tools/api_doc_generator.rb +396 -0
  37. data/lib/tools/spec_generator.rb +410 -0
  38. data/lib/tools/spec_generator_old.rb +49 -0
  39. metadata +252 -0
@@ -0,0 +1,85 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A container for a file signature and all the physical file instances that have that signature
6
+ # This element has one child {FileSignature} element, and one or more {FileInstance} elements
7
+ # Regarding the class name, see
8
+ # * {http://en.wikipedia.org/wiki/Functional_Requirements_for_Bibliographic_Records}
9
+ # * {http://planets-project.eu/events/copenhagen-2009/pre-reading/docs/Modelling%20Organizational%20Preservation%20Goals_Angela%20Dappert.pdf}
10
+ #
11
+ # ====Data Model
12
+ # * {FileInventory} = container for recording information about a collection of related files
13
+ # * {FileGroup} [1..*] = subset allow segregation of content and metadata files.
14
+ # * <b>{FileManifestation} [1..*] = snapshot of a file's filesystem characteristics</b>
15
+ # * {FileSignature} [1] = file fixity information
16
+ # * {FileInstance} [1..*] = filepath and timestamp of any physical file having that signature
17
+ #
18
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
19
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
20
+ class FileManifestation < Serializable
21
+ include HappyMapper
22
+
23
+ # The name of the XML element used to serialize this objects data
24
+ tag 'file'
25
+
26
+ # (see Serializable#initialize)
27
+ def initialize(opts={})
28
+ @instances = Array.new
29
+ super(opts)
30
+ end
31
+
32
+ # @attribute
33
+ # @return [FileSignature] The fixity data of the file instance
34
+ element :signature, FileSignature, :tag => 'fileSignature'
35
+
36
+ def signature
37
+ @signature.is_a?(Array) ? @signature[0] : @signature
38
+ end
39
+
40
+ def signature=(signature)
41
+ @signature = signature.is_a?(Array) ? signature[0] : signature
42
+ end
43
+
44
+ # @attribute
45
+ # @return [Array<FileInstance>] The location(s) of the file manifestation's file instances
46
+ has_many :instances, FileInstance, :tag => 'fileInstance'
47
+
48
+ # @api internal
49
+ # @return [Array<String>] Create an array from all the file paths of the child {FileInstance} objects
50
+ def paths
51
+ instances.collect { |i| i.path}
52
+ end
53
+
54
+ # @api internal
55
+ # @return [Integer] The total number of {FileInstance} objects in this manifestation.
56
+ # (Number of files that share this manifestation's signature)
57
+ def file_count
58
+ instances.size
59
+ end
60
+
61
+ # @api internal
62
+ # @return [Integer] The total size (in bytes) of all files that share this manifestation's signature
63
+ def byte_count
64
+ file_count.to_i * signature.size.to_i
65
+ end
66
+
67
+ # @api internal
68
+ # @return [Integer] The total disk usage (in 1 kB blocks) of all files that share this manifestation's signature
69
+ # (estimating du -k result)
70
+ def block_count
71
+ block_size=1024
72
+ instance_blocks = (signature.size.to_i + block_size - 1)/block_size
73
+ file_count * instance_blocks
74
+ end
75
+
76
+ # @api internal
77
+ # @param other [FileManifestation] The {FileManifestation} object to compare with self
78
+ # @return [Boolean] True if {FileManifestation} objects have same content
79
+ def ==(other)
80
+ (self.signature == other.signature) && (self.instances == other.instances)
81
+ end
82
+
83
+ end
84
+
85
+ end
@@ -0,0 +1,200 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # The fixity properties of a file, used to determine file content equivalence regardless of filename.
6
+ # Placing this data in a class by itself facilitates using file size together with the MD5 and SHA1 checksums
7
+ # as a single key when doing comparisons against other file instances. The Moab design assumes that this file signature
8
+ # is sufficiently unique to act as a comparator for determining file equality and eliminating file redundancy.
9
+ #
10
+ # The use of signatures for a compare-by-hash mechanism introduces a miniscule (but non-zero) risk
11
+ # that two non-identical files will have the same checksum. While this risk is only about 1 in 1048
12
+ # when using the SHA1 checksum alone, it can be reduced even further (to about 1 in 1086)
13
+ # if we use the MD5 and SHA1 checksums together. And we gain a bit more comfort by including a comparison of file sizes.
14
+ #
15
+ # Finally, the "collision" risk is reduced by isolation of each digital object's file pool within an object folder,
16
+ # instead of in a common storage area shared by the whole repository.
17
+ #
18
+ # ====Data Model
19
+ # * {FileInventory} = container for recording information about a collection of related files
20
+ # * {FileGroup} [1..*] = subset allow segregation of content and metadata files
21
+ # * {FileManifestation} [1..*] = snapshot of a file's filesystem characteristics
22
+ # * <b>{FileSignature} [1] = file fixity information</b>
23
+ # * {FileInstance} [1..*] = filepath and timestamp of any physical file having that signature
24
+ #
25
+ # * {SignatureCatalog} = lookup table containing a cumulative collection of all files ever ingested
26
+ # * {SignatureCatalogEntry} [1..*] = an row in the lookup table containing storage information about a single file
27
+ # * <b>{FileSignature} [1] = file fixity information</b>
28
+ #
29
+ # * {FileInventoryDifference} = compares two {FileInventory} instances based on file signatures and pathnames
30
+ # * {FileGroupDifference} [1..*] = performs analysis and reports differences between two matching {FileGroup} objects
31
+ # * {FileGroupDifferenceSubset} [1..5] = collects a set of file-level differences of a give change type
32
+ # * {FileInstanceDifference} [1..*] = contains difference information at the file level
33
+ # * <b>{FileSignature} [1..2] = contains the file signature(s) of two file instances being compared</b>
34
+ #
35
+ # @see http://searchstorage.techtarget.com/feature/The-skinny-on-data-deduplication
36
+ # @see http://www.ibm.com/developerworks/wikis/download/attachments/106987789/TSMDataDeduplication.pdf
37
+ # @see https://www.redlegg.com/pdf_file/3_1320410927_HowDataDedupeWorks_WP_100809.pdf
38
+ # @see http://www.library.yale.edu/iac/DPC/AN_DPC_FixityChecksFinal11.pdf
39
+ #
40
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
41
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
42
+ class FileSignature < Serializable
43
+
44
+ include HappyMapper
45
+
46
+ # The name of the XML element used to serialize this objects data
47
+ tag 'fileSignature'
48
+
49
+ # (see Serializable#initialize)
50
+ def initialize(opts={})
51
+ super(opts)
52
+ end
53
+
54
+ # @attribute
55
+ # @return [Integer] The size of the file in bytes
56
+ attribute :size, Integer, :on_save => Proc.new { |n| n.to_s }
57
+
58
+ # @attribute
59
+ # @return [String] The MD5 checksum value of the file
60
+ attribute :md5, String, :on_save => Proc.new { |n| n.nil? ? "" : n.to_s }
61
+
62
+ # @attribute
63
+ # @return [String] The SHA1 checksum value of the file
64
+ attribute :sha1, String, :on_save => Proc.new { |n| n.nil? ? "" : n.to_s }
65
+
66
+ # @attribute
67
+ # @return [String] The SHA256 checksum value of the file
68
+ attribute :sha256, String, :on_save => Proc.new { |n| n.nil? ? "" : n.to_s }
69
+
70
+ # @param type [Symbol,String] The type of checksum
71
+ # @param value [String] The checksum value
72
+ # @return [void] Set the value of the specified checksum type
73
+ def set_checksum(type,value)
74
+ case type.to_s.downcase.to_sym
75
+ when :md5
76
+ @md5 = value
77
+ when :sha1
78
+ @sha1 = value
79
+ when :sha256
80
+ @sha256 = value
81
+ else
82
+ raise "Unknown checksum type '#{type.to_s}'"
83
+ end
84
+ end
85
+
86
+ # @return [Hash<Symbol,String>] A hash of the checksum data
87
+ def checksums
88
+ checksum_hash = OrderedHash.new
89
+ checksum_hash[:md5] = @md5
90
+ checksum_hash[:sha1] = @sha1
91
+ checksum_hash[:sha256] = @sha256
92
+ checksum_hash.delete_if { |key,value| value.nil? or value.empty?}
93
+ checksum_hash
94
+ end
95
+
96
+ # @return [Boolean] The signature contains all of the 3 desired checksums
97
+ def complete?
98
+ checksums.size == 3
99
+ end
100
+
101
+ # @api internal
102
+ # @return [Hash<Symbol,String>] A hash of fixity data from this signataure object
103
+ def fixity
104
+ fixity_hash = OrderedHash.new
105
+ fixity_hash[:size] = @size.to_s
106
+ fixity_hash.merge!(checksums)
107
+ fixity_hash
108
+ end
109
+
110
+ # @api internal
111
+ # @param other [FileSignature] The other file signature being compared to this signature
112
+ # @return [Boolean] Returns true if self and other have comparable fixity data.
113
+ def eql?(other)
114
+ return false if self.size.to_i != other.size.to_i
115
+ self_checksums = self.checksums
116
+ other_checksums = other.checksums
117
+ matching_keys = self_checksums.keys & other_checksums.keys
118
+ return false if matching_keys.size == 0
119
+ matching_keys.each do |key|
120
+ return false if self_checksums[key] != other_checksums[key]
121
+ end
122
+ true
123
+ end
124
+
125
+ # @api internal
126
+ # (see #eql?)
127
+ def ==(other)
128
+ eql?(other)
129
+ end
130
+
131
+ # @api internal
132
+ # @return [Fixnum] Compute a hash-code for the fixity value array.
133
+ # Two file instances with the same content will have the same hash code (and will compare using eql?).
134
+ # @note The hash and eql? methods override the methods inherited from Object.
135
+ # These methods ensure that instances of this class can be used as Hash keys. See
136
+ # * {http://www.paulbutcher.com/2007/10/navigating-the-equality-maze/}
137
+ # * {http://techbot.me/2011/05/ruby-basics-equality-operators-ruby/}
138
+ # Also overriden is {#==} so that equality tests in other contexts will also return the expected result.
139
+ def hash
140
+ @size.to_i
141
+ end
142
+
143
+ # @api internal
144
+ # @param pathname [Pathname] The location of the file to be digested
145
+ # @return [FileSignature] Generate a FileSignature instance containing size and checksums for a physical file
146
+ def signature_from_file(pathname)
147
+ @size = pathname.size
148
+ md5_digest = Digest::MD5.new
149
+ sha1_digest = Digest::SHA1.new
150
+ sha256_digest = Digest::SHA2.new(256)
151
+ pathname.open("r") do |stream|
152
+ while buffer = stream.read(8192)
153
+ md5_digest.update(buffer)
154
+ sha1_digest.update(buffer)
155
+ sha256_digest.update(buffer)
156
+ end
157
+ end
158
+ @md5 = md5_digest.hexdigest
159
+ @sha1 = sha1_digest.hexdigest
160
+ @sha256 = sha256_digest.hexdigest
161
+ self
162
+ end
163
+
164
+ # @api internal
165
+ # @param pathname [Pathname] The location of the file whose full signature will be returned
166
+ # @return [FileSignature] The full signature derived from the file, unless the fixity is inconsistent with current values
167
+ def normalized_signature(pathname)
168
+ sig_from_file = FileSignature.new.signature_from_file(pathname)
169
+ if self.eql?(sig_from_file)
170
+ # The full signature from file is consistent with current values
171
+ return sig_from_file
172
+ else
173
+ # One or more of the fixity values is inconsistent, so raise an exception
174
+ raise "Signature inconsistent between inventory and file for #{pathname}: #{self.diff(sig_from_file).inspect}"
175
+ end
176
+ end
177
+
178
+ # @return [Hash<Symbol,String>] Key is type (e.g. :sha1), value is checksum names (e.g. ['SHA-1', 'SHA1'])
179
+ def FileSignature.checksum_names_for_type
180
+ names_for_type = OrderedHash.new
181
+ names_for_type[:md5] = ['MD5']
182
+ names_for_type[:sha1] = ['SHA-1', 'SHA1']
183
+ names_for_type[:sha256] = ['SHA-256', 'SHA256']
184
+ names_for_type
185
+ end
186
+
187
+ # @return [Hash<String, Symbol>] Key is checksum name (e.g. MD5), value is checksum type (e.g. :md5)
188
+ def FileSignature.checksum_type_for_name
189
+ type_for_name = OrderedHash.new
190
+ self.checksum_names_for_type.each do |type, names|
191
+ names.each do |name|
192
+ type_for_name[name] = type
193
+ end
194
+ end
195
+ type_for_name
196
+ end
197
+
198
+ end
199
+
200
+ end
@@ -0,0 +1,195 @@
1
+ require 'moab'
2
+
3
+ module Moab
4
+
5
+ # A digital object's Signature Catalog is derived from an filtered aggregation of the file inventories
6
+ # of a digital object's set of versions. (see {#update})
7
+ # It has an entry for every file (identified by {FileSignature}) found in any of the versions,
8
+ # along with a record of the SDR storage location that was used to preserve a single file instance.
9
+ # Once this catalog has been populated, it has multiple uses:
10
+ # * The signature index is used to determine which files of a newly submitted object version
11
+ # are new additions and which are duplicates of files previously ingested. (See {#version_additions})
12
+ # (When a new version contains a mixture of added files and files carried over from the previous version
13
+ # we only need to store the files from the new version that have unique file signatures.)
14
+ # * Reconstruction of an object version (see {StorageObject#reconstruct_version}) requires a combination
15
+ # of a full version's {FileInventory} and the SignatureCatalog.
16
+ # * The catalog can also be used for performing consistency checks between manifest files and storage
17
+ #
18
+ # ====Data Model
19
+ # * <b>{SignatureCatalog} = lookup table containing a cumulative collection of all files ever ingested</b>
20
+ # * {SignatureCatalogEntry} [1..*] = an row in the lookup table containing storage information about a single file
21
+ # * {FileSignature} [1] = file fixity information
22
+ #
23
+ # @example {include:file:spec/fixtures/derivatives/manifests/v3/signatureCatalog.xml}
24
+ # @see StorageObject
25
+ # @see Bagger
26
+ # @note Copyright (c) 2012 by The Board of Trustees of the Leland Stanford Junior University.
27
+ # All rights reserved. See {file:LICENSE.rdoc} for details.
28
+ class SignatureCatalog < Manifest
29
+ include HappyMapper
30
+
31
+ # The name of the XML element used to serialize this objects data
32
+ tag 'signatureCatalog'
33
+
34
+ # (see Serializable#initialize)
35
+ def initialize(opts={})
36
+ @entries = Array.new
37
+ @signature_hash = OrderedHash.new
38
+ super(opts)
39
+ end
40
+
41
+ # @attribute
42
+ # @return [String] The object ID (druid)
43
+ attribute :digital_object_id, String, :tag => 'objectId'
44
+
45
+ # @attribute
46
+ # @return [Integer] The ordinal version number
47
+ attribute :version_id, Integer, :tag => 'versionId', :key => true, :on_save => Proc.new {|n| n.to_s}
48
+
49
+ # @return [String] The unique identifier concatenating digital object id with version id
50
+ def composite_key
51
+ @digital_object_id + '-' + StorageObject.version_dirname(@version_id)
52
+ end
53
+
54
+ # @attribute
55
+ # @return [Time] The datetime at which the catalog was updated
56
+ attribute :catalog_datetime, Time, :tag => 'catalogDatetime', :on_save => Proc.new {|t| t.to_s}
57
+
58
+ def catalog_datetime=(datetime)
59
+ @catalog_datetime=Time.input(datetime)
60
+ end
61
+
62
+ def catalog_datetime
63
+ Time.output(@catalog_datetime)
64
+ end
65
+
66
+ # @attribute
67
+ # @return [Integer] The total number of data files (dynamically calculated)
68
+ attribute :file_count, Integer, :tag => 'fileCount', :on_save => Proc.new {|t| t.to_s}
69
+
70
+ def file_count
71
+ entries.size
72
+ end
73
+
74
+ # @attribute
75
+ # @return [Integer] The total size (in bytes) of all data files (dynamically calculated)
76
+ attribute :byte_count, Integer, :tag => 'byteCount', :on_save => Proc.new {|t| t.to_s}
77
+
78
+ def byte_count
79
+ entries.inject(0) { |sum, entry| sum + entry.signature.size.to_i }
80
+ end
81
+
82
+ # @attribute
83
+ # @return [Integer] The total disk usage (in 1 kB blocks) of all data files (estimating du -k result) (dynamically calculated)
84
+ attribute :block_count, Integer, :tag => 'blockCount', :on_save => Proc.new {|t| t.to_s}
85
+
86
+ def block_count
87
+ block_size=1024
88
+ entries.inject(0) { |sum, entry| sum + (entry.signature.size.to_i + block_size - 1)/block_size }
89
+ end
90
+
91
+ # @return [Array<String>] The data fields to include in summary reports
92
+ def summary_fields
93
+ %w{digital_object_id version_id catalog_datetime file_count byte_count block_count}
94
+ end
95
+
96
+ # @attribute
97
+ # @return [Array<SignatureCatalogEntry>] The set of data groups comprising the version
98
+ has_many :entries, SignatureCatalogEntry, :tag => 'entry'
99
+
100
+ def entries=(entry_array)
101
+ entry_array.each do |entry|
102
+ add_entry(entry)
103
+ end
104
+ end
105
+
106
+ # @return [OrderedHash] An index having {FileSignature} objects as keys and {SignatureCatalogEntry} objects as values
107
+ attr_accessor :signature_hash
108
+
109
+ # @api internal
110
+ # @param entry [SignatureCatalogEntry] The new catalog entry
111
+ # @return [void] Add a new entry to the catalog and to the {#signature_hash} index
112
+ def add_entry(entry)
113
+ @signature_hash[entry.signature] = entry
114
+ entries << entry
115
+ end
116
+
117
+ # @param [FileSignature] file_signature The signature of the file whose path is sought
118
+ # @return [String] The object-relative path of the file having the specified signature
119
+ def catalog_filepath(file_signature)
120
+ catalog_entry = @signature_hash[file_signature]
121
+ raise FileNotFoundException, "catalog entry not found for #{file_signature.fixity.inspect} in #{@digital_object_id} - #{@version_id}" if catalog_entry.nil?
122
+ catalog_entry.storage_path
123
+ end
124
+
125
+ # @param group [FileGroup] A group of the files from a file inventory
126
+ # @param group_pathname [Pathname] The location of the directory containing the group's files
127
+ # @return [void] Inspect and upgrade the group's signature data to include all desired checksums
128
+ def normalize_group_signatures(group, group_pathname=nil)
129
+ unless group_pathname.nil?
130
+ group_pathname = Pathname(group_pathname)
131
+ raise "Could not locate #{group_pathname}" unless group_pathname.exist?
132
+ end
133
+ group.files.each do |file|
134
+ unless file.signature.complete?
135
+ if @signature_hash.has_key?(file.signature)
136
+ file.signature = @signature_hash.find {|k,v| k == file.signature}[0]
137
+ elsif group_pathname
138
+ file_pathname = group_pathname.join(file.instances[0].path)
139
+ file.signature = file.signature.normalized_signature(file_pathname)
140
+ end
141
+ end
142
+ end
143
+ end
144
+
145
+ # @api external
146
+ # @param version_inventory [FileInventory] The complete inventory of the files comprising a digital object version
147
+ # @param data_pathname [Pathname] The location of the object's data directory
148
+ # @return [void] Compares the {FileSignature} entries in the new versions {FileInventory} against the signatures
149
+ # in this catalog and create new {SignatureCatalogEntry} addtions to the catalog
150
+ # @example {include:file:spec/features/catalog/catalog_update_spec.rb}
151
+ def update(version_inventory, data_pathname)
152
+ version_inventory.groups.each do |group|
153
+ group.files.each do |file|
154
+ unless @signature_hash.has_key?(file.signature)
155
+ entry = SignatureCatalogEntry.new
156
+ entry.version_id = version_inventory.version_id
157
+ entry.group_id = group.group_id
158
+ entry.path = file.instances[0].path
159
+ if file.signature.complete?
160
+ entry.signature = file.signature
161
+ else
162
+ file_pathname = data_pathname.join(group.group_id,entry.path)
163
+ entry.signature = file.signature.normalized_signature(file_pathname)
164
+ end
165
+ add_entry(entry)
166
+ end
167
+ end
168
+ end
169
+ @version_id = version_inventory.version_id
170
+ @catalog_datetime = Time.now
171
+ end
172
+
173
+ # @api external
174
+ # @param version_inventory (see #update)
175
+ # @return [FileInventory] Retrurns a filtered copy of the input inventory
176
+ # containing only those files that were added in this version
177
+ # @example {include:file:spec/features/catalog/version_additions_spec.rb}
178
+ def version_additions(version_inventory)
179
+ version_additions = FileInventory.new(:type=>'additions')
180
+ version_additions.copy_ids(version_inventory)
181
+ version_inventory.groups.each do |group|
182
+ group_addtions = FileGroup.new(:group_id => group.group_id)
183
+ group.files.each do |file|
184
+ unless @signature_hash.has_key?(file.signature)
185
+ group_addtions.add_file_instance(file.signature,file.instances[0])
186
+ end
187
+ end
188
+ version_additions.groups << group_addtions if group_addtions.files.size > 0
189
+ end
190
+ version_additions
191
+ end
192
+
193
+ end
194
+
195
+ end