longleaf 0.3.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +12 -2
  3. data/README.md +11 -1
  4. data/lib/longleaf/candidates/manifest_digest_provider.rb +17 -0
  5. data/lib/longleaf/candidates/single_digest_provider.rb +13 -0
  6. data/lib/longleaf/cli.rb +49 -36
  7. data/lib/longleaf/commands/register_command.rb +3 -3
  8. data/lib/longleaf/commands/validate_config_command.rb +1 -1
  9. data/lib/longleaf/events/register_event.rb +8 -4
  10. data/lib/longleaf/helpers/case_insensitive_hash.rb +38 -0
  11. data/lib/longleaf/helpers/digest_helper.rb +7 -1
  12. data/lib/longleaf/helpers/s3_uri_helper.rb +86 -0
  13. data/lib/longleaf/helpers/selection_options_parser.rb +189 -0
  14. data/lib/longleaf/helpers/service_date_helper.rb +29 -1
  15. data/lib/longleaf/indexing/sequel_index_driver.rb +2 -20
  16. data/lib/longleaf/models/app_fields.rb +4 -2
  17. data/lib/longleaf/models/filesystem_metadata_location.rb +56 -0
  18. data/lib/longleaf/models/filesystem_storage_location.rb +52 -0
  19. data/lib/longleaf/models/metadata_location.rb +47 -0
  20. data/lib/longleaf/models/metadata_record.rb +3 -1
  21. data/lib/longleaf/models/s3_storage_location.rb +133 -0
  22. data/lib/longleaf/models/service_fields.rb +4 -0
  23. data/lib/longleaf/models/storage_location.rb +17 -48
  24. data/lib/longleaf/models/storage_types.rb +9 -0
  25. data/lib/longleaf/preservation_services/rsync_replication_service.rb +9 -11
  26. data/lib/longleaf/preservation_services/s3_replication_service.rb +143 -0
  27. data/lib/longleaf/services/application_config_deserializer.rb +26 -4
  28. data/lib/longleaf/services/application_config_validator.rb +17 -6
  29. data/lib/longleaf/services/configuration_validator.rb +64 -4
  30. data/lib/longleaf/services/filesystem_location_validator.rb +16 -0
  31. data/lib/longleaf/services/metadata_deserializer.rb +41 -9
  32. data/lib/longleaf/services/metadata_persistence_manager.rb +3 -2
  33. data/lib/longleaf/services/metadata_serializer.rb +94 -13
  34. data/lib/longleaf/services/metadata_validator.rb +76 -0
  35. data/lib/longleaf/services/s3_location_validator.rb +19 -0
  36. data/lib/longleaf/services/service_definition_validator.rb +16 -8
  37. data/lib/longleaf/services/service_manager.rb +7 -15
  38. data/lib/longleaf/services/service_mapping_validator.rb +26 -15
  39. data/lib/longleaf/services/storage_location_manager.rb +38 -12
  40. data/lib/longleaf/services/storage_location_validator.rb +41 -30
  41. data/lib/longleaf/specs/config_builder.rb +10 -3
  42. data/lib/longleaf/specs/config_validator_helpers.rb +16 -0
  43. data/lib/longleaf/specs/metadata_builder.rb +1 -0
  44. data/lib/longleaf/version.rb +1 -1
  45. data/longleaf.gemspec +3 -1
  46. data/mkdocs.yml +2 -1
  47. metadata +48 -8
  48. data/.travis.yml +0 -4
  49. data/lib/longleaf/services/storage_path_validator.rb +0 -16
@@ -4,14 +4,25 @@ require_relative 'service_mapping_validator'
4
4
 
5
5
  module Longleaf
6
6
  # Validator for Longleaf application configuration
7
- class ApplicationConfigValidator
7
+ class ApplicationConfigValidator < ConfigurationValidator
8
+ # @param config [Hash] hash containing the application configuration
9
+ def initialize(config)
10
+ super(config)
11
+ end
12
+
13
+ protected
8
14
  # Validates the application configuration provided. Will raise ConfigurationError
9
15
  # if any portion of the configuration is not syntactically or semantically valid.
10
- # @param config [Hash] application configuration
11
- def self.validate(config)
12
- Longleaf::StorageLocationValidator::validate_config(config)
13
- Longleaf::ServiceDefinitionValidator::validate_config(config)
14
- Longleaf::ServiceMappingValidator::validate_config(config)
16
+ def validate
17
+ loc_result = StorageLocationValidator.new(@config).validate_config
18
+ defs_result = ServiceDefinitionValidator.new(@config).validate_config
19
+ mapping_result = ServiceMappingValidator.new(@config).validate_config
20
+
21
+ @result.errors.concat(loc_result.errors) unless loc_result.valid?
22
+ @result.errors.concat(defs_result.errors) unless defs_result.valid?
23
+ @result.errors.concat(mapping_result.errors) unless mapping_result.valid?
24
+
25
+ @result
15
26
  end
16
27
  end
17
28
  end
@@ -1,11 +1,71 @@
1
1
  module Longleaf
2
2
  # Abstract configuration validator class
3
3
  class ConfigurationValidator
4
- class << self
5
- protected
6
- def assert(fail_message, assertion_passed)
7
- raise ConfigurationError.new(fail_message) unless assertion_passed
4
+ attr_reader :result
5
+
6
+ def initialize(config)
7
+ @result = ConfigurationValidationResult.new
8
+ @config = config
9
+ end
10
+
11
+ # Verify that the provided configuration is valid
12
+ # @return [ConfigurationValidationResult] the result of the validation
13
+ def validate_config
14
+ register_on_failure { validate }
15
+
16
+ @result
17
+ end
18
+
19
+ # Asserts that the given conditional is true, raising a ConfigurationError if it is not.
20
+ def assert(fail_message, assertion_passed)
21
+ fail(fail_message) unless assertion_passed
22
+ end
23
+
24
+ # Indicate that validation has failed, throwing a Configuration error with the given message
25
+ def fail(fail_message)
26
+ raise ConfigurationError.new(fail_message)
27
+ end
28
+
29
+ # Registers an error to the result for this validator
30
+ def register_error(error)
31
+ if error.is_a?(StandardError)
32
+ @result.register_error(error.msg)
33
+ else
34
+ @result.register_error(error)
35
+ end
36
+ end
37
+
38
+ # Performs the provided block. If the block produces a ConfigurationError, the error
39
+ # is swallowed and registered to the result
40
+ def register_on_failure
41
+ begin
42
+ yield
43
+ rescue ConfigurationError => err
44
+ register_error(err.message)
8
45
  end
9
46
  end
10
47
  end
48
+
49
+ class ConfigurationValidationResult
50
+ attr_reader :errors
51
+
52
+ def initialize
53
+ @errors = Array.new
54
+ end
55
+
56
+ # Register an error with this validation result
57
+ def register_error(error_message)
58
+ @errors << error_message
59
+ end
60
+
61
+ # @return [boolean] true if validation produced not errors
62
+ def valid?
63
+ @errors.length == 0
64
+ end
65
+
66
+ # @raise [ConfigurationError] if the result is not valid, which lists all failures
67
+ def raise_if_invalid
68
+ raise ConfigurationError.new(@errors.join("\n")) unless valid?
69
+ end
70
+ end
11
71
  end
@@ -0,0 +1,16 @@
1
+ require 'pathname'
2
+ require 'longleaf/errors'
3
+
4
+ module Longleaf
5
+ # Validates the configuration of a filesystem based location
6
+ class FilesystemLocationValidator
7
+
8
+ def self.validate(p_validator, name, path_prop, section_name, path)
9
+ base_msg = "Storage location '#{name}' specifies invalid #{section_name} '#{path_prop}' property: "
10
+ p_validator.assert(base_msg + 'Path must not be empty', !path.nil? && !path.to_s.strip.empty?)
11
+ p_validator.assert(base_msg + 'Path must not contain any relative modifiers (/..)', !path.include?('/..'))
12
+ p_validator.assert(base_msg + 'Path must be absolute', Pathname.new(path).absolute?)
13
+ p_validator.assert(base_msg + 'Path does not exist', Dir.exist?(path))
14
+ end
15
+ end
16
+ end
@@ -1,6 +1,7 @@
1
1
  require 'yaml'
2
2
  require 'longleaf/models/metadata_record'
3
3
  require 'longleaf/models/md_fields'
4
+ require 'longleaf/services/metadata_validator'
4
5
  require 'longleaf/errors'
5
6
  require 'longleaf/logging'
6
7
 
@@ -15,6 +16,8 @@ module Longleaf
15
16
  # @param file_path [String] path of the file to read. Required.
16
17
  # @param format [String] format the file is stored in. Default is 'yaml'.
17
18
  def self.deserialize(file_path:, format: 'yaml', digest_algs: [])
19
+ file_path = file_path.path if file_path.is_a?(File)
20
+
18
21
  case format
19
22
  when 'yaml'
20
23
  md = from_yaml(file_path, digest_algs)
@@ -22,10 +25,6 @@ module Longleaf
22
25
  raise ArgumentError.new("Invalid deserialization format #{format} specified")
23
26
  end
24
27
 
25
- if !md || !md.is_a?(Hash) || !md.key?(MDF::DATA) || !md.key?(MDF::SERVICES)
26
- raise Longleaf::MetadataError.new("Invalid metadata file, did not contain data or services fields: #{file_path}")
27
- end
28
-
29
28
  data = Hash.new.merge(md[MDF::DATA])
30
29
  # Extract reserved properties for submission as separate parameters
31
30
  registered = data.delete(MDFields::REGISTERED_TIMESTAMP)
@@ -37,7 +36,7 @@ module Longleaf
37
36
  services = md[MDF::SERVICES]
38
37
  service_records = Hash.new
39
38
  services&.each do |name, props|
40
- raise Longleaf::MetadataError.new("Value of service #{name} must be a hash") unless props.class == Hash
39
+ raise MetadataError.new("Value of service #{name} must be a hash") unless props.class == Hash
41
40
 
42
41
  service_props = Hash.new.merge(props)
43
42
 
@@ -66,12 +65,45 @@ module Longleaf
66
65
  File.open(file_path, 'r:bom|utf-8') do |f|
67
66
  contents = f.read
68
67
 
69
- verify_digests(file_path, contents, digest_algs)
68
+ checksum_error = nil
69
+ begin
70
+ verify_digests(file_path, contents, digest_algs)
71
+ rescue ChecksumMismatchError => err
72
+ # Hold onto the checksum error, in case we can identify the underlying cause
73
+ checksum_error = err
74
+ end
70
75
 
71
76
  begin
72
- YAML.safe_load(contents, [], [], true)
73
- rescue => err
74
- raise Longleaf::MetadataError.new("Failed to parse metadata file #{file_path}: #{err.message}")
77
+ md = nil
78
+ begin
79
+ md = YAML.safe_load(contents, [], [], true)
80
+ rescue => err
81
+ raise MetadataError.new("Failed to parse metadata file #{file_path}: #{err.message}")
82
+ end
83
+
84
+ validation_result = MetadataValidator.new(md).validate_config
85
+ if !validation_result.valid?
86
+ if checksum_error.nil?
87
+ raise MetadataError.new("Invalid metadata file #{file_path.to_s}:\n#{validation_result.errors.join("\n")}")
88
+ else
89
+ raise MetadataError.new(validation_result.errors.join("\n"))
90
+ end
91
+ end
92
+
93
+ # Either return the valid metadata, or raise the checksum error as is
94
+ if checksum_error.nil?
95
+ md
96
+ else
97
+ raise checksum_error
98
+ end
99
+ rescue MetadataError => err
100
+ if checksum_error.nil?
101
+ raise err
102
+ else
103
+ # Add underlying cause from the metadata error to the checksum mismatch error
104
+ msg = checksum_error.message + "\nWith related issue(s):\n#{err.message}"
105
+ raise ChecksumMismatchError.new(msg)
106
+ end
75
107
  end
76
108
  end
77
109
  end
@@ -1,4 +1,5 @@
1
1
  require 'longleaf/services/metadata_serializer'
2
+ require 'longleaf/services/metadata_deserializer'
2
3
  require 'longleaf/errors'
3
4
 
4
5
  module Longleaf
@@ -20,7 +21,7 @@ module Longleaf
20
21
 
21
22
  MetadataSerializer::write(metadata: file_rec.metadata_record,
22
23
  file_path: file_rec.metadata_path,
23
- digest_algs: file_rec.storage_location.metadata_digests)
24
+ digest_algs: file_rec.storage_location.metadata_location.digests)
24
25
 
25
26
  index(file_rec)
26
27
  end
@@ -38,7 +39,7 @@ module Longleaf
38
39
  # @return [MetadataRecord] the metadata record for the file record
39
40
  def load(file_rec)
40
41
  md_rec = MetadataDeserializer.deserialize(file_path: file_rec.metadata_path,
41
- digest_algs: file_rec.storage_location.metadata_digests)
42
+ digest_algs: file_rec.storage_location.metadata_location.digests)
42
43
  file_rec.metadata_record = md_rec
43
44
  md_rec
44
45
  end
@@ -5,6 +5,7 @@ require 'longleaf/helpers/digest_helper'
5
5
  require 'longleaf/errors'
6
6
  require 'longleaf/logging'
7
7
  require 'pathname'
8
+ require "tempfile"
8
9
 
9
10
  module Longleaf
10
11
  # Service which serializes MetadataRecord objects
@@ -30,12 +31,7 @@ module Longleaf
30
31
  raise ArgumentError.new("Invalid serialization format #{format} specified")
31
32
  end
32
33
 
33
- # Fill in parent directories if they do not exist
34
- parent_dir = Pathname(file_path).parent
35
- parent_dir.mkpath unless parent_dir.exist?
36
-
37
- File.write(file_path, content)
38
- write_digests(file_path, content, digest_algs)
34
+ atomic_write(file_path, content, digest_algs)
39
35
  end
40
36
 
41
37
  # @param metadata [MetadataRecord] metadata record to transform
@@ -85,24 +81,109 @@ module Longleaf
85
81
  end
86
82
  end
87
83
 
84
+ # Safely writes the new metadata file and its digests.
85
+ # It does so by first writing the content and its digests to temp files,
86
+ # then making the temp files the current version of the file.
87
+ # Attempts to clean up new data in the case of failure.
88
+ def self.atomic_write(file_path, content, digest_algs)
89
+ # Fill in parent directories if they do not exist
90
+ parent_dir = Pathname(file_path).parent
91
+ parent_dir.mkpath unless parent_dir.exist?
92
+
93
+ file_path = file_path.path if file_path.respond_to?(:path)
94
+
95
+ # If file does not already exist, then simply write it
96
+ if !File.exist?(file_path)
97
+ File.write(file_path, content)
98
+ write_digests(file_path, content, digest_algs)
99
+ return
100
+ end
101
+
102
+ # Updating file, use safe atomic write
103
+ File.open(file_path) do |original_file|
104
+ original_file.flock(File::LOCK_EX)
105
+
106
+ base_name = File.basename(file_path)
107
+ Tempfile.open(base_name, parent_dir) do |temp_file|
108
+ begin
109
+ # Write content to temp file
110
+ temp_file.write(content)
111
+ temp_file.close
112
+
113
+ temp_path = temp_file.path
114
+
115
+ # Set permissions of new file to match old if it exists
116
+ old_stat = File.stat(file_path)
117
+ set_perms(temp_path, old_stat)
118
+
119
+ begin
120
+ digest_paths = write_digests(temp_path, content, digest_algs)
121
+
122
+ File.rename(temp_path, file_path)
123
+ rescue => e
124
+ cleanup_digests(temp_path)
125
+ raise e
126
+ end
127
+ rescue => e
128
+ temp_file.delete
129
+ raise e
130
+ end
131
+
132
+ # Cleanup all existing digest files, in case the set of algorithms has changed
133
+ cleanup_digests(file_path)
134
+ # Move new digests into place
135
+ digest_paths.each do |digest_path|
136
+ File.rename(digest_path, digest_path.sub(temp_path, file_path))
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+ def self.set_perms(file_path, stat_info)
143
+ if stat_info
144
+ # Set correct permissions on new file
145
+ begin
146
+ File.chown(stat_info.uid, stat_info.gid, file_path)
147
+ # This operation will affect filesystem ACL's
148
+ File.chmod(stat_info.mode, file_path)
149
+ rescue Errno::EPERM, Errno::EACCES
150
+ # Changing file ownership failed, moving on.
151
+ return false
152
+ end
153
+ end
154
+ true
155
+ end
156
+
157
+ # Deletes all known digest files for the provided file path
158
+ def self.cleanup_digests(file_path)
159
+ DigestHelper::KNOWN_DIGESTS.each do |alg|
160
+ digest_path = "#{file_path}.#{alg}"
161
+ File.delete(digest_path) if File.exist?(digest_path)
162
+ end
163
+ end
164
+
88
165
  def self.write_digests(file_path, content, digests)
89
- return if digests.nil? || digests.empty?
166
+ return [] if digests.nil? || digests.empty?
167
+
168
+ digest_paths = Array.new
90
169
 
91
170
  digests.each do |alg|
92
171
  digest_class = DigestHelper::start_digest(alg)
93
172
  result = digest_class.hexdigest(content)
94
- if file_path.respond_to?(:path)
95
- digest_path = "#{file_path.path}.#{alg}"
96
- else
97
- digest_path = "#{file_path}.#{alg}"
98
- end
173
+ digest_path = "#{file_path}.#{alg}"
99
174
 
100
175
  File.write(digest_path, result)
101
176
 
102
- self.logger.debug("Generated #{alg} digest for metadata file #{file_path}: #{result}")
177
+ digest_paths.push(digest_path)
178
+
179
+ self.logger.debug("Generated #{alg} digest for metadata file #{file_path}: #{digest_path} #{result}")
103
180
  end
181
+
182
+ digest_paths
104
183
  end
105
184
 
185
+ private_class_method :cleanup_digests
106
186
  private_class_method :write_digests
187
+ private_class_method :atomic_write
107
188
  end
108
189
  end
@@ -0,0 +1,76 @@
1
+ require 'pathname'
2
+ require 'longleaf/models/md_fields'
3
+ require 'longleaf/errors'
4
+ require_relative 'configuration_validator'
5
+
6
+ module Longleaf
7
+ # Validator for file metadata
8
+ class MetadataValidator < ConfigurationValidator
9
+ MDF ||= MDFields
10
+
11
+ # @param config [Hash] hash containing the application configuration
12
+ def initialize(config)
13
+ super(config)
14
+ end
15
+
16
+ protected
17
+ # Validates the provided metadata for a file to ensure that it is syntactically correct and field types
18
+ # are validate.
19
+ def validate
20
+ assert("Metadata must be a hash, but a #{@config.class} was provided", @config.class == Hash)
21
+ assert("Metadata must contain a '#{MDF::DATA}' key", @config.key?(MDF::DATA))
22
+ assert("Metadata must contain a '#{MDF::SERVICES}' key", @config.key?(MDF::SERVICES))
23
+
24
+ data = @config[MDF::DATA]
25
+ register_on_failure { validate_date_field(data, MDF::REGISTERED_TIMESTAMP) }
26
+ register_on_failure { validate_date_field(data, MDF::DEREGISTERED_TIMESTAMP, required: false) }
27
+ register_on_failure { validate_date_field(data, MDF::LAST_MODIFIED) }
28
+
29
+ register_on_failure { validate_positive_integer(data, MDF::FILE_SIZE) }
30
+
31
+ checksums = data[MDF::CHECKSUMS]
32
+ register_on_failure do
33
+ if !checksums.nil? && !checksums.is_a?(Hash)
34
+ fail("Field '#{MDF::CHECKSUMS}' must be a map of algorithms to digests, but was a #{checksums.class}")
35
+ end
36
+ end
37
+
38
+ # Ensure that any service timestamps present are valid dates
39
+ services = @config[MDF::SERVICES]
40
+ services.each do |service_name, service_rec|
41
+ register_on_failure { validate_date_field(service_rec, MDF::SERVICE_TIMESTAMP, required: false) }
42
+ end
43
+ end
44
+
45
+ def validate_date_field(section, field_key, required: true)
46
+ field_val = section[field_key]
47
+
48
+ if field_val
49
+ begin
50
+ Time.iso8601(section[field_key])
51
+ rescue ArgumentError
52
+ fail("Field '#{field_key}' must be a valid ISO8601 timestamp, but contained value '#{section[field_key]}'")
53
+ end
54
+ elsif required
55
+ fail("Metadata must contain a '#{field_key}' field")
56
+ end
57
+ end
58
+
59
+ def validate_positive_integer(section, field_key, required: true)
60
+ field_val = section[field_key]
61
+
62
+ if field_val
63
+ begin
64
+ val = field_val.is_a?(Integer) ? field_val : Integer(field_val, 10)
65
+ if val < 0
66
+ fail("Field '#{field_key}' must be a positive integer")
67
+ end
68
+ rescue ArgumentError => err
69
+ fail("Field '#{field_key}' must be a positive integer")
70
+ end
71
+ elsif required
72
+ fail("Metadata must contain a '#{field_key}' field")
73
+ end
74
+ end
75
+ end
76
+ end