longleaf 0.3.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +12 -2
  3. data/README.md +11 -1
  4. data/lib/longleaf/candidates/manifest_digest_provider.rb +17 -0
  5. data/lib/longleaf/candidates/single_digest_provider.rb +13 -0
  6. data/lib/longleaf/cli.rb +49 -36
  7. data/lib/longleaf/commands/register_command.rb +3 -3
  8. data/lib/longleaf/commands/validate_config_command.rb +1 -1
  9. data/lib/longleaf/events/register_event.rb +8 -4
  10. data/lib/longleaf/helpers/case_insensitive_hash.rb +38 -0
  11. data/lib/longleaf/helpers/digest_helper.rb +7 -1
  12. data/lib/longleaf/helpers/s3_uri_helper.rb +86 -0
  13. data/lib/longleaf/helpers/selection_options_parser.rb +189 -0
  14. data/lib/longleaf/helpers/service_date_helper.rb +29 -1
  15. data/lib/longleaf/indexing/sequel_index_driver.rb +2 -20
  16. data/lib/longleaf/models/app_fields.rb +4 -2
  17. data/lib/longleaf/models/filesystem_metadata_location.rb +56 -0
  18. data/lib/longleaf/models/filesystem_storage_location.rb +52 -0
  19. data/lib/longleaf/models/metadata_location.rb +47 -0
  20. data/lib/longleaf/models/metadata_record.rb +3 -1
  21. data/lib/longleaf/models/s3_storage_location.rb +133 -0
  22. data/lib/longleaf/models/service_fields.rb +4 -0
  23. data/lib/longleaf/models/storage_location.rb +17 -48
  24. data/lib/longleaf/models/storage_types.rb +9 -0
  25. data/lib/longleaf/preservation_services/rsync_replication_service.rb +9 -11
  26. data/lib/longleaf/preservation_services/s3_replication_service.rb +143 -0
  27. data/lib/longleaf/services/application_config_deserializer.rb +26 -4
  28. data/lib/longleaf/services/application_config_validator.rb +17 -6
  29. data/lib/longleaf/services/configuration_validator.rb +64 -4
  30. data/lib/longleaf/services/filesystem_location_validator.rb +16 -0
  31. data/lib/longleaf/services/metadata_deserializer.rb +41 -9
  32. data/lib/longleaf/services/metadata_persistence_manager.rb +3 -2
  33. data/lib/longleaf/services/metadata_serializer.rb +94 -13
  34. data/lib/longleaf/services/metadata_validator.rb +76 -0
  35. data/lib/longleaf/services/s3_location_validator.rb +19 -0
  36. data/lib/longleaf/services/service_definition_validator.rb +16 -8
  37. data/lib/longleaf/services/service_manager.rb +7 -15
  38. data/lib/longleaf/services/service_mapping_validator.rb +26 -15
  39. data/lib/longleaf/services/storage_location_manager.rb +38 -12
  40. data/lib/longleaf/services/storage_location_validator.rb +41 -30
  41. data/lib/longleaf/specs/config_builder.rb +10 -3
  42. data/lib/longleaf/specs/config_validator_helpers.rb +16 -0
  43. data/lib/longleaf/specs/metadata_builder.rb +1 -0
  44. data/lib/longleaf/version.rb +1 -1
  45. data/longleaf.gemspec +3 -1
  46. data/mkdocs.yml +2 -1
  47. metadata +48 -8
  48. data/.travis.yml +0 -4
  49. data/lib/longleaf/services/storage_path_validator.rb +0 -16
@@ -4,14 +4,25 @@ require_relative 'service_mapping_validator'
4
4
 
5
5
  module Longleaf
6
6
  # Validator for Longleaf application configuration
7
- class ApplicationConfigValidator
7
+ class ApplicationConfigValidator < ConfigurationValidator
8
+ # @param config [Hash] hash containing the application configuration
9
+ def initialize(config)
10
+ super(config)
11
+ end
12
+
13
+ protected
8
14
  # Validates the application configuration provided. Will raise ConfigurationError
9
15
  # if any portion of the configuration is not syntactically or semantically valid.
10
- # @param config [Hash] application configuration
11
- def self.validate(config)
12
- Longleaf::StorageLocationValidator::validate_config(config)
13
- Longleaf::ServiceDefinitionValidator::validate_config(config)
14
- Longleaf::ServiceMappingValidator::validate_config(config)
16
+ def validate
17
+ loc_result = StorageLocationValidator.new(@config).validate_config
18
+ defs_result = ServiceDefinitionValidator.new(@config).validate_config
19
+ mapping_result = ServiceMappingValidator.new(@config).validate_config
20
+
21
+ @result.errors.concat(loc_result.errors) unless loc_result.valid?
22
+ @result.errors.concat(defs_result.errors) unless defs_result.valid?
23
+ @result.errors.concat(mapping_result.errors) unless mapping_result.valid?
24
+
25
+ @result
15
26
  end
16
27
  end
17
28
  end
@@ -1,11 +1,71 @@
1
1
  module Longleaf
2
2
  # Abstract configuration validator class
3
3
  class ConfigurationValidator
4
- class << self
5
- protected
6
- def assert(fail_message, assertion_passed)
7
- raise ConfigurationError.new(fail_message) unless assertion_passed
4
+ attr_reader :result
5
+
6
+ def initialize(config)
7
+ @result = ConfigurationValidationResult.new
8
+ @config = config
9
+ end
10
+
11
+ # Verify that the provided configuration is valid
12
+ # @return [ConfigurationValidationResult] the result of the validation
13
+ def validate_config
14
+ register_on_failure { validate }
15
+
16
+ @result
17
+ end
18
+
19
+ # Asserts that the given conditional is true, raising a ConfigurationError if it is not.
20
+ def assert(fail_message, assertion_passed)
21
+ fail(fail_message) unless assertion_passed
22
+ end
23
+
24
+ # Indicate that validation has failed, throwing a Configuration error with the given message
25
+ def fail(fail_message)
26
+ raise ConfigurationError.new(fail_message)
27
+ end
28
+
29
+ # Registers an error to the result for this validator
30
+ def register_error(error)
31
+ if error.is_a?(StandardError)
32
+ @result.register_error(error.msg)
33
+ else
34
+ @result.register_error(error)
35
+ end
36
+ end
37
+
38
+ # Performs the provided block. If the block produces a ConfigurationError, the error
39
+ # is swallowed and registered to the result
40
+ def register_on_failure
41
+ begin
42
+ yield
43
+ rescue ConfigurationError => err
44
+ register_error(err.message)
8
45
  end
9
46
  end
10
47
  end
48
+
49
+ class ConfigurationValidationResult
50
+ attr_reader :errors
51
+
52
+ def initialize
53
+ @errors = Array.new
54
+ end
55
+
56
+ # Register an error with this validation result
57
+ def register_error(error_message)
58
+ @errors << error_message
59
+ end
60
+
61
+ # @return [boolean] true if validation produced not errors
62
+ def valid?
63
+ @errors.length == 0
64
+ end
65
+
66
+ # @raise [ConfigurationError] if the result is not valid, which lists all failures
67
+ def raise_if_invalid
68
+ raise ConfigurationError.new(@errors.join("\n")) unless valid?
69
+ end
70
+ end
11
71
  end
@@ -0,0 +1,16 @@
1
+ require 'pathname'
2
+ require 'longleaf/errors'
3
+
4
+ module Longleaf
5
+ # Validates the configuration of a filesystem based location
6
+ class FilesystemLocationValidator
7
+
8
+ def self.validate(p_validator, name, path_prop, section_name, path)
9
+ base_msg = "Storage location '#{name}' specifies invalid #{section_name} '#{path_prop}' property: "
10
+ p_validator.assert(base_msg + 'Path must not be empty', !path.nil? && !path.to_s.strip.empty?)
11
+ p_validator.assert(base_msg + 'Path must not contain any relative modifiers (/..)', !path.include?('/..'))
12
+ p_validator.assert(base_msg + 'Path must be absolute', Pathname.new(path).absolute?)
13
+ p_validator.assert(base_msg + 'Path does not exist', Dir.exist?(path))
14
+ end
15
+ end
16
+ end
@@ -1,6 +1,7 @@
1
1
  require 'yaml'
2
2
  require 'longleaf/models/metadata_record'
3
3
  require 'longleaf/models/md_fields'
4
+ require 'longleaf/services/metadata_validator'
4
5
  require 'longleaf/errors'
5
6
  require 'longleaf/logging'
6
7
 
@@ -15,6 +16,8 @@ module Longleaf
15
16
  # @param file_path [String] path of the file to read. Required.
16
17
  # @param format [String] format the file is stored in. Default is 'yaml'.
17
18
  def self.deserialize(file_path:, format: 'yaml', digest_algs: [])
19
+ file_path = file_path.path if file_path.is_a?(File)
20
+
18
21
  case format
19
22
  when 'yaml'
20
23
  md = from_yaml(file_path, digest_algs)
@@ -22,10 +25,6 @@ module Longleaf
22
25
  raise ArgumentError.new("Invalid deserialization format #{format} specified")
23
26
  end
24
27
 
25
- if !md || !md.is_a?(Hash) || !md.key?(MDF::DATA) || !md.key?(MDF::SERVICES)
26
- raise Longleaf::MetadataError.new("Invalid metadata file, did not contain data or services fields: #{file_path}")
27
- end
28
-
29
28
  data = Hash.new.merge(md[MDF::DATA])
30
29
  # Extract reserved properties for submission as separate parameters
31
30
  registered = data.delete(MDFields::REGISTERED_TIMESTAMP)
@@ -37,7 +36,7 @@ module Longleaf
37
36
  services = md[MDF::SERVICES]
38
37
  service_records = Hash.new
39
38
  services&.each do |name, props|
40
- raise Longleaf::MetadataError.new("Value of service #{name} must be a hash") unless props.class == Hash
39
+ raise MetadataError.new("Value of service #{name} must be a hash") unless props.class == Hash
41
40
 
42
41
  service_props = Hash.new.merge(props)
43
42
 
@@ -66,12 +65,45 @@ module Longleaf
66
65
  File.open(file_path, 'r:bom|utf-8') do |f|
67
66
  contents = f.read
68
67
 
69
- verify_digests(file_path, contents, digest_algs)
68
+ checksum_error = nil
69
+ begin
70
+ verify_digests(file_path, contents, digest_algs)
71
+ rescue ChecksumMismatchError => err
72
+ # Hold onto the checksum error, in case we can identify the underlying cause
73
+ checksum_error = err
74
+ end
70
75
 
71
76
  begin
72
- YAML.safe_load(contents, [], [], true)
73
- rescue => err
74
- raise Longleaf::MetadataError.new("Failed to parse metadata file #{file_path}: #{err.message}")
77
+ md = nil
78
+ begin
79
+ md = YAML.safe_load(contents, [], [], true)
80
+ rescue => err
81
+ raise MetadataError.new("Failed to parse metadata file #{file_path}: #{err.message}")
82
+ end
83
+
84
+ validation_result = MetadataValidator.new(md).validate_config
85
+ if !validation_result.valid?
86
+ if checksum_error.nil?
87
+ raise MetadataError.new("Invalid metadata file #{file_path.to_s}:\n#{validation_result.errors.join("\n")}")
88
+ else
89
+ raise MetadataError.new(validation_result.errors.join("\n"))
90
+ end
91
+ end
92
+
93
+ # Either return the valid metadata, or raise the checksum error as is
94
+ if checksum_error.nil?
95
+ md
96
+ else
97
+ raise checksum_error
98
+ end
99
+ rescue MetadataError => err
100
+ if checksum_error.nil?
101
+ raise err
102
+ else
103
+ # Add underlying cause from the metadata error to the checksum mismatch error
104
+ msg = checksum_error.message + "\nWith related issue(s):\n#{err.message}"
105
+ raise ChecksumMismatchError.new(msg)
106
+ end
75
107
  end
76
108
  end
77
109
  end
@@ -1,4 +1,5 @@
1
1
  require 'longleaf/services/metadata_serializer'
2
+ require 'longleaf/services/metadata_deserializer'
2
3
  require 'longleaf/errors'
3
4
 
4
5
  module Longleaf
@@ -20,7 +21,7 @@ module Longleaf
20
21
 
21
22
  MetadataSerializer::write(metadata: file_rec.metadata_record,
22
23
  file_path: file_rec.metadata_path,
23
- digest_algs: file_rec.storage_location.metadata_digests)
24
+ digest_algs: file_rec.storage_location.metadata_location.digests)
24
25
 
25
26
  index(file_rec)
26
27
  end
@@ -38,7 +39,7 @@ module Longleaf
38
39
  # @return [MetadataRecord] the metadata record for the file record
39
40
  def load(file_rec)
40
41
  md_rec = MetadataDeserializer.deserialize(file_path: file_rec.metadata_path,
41
- digest_algs: file_rec.storage_location.metadata_digests)
42
+ digest_algs: file_rec.storage_location.metadata_location.digests)
42
43
  file_rec.metadata_record = md_rec
43
44
  md_rec
44
45
  end
@@ -5,6 +5,7 @@ require 'longleaf/helpers/digest_helper'
5
5
  require 'longleaf/errors'
6
6
  require 'longleaf/logging'
7
7
  require 'pathname'
8
+ require "tempfile"
8
9
 
9
10
  module Longleaf
10
11
  # Service which serializes MetadataRecord objects
@@ -30,12 +31,7 @@ module Longleaf
30
31
  raise ArgumentError.new("Invalid serialization format #{format} specified")
31
32
  end
32
33
 
33
- # Fill in parent directories if they do not exist
34
- parent_dir = Pathname(file_path).parent
35
- parent_dir.mkpath unless parent_dir.exist?
36
-
37
- File.write(file_path, content)
38
- write_digests(file_path, content, digest_algs)
34
+ atomic_write(file_path, content, digest_algs)
39
35
  end
40
36
 
41
37
  # @param metadata [MetadataRecord] metadata record to transform
@@ -85,24 +81,109 @@ module Longleaf
85
81
  end
86
82
  end
87
83
 
84
+ # Safely writes the new metadata file and its digests.
85
+ # It does so by first writing the content and its digests to temp files,
86
+ # then making the temp files the current version of the file.
87
+ # Attempts to clean up new data in the case of failure.
88
+ def self.atomic_write(file_path, content, digest_algs)
89
+ # Fill in parent directories if they do not exist
90
+ parent_dir = Pathname(file_path).parent
91
+ parent_dir.mkpath unless parent_dir.exist?
92
+
93
+ file_path = file_path.path if file_path.respond_to?(:path)
94
+
95
+ # If file does not already exist, then simply write it
96
+ if !File.exist?(file_path)
97
+ File.write(file_path, content)
98
+ write_digests(file_path, content, digest_algs)
99
+ return
100
+ end
101
+
102
+ # Updating file, use safe atomic write
103
+ File.open(file_path) do |original_file|
104
+ original_file.flock(File::LOCK_EX)
105
+
106
+ base_name = File.basename(file_path)
107
+ Tempfile.open(base_name, parent_dir) do |temp_file|
108
+ begin
109
+ # Write content to temp file
110
+ temp_file.write(content)
111
+ temp_file.close
112
+
113
+ temp_path = temp_file.path
114
+
115
+ # Set permissions of new file to match old if it exists
116
+ old_stat = File.stat(file_path)
117
+ set_perms(temp_path, old_stat)
118
+
119
+ begin
120
+ digest_paths = write_digests(temp_path, content, digest_algs)
121
+
122
+ File.rename(temp_path, file_path)
123
+ rescue => e
124
+ cleanup_digests(temp_path)
125
+ raise e
126
+ end
127
+ rescue => e
128
+ temp_file.delete
129
+ raise e
130
+ end
131
+
132
+ # Cleanup all existing digest files, in case the set of algorithms has changed
133
+ cleanup_digests(file_path)
134
+ # Move new digests into place
135
+ digest_paths.each do |digest_path|
136
+ File.rename(digest_path, digest_path.sub(temp_path, file_path))
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+ def self.set_perms(file_path, stat_info)
143
+ if stat_info
144
+ # Set correct permissions on new file
145
+ begin
146
+ File.chown(stat_info.uid, stat_info.gid, file_path)
147
+ # This operation will affect filesystem ACL's
148
+ File.chmod(stat_info.mode, file_path)
149
+ rescue Errno::EPERM, Errno::EACCES
150
+ # Changing file ownership failed, moving on.
151
+ return false
152
+ end
153
+ end
154
+ true
155
+ end
156
+
157
+ # Deletes all known digest files for the provided file path
158
+ def self.cleanup_digests(file_path)
159
+ DigestHelper::KNOWN_DIGESTS.each do |alg|
160
+ digest_path = "#{file_path}.#{alg}"
161
+ File.delete(digest_path) if File.exist?(digest_path)
162
+ end
163
+ end
164
+
88
165
  def self.write_digests(file_path, content, digests)
89
- return if digests.nil? || digests.empty?
166
+ return [] if digests.nil? || digests.empty?
167
+
168
+ digest_paths = Array.new
90
169
 
91
170
  digests.each do |alg|
92
171
  digest_class = DigestHelper::start_digest(alg)
93
172
  result = digest_class.hexdigest(content)
94
- if file_path.respond_to?(:path)
95
- digest_path = "#{file_path.path}.#{alg}"
96
- else
97
- digest_path = "#{file_path}.#{alg}"
98
- end
173
+ digest_path = "#{file_path}.#{alg}"
99
174
 
100
175
  File.write(digest_path, result)
101
176
 
102
- self.logger.debug("Generated #{alg} digest for metadata file #{file_path}: #{result}")
177
+ digest_paths.push(digest_path)
178
+
179
+ self.logger.debug("Generated #{alg} digest for metadata file #{file_path}: #{digest_path} #{result}")
103
180
  end
181
+
182
+ digest_paths
104
183
  end
105
184
 
185
+ private_class_method :cleanup_digests
106
186
  private_class_method :write_digests
187
+ private_class_method :atomic_write
107
188
  end
108
189
  end
@@ -0,0 +1,76 @@
1
+ require 'pathname'
2
+ require 'longleaf/models/md_fields'
3
+ require 'longleaf/errors'
4
+ require_relative 'configuration_validator'
5
+
6
+ module Longleaf
7
+ # Validator for file metadata
8
+ class MetadataValidator < ConfigurationValidator
9
+ MDF ||= MDFields
10
+
11
+ # @param config [Hash] hash containing the application configuration
12
+ def initialize(config)
13
+ super(config)
14
+ end
15
+
16
+ protected
17
+ # Validates the provided metadata for a file to ensure that it is syntactically correct and field types
18
+ # are validate.
19
+ def validate
20
+ assert("Metadata must be a hash, but a #{@config.class} was provided", @config.class == Hash)
21
+ assert("Metadata must contain a '#{MDF::DATA}' key", @config.key?(MDF::DATA))
22
+ assert("Metadata must contain a '#{MDF::SERVICES}' key", @config.key?(MDF::SERVICES))
23
+
24
+ data = @config[MDF::DATA]
25
+ register_on_failure { validate_date_field(data, MDF::REGISTERED_TIMESTAMP) }
26
+ register_on_failure { validate_date_field(data, MDF::DEREGISTERED_TIMESTAMP, required: false) }
27
+ register_on_failure { validate_date_field(data, MDF::LAST_MODIFIED) }
28
+
29
+ register_on_failure { validate_positive_integer(data, MDF::FILE_SIZE) }
30
+
31
+ checksums = data[MDF::CHECKSUMS]
32
+ register_on_failure do
33
+ if !checksums.nil? && !checksums.is_a?(Hash)
34
+ fail("Field '#{MDF::CHECKSUMS}' must be a map of algorithms to digests, but was a #{checksums.class}")
35
+ end
36
+ end
37
+
38
+ # Ensure that any service timestamps present are valid dates
39
+ services = @config[MDF::SERVICES]
40
+ services.each do |service_name, service_rec|
41
+ register_on_failure { validate_date_field(service_rec, MDF::SERVICE_TIMESTAMP, required: false) }
42
+ end
43
+ end
44
+
45
+ def validate_date_field(section, field_key, required: true)
46
+ field_val = section[field_key]
47
+
48
+ if field_val
49
+ begin
50
+ Time.iso8601(section[field_key])
51
+ rescue ArgumentError
52
+ fail("Field '#{field_key}' must be a valid ISO8601 timestamp, but contained value '#{section[field_key]}'")
53
+ end
54
+ elsif required
55
+ fail("Metadata must contain a '#{field_key}' field")
56
+ end
57
+ end
58
+
59
+ def validate_positive_integer(section, field_key, required: true)
60
+ field_val = section[field_key]
61
+
62
+ if field_val
63
+ begin
64
+ val = field_val.is_a?(Integer) ? field_val : Integer(field_val, 10)
65
+ if val < 0
66
+ fail("Field '#{field_key}' must be a positive integer")
67
+ end
68
+ rescue ArgumentError => err
69
+ fail("Field '#{field_key}' must be a positive integer")
70
+ end
71
+ elsif required
72
+ fail("Metadata must contain a '#{field_key}' field")
73
+ end
74
+ end
75
+ end
76
+ end