longleaf 0.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +94 -0
  3. data/.editorconfig +13 -0
  4. data/.gitignore +4 -1
  5. data/.rubocop.yml +44 -0
  6. data/.rubocop_todo.yml +834 -0
  7. data/.yardopts +1 -0
  8. data/Gemfile +16 -1
  9. data/README.md +98 -12
  10. data/Rakefile +6 -0
  11. data/bin/setup +16 -1
  12. data/docs/aboutlongleaf.md +28 -0
  13. data/docs/extra.css +32 -0
  14. data/docs/img/change-file.png +0 -0
  15. data/docs/img/ll-example-preserved.png +0 -0
  16. data/docs/index.md +19 -0
  17. data/docs/install.md +66 -0
  18. data/docs/ll-example/config-example-relative.yml +33 -0
  19. data/docs/ll-example/files-dir/LLexample-PDF.pdf +0 -0
  20. data/docs/ll-example/files-dir/LLexample-TOCHANGE.txt +15 -0
  21. data/docs/ll-example/files-dir/LLexample-tokeep.txt +10 -0
  22. data/docs/ll-example/metadata-dir/.gitkeep +0 -0
  23. data/docs/ll-example/replica-files/.gitkeep +0 -0
  24. data/docs/ll-example/replica-metadata/.gitkeep +0 -0
  25. data/docs/quickstart.md +270 -0
  26. data/docs/rdocs/Longleaf.html +135 -0
  27. data/docs/rdocs/Longleaf/AppFields.html +178 -0
  28. data/docs/rdocs/Longleaf/ApplicationConfigDeserializer.html +631 -0
  29. data/docs/rdocs/Longleaf/ApplicationConfigManager.html +610 -0
  30. data/docs/rdocs/Longleaf/ApplicationConfigValidator.html +238 -0
  31. data/docs/rdocs/Longleaf/CLI.html +909 -0
  32. data/docs/rdocs/Longleaf/ChecksumMismatchError.html +151 -0
  33. data/docs/rdocs/Longleaf/ConfigBuilder.html +1339 -0
  34. data/docs/rdocs/Longleaf/ConfigurationError.html +143 -0
  35. data/docs/rdocs/Longleaf/ConfigurationValidator.html +227 -0
  36. data/docs/rdocs/Longleaf/DeregisterCommand.html +420 -0
  37. data/docs/rdocs/Longleaf/DeregisterEvent.html +453 -0
  38. data/docs/rdocs/Longleaf/DeregistrationError.html +151 -0
  39. data/docs/rdocs/Longleaf/DigestHelper.html +419 -0
  40. data/docs/rdocs/Longleaf/EventError.html +147 -0
  41. data/docs/rdocs/Longleaf/EventNames.html +163 -0
  42. data/docs/rdocs/Longleaf/EventStatusTracking.html +656 -0
  43. data/docs/rdocs/Longleaf/FileCheckService.html +540 -0
  44. data/docs/rdocs/Longleaf/FileHelpers.html +520 -0
  45. data/docs/rdocs/Longleaf/FileRecord.html +716 -0
  46. data/docs/rdocs/Longleaf/FileSelector.html +901 -0
  47. data/docs/rdocs/Longleaf/FixityCheckService.html +691 -0
  48. data/docs/rdocs/Longleaf/IndexManager.html +1155 -0
  49. data/docs/rdocs/Longleaf/InvalidDigestAlgorithmError.html +143 -0
  50. data/docs/rdocs/Longleaf/InvalidStoragePathError.html +143 -0
  51. data/docs/rdocs/Longleaf/Logging.html +405 -0
  52. data/docs/rdocs/Longleaf/Logging/RedirectingLogger.html +1213 -0
  53. data/docs/rdocs/Longleaf/LongleafError.html +139 -0
  54. data/docs/rdocs/Longleaf/MDFields.html +193 -0
  55. data/docs/rdocs/Longleaf/MetadataBuilder.html +787 -0
  56. data/docs/rdocs/Longleaf/MetadataDeserializer.html +537 -0
  57. data/docs/rdocs/Longleaf/MetadataError.html +143 -0
  58. data/docs/rdocs/Longleaf/MetadataPersistenceManager.html +539 -0
  59. data/docs/rdocs/Longleaf/MetadataRecord.html +1411 -0
  60. data/docs/rdocs/Longleaf/MetadataSerializer.html +786 -0
  61. data/docs/rdocs/Longleaf/PreservationServiceError.html +147 -0
  62. data/docs/rdocs/Longleaf/PreserveCommand.html +410 -0
  63. data/docs/rdocs/Longleaf/PreserveEvent.html +491 -0
  64. data/docs/rdocs/Longleaf/RegisterCommand.html +428 -0
  65. data/docs/rdocs/Longleaf/RegisterEvent.html +628 -0
  66. data/docs/rdocs/Longleaf/RegisteredFileSelector.html +446 -0
  67. data/docs/rdocs/Longleaf/RegistrationError.html +151 -0
  68. data/docs/rdocs/Longleaf/ReindexCommand.html +576 -0
  69. data/docs/rdocs/Longleaf/RsyncReplicationService.html +1180 -0
  70. data/docs/rdocs/Longleaf/SequelIndexDriver.html +1978 -0
  71. data/docs/rdocs/Longleaf/ServiceCandidateFilesystemIterator.html +572 -0
  72. data/docs/rdocs/Longleaf/ServiceCandidateIndexIterator.html +532 -0
  73. data/docs/rdocs/Longleaf/ServiceCandidateLocator.html +333 -0
  74. data/docs/rdocs/Longleaf/ServiceClassCache.html +725 -0
  75. data/docs/rdocs/Longleaf/ServiceDateHelper.html +425 -0
  76. data/docs/rdocs/Longleaf/ServiceDefinition.html +683 -0
  77. data/docs/rdocs/Longleaf/ServiceDefinitionManager.html +371 -0
  78. data/docs/rdocs/Longleaf/ServiceDefinitionValidator.html +269 -0
  79. data/docs/rdocs/Longleaf/ServiceFields.html +173 -0
  80. data/docs/rdocs/Longleaf/ServiceManager.html +1229 -0
  81. data/docs/rdocs/Longleaf/ServiceMappingManager.html +410 -0
  82. data/docs/rdocs/Longleaf/ServiceMappingValidator.html +347 -0
  83. data/docs/rdocs/Longleaf/ServiceRecord.html +821 -0
  84. data/docs/rdocs/Longleaf/StorageLocation.html +985 -0
  85. data/docs/rdocs/Longleaf/StorageLocationManager.html +729 -0
  86. data/docs/rdocs/Longleaf/StorageLocationUnavailableError.html +143 -0
  87. data/docs/rdocs/Longleaf/StorageLocationValidator.html +373 -0
  88. data/docs/rdocs/Longleaf/StoragePathValidator.html +253 -0
  89. data/docs/rdocs/Longleaf/SystemConfigBuilder.html +441 -0
  90. data/docs/rdocs/Longleaf/SystemConfigFields.html +163 -0
  91. data/docs/rdocs/Longleaf/ValidateConfigCommand.html +451 -0
  92. data/docs/rdocs/Longleaf/ValidateMetadataCommand.html +408 -0
  93. data/docs/rdocs/_index.html +660 -0
  94. data/docs/rdocs/class_list.html +51 -0
  95. data/docs/rdocs/css/common.css +1 -0
  96. data/docs/rdocs/css/full_list.css +58 -0
  97. data/docs/rdocs/css/style.css +496 -0
  98. data/docs/rdocs/file.README.html +165 -0
  99. data/docs/rdocs/file_list.html +56 -0
  100. data/docs/rdocs/frames.html +17 -0
  101. data/docs/rdocs/index.html +165 -0
  102. data/docs/rdocs/js/app.js +303 -0
  103. data/docs/rdocs/js/full_list.js +216 -0
  104. data/docs/rdocs/js/jquery.js +4 -0
  105. data/docs/rdocs/method_list.html +2051 -0
  106. data/docs/rdocs/top-level-namespace.html +110 -0
  107. data/lib/longleaf/candidates/file_selector.rb +150 -0
  108. data/lib/longleaf/candidates/manifest_digest_provider.rb +17 -0
  109. data/lib/longleaf/candidates/physical_path_provider.rb +17 -0
  110. data/lib/longleaf/candidates/registered_file_selector.rb +67 -0
  111. data/lib/longleaf/candidates/service_candidate_filesystem_iterator.rb +93 -0
  112. data/lib/longleaf/candidates/service_candidate_index_iterator.rb +84 -0
  113. data/lib/longleaf/candidates/service_candidate_locator.rb +23 -0
  114. data/lib/longleaf/candidates/single_digest_provider.rb +13 -0
  115. data/lib/longleaf/cli.rb +252 -46
  116. data/lib/longleaf/commands/deregister_command.rb +51 -0
  117. data/lib/longleaf/commands/preserve_command.rb +50 -0
  118. data/lib/longleaf/commands/register_command.rb +34 -43
  119. data/lib/longleaf/commands/reindex_command.rb +92 -0
  120. data/lib/longleaf/commands/validate_config_command.rb +33 -8
  121. data/lib/longleaf/commands/validate_metadata_command.rb +51 -0
  122. data/lib/longleaf/errors.rb +26 -7
  123. data/lib/longleaf/events/deregister_event.rb +53 -0
  124. data/lib/longleaf/events/event_names.rb +9 -0
  125. data/lib/longleaf/events/event_status_tracking.rb +59 -0
  126. data/lib/longleaf/events/preserve_event.rb +82 -0
  127. data/lib/longleaf/events/register_event.rb +59 -51
  128. data/lib/longleaf/helpers/case_insensitive_hash.rb +38 -0
  129. data/lib/longleaf/helpers/digest_helper.rb +56 -0
  130. data/lib/longleaf/helpers/s3_uri_helper.rb +86 -0
  131. data/lib/longleaf/helpers/selection_options_parser.rb +215 -0
  132. data/lib/longleaf/helpers/service_date_helper.rb +78 -0
  133. data/lib/longleaf/indexing/index_manager.rb +101 -0
  134. data/lib/longleaf/indexing/sequel_index_driver.rb +306 -0
  135. data/lib/longleaf/logging.rb +5 -4
  136. data/lib/longleaf/logging/redirecting_logger.rb +30 -25
  137. data/lib/longleaf/models/app_fields.rb +7 -2
  138. data/lib/longleaf/models/file_record.rb +31 -8
  139. data/lib/longleaf/models/filesystem_metadata_location.rb +56 -0
  140. data/lib/longleaf/models/filesystem_storage_location.rb +52 -0
  141. data/lib/longleaf/models/md_fields.rb +3 -1
  142. data/lib/longleaf/models/metadata_location.rb +47 -0
  143. data/lib/longleaf/models/metadata_record.rb +43 -16
  144. data/lib/longleaf/models/s3_storage_location.rb +138 -0
  145. data/lib/longleaf/models/service_definition.rb +7 -6
  146. data/lib/longleaf/models/service_fields.rb +7 -1
  147. data/lib/longleaf/models/service_record.rb +10 -6
  148. data/lib/longleaf/models/storage_location.rb +24 -19
  149. data/lib/longleaf/models/storage_types.rb +9 -0
  150. data/lib/longleaf/models/system_config_fields.rb +9 -0
  151. data/lib/longleaf/preservation_services/file_check_service.rb +59 -0
  152. data/lib/longleaf/preservation_services/fixity_check_service.rb +124 -0
  153. data/lib/longleaf/preservation_services/rsync_replication_service.rb +198 -0
  154. data/lib/longleaf/preservation_services/s3_replication_service.rb +131 -0
  155. data/lib/longleaf/services/application_config_deserializer.rb +81 -24
  156. data/lib/longleaf/services/application_config_manager.rb +20 -6
  157. data/lib/longleaf/services/application_config_validator.rb +19 -9
  158. data/lib/longleaf/services/configuration_validator.rb +67 -4
  159. data/lib/longleaf/services/filesystem_location_validator.rb +16 -0
  160. data/lib/longleaf/services/metadata_deserializer.rb +115 -42
  161. data/lib/longleaf/services/metadata_persistence_manager.rb +47 -0
  162. data/lib/longleaf/services/metadata_serializer.rb +156 -23
  163. data/lib/longleaf/services/metadata_validator.rb +76 -0
  164. data/lib/longleaf/services/s3_location_validator.rb +19 -0
  165. data/lib/longleaf/services/service_class_cache.rb +112 -0
  166. data/lib/longleaf/services/service_definition_manager.rb +10 -7
  167. data/lib/longleaf/services/service_definition_validator.rb +25 -18
  168. data/lib/longleaf/services/service_manager.rb +86 -11
  169. data/lib/longleaf/services/service_mapping_manager.rb +13 -12
  170. data/lib/longleaf/services/service_mapping_validator.rb +36 -26
  171. data/lib/longleaf/services/storage_location_manager.rb +76 -15
  172. data/lib/longleaf/services/storage_location_validator.rb +49 -35
  173. data/lib/longleaf/specs/config_builder.rb +47 -23
  174. data/lib/longleaf/specs/config_validator_helpers.rb +16 -0
  175. data/lib/longleaf/specs/custom_matchers.rb +9 -0
  176. data/lib/longleaf/specs/file_helpers.rb +61 -0
  177. data/lib/longleaf/specs/metadata_builder.rb +98 -0
  178. data/lib/longleaf/specs/system_config_builder.rb +27 -0
  179. data/lib/longleaf/version.rb +1 -1
  180. data/longleaf.gemspec +20 -7
  181. data/mkdocs.yml +21 -0
  182. metadata +308 -24
  183. data/.travis.yml +0 -4
  184. data/lib/longleaf/commands/abstract_command.rb +0 -37
  185. data/lib/longleaf/services/storage_path_validator.rb +0 -16
@@ -1,21 +1,22 @@
1
1
  require_relative 'service_fields'
2
2
 
3
- # Definition of a preservation service
4
3
  module Longleaf
4
+ # Definition of a configured preservation service
5
5
  class ServiceDefinition
6
6
  attr_reader :name
7
- attr_reader :work_script
7
+ attr_reader :work_script, :work_class
8
8
  attr_reader :frequency, :delay
9
9
  attr_reader :properties
10
-
11
- def initialize(name:, work_script:, frequency: nil, delay: nil, properties: Hash.new)
10
+
11
+ def initialize(name:, work_script:, work_class: nil, frequency: nil, delay: nil, properties: Hash.new)
12
12
  raise ArgumentError.new("Parameters name and work_script are required") unless name && work_script
13
-
13
+
14
14
  @properties = properties
15
15
  @name = name
16
16
  @work_script = work_script
17
+ @work_class = work_class
17
18
  @frequency = frequency
18
19
  @delay = delay
19
20
  end
20
21
  end
21
- end
22
+ end
@@ -1,10 +1,16 @@
1
1
  module Longleaf
2
+ # Constants for common configuration fields for preservation service definitions
2
3
  class ServiceFields
3
4
  WORK_SCRIPT = 'work_script'
5
+ WORK_CLASS = 'work_class'
4
6
  FREQUENCY = 'frequency'
5
7
  DELAY = 'delay'
6
-
8
+
7
9
  REPLICATE_TO = 'to'
8
10
  DIGEST_ALGORITHMS = 'algorithms'
11
+
12
+ COLLISION_PROPERTY = "replica_collision_policy"
13
+ DEFAULT_COLLISION_POLICY = "replace"
14
+ VALID_COLLISION_POLICIES = ["replace"]
9
15
  end
10
16
  end
@@ -1,27 +1,31 @@
1
- # Record for an individual service in a file's metadata record.
2
1
  module Longleaf
2
+ # Record for an individual service in a file's metadata record.
3
3
  class ServiceRecord
4
4
  attr_reader :properties
5
5
  attr_accessor :stale_replicas, :timestamp, :run_needed
6
-
6
+ attr_accessor :failure_timestamp
7
+
7
8
  # @param properties [Hash] initial properties for this service record
9
+ # @param stale_replicas [Boolean] whether there are any stale replicas from this service
10
+ # @param timestamp [String] timestamp when this service last ran or was initialized
11
+ # @param run_needed [Boolean] flag indicating that this service should be run at the next available opportunity
8
12
  def initialize(properties: Hash.new, stale_replicas: false, timestamp: nil, run_needed: false)
9
13
  raise ArgumentError.new("Service properties must be a hash") if properties.class != Hash
10
-
14
+
11
15
  @properties = properties
12
16
  @timestamp = timestamp
13
17
  @stale_replicas = stale_replicas
14
18
  @run_needed = run_needed
15
19
  end
16
-
20
+
17
21
  # @return the value of a service property identified by key
18
22
  def [](key)
19
23
  @properties[key]
20
24
  end
21
-
25
+
22
26
  # set the value of a service property identified by key
23
27
  def []=(key, value)
24
28
  @properties[key] = value
25
29
  end
26
30
  end
27
- end
31
+ end
@@ -1,19 +1,25 @@
1
- require 'longleaf/services/metadata_serializer'
1
+ require 'longleaf/models/app_fields'
2
2
 
3
3
  module Longleaf
4
+ # Representation of a configured storage location
4
5
  class StorageLocation
6
+ AF ||= Longleaf::AppFields
7
+
5
8
  attr_reader :name
6
9
  attr_reader :path
7
- attr_reader :metadata_path
8
-
9
- def initialize(name:, path:, metadata_path:)
10
- raise ArgumentError.new("Parameters name, path and metadata_path are required") unless name && path && metadata_path
11
-
12
- @path = path
10
+ attr_reader :metadata_location
11
+
12
+ # @param name [String] the name of this storage location
13
+ # @param config [Hash] hash containing the configuration options for this location
14
+ # @param md_loc [MetadataLocation] metadata location associated with this storage location
15
+ def initialize(name, config, md_loc)
16
+ raise ArgumentError.new("Config parameter is required") unless config
17
+ @path = config[AF::LOCATION_PATH]
13
18
  @name = name
14
- @metadata_path = metadata_path
19
+ raise ArgumentError.new("Parameters name, path and metadata location are required") unless @name && @path && md_loc
20
+ @metadata_location = md_loc
15
21
  end
16
-
22
+
17
23
  # Get the path for the metadata file for the given file path located in this storage location.
18
24
  # @param file_path [String] path of the file
19
25
  # @raise [ArgumentError] if the file_path is not provided or is not in this storage location.
@@ -22,16 +28,15 @@ module Longleaf
22
28
  raise ArgumentError.new("Provided file path is not contained by storage location #{@name}: #{file_path}") \
23
29
  unless file_path.start_with?(@path)
24
30
 
25
- file_path.sub(/^#{@path}/, metadata_path) + MetadataSerializer::metadata_suffix
31
+ rel_file_path = relativize(file_path)
32
+
33
+ @metadata_location.metadata_path_for(rel_file_path)
26
34
  end
27
-
28
- # Checks that the path and metadata path defined in this location are available
29
- # @raise [StorageLocationUnavailableError] if the storage location is not available
30
- def available?
31
- raise StorageLocationUnavailableError.new("Path does not exist or is not a directory: #{@path}")\
32
- unless Dir.exist?(@path)
33
- raise StorageLocationUnavailableError.new("Metadata path does not exist or is not a directory: #{@metadata_path}")\
34
- unless Dir.exist?(@metadata_path)
35
+
36
+ # @param [String] path to check
37
+ # @return true if the file path is contained by the path for this location
38
+ def contains?(file_path)
39
+ file_path.start_with?(@path)
35
40
  end
36
41
  end
37
- end
42
+ end
@@ -0,0 +1,9 @@
1
+ module Longleaf
2
+ # Storage type constants
3
+ class StorageTypes
4
+ FILESYSTEM_STORAGE_TYPE = 'filesystem'
5
+ S3_STORAGE_TYPE = 's3'
6
+
7
+ DEFAULT_STORAGE_TYPE = FILESYSTEM_STORAGE_TYPE
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Longleaf
2
+ # System configuration field names
3
+ class SystemConfigFields
4
+ MD_INDEX = 'index'
5
+ MD_INDEX_ADAPTER = 'adapter'
6
+ MD_INDEX_CONNECTION = 'connection'
7
+ MD_INDEX_PAGE_SIZE = 'page_size'
8
+ end
9
+ end
@@ -0,0 +1,59 @@
1
+ require 'longleaf/events/event_names'
2
+ require 'longleaf/logging'
3
+
4
+ module Longleaf
5
+ # Preservation service which validates a file using current filesystem information compared against the
6
+ # last registered details for that file. Checks using file name, size and last modified timestamp.
7
+ class FileCheckService
8
+ include Longleaf::Logging
9
+
10
+ # Initialize a FileCheckService from the given service definition
11
+ #
12
+ # @param service_def [ServiceDefinition] the configuration for this service
13
+ # @param app_manager [ApplicationConfigManager] manager for configured storage locations
14
+ def initialize(service_def, app_manager)
15
+ @service_def = service_def
16
+ @app_manager = app_manager
17
+ end
18
+
19
+ # Perform file information check.
20
+ #
21
+ # @param file_rec [FileRecord] record representing the file to perform the service on.
22
+ # @param event [String] name of the event this service is being invoked by.
23
+ # @raise [PreservationServiceError] if the file system information does not match the stored details
24
+ def perform(file_rec, event)
25
+ file_path = file_rec.path
26
+ phys_path = file_rec.physical_path
27
+ md_rec = file_rec.metadata_record
28
+
29
+ logger.debug("Performing file information check of #{file_path}")
30
+
31
+ if !File.exist?(phys_path)
32
+ raise PreservationServiceError.new("File does not exist: #{phys_path}")
33
+ end
34
+
35
+ file_size = File.size(phys_path)
36
+ if file_size != md_rec.file_size
37
+ raise PreservationServiceError.new("File size for #{phys_path} does not match the expected value: registered = #{md_rec.file_size} bytes, actual = #{file_size} bytes")
38
+ end
39
+
40
+ last_modified = File.mtime(phys_path).utc.iso8601(3)
41
+ if last_modified != md_rec.last_modified
42
+ raise PreservationServiceError.new("Last modified timestamp for #{phys_path} does not match the expected value: registered = #{md_rec.last_modified}, actual = #{last_modified}")
43
+ end
44
+ end
45
+
46
+ # Determine if this service is applicable for the provided event, given the configured service definition
47
+ #
48
+ # @param event [String] name of the event
49
+ # @return [Boolean] returns true if this service is applicable for the provided event
50
+ def is_applicable?(event)
51
+ case event
52
+ when EventNames::PRESERVE
53
+ true
54
+ else
55
+ false
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,124 @@
1
+ require 'longleaf/events/event_names'
2
+ require 'longleaf/models/service_fields'
3
+ require 'longleaf/logging'
4
+ require 'longleaf/helpers/digest_helper'
5
+ require 'set'
6
+
7
+ module Longleaf
8
+ # Preservation service which performs one or more fixity checks on a file based on the configured list
9
+ # of digest algorithms. It currently supports 'md5', 'sha1', 'sha2', 'sha256', 'sha384', 'sha512' and 'rmd160'.
10
+ #
11
+ # If the service encounters a file which is missing any of the digest algorithms the service is configured
12
+ # to check, the outcome may be controlled with the 'absent_digest' property via the following values:
13
+ # * 'fail' - the service will raise a ChecksumMismatchError for the missing algorithm. This is the default.
14
+ # * 'ignore' - the service will skip calculating any algorithms not already present for the file.
15
+ # * 'generate' - the service will generate and store any missing digests from the set of configured algorithms.
16
+ class FixityCheckService
17
+ include Longleaf::Logging
18
+
19
+ SUPPORTED_ALGORITHMS = ['md5', 'sha1', 'sha2', 'sha256', 'sha384', 'sha512', 'rmd160']
20
+
21
+ # service configuration property indicating how to handle situations where a file does not
22
+ # have a digest for one of the expected algorithms on record.
23
+ ABSENT_DIGEST_PROPERTY = 'absent_digest'
24
+ FAIL_IF_ABSENT = 'fail'
25
+ GENERATE_IF_ABSENT = 'generate'
26
+ IGNORE_IF_ABSENT = 'ignore'
27
+ ABSENT_DIGEST_OPTIONS = [FAIL_IF_ABSENT, GENERATE_IF_ABSENT, IGNORE_IF_ABSENT]
28
+
29
+ # Initialize a FixityCheckService from the given service definition
30
+ #
31
+ # @param service_def [ServiceDefinition] the configuration for this service
32
+ # @param app_manager [ApplicationConfigManager] manager for configured storage locations
33
+ def initialize(service_def, app_manager)
34
+ @service_def = service_def
35
+ @absent_digest_behavior = @service_def.properties[ABSENT_DIGEST_PROPERTY] || FAIL_IF_ABSENT
36
+ unless ABSENT_DIGEST_OPTIONS.include?(@absent_digest_behavior)
37
+ raise ArgumentError.new("Invalid option '#{@absent_digest_behavior}' for property #{ABSENT_DIGEST_PROPERTY} in service #{service_def.name}")
38
+ end
39
+
40
+ service_algs = service_def.properties[ServiceFields::DIGEST_ALGORITHMS]
41
+ if service_algs.nil? || service_algs.empty?
42
+ raise ArgumentError.new("FixityCheckService from definition #{service_def.name} requires a list of one or more digest algorithms")
43
+ end
44
+
45
+ service_algs = [service_algs] if service_algs.is_a?(String)
46
+
47
+ # Store the list of digest algorithms to verify, using normalized algorithm names.
48
+ @digest_algs = Set.new
49
+ service_algs.each do |alg|
50
+ normalized_alg = alg.downcase.delete('-')
51
+ if SUPPORTED_ALGORITHMS.include?(normalized_alg)
52
+ @digest_algs << normalized_alg
53
+ else
54
+ raise ArgumentError.new("Unsupported checksum algorithm '#{alg}' in definition #{service_def.name}. Supported algorithms are: #{SUPPORTED_ALGORITHMS}")
55
+ end
56
+ end
57
+ end
58
+
59
+ # Perform all configured fixity checks on the provided file
60
+ #
61
+ # @param file_rec [FileRecord] record representing the file to perform the service on.
62
+ # @param event [String] name of the event this service is being invoked by.
63
+ # @raise [ChecksumMismatchError] if the checksum on record does not match the generated checksum
64
+ def perform(file_rec, event)
65
+ path = file_rec.path
66
+ phys_path = file_rec.physical_path
67
+ md_rec = file_rec.metadata_record
68
+
69
+ # Get the list of existing checksums for the file and normalize algorithm names
70
+ file_digests = Hash.new
71
+ md_rec.checksums&.each do |alg, digest|
72
+ normalized_alg = alg.downcase.delete('-')
73
+ if @digest_algs.include?(normalized_alg)
74
+ file_digests[normalized_alg] = digest
75
+ else
76
+ logger.debug("Metadata for file #{path} contains unexpected '#{alg}' digest, it will be ignored.")
77
+ end
78
+ end
79
+
80
+ @digest_algs.each do |alg|
81
+ existing_digest = file_digests[alg]
82
+
83
+ if existing_digest.nil?
84
+ if @absent_digest_behavior == FAIL_IF_ABSENT
85
+ raise ChecksumMismatchError.new("Fixity check using algorithm '#{alg}' failed for file #{path}: no existing digest of type '#{alg}' on record.")
86
+ elsif @absent_digest_behavior == IGNORE_IF_ABSENT
87
+ logger.debug("Skipping check of algorithm '#{alg}' for file #{path}: no digest on record.")
88
+ next
89
+ end
90
+ end
91
+
92
+ digest = DigestHelper::start_digest(alg)
93
+ digest.file(phys_path)
94
+ generated_digest = digest.hexdigest
95
+
96
+ # Store the missing checksum if using the 'generate' behavior
97
+ if existing_digest.nil? && @absent_digest_behavior == GENERATE_IF_ABSENT
98
+ md_rec.checksums[alg] = generated_digest
99
+ logger.info("Generated and stored digest using algorithm '#{alg}' for file #{phys_path}")
100
+ else
101
+ # Compare the new digest to the one on record
102
+ if existing_digest == generated_digest
103
+ logger.info("Fixity check using algorithm '#{alg}' succeeded for file #{phys_path}")
104
+ else
105
+ raise ChecksumMismatchError.new("Fixity check using algorithm '#{alg}' failed for file #{phys_path}: expected '#{existing_digest}', calculated '#{generated_digest}.'")
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ # Determine if this service is applicable for the provided event, given the configured service definition
112
+ #
113
+ # @param event [String] name of the event
114
+ # @return [Boolean] returns true if this service is applicable for the provided event
115
+ def is_applicable?(event)
116
+ case event
117
+ when EventNames::PRESERVE
118
+ true
119
+ else
120
+ false
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,198 @@
1
+ require 'longleaf/events/event_names'
2
+ require 'longleaf/logging'
3
+ require 'longleaf/errors'
4
+ require 'longleaf/models/file_record'
5
+ require 'longleaf/models/service_fields'
6
+ require 'longleaf/events/register_event'
7
+ require 'longleaf/candidates/single_digest_provider'
8
+ require 'open3'
9
+
10
+ module Longleaf
11
+ # Preservation service which performs replication of a file to one or more destinations using rsync.
12
+ #
13
+ # The service definition must contain one or more destinations, specified with the "to" property.
14
+ # These destinations must be either a known storage location name, a remote path, or absolute path.
15
+ #
16
+ # Optional service configuration properties:
17
+ # * replica_collision_policy = specifies the desired outcome if the service attempts to replicate
18
+ # a file which already exists at a destination. Default: "replace".
19
+ # * rsync_command = the command to invoke in order to execute rsync. Default: "rsync"
20
+ # * rsync_options = additional parameters that will be passed along to rsync. Cannot include options
21
+ # which change the target of the command or prevent its execution, such as "files-from", "dry-run",
22
+ # "help", etc. Command will always include "-R". Default "-a".
23
+ class RsyncReplicationService
24
+ include Longleaf::Logging
25
+ SF ||= Longleaf::ServiceFields
26
+
27
+ RSYNC_COMMAND_PROPERTY = "rsync_command"
28
+ DEFAULT_COMMAND = "rsync"
29
+
30
+ RSYNC_OPTIONS_PROPERTY = "rsync_options"
31
+ DEFAULT_OPTIONS = "-a"
32
+ DISALLOWED_OPTIONS = ["files-from", "n", "dry-run", "exclude", "exclude-from", "cvs-exclude",
33
+ "h", "help", "f", "F", "filter"]
34
+
35
+ attr_reader :command, :options, :collision_policy
36
+
37
+ # Initialize a RsyncReplicationService from the given service definition
38
+ #
39
+ # @param service_def [ServiceDefinition] the configuration for this service
40
+ # @param app_manager [ApplicationConfigManager] the application configuration
41
+ def initialize(service_def, app_manager)
42
+ @service_def = service_def
43
+ @app_manager = app_manager
44
+
45
+ @command = @service_def.properties[RSYNC_COMMAND_PROPERTY] || DEFAULT_COMMAND
46
+
47
+ # Validate rsync parameters
48
+ @options = @service_def.properties[RSYNC_OPTIONS_PROPERTY] || DEFAULT_OPTIONS
49
+ if contains_disallowed_option?(@options)
50
+ raise ArgumentError.new("Service #{service_def.name} specifies a disallowed rsync paramter," \
51
+ + " rsync_options may not include the following: #{DISALLOWED_OPTIONS.join(' ')}")
52
+ end
53
+
54
+ # Set and validate the replica collision policy
55
+ @collision_policy = @service_def.properties[SF::COLLISION_PROPERTY] || SF::DEFAULT_COLLISION_POLICY
56
+ if !SF::VALID_COLLISION_POLICIES.include?(@collision_policy)
57
+ raise ArgumentError.new("Service #{service_def.name} received invalid #{SF::COLLISION_PROPERTY}" \
58
+ + " value #{@collision_policy}")
59
+ end
60
+
61
+ # Store and validate destinations
62
+ replicate_to = @service_def.properties[SF::REPLICATE_TO]
63
+ if replicate_to.nil? || replicate_to.empty?
64
+ raise ArgumentError.new("Service #{service_def.name} must provide one or more replication destinations.")
65
+ end
66
+ replicate_to = [replicate_to] if replicate_to.is_a?(String)
67
+
68
+ loc_manager = app_manager.location_manager
69
+ # Build list of destinations, translating to storage locations when relevant
70
+ @destinations = Array.new
71
+ replicate_to.each do |dest|
72
+ # Assume that if destination contains a : or / it is a path rather than storage location
73
+ if dest =~ /[:\/]/
74
+ @destinations << dest
75
+ else
76
+ if loc_manager.locations.key?(dest)
77
+ @destinations << loc_manager.locations[dest]
78
+ else
79
+ raise ArgumentError.new("Service #{service_def.name} specifies unknown storage location '#{dest}'" \
80
+ + " as a replication destination")
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ # During a replication event, perform replication of the specified file to all configured destinations
87
+ # as necessary.
88
+ #
89
+ # @param file_rec [FileRecord] record representing the file to perform the service on.
90
+ # @param event [String] name of the event this service is being invoked by.
91
+ # @raise [PreservationServiceError] if the rsync replication fails
92
+ def perform(file_rec, event)
93
+ @destinations.each do |destination|
94
+ dest_is_storage_loc = destination.is_a?(Longleaf::StorageLocation)
95
+
96
+ if dest_is_storage_loc
97
+ dest_path = destination.path
98
+ else
99
+ dest_path = destination
100
+ end
101
+
102
+ logical_physical_same = file_rec.path == file_rec.physical_path
103
+ # Determine the path to the file being replicated relative to its storage location
104
+ rel_path = file_rec.storage_location.relativize(file_rec.path)
105
+
106
+ options = @options
107
+ if logical_physical_same
108
+ options = options + " -R"
109
+ # source path with . so that rsync will only create destination directories starting from that point
110
+ source_path = File.join(file_rec.storage_location.path, "./#{rel_path}")
111
+ else
112
+ options = options + " --no-relative"
113
+ source_path = file_rec.physical_path
114
+ dest_path = File.join(dest_path, rel_path)
115
+ if (dest_is_storage_loc && destination.is_a?(Longleaf::FilesystemStorageLocation)) || !dest_is_storage_loc
116
+ # Fill in missing parent directories, as rsync cannot do so when specifying a different source and dest filename
117
+ dirname = File.dirname(dest_path)
118
+ logger.debug("Creating parent dirs #{dirname} for #{file_rec.path}")
119
+ FileUtils.mkdir_p(dirname)
120
+ else
121
+ raise PreservationServiceError.new(
122
+ "Destination #{destination.name} does not currently support separate physical and logical paths")
123
+ end
124
+ end
125
+
126
+ # Check that the destination is available because attempting to write
127
+ verify_destination_available(destination, file_rec)
128
+
129
+ logger.debug("Invoking rsync with command: #{@command} \"#{source_path}\" \"#{dest_path}\" #{options}")
130
+ stdout, stderr, status = Open3.capture3("#{@command} \"#{source_path}\" \"#{dest_path}\" #{options}")
131
+ raise PreservationServiceError.new("Failed to replicate #{file_rec.path} to #{dest_path}: #{stderr}") \
132
+ unless status.success?
133
+
134
+ logger.info("Replicated #{file_rec.path} to destination #{dest_path}")
135
+
136
+ # For destinations which are storage locations, register the replica with longleaf
137
+ if dest_is_storage_loc
138
+ register_replica(destination, rel_path, file_rec)
139
+ end
140
+ end
141
+ end
142
+
143
+ # Determine if this service is applicable for the provided event, given the configured service definition
144
+ #
145
+ # @param event [String] name of the event
146
+ # @return [Boolean] returns true if this service is applicable for the provided event
147
+ def is_applicable?(event)
148
+ case event
149
+ when EventNames::PRESERVE
150
+ true
151
+ else
152
+ false
153
+ end
154
+ end
155
+
156
+ private
157
+ def contains_disallowed_option?(options)
158
+ DISALLOWED_OPTIONS.each do |disallowed|
159
+ if disallowed.length == 1
160
+ if options =~ /(\A| )-[a-zA-Z0-9]*#{disallowed}[a-zA-Z0-9]*( |=|\z)/
161
+ return true
162
+ end
163
+ else
164
+ if options =~ /(\A| )--#{disallowed}( |=|\z)/
165
+ return true
166
+ end
167
+ end
168
+ end
169
+
170
+ false
171
+ end
172
+
173
+ def verify_destination_available(destination, file_rec)
174
+ if destination.is_a?(Longleaf::StorageLocation)
175
+ begin
176
+ destination.available?
177
+ rescue StorageLocationUnavailableError => e
178
+ raise StorageLocationUnavailableError.new("Cannot replicate #{file_rec.path} to destination #{destination.name}: " \
179
+ + e.message)
180
+ end
181
+ elsif destination.start_with?("/")
182
+ raise StorageLocationUnavailableError.new("Cannot replicate #{file_rec.path} to destination" \
183
+ + " #{destination}, path does not exist.") unless Dir.exist?(destination)
184
+ end
185
+ end
186
+
187
+ def register_replica(destination, rel_path, file_rec)
188
+ dest_file_path = File.join(destination.path, rel_path)
189
+ dest_file_rec = FileRecord.new(dest_file_path, destination)
190
+
191
+ register_event = RegisterEvent.new(file_rec: dest_file_rec,
192
+ app_manager: @app_manager,
193
+ force: true,
194
+ digest_provider: SingleDigestProvider.new(file_rec.metadata_record.checksums))
195
+ register_event.perform
196
+ end
197
+ end
198
+ end