longleaf 0.1.0.pre.3 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +94 -0
  3. data/.editorconfig +13 -0
  4. data/.gitignore +4 -1
  5. data/.rubocop.yml +44 -0
  6. data/.rubocop_todo.yml +834 -0
  7. data/.yardopts +1 -0
  8. data/Gemfile +16 -1
  9. data/README.md +98 -12
  10. data/Rakefile +6 -0
  11. data/bin/setup +16 -1
  12. data/docs/aboutlongleaf.md +28 -0
  13. data/docs/extra.css +32 -0
  14. data/docs/img/change-file.png +0 -0
  15. data/docs/img/ll-example-preserved.png +0 -0
  16. data/docs/index.md +19 -0
  17. data/docs/install.md +66 -0
  18. data/docs/ll-example/config-example-relative.yml +33 -0
  19. data/docs/ll-example/files-dir/LLexample-PDF.pdf +0 -0
  20. data/docs/ll-example/files-dir/LLexample-TOCHANGE.txt +15 -0
  21. data/docs/ll-example/files-dir/LLexample-tokeep.txt +10 -0
  22. data/docs/ll-example/metadata-dir/.gitkeep +0 -0
  23. data/docs/ll-example/replica-files/.gitkeep +0 -0
  24. data/docs/ll-example/replica-metadata/.gitkeep +0 -0
  25. data/docs/quickstart.md +270 -0
  26. data/docs/rdocs/Longleaf.html +135 -0
  27. data/docs/rdocs/Longleaf/AppFields.html +178 -0
  28. data/docs/rdocs/Longleaf/ApplicationConfigDeserializer.html +631 -0
  29. data/docs/rdocs/Longleaf/ApplicationConfigManager.html +610 -0
  30. data/docs/rdocs/Longleaf/ApplicationConfigValidator.html +238 -0
  31. data/docs/rdocs/Longleaf/CLI.html +909 -0
  32. data/docs/rdocs/Longleaf/ChecksumMismatchError.html +151 -0
  33. data/docs/rdocs/Longleaf/ConfigBuilder.html +1339 -0
  34. data/docs/rdocs/Longleaf/ConfigurationError.html +143 -0
  35. data/docs/rdocs/Longleaf/ConfigurationValidator.html +227 -0
  36. data/docs/rdocs/Longleaf/DeregisterCommand.html +420 -0
  37. data/docs/rdocs/Longleaf/DeregisterEvent.html +453 -0
  38. data/docs/rdocs/Longleaf/DeregistrationError.html +151 -0
  39. data/docs/rdocs/Longleaf/DigestHelper.html +419 -0
  40. data/docs/rdocs/Longleaf/EventError.html +147 -0
  41. data/docs/rdocs/Longleaf/EventNames.html +163 -0
  42. data/docs/rdocs/Longleaf/EventStatusTracking.html +656 -0
  43. data/docs/rdocs/Longleaf/FileCheckService.html +540 -0
  44. data/docs/rdocs/Longleaf/FileHelpers.html +520 -0
  45. data/docs/rdocs/Longleaf/FileRecord.html +716 -0
  46. data/docs/rdocs/Longleaf/FileSelector.html +901 -0
  47. data/docs/rdocs/Longleaf/FixityCheckService.html +691 -0
  48. data/docs/rdocs/Longleaf/IndexManager.html +1155 -0
  49. data/docs/rdocs/Longleaf/InvalidDigestAlgorithmError.html +143 -0
  50. data/docs/rdocs/Longleaf/InvalidStoragePathError.html +143 -0
  51. data/docs/rdocs/Longleaf/Logging.html +405 -0
  52. data/docs/rdocs/Longleaf/Logging/RedirectingLogger.html +1213 -0
  53. data/docs/rdocs/Longleaf/LongleafError.html +139 -0
  54. data/docs/rdocs/Longleaf/MDFields.html +193 -0
  55. data/docs/rdocs/Longleaf/MetadataBuilder.html +787 -0
  56. data/docs/rdocs/Longleaf/MetadataDeserializer.html +537 -0
  57. data/docs/rdocs/Longleaf/MetadataError.html +143 -0
  58. data/docs/rdocs/Longleaf/MetadataPersistenceManager.html +539 -0
  59. data/docs/rdocs/Longleaf/MetadataRecord.html +1411 -0
  60. data/docs/rdocs/Longleaf/MetadataSerializer.html +786 -0
  61. data/docs/rdocs/Longleaf/PreservationServiceError.html +147 -0
  62. data/docs/rdocs/Longleaf/PreserveCommand.html +410 -0
  63. data/docs/rdocs/Longleaf/PreserveEvent.html +491 -0
  64. data/docs/rdocs/Longleaf/RegisterCommand.html +428 -0
  65. data/docs/rdocs/Longleaf/RegisterEvent.html +628 -0
  66. data/docs/rdocs/Longleaf/RegisteredFileSelector.html +446 -0
  67. data/docs/rdocs/Longleaf/RegistrationError.html +151 -0
  68. data/docs/rdocs/Longleaf/ReindexCommand.html +576 -0
  69. data/docs/rdocs/Longleaf/RsyncReplicationService.html +1180 -0
  70. data/docs/rdocs/Longleaf/SequelIndexDriver.html +1978 -0
  71. data/docs/rdocs/Longleaf/ServiceCandidateFilesystemIterator.html +572 -0
  72. data/docs/rdocs/Longleaf/ServiceCandidateIndexIterator.html +532 -0
  73. data/docs/rdocs/Longleaf/ServiceCandidateLocator.html +333 -0
  74. data/docs/rdocs/Longleaf/ServiceClassCache.html +725 -0
  75. data/docs/rdocs/Longleaf/ServiceDateHelper.html +425 -0
  76. data/docs/rdocs/Longleaf/ServiceDefinition.html +683 -0
  77. data/docs/rdocs/Longleaf/ServiceDefinitionManager.html +371 -0
  78. data/docs/rdocs/Longleaf/ServiceDefinitionValidator.html +269 -0
  79. data/docs/rdocs/Longleaf/ServiceFields.html +173 -0
  80. data/docs/rdocs/Longleaf/ServiceManager.html +1229 -0
  81. data/docs/rdocs/Longleaf/ServiceMappingManager.html +410 -0
  82. data/docs/rdocs/Longleaf/ServiceMappingValidator.html +347 -0
  83. data/docs/rdocs/Longleaf/ServiceRecord.html +821 -0
  84. data/docs/rdocs/Longleaf/StorageLocation.html +985 -0
  85. data/docs/rdocs/Longleaf/StorageLocationManager.html +729 -0
  86. data/docs/rdocs/Longleaf/StorageLocationUnavailableError.html +143 -0
  87. data/docs/rdocs/Longleaf/StorageLocationValidator.html +373 -0
  88. data/docs/rdocs/Longleaf/StoragePathValidator.html +253 -0
  89. data/docs/rdocs/Longleaf/SystemConfigBuilder.html +441 -0
  90. data/docs/rdocs/Longleaf/SystemConfigFields.html +163 -0
  91. data/docs/rdocs/Longleaf/ValidateConfigCommand.html +451 -0
  92. data/docs/rdocs/Longleaf/ValidateMetadataCommand.html +408 -0
  93. data/docs/rdocs/_index.html +660 -0
  94. data/docs/rdocs/class_list.html +51 -0
  95. data/docs/rdocs/css/common.css +1 -0
  96. data/docs/rdocs/css/full_list.css +58 -0
  97. data/docs/rdocs/css/style.css +496 -0
  98. data/docs/rdocs/file.README.html +165 -0
  99. data/docs/rdocs/file_list.html +56 -0
  100. data/docs/rdocs/frames.html +17 -0
  101. data/docs/rdocs/index.html +165 -0
  102. data/docs/rdocs/js/app.js +303 -0
  103. data/docs/rdocs/js/full_list.js +216 -0
  104. data/docs/rdocs/js/jquery.js +4 -0
  105. data/docs/rdocs/method_list.html +2051 -0
  106. data/docs/rdocs/top-level-namespace.html +110 -0
  107. data/lib/longleaf/candidates/file_selector.rb +150 -0
  108. data/lib/longleaf/candidates/manifest_digest_provider.rb +17 -0
  109. data/lib/longleaf/candidates/physical_path_provider.rb +17 -0
  110. data/lib/longleaf/candidates/registered_file_selector.rb +67 -0
  111. data/lib/longleaf/candidates/service_candidate_filesystem_iterator.rb +93 -0
  112. data/lib/longleaf/candidates/service_candidate_index_iterator.rb +84 -0
  113. data/lib/longleaf/candidates/service_candidate_locator.rb +23 -0
  114. data/lib/longleaf/candidates/single_digest_provider.rb +13 -0
  115. data/lib/longleaf/cli.rb +249 -44
  116. data/lib/longleaf/commands/deregister_command.rb +51 -0
  117. data/lib/longleaf/commands/preserve_command.rb +50 -0
  118. data/lib/longleaf/commands/register_command.rb +34 -43
  119. data/lib/longleaf/commands/reindex_command.rb +92 -0
  120. data/lib/longleaf/commands/validate_config_command.rb +33 -8
  121. data/lib/longleaf/commands/validate_metadata_command.rb +51 -0
  122. data/lib/longleaf/errors.rb +26 -7
  123. data/lib/longleaf/events/deregister_event.rb +53 -0
  124. data/lib/longleaf/events/event_names.rb +9 -0
  125. data/lib/longleaf/events/event_status_tracking.rb +59 -0
  126. data/lib/longleaf/events/preserve_event.rb +82 -0
  127. data/lib/longleaf/events/register_event.rb +59 -51
  128. data/lib/longleaf/helpers/case_insensitive_hash.rb +38 -0
  129. data/lib/longleaf/helpers/digest_helper.rb +56 -0
  130. data/lib/longleaf/helpers/s3_uri_helper.rb +86 -0
  131. data/lib/longleaf/helpers/selection_options_parser.rb +215 -0
  132. data/lib/longleaf/helpers/service_date_helper.rb +78 -0
  133. data/lib/longleaf/indexing/index_manager.rb +101 -0
  134. data/lib/longleaf/indexing/sequel_index_driver.rb +306 -0
  135. data/lib/longleaf/logging.rb +5 -4
  136. data/lib/longleaf/logging/redirecting_logger.rb +30 -25
  137. data/lib/longleaf/models/app_fields.rb +7 -2
  138. data/lib/longleaf/models/file_record.rb +31 -8
  139. data/lib/longleaf/models/filesystem_metadata_location.rb +56 -0
  140. data/lib/longleaf/models/filesystem_storage_location.rb +52 -0
  141. data/lib/longleaf/models/md_fields.rb +3 -1
  142. data/lib/longleaf/models/metadata_location.rb +47 -0
  143. data/lib/longleaf/models/metadata_record.rb +43 -16
  144. data/lib/longleaf/models/s3_storage_location.rb +138 -0
  145. data/lib/longleaf/models/service_definition.rb +7 -6
  146. data/lib/longleaf/models/service_fields.rb +7 -1
  147. data/lib/longleaf/models/service_record.rb +10 -6
  148. data/lib/longleaf/models/storage_location.rb +24 -21
  149. data/lib/longleaf/models/storage_types.rb +9 -0
  150. data/lib/longleaf/models/system_config_fields.rb +9 -0
  151. data/lib/longleaf/preservation_services/file_check_service.rb +59 -0
  152. data/lib/longleaf/preservation_services/fixity_check_service.rb +124 -0
  153. data/lib/longleaf/preservation_services/rsync_replication_service.rb +198 -0
  154. data/lib/longleaf/preservation_services/s3_replication_service.rb +131 -0
  155. data/lib/longleaf/services/application_config_deserializer.rb +80 -21
  156. data/lib/longleaf/services/application_config_manager.rb +20 -6
  157. data/lib/longleaf/services/application_config_validator.rb +19 -9
  158. data/lib/longleaf/services/configuration_validator.rb +67 -4
  159. data/lib/longleaf/services/filesystem_location_validator.rb +16 -0
  160. data/lib/longleaf/services/metadata_deserializer.rb +115 -42
  161. data/lib/longleaf/services/metadata_persistence_manager.rb +47 -0
  162. data/lib/longleaf/services/metadata_serializer.rb +139 -25
  163. data/lib/longleaf/services/metadata_validator.rb +76 -0
  164. data/lib/longleaf/services/s3_location_validator.rb +19 -0
  165. data/lib/longleaf/services/service_class_cache.rb +112 -0
  166. data/lib/longleaf/services/service_definition_manager.rb +10 -7
  167. data/lib/longleaf/services/service_definition_validator.rb +25 -18
  168. data/lib/longleaf/services/service_manager.rb +86 -11
  169. data/lib/longleaf/services/service_mapping_manager.rb +13 -12
  170. data/lib/longleaf/services/service_mapping_validator.rb +36 -26
  171. data/lib/longleaf/services/storage_location_manager.rb +76 -15
  172. data/lib/longleaf/services/storage_location_validator.rb +49 -35
  173. data/lib/longleaf/specs/config_builder.rb +47 -23
  174. data/lib/longleaf/specs/config_validator_helpers.rb +16 -0
  175. data/lib/longleaf/specs/custom_matchers.rb +9 -0
  176. data/lib/longleaf/specs/file_helpers.rb +61 -0
  177. data/lib/longleaf/specs/metadata_builder.rb +98 -0
  178. data/lib/longleaf/specs/system_config_builder.rb +27 -0
  179. data/lib/longleaf/version.rb +1 -1
  180. data/longleaf.gemspec +20 -7
  181. data/mkdocs.yml +21 -0
  182. metadata +310 -26
  183. data/.travis.yml +0 -4
  184. data/lib/longleaf/commands/abstract_command.rb +0 -37
  185. data/lib/longleaf/services/storage_path_validator.rb +0 -16
@@ -1,21 +1,22 @@
1
1
  require_relative 'service_fields'
2
2
 
3
- # Definition of a preservation service
4
3
  module Longleaf
4
+ # Definition of a configured preservation service
5
5
  class ServiceDefinition
6
6
  attr_reader :name
7
- attr_reader :work_script
7
+ attr_reader :work_script, :work_class
8
8
  attr_reader :frequency, :delay
9
9
  attr_reader :properties
10
-
11
- def initialize(name:, work_script:, frequency: nil, delay: nil, properties: Hash.new)
10
+
11
+ def initialize(name:, work_script:, work_class: nil, frequency: nil, delay: nil, properties: Hash.new)
12
12
  raise ArgumentError.new("Parameters name and work_script are required") unless name && work_script
13
-
13
+
14
14
  @properties = properties
15
15
  @name = name
16
16
  @work_script = work_script
17
+ @work_class = work_class
17
18
  @frequency = frequency
18
19
  @delay = delay
19
20
  end
20
21
  end
21
- end
22
+ end
@@ -1,10 +1,16 @@
1
1
  module Longleaf
2
+ # Constants for common configuration fields for preservation service definitions
2
3
  class ServiceFields
3
4
  WORK_SCRIPT = 'work_script'
5
+ WORK_CLASS = 'work_class'
4
6
  FREQUENCY = 'frequency'
5
7
  DELAY = 'delay'
6
-
8
+
7
9
  REPLICATE_TO = 'to'
8
10
  DIGEST_ALGORITHMS = 'algorithms'
11
+
12
+ COLLISION_PROPERTY = "replica_collision_policy"
13
+ DEFAULT_COLLISION_POLICY = "replace"
14
+ VALID_COLLISION_POLICIES = ["replace"]
9
15
  end
10
16
  end
@@ -1,27 +1,31 @@
1
- # Record for an individual service in a file's metadata record.
2
1
  module Longleaf
2
+ # Record for an individual service in a file's metadata record.
3
3
  class ServiceRecord
4
4
  attr_reader :properties
5
5
  attr_accessor :stale_replicas, :timestamp, :run_needed
6
-
6
+ attr_accessor :failure_timestamp
7
+
7
8
  # @param properties [Hash] initial properties for this service record
9
+ # @param stale_replicas [Boolean] whether there are any stale replicas from this service
10
+ # @param timestamp [String] timestamp when this service last ran or was initialized
11
+ # @param run_needed [Boolean] flag indicating that this service should be run at the next available opportunity
8
12
  def initialize(properties: Hash.new, stale_replicas: false, timestamp: nil, run_needed: false)
9
13
  raise ArgumentError.new("Service properties must be a hash") if properties.class != Hash
10
-
14
+
11
15
  @properties = properties
12
16
  @timestamp = timestamp
13
17
  @stale_replicas = stale_replicas
14
18
  @run_needed = run_needed
15
19
  end
16
-
20
+
17
21
  # @return the value of a service property identified by key
18
22
  def [](key)
19
23
  @properties[key]
20
24
  end
21
-
25
+
22
26
  # set the value of a service property identified by key
23
27
  def []=(key, value)
24
28
  @properties[key] = value
25
29
  end
26
30
  end
27
- end
31
+ end
@@ -1,21 +1,25 @@
1
- require 'longleaf/services/metadata_serializer'
1
+ require 'longleaf/models/app_fields'
2
2
 
3
3
  module Longleaf
4
+ # Representation of a configured storage location
4
5
  class StorageLocation
6
+ AF ||= Longleaf::AppFields
7
+
5
8
  attr_reader :name
6
9
  attr_reader :path
7
- attr_reader :metadata_path
8
-
9
- def initialize(name:, path:, metadata_path:)
10
- raise ArgumentError.new("Parameters name, path and metadata_path are required") unless name && path && metadata_path
11
-
12
- @path = path
13
- @path += '/' unless @path.end_with?('/')
10
+ attr_reader :metadata_location
11
+
12
+ # @param name [String] the name of this storage location
13
+ # @param config [Hash] hash containing the configuration options for this location
14
+ # @param md_loc [MetadataLocation] metadata location associated with this storage location
15
+ def initialize(name, config, md_loc)
16
+ raise ArgumentError.new("Config parameter is required") unless config
17
+ @path = config[AF::LOCATION_PATH]
14
18
  @name = name
15
- @metadata_path = metadata_path
16
- @metadata_path += '/' unless @metadata_path.end_with?('/')
19
+ raise ArgumentError.new("Parameters name, path and metadata location are required") unless @name && @path && md_loc
20
+ @metadata_location = md_loc
17
21
  end
18
-
22
+
19
23
  # Get the path for the metadata file for the given file path located in this storage location.
20
24
  # @param file_path [String] path of the file
21
25
  # @raise [ArgumentError] if the file_path is not provided or is not in this storage location.
@@ -24,16 +28,15 @@ module Longleaf
24
28
  raise ArgumentError.new("Provided file path is not contained by storage location #{@name}: #{file_path}") \
25
29
  unless file_path.start_with?(@path)
26
30
 
27
- file_path.sub(/^#{@path}/, metadata_path) + MetadataSerializer::metadata_suffix
31
+ rel_file_path = relativize(file_path)
32
+
33
+ @metadata_location.metadata_path_for(rel_file_path)
28
34
  end
29
-
30
- # Checks that the path and metadata path defined in this location are available
31
- # @raise [StorageLocationUnavailableError] if the storage location is not available
32
- def available?
33
- raise StorageLocationUnavailableError.new("Path does not exist or is not a directory: #{@path}")\
34
- unless Dir.exist?(@path)
35
- raise StorageLocationUnavailableError.new("Metadata path does not exist or is not a directory: #{@metadata_path}")\
36
- unless Dir.exist?(@metadata_path)
35
+
36
+ # @param [String] path to check
37
+ # @return true if the file path is contained by the path for this location
38
+ def contains?(file_path)
39
+ file_path.start_with?(@path)
37
40
  end
38
41
  end
39
- end
42
+ end
@@ -0,0 +1,9 @@
1
+ module Longleaf
2
+ # Storage type constants
3
+ class StorageTypes
4
+ FILESYSTEM_STORAGE_TYPE = 'filesystem'
5
+ S3_STORAGE_TYPE = 's3'
6
+
7
+ DEFAULT_STORAGE_TYPE = FILESYSTEM_STORAGE_TYPE
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Longleaf
2
+ # System configuration field names
3
+ class SystemConfigFields
4
+ MD_INDEX = 'index'
5
+ MD_INDEX_ADAPTER = 'adapter'
6
+ MD_INDEX_CONNECTION = 'connection'
7
+ MD_INDEX_PAGE_SIZE = 'page_size'
8
+ end
9
+ end
@@ -0,0 +1,59 @@
1
+ require 'longleaf/events/event_names'
2
+ require 'longleaf/logging'
3
+
4
+ module Longleaf
5
+ # Preservation service which validates a file using current filesystem information compared against the
6
+ # last registered details for that file. Checks using file name, size and last modified timestamp.
7
+ class FileCheckService
8
+ include Longleaf::Logging
9
+
10
+ # Initialize a FileCheckService from the given service definition
11
+ #
12
+ # @param service_def [ServiceDefinition] the configuration for this service
13
+ # @param app_manager [ApplicationConfigManager] manager for configured storage locations
14
+ def initialize(service_def, app_manager)
15
+ @service_def = service_def
16
+ @app_manager = app_manager
17
+ end
18
+
19
+ # Perform file information check.
20
+ #
21
+ # @param file_rec [FileRecord] record representing the file to perform the service on.
22
+ # @param event [String] name of the event this service is being invoked by.
23
+ # @raise [PreservationServiceError] if the file system information does not match the stored details
24
+ def perform(file_rec, event)
25
+ file_path = file_rec.path
26
+ phys_path = file_rec.physical_path
27
+ md_rec = file_rec.metadata_record
28
+
29
+ logger.debug("Performing file information check of #{file_path}")
30
+
31
+ if !File.exist?(phys_path)
32
+ raise PreservationServiceError.new("File does not exist: #{phys_path}")
33
+ end
34
+
35
+ file_size = File.size(phys_path)
36
+ if file_size != md_rec.file_size
37
+ raise PreservationServiceError.new("File size for #{phys_path} does not match the expected value: registered = #{md_rec.file_size} bytes, actual = #{file_size} bytes")
38
+ end
39
+
40
+ last_modified = File.mtime(phys_path).utc.iso8601(3)
41
+ if last_modified != md_rec.last_modified
42
+ raise PreservationServiceError.new("Last modified timestamp for #{phys_path} does not match the expected value: registered = #{md_rec.last_modified}, actual = #{last_modified}")
43
+ end
44
+ end
45
+
46
+ # Determine if this service is applicable for the provided event, given the configured service definition
47
+ #
48
+ # @param event [String] name of the event
49
+ # @return [Boolean] returns true if this service is applicable for the provided event
50
+ def is_applicable?(event)
51
+ case event
52
+ when EventNames::PRESERVE
53
+ true
54
+ else
55
+ false
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,124 @@
1
+ require 'longleaf/events/event_names'
2
+ require 'longleaf/models/service_fields'
3
+ require 'longleaf/logging'
4
+ require 'longleaf/helpers/digest_helper'
5
+ require 'set'
6
+
7
+ module Longleaf
8
+ # Preservation service which performs one or more fixity checks on a file based on the configured list
9
+ # of digest algorithms. It currently supports 'md5', 'sha1', 'sha2', 'sha256', 'sha384', 'sha512' and 'rmd160'.
10
+ #
11
+ # If the service encounters a file which is missing any of the digest algorithms the service is configured
12
+ # to check, the outcome may be controlled with the 'absent_digest' property via the following values:
13
+ # * 'fail' - the service will raise a ChecksumMismatchError for the missing algorithm. This is the default.
14
+ # * 'ignore' - the service will skip calculating any algorithms not already present for the file.
15
+ # * 'generate' - the service will generate and store any missing digests from the set of configured algorithms.
16
+ class FixityCheckService
17
+ include Longleaf::Logging
18
+
19
+ SUPPORTED_ALGORITHMS = ['md5', 'sha1', 'sha2', 'sha256', 'sha384', 'sha512', 'rmd160']
20
+
21
+ # service configuration property indicating how to handle situations where a file does not
22
+ # have a digest for one of the expected algorithms on record.
23
+ ABSENT_DIGEST_PROPERTY = 'absent_digest'
24
+ FAIL_IF_ABSENT = 'fail'
25
+ GENERATE_IF_ABSENT = 'generate'
26
+ IGNORE_IF_ABSENT = 'ignore'
27
+ ABSENT_DIGEST_OPTIONS = [FAIL_IF_ABSENT, GENERATE_IF_ABSENT, IGNORE_IF_ABSENT]
28
+
29
+ # Initialize a FixityCheckService from the given service definition
30
+ #
31
+ # @param service_def [ServiceDefinition] the configuration for this service
32
+ # @param app_manager [ApplicationConfigManager] manager for configured storage locations
33
+ def initialize(service_def, app_manager)
34
+ @service_def = service_def
35
+ @absent_digest_behavior = @service_def.properties[ABSENT_DIGEST_PROPERTY] || FAIL_IF_ABSENT
36
+ unless ABSENT_DIGEST_OPTIONS.include?(@absent_digest_behavior)
37
+ raise ArgumentError.new("Invalid option '#{@absent_digest_behavior}' for property #{ABSENT_DIGEST_PROPERTY} in service #{service_def.name}")
38
+ end
39
+
40
+ service_algs = service_def.properties[ServiceFields::DIGEST_ALGORITHMS]
41
+ if service_algs.nil? || service_algs.empty?
42
+ raise ArgumentError.new("FixityCheckService from definition #{service_def.name} requires a list of one or more digest algorithms")
43
+ end
44
+
45
+ service_algs = [service_algs] if service_algs.is_a?(String)
46
+
47
+ # Store the list of digest algorithms to verify, using normalized algorithm names.
48
+ @digest_algs = Set.new
49
+ service_algs.each do |alg|
50
+ normalized_alg = alg.downcase.delete('-')
51
+ if SUPPORTED_ALGORITHMS.include?(normalized_alg)
52
+ @digest_algs << normalized_alg
53
+ else
54
+ raise ArgumentError.new("Unsupported checksum algorithm '#{alg}' in definition #{service_def.name}. Supported algorithms are: #{SUPPORTED_ALGORITHMS}")
55
+ end
56
+ end
57
+ end
58
+
59
+ # Perform all configured fixity checks on the provided file
60
+ #
61
+ # @param file_rec [FileRecord] record representing the file to perform the service on.
62
+ # @param event [String] name of the event this service is being invoked by.
63
+ # @raise [ChecksumMismatchError] if the checksum on record does not match the generated checksum
64
+ def perform(file_rec, event)
65
+ path = file_rec.path
66
+ phys_path = file_rec.physical_path
67
+ md_rec = file_rec.metadata_record
68
+
69
+ # Get the list of existing checksums for the file and normalize algorithm names
70
+ file_digests = Hash.new
71
+ md_rec.checksums&.each do |alg, digest|
72
+ normalized_alg = alg.downcase.delete('-')
73
+ if @digest_algs.include?(normalized_alg)
74
+ file_digests[normalized_alg] = digest
75
+ else
76
+ logger.debug("Metadata for file #{path} contains unexpected '#{alg}' digest, it will be ignored.")
77
+ end
78
+ end
79
+
80
+ @digest_algs.each do |alg|
81
+ existing_digest = file_digests[alg]
82
+
83
+ if existing_digest.nil?
84
+ if @absent_digest_behavior == FAIL_IF_ABSENT
85
+ raise ChecksumMismatchError.new("Fixity check using algorithm '#{alg}' failed for file #{path}: no existing digest of type '#{alg}' on record.")
86
+ elsif @absent_digest_behavior == IGNORE_IF_ABSENT
87
+ logger.debug("Skipping check of algorithm '#{alg}' for file #{path}: no digest on record.")
88
+ next
89
+ end
90
+ end
91
+
92
+ digest = DigestHelper::start_digest(alg)
93
+ digest.file(phys_path)
94
+ generated_digest = digest.hexdigest
95
+
96
+ # Store the missing checksum if using the 'generate' behavior
97
+ if existing_digest.nil? && @absent_digest_behavior == GENERATE_IF_ABSENT
98
+ md_rec.checksums[alg] = generated_digest
99
+ logger.info("Generated and stored digest using algorithm '#{alg}' for file #{phys_path}")
100
+ else
101
+ # Compare the new digest to the one on record
102
+ if existing_digest == generated_digest
103
+ logger.info("Fixity check using algorithm '#{alg}' succeeded for file #{phys_path}")
104
+ else
105
+ raise ChecksumMismatchError.new("Fixity check using algorithm '#{alg}' failed for file #{phys_path}: expected '#{existing_digest}', calculated '#{generated_digest}.'")
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ # Determine if this service is applicable for the provided event, given the configured service definition
112
+ #
113
+ # @param event [String] name of the event
114
+ # @return [Boolean] returns true if this service is applicable for the provided event
115
+ def is_applicable?(event)
116
+ case event
117
+ when EventNames::PRESERVE
118
+ true
119
+ else
120
+ false
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,198 @@
1
+ require 'longleaf/events/event_names'
2
+ require 'longleaf/logging'
3
+ require 'longleaf/errors'
4
+ require 'longleaf/models/file_record'
5
+ require 'longleaf/models/service_fields'
6
+ require 'longleaf/events/register_event'
7
+ require 'longleaf/candidates/single_digest_provider'
8
+ require 'open3'
9
+
10
+ module Longleaf
11
+ # Preservation service which performs replication of a file to one or more destinations using rsync.
12
+ #
13
+ # The service definition must contain one or more destinations, specified with the "to" property.
14
+ # These destinations must be either a known storage location name, a remote path, or absolute path.
15
+ #
16
+ # Optional service configuration properties:
17
+ # * replica_collision_policy = specifies the desired outcome if the service attempts to replicate
18
+ # a file which already exists at a destination. Default: "replace".
19
+ # * rsync_command = the command to invoke in order to execute rsync. Default: "rsync"
20
+ # * rsync_options = additional parameters that will be passed along to rsync. Cannot include options
21
+ # which change the target of the command or prevent its execution, such as "files-from", "dry-run",
22
+ # "help", etc. Command will always include "-R". Default "-a".
23
+ class RsyncReplicationService
24
+ include Longleaf::Logging
25
+ SF ||= Longleaf::ServiceFields
26
+
27
+ RSYNC_COMMAND_PROPERTY = "rsync_command"
28
+ DEFAULT_COMMAND = "rsync"
29
+
30
+ RSYNC_OPTIONS_PROPERTY = "rsync_options"
31
+ DEFAULT_OPTIONS = "-a"
32
+ DISALLOWED_OPTIONS = ["files-from", "n", "dry-run", "exclude", "exclude-from", "cvs-exclude",
33
+ "h", "help", "f", "F", "filter"]
34
+
35
+ attr_reader :command, :options, :collision_policy
36
+
37
+ # Initialize a RsyncReplicationService from the given service definition
38
+ #
39
+ # @param service_def [ServiceDefinition] the configuration for this service
40
+ # @param app_manager [ApplicationConfigManager] the application configuration
41
+ def initialize(service_def, app_manager)
42
+ @service_def = service_def
43
+ @app_manager = app_manager
44
+
45
+ @command = @service_def.properties[RSYNC_COMMAND_PROPERTY] || DEFAULT_COMMAND
46
+
47
+ # Validate rsync parameters
48
+ @options = @service_def.properties[RSYNC_OPTIONS_PROPERTY] || DEFAULT_OPTIONS
49
+ if contains_disallowed_option?(@options)
50
+ raise ArgumentError.new("Service #{service_def.name} specifies a disallowed rsync paramter," \
51
+ + " rsync_options may not include the following: #{DISALLOWED_OPTIONS.join(' ')}")
52
+ end
53
+
54
+ # Set and validate the replica collision policy
55
+ @collision_policy = @service_def.properties[SF::COLLISION_PROPERTY] || SF::DEFAULT_COLLISION_POLICY
56
+ if !SF::VALID_COLLISION_POLICIES.include?(@collision_policy)
57
+ raise ArgumentError.new("Service #{service_def.name} received invalid #{SF::COLLISION_PROPERTY}" \
58
+ + " value #{@collision_policy}")
59
+ end
60
+
61
+ # Store and validate destinations
62
+ replicate_to = @service_def.properties[SF::REPLICATE_TO]
63
+ if replicate_to.nil? || replicate_to.empty?
64
+ raise ArgumentError.new("Service #{service_def.name} must provide one or more replication destinations.")
65
+ end
66
+ replicate_to = [replicate_to] if replicate_to.is_a?(String)
67
+
68
+ loc_manager = app_manager.location_manager
69
+ # Build list of destinations, translating to storage locations when relevant
70
+ @destinations = Array.new
71
+ replicate_to.each do |dest|
72
+ # Assume that if destination contains a : or / it is a path rather than storage location
73
+ if dest =~ /[:\/]/
74
+ @destinations << dest
75
+ else
76
+ if loc_manager.locations.key?(dest)
77
+ @destinations << loc_manager.locations[dest]
78
+ else
79
+ raise ArgumentError.new("Service #{service_def.name} specifies unknown storage location '#{dest}'" \
80
+ + " as a replication destination")
81
+ end
82
+ end
83
+ end
84
+ end
85
+
86
+ # During a replication event, perform replication of the specified file to all configured destinations
87
+ # as necessary.
88
+ #
89
+ # @param file_rec [FileRecord] record representing the file to perform the service on.
90
+ # @param event [String] name of the event this service is being invoked by.
91
+ # @raise [PreservationServiceError] if the rsync replication fails
92
+ def perform(file_rec, event)
93
+ @destinations.each do |destination|
94
+ dest_is_storage_loc = destination.is_a?(Longleaf::StorageLocation)
95
+
96
+ if dest_is_storage_loc
97
+ dest_path = destination.path
98
+ else
99
+ dest_path = destination
100
+ end
101
+
102
+ logical_physical_same = file_rec.path == file_rec.physical_path
103
+ # Determine the path to the file being replicated relative to its storage location
104
+ rel_path = file_rec.storage_location.relativize(file_rec.path)
105
+
106
+ options = @options
107
+ if logical_physical_same
108
+ options = options + " -R"
109
+ # source path with . so that rsync will only create destination directories starting from that point
110
+ source_path = File.join(file_rec.storage_location.path, "./#{rel_path}")
111
+ else
112
+ options = options + " --no-relative"
113
+ source_path = file_rec.physical_path
114
+ dest_path = File.join(dest_path, rel_path)
115
+ if (dest_is_storage_loc && destination.is_a?(Longleaf::FilesystemStorageLocation)) || !dest_is_storage_loc
116
+ # Fill in missing parent directories, as rsync cannot do so when specifying a different source and dest filename
117
+ dirname = File.dirname(dest_path)
118
+ logger.debug("Creating parent dirs #{dirname} for #{file_rec.path}")
119
+ FileUtils.mkdir_p(dirname)
120
+ else
121
+ raise PreservationServiceError.new(
122
+ "Destination #{destination.name} does not currently support separate physical and logical paths")
123
+ end
124
+ end
125
+
126
+ # Check that the destination is available because attempting to write
127
+ verify_destination_available(destination, file_rec)
128
+
129
+ logger.debug("Invoking rsync with command: #{@command} \"#{source_path}\" \"#{dest_path}\" #{options}")
130
+ stdout, stderr, status = Open3.capture3("#{@command} \"#{source_path}\" \"#{dest_path}\" #{options}")
131
+ raise PreservationServiceError.new("Failed to replicate #{file_rec.path} to #{dest_path}: #{stderr}") \
132
+ unless status.success?
133
+
134
+ logger.info("Replicated #{file_rec.path} to destination #{dest_path}")
135
+
136
+ # For destinations which are storage locations, register the replica with longleaf
137
+ if dest_is_storage_loc
138
+ register_replica(destination, rel_path, file_rec)
139
+ end
140
+ end
141
+ end
142
+
143
+ # Determine if this service is applicable for the provided event, given the configured service definition
144
+ #
145
+ # @param event [String] name of the event
146
+ # @return [Boolean] returns true if this service is applicable for the provided event
147
+ def is_applicable?(event)
148
+ case event
149
+ when EventNames::PRESERVE
150
+ true
151
+ else
152
+ false
153
+ end
154
+ end
155
+
156
+ private
157
+ def contains_disallowed_option?(options)
158
+ DISALLOWED_OPTIONS.each do |disallowed|
159
+ if disallowed.length == 1
160
+ if options =~ /(\A| )-[a-zA-Z0-9]*#{disallowed}[a-zA-Z0-9]*( |=|\z)/
161
+ return true
162
+ end
163
+ else
164
+ if options =~ /(\A| )--#{disallowed}( |=|\z)/
165
+ return true
166
+ end
167
+ end
168
+ end
169
+
170
+ false
171
+ end
172
+
173
+ def verify_destination_available(destination, file_rec)
174
+ if destination.is_a?(Longleaf::StorageLocation)
175
+ begin
176
+ destination.available?
177
+ rescue StorageLocationUnavailableError => e
178
+ raise StorageLocationUnavailableError.new("Cannot replicate #{file_rec.path} to destination #{destination.name}: " \
179
+ + e.message)
180
+ end
181
+ elsif destination.start_with?("/")
182
+ raise StorageLocationUnavailableError.new("Cannot replicate #{file_rec.path} to destination" \
183
+ + " #{destination}, path does not exist.") unless Dir.exist?(destination)
184
+ end
185
+ end
186
+
187
+ def register_replica(destination, rel_path, file_rec)
188
+ dest_file_path = File.join(destination.path, rel_path)
189
+ dest_file_rec = FileRecord.new(dest_file_path, destination)
190
+
191
+ register_event = RegisterEvent.new(file_rec: dest_file_rec,
192
+ app_manager: @app_manager,
193
+ force: true,
194
+ digest_provider: SingleDigestProvider.new(file_rec.metadata_record.checksums))
195
+ register_event.perform
196
+ end
197
+ end
198
+ end