longleaf 0.2.0.pre.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +84 -0
  3. data/.gitignore +4 -2
  4. data/.rubocop.yml +42 -2
  5. data/.rubocop_todo.yml +390 -311
  6. data/.yardopts +1 -0
  7. data/Gemfile +16 -1
  8. data/README.md +67 -13
  9. data/Rakefile +6 -0
  10. data/bin/setup +16 -1
  11. data/docs/aboutlongleaf.md +28 -0
  12. data/docs/extra.css +32 -0
  13. data/docs/img/change-file.png +0 -0
  14. data/docs/img/ll-example-preserved.png +0 -0
  15. data/docs/index.md +19 -0
  16. data/docs/install.md +66 -0
  17. data/docs/ll-example/config-example-relative.yml +33 -0
  18. data/docs/ll-example/files-dir/LLexample-PDF.pdf +0 -0
  19. data/docs/ll-example/files-dir/LLexample-TOCHANGE.txt +15 -0
  20. data/docs/ll-example/files-dir/LLexample-tokeep.txt +10 -0
  21. data/docs/ll-example/metadata-dir/.gitkeep +0 -0
  22. data/docs/ll-example/replica-files/.gitkeep +0 -0
  23. data/docs/ll-example/replica-metadata/.gitkeep +0 -0
  24. data/docs/quickstart.md +270 -0
  25. data/docs/rdocs/Longleaf.html +135 -0
  26. data/docs/rdocs/Longleaf/AppFields.html +178 -0
  27. data/docs/rdocs/Longleaf/ApplicationConfigDeserializer.html +631 -0
  28. data/docs/rdocs/Longleaf/ApplicationConfigManager.html +610 -0
  29. data/docs/rdocs/Longleaf/ApplicationConfigValidator.html +238 -0
  30. data/docs/rdocs/Longleaf/CLI.html +909 -0
  31. data/docs/rdocs/Longleaf/ChecksumMismatchError.html +151 -0
  32. data/docs/rdocs/Longleaf/ConfigBuilder.html +1339 -0
  33. data/docs/rdocs/Longleaf/ConfigurationError.html +143 -0
  34. data/docs/rdocs/Longleaf/ConfigurationValidator.html +227 -0
  35. data/docs/rdocs/Longleaf/DeregisterCommand.html +420 -0
  36. data/docs/rdocs/Longleaf/DeregisterEvent.html +453 -0
  37. data/docs/rdocs/Longleaf/DeregistrationError.html +151 -0
  38. data/docs/rdocs/Longleaf/DigestHelper.html +419 -0
  39. data/docs/rdocs/Longleaf/EventError.html +147 -0
  40. data/docs/rdocs/Longleaf/EventNames.html +163 -0
  41. data/docs/rdocs/Longleaf/EventStatusTracking.html +656 -0
  42. data/docs/rdocs/Longleaf/FileCheckService.html +540 -0
  43. data/docs/rdocs/Longleaf/FileHelpers.html +520 -0
  44. data/docs/rdocs/Longleaf/FileRecord.html +716 -0
  45. data/docs/rdocs/Longleaf/FileSelector.html +901 -0
  46. data/docs/rdocs/Longleaf/FixityCheckService.html +691 -0
  47. data/docs/rdocs/Longleaf/IndexManager.html +1155 -0
  48. data/docs/rdocs/Longleaf/InvalidDigestAlgorithmError.html +143 -0
  49. data/docs/rdocs/Longleaf/InvalidStoragePathError.html +143 -0
  50. data/docs/rdocs/Longleaf/Logging.html +405 -0
  51. data/docs/rdocs/Longleaf/Logging/RedirectingLogger.html +1213 -0
  52. data/docs/rdocs/Longleaf/LongleafError.html +139 -0
  53. data/docs/rdocs/Longleaf/MDFields.html +193 -0
  54. data/docs/rdocs/Longleaf/MetadataBuilder.html +787 -0
  55. data/docs/rdocs/Longleaf/MetadataDeserializer.html +537 -0
  56. data/docs/rdocs/Longleaf/MetadataError.html +143 -0
  57. data/docs/rdocs/Longleaf/MetadataPersistenceManager.html +539 -0
  58. data/docs/rdocs/Longleaf/MetadataRecord.html +1411 -0
  59. data/docs/rdocs/Longleaf/MetadataSerializer.html +786 -0
  60. data/docs/rdocs/Longleaf/PreservationServiceError.html +147 -0
  61. data/docs/rdocs/Longleaf/PreserveCommand.html +410 -0
  62. data/docs/rdocs/Longleaf/PreserveEvent.html +491 -0
  63. data/docs/rdocs/Longleaf/RegisterCommand.html +428 -0
  64. data/docs/rdocs/Longleaf/RegisterEvent.html +628 -0
  65. data/docs/rdocs/Longleaf/RegisteredFileSelector.html +446 -0
  66. data/docs/rdocs/Longleaf/RegistrationError.html +151 -0
  67. data/docs/rdocs/Longleaf/ReindexCommand.html +576 -0
  68. data/docs/rdocs/Longleaf/RsyncReplicationService.html +1180 -0
  69. data/docs/rdocs/Longleaf/SequelIndexDriver.html +1978 -0
  70. data/docs/rdocs/Longleaf/ServiceCandidateFilesystemIterator.html +572 -0
  71. data/docs/rdocs/Longleaf/ServiceCandidateIndexIterator.html +532 -0
  72. data/docs/rdocs/Longleaf/ServiceCandidateLocator.html +333 -0
  73. data/docs/rdocs/Longleaf/ServiceClassCache.html +725 -0
  74. data/docs/rdocs/Longleaf/ServiceDateHelper.html +425 -0
  75. data/docs/rdocs/Longleaf/ServiceDefinition.html +683 -0
  76. data/docs/rdocs/Longleaf/ServiceDefinitionManager.html +371 -0
  77. data/docs/rdocs/Longleaf/ServiceDefinitionValidator.html +269 -0
  78. data/docs/rdocs/Longleaf/ServiceFields.html +173 -0
  79. data/docs/rdocs/Longleaf/ServiceManager.html +1229 -0
  80. data/docs/rdocs/Longleaf/ServiceMappingManager.html +410 -0
  81. data/docs/rdocs/Longleaf/ServiceMappingValidator.html +347 -0
  82. data/docs/rdocs/Longleaf/ServiceRecord.html +821 -0
  83. data/docs/rdocs/Longleaf/StorageLocation.html +985 -0
  84. data/docs/rdocs/Longleaf/StorageLocationManager.html +729 -0
  85. data/docs/rdocs/Longleaf/StorageLocationUnavailableError.html +143 -0
  86. data/docs/rdocs/Longleaf/StorageLocationValidator.html +373 -0
  87. data/docs/rdocs/Longleaf/StoragePathValidator.html +253 -0
  88. data/docs/rdocs/Longleaf/SystemConfigBuilder.html +441 -0
  89. data/docs/rdocs/Longleaf/SystemConfigFields.html +163 -0
  90. data/docs/rdocs/Longleaf/ValidateConfigCommand.html +451 -0
  91. data/docs/rdocs/Longleaf/ValidateMetadataCommand.html +408 -0
  92. data/docs/rdocs/_index.html +660 -0
  93. data/docs/rdocs/class_list.html +51 -0
  94. data/docs/rdocs/css/common.css +1 -0
  95. data/docs/rdocs/css/full_list.css +58 -0
  96. data/docs/rdocs/css/style.css +496 -0
  97. data/docs/rdocs/file.README.html +165 -0
  98. data/docs/rdocs/file_list.html +56 -0
  99. data/docs/rdocs/frames.html +17 -0
  100. data/docs/rdocs/index.html +165 -0
  101. data/docs/rdocs/js/app.js +303 -0
  102. data/docs/rdocs/js/full_list.js +216 -0
  103. data/docs/rdocs/js/jquery.js +4 -0
  104. data/docs/rdocs/method_list.html +2051 -0
  105. data/docs/rdocs/top-level-namespace.html +110 -0
  106. data/lib/longleaf/candidates/file_selector.rb +47 -15
  107. data/lib/longleaf/candidates/registered_file_selector.rb +67 -0
  108. data/lib/longleaf/candidates/service_candidate_filesystem_iterator.rb +29 -35
  109. data/lib/longleaf/candidates/service_candidate_index_iterator.rb +84 -0
  110. data/lib/longleaf/candidates/service_candidate_locator.rb +9 -4
  111. data/lib/longleaf/cli.rb +162 -80
  112. data/lib/longleaf/commands/deregister_command.rb +12 -11
  113. data/lib/longleaf/commands/preserve_command.rb +13 -8
  114. data/lib/longleaf/commands/register_command.rb +9 -6
  115. data/lib/longleaf/commands/reindex_command.rb +92 -0
  116. data/lib/longleaf/commands/validate_config_command.rb +27 -6
  117. data/lib/longleaf/commands/validate_metadata_command.rb +11 -9
  118. data/lib/longleaf/errors.rb +12 -12
  119. data/lib/longleaf/events/deregister_event.rb +13 -15
  120. data/lib/longleaf/events/event_status_tracking.rb +7 -7
  121. data/lib/longleaf/events/preserve_event.rb +24 -14
  122. data/lib/longleaf/events/register_event.rb +21 -35
  123. data/lib/longleaf/helpers/digest_helper.rb +4 -4
  124. data/lib/longleaf/helpers/service_date_helper.rb +5 -6
  125. data/lib/longleaf/indexing/index_manager.rb +101 -0
  126. data/lib/longleaf/indexing/sequel_index_driver.rb +324 -0
  127. data/lib/longleaf/logging.rb +4 -4
  128. data/lib/longleaf/logging/redirecting_logger.rb +20 -20
  129. data/lib/longleaf/models/app_fields.rb +2 -1
  130. data/lib/longleaf/models/file_record.rb +10 -6
  131. data/lib/longleaf/models/md_fields.rb +1 -1
  132. data/lib/longleaf/models/metadata_record.rb +22 -12
  133. data/lib/longleaf/models/service_definition.rb +3 -3
  134. data/lib/longleaf/models/service_fields.rb +1 -1
  135. data/lib/longleaf/models/service_record.rb +6 -5
  136. data/lib/longleaf/models/storage_location.rb +26 -7
  137. data/lib/longleaf/models/system_config_fields.rb +9 -0
  138. data/lib/longleaf/preservation_services/file_check_service.rb +58 -0
  139. data/lib/longleaf/preservation_services/fixity_check_service.rb +16 -14
  140. data/lib/longleaf/preservation_services/rsync_replication_service.rb +32 -31
  141. data/lib/longleaf/services/application_config_deserializer.rb +55 -18
  142. data/lib/longleaf/services/application_config_manager.rb +16 -4
  143. data/lib/longleaf/services/application_config_validator.rb +1 -2
  144. data/lib/longleaf/services/configuration_validator.rb +6 -4
  145. data/lib/longleaf/services/metadata_deserializer.rb +40 -38
  146. data/lib/longleaf/services/metadata_persistence_manager.rb +46 -0
  147. data/lib/longleaf/services/metadata_serializer.rb +23 -22
  148. data/lib/longleaf/services/service_class_cache.rb +15 -15
  149. data/lib/longleaf/services/service_definition_manager.rb +5 -6
  150. data/lib/longleaf/services/service_definition_validator.rb +5 -6
  151. data/lib/longleaf/services/service_manager.rb +37 -17
  152. data/lib/longleaf/services/service_mapping_manager.rb +9 -9
  153. data/lib/longleaf/services/service_mapping_validator.rb +9 -10
  154. data/lib/longleaf/services/storage_location_manager.rb +22 -8
  155. data/lib/longleaf/services/storage_location_validator.rb +11 -8
  156. data/lib/longleaf/services/storage_path_validator.rb +1 -1
  157. data/lib/longleaf/specs/config_builder.rb +30 -17
  158. data/lib/longleaf/specs/custom_matchers.rb +1 -1
  159. data/lib/longleaf/specs/file_helpers.rb +15 -14
  160. data/lib/longleaf/specs/metadata_builder.rb +91 -0
  161. data/lib/longleaf/specs/system_config_builder.rb +27 -0
  162. data/lib/longleaf/version.rb +1 -1
  163. data/longleaf.gemspec +17 -7
  164. data/mkdocs.yml +20 -0
  165. metadata +233 -22
@@ -10,7 +10,7 @@ module Longleaf
10
10
  # Event to register a file with longleaf
11
11
  class RegisterEvent
12
12
  include Longleaf::EventStatusTracking
13
-
13
+
14
14
  # @param file_rec [FileRecord] file record
15
15
  # @param app_manager [ApplicationConfigManager] the application configuration
16
16
  # @param force [boolean] if true, then already registered files will be re-registered
@@ -21,75 +21,61 @@ module Longleaf
21
21
  raise ArgumentError.new('Must provide an ApplicationConfigManager') if app_manager.nil?
22
22
  raise ArgumentError.new('Parameter app_manager must be an ApplicationConfigManager') \
23
23
  unless app_manager.is_a?(ApplicationConfigManager)
24
-
24
+
25
25
  @app_manager = app_manager
26
26
  @file_rec = file_rec
27
27
  @force = force
28
28
  @checksums = checksums
29
29
  end
30
-
30
+
31
31
  # Perform a registration event on the given file
32
- # @raise RegistrationError if a file cannot be registered
32
+ # @raise RegistrationError if a file cannot be registered
33
33
  def perform
34
34
  begin
35
35
  # Only need to re-register file if the force flag is provided
36
36
  if @file_rec.metadata_present? && !@force
37
37
  raise RegistrationError.new("Unable to register '#{@file_rec.path}', it is already registered.")
38
38
  end
39
-
39
+
40
40
  # create metadata record
41
- md_rec = MetadataRecord.new(registered: Time.now.utc.iso8601)
41
+ md_rec = MetadataRecord.new(registered: Time.now.utc.iso8601(3))
42
42
  @file_rec.metadata_record = md_rec
43
-
43
+
44
44
  # retain significant details from former record
45
45
  if @file_rec.metadata_present?
46
46
  retain_existing_properties
47
47
  end
48
-
48
+
49
49
  populate_file_properties
50
-
50
+
51
51
  md_rec.checksums.merge!(@checksums) unless @checksums.nil?
52
-
53
- populate_services
54
-
55
- # persist the metadata out to file
56
- MetadataSerializer::write(metadata: md_rec,
57
- file_path: @file_rec.metadata_path,
58
- digest_algs: @file_rec.storage_location.metadata_digests)
59
-
52
+
53
+ # persist the metadata
54
+ @app_manager.md_manager.persist(@file_rec)
55
+
60
56
  record_success(EventNames::REGISTER, @file_rec.path)
61
57
  rescue RegistrationError => err
62
58
  record_failure(EventNames::REGISTER, @file_rec.path, err.message)
63
59
  rescue InvalidStoragePathError => err
64
60
  record_failure(EventNames::REGISTER, @file_rec.path, err.message)
65
61
  end
66
-
62
+
67
63
  return_status
68
64
  end
69
-
65
+
70
66
  private
71
67
  def populate_file_properties
72
68
  md_rec = @file_rec.metadata_record
73
-
69
+
74
70
  # Set file properties
75
- md_rec.last_modified = File.mtime(@file_rec.path).utc.iso8601
71
+ md_rec.last_modified = File.mtime(@file_rec.path).utc.iso8601(3)
76
72
  md_rec.file_size = File.size(@file_rec.path)
77
73
  end
78
-
79
- def populate_services
80
- md_rec = @file_rec.metadata_record
81
-
82
- service_manager = @app_manager.service_manager
83
- service_names = service_manager.list_services(location: @file_rec.storage_location.name)
84
-
85
- # Add service section
86
- service_names.each { |serv_name| md_rec.add_service(serv_name) }
87
- end
88
-
74
+
89
75
  # Copy a subset of properties from an existing metadata record to the new record
90
76
  def retain_existing_properties
91
77
  md_rec = @file_rec.metadata_record
92
-
78
+
93
79
  old_md = MetadataDeserializer.deserialize(file_path: @file_rec.metadata_path,
94
80
  digest_algs: @file_rec.storage_location.metadata_digests)
95
81
  # Copy custom properties
@@ -97,7 +83,7 @@ module Longleaf
97
83
  # Copy stale-replicas flag per service
98
84
  old_md.list_services.each do |serv_name|
99
85
  serv_rec = old_md.service(serv_name)
100
-
86
+
101
87
  stale_replicas = serv_rec.stale_replicas
102
88
  if stale_replicas
103
89
  new_service = md_rec.service(serv_name)
@@ -106,4 +92,4 @@ module Longleaf
106
92
  end
107
93
  end
108
94
  end
109
- end
95
+ end
@@ -5,7 +5,7 @@ module Longleaf
5
5
  # Helper methods for generating digests
6
6
  class DigestHelper
7
7
  KNOWN_DIGESTS ||= ['md5', 'sha1', 'sha2', 'sha256', 'sha384', 'sha512', 'rmd160']
8
-
8
+
9
9
  # @param algs Either a string containing one or an array containing zero or more digest
10
10
  # algorithm names.
11
11
  # @raise [InvalidDigestAlgorithmError] thrown if any of the digest algorithms listed are not
@@ -19,11 +19,11 @@ module Longleaf
19
19
  else
20
20
  unknown = algs.select { |alg| !KNOWN_DIGESTS.include?(alg) }
21
21
  unless unknown.empty?
22
- raise InvalidDigestAlgorithmError.new("Unknown digest algorithm(s): #{unknown.to_s}")
22
+ raise InvalidDigestAlgorithmError.new("Unknown digest algorithm(s): #{unknown}")
23
23
  end
24
24
  end
25
25
  end
26
-
26
+
27
27
  # Get a Digest class for the specified algorithm
28
28
  # @param alg [String] name of the digest algorithm
29
29
  # @return [Digest] A digest class for the requested algorithm
@@ -47,4 +47,4 @@ module Longleaf
47
47
  end
48
48
  end
49
49
  end
50
- end
50
+ end
@@ -3,7 +3,6 @@ require 'time'
3
3
  module Longleaf
4
4
  # Helper methods for interacting with dates/timestamps on services
5
5
  class ServiceDateHelper
6
-
7
6
  # Adds the amount of time from modifier to the provided timestamp
8
7
  # @param timestamp [String] ISO-8601 timestamp string
9
8
  # @param modifier [String] amount of time to add to the timestamp. It must follow the syntax
@@ -18,7 +17,7 @@ module Longleaf
18
17
  else
19
18
  raise ArgumentError.new("Cannot parse time modifier #{modifier}")
20
19
  end
21
-
20
+
22
21
  datetime = Time.iso8601(timestamp)
23
22
  case unit
24
23
  when 'second'
@@ -36,16 +35,16 @@ module Longleaf
36
35
  when 'year'
37
36
  unit_modifier = 365 * 24 * 3600
38
37
  end
39
-
38
+
40
39
  modified_time = datetime + (value * unit_modifier)
41
40
  modified_time.iso8601
42
41
  end
43
-
42
+
44
43
  # Get a timestamp in the format expected for service timestamps.
45
44
  # @param timestamp [Time] the time to format. Defaults to now.
46
45
  # @return [String] the time formatted as iso8601
47
46
  def self.formatted_timestamp(timestamp = Time.now)
48
- timestamp.iso8601.to_s
47
+ timestamp.utc.iso8601(3).to_s
49
48
  end
50
49
  end
51
- end
50
+ end
@@ -0,0 +1,101 @@
1
+ require 'longleaf/models/system_config_fields'
2
+ require 'longleaf/services/metadata_persistence_manager'
3
+ require 'longleaf/errors'
4
+
5
+ module Longleaf
6
+ # Manager configures and provides access to a metadata index if one is specified
7
+ class IndexManager
8
+ SYS_FIELDS ||= Longleaf::SystemConfigFields
9
+
10
+ # @param config [Hash] The system configuration as a hash
11
+ # @param app_config_manager [ApplicationConfigManager] the application config
12
+ def initialize(config, app_config_manager)
13
+ @config = config
14
+ @app_config_manager = app_config_manager
15
+ init_index_driver if @config&.key?(SYS_FIELDS::MD_INDEX)
16
+ end
17
+
18
+ # @return true if the system is configured to use a metadata index
19
+ def using_index?
20
+ !@index_driver.nil?
21
+ end
22
+
23
+ # Index the provided file_rec and its metadata
24
+ #
25
+ # @param file_rec [FileRecord] file record to index
26
+ def index(file_rec)
27
+ @index_driver.index(file_rec)
28
+ end
29
+
30
+ # Remove an entry from the index
31
+ # @param remove_me The record to remove from the index
32
+ def remove(remove_me)
33
+ @index_driver.remove(remove_me)
34
+ end
35
+
36
+ def clear_index(older_than = nil)
37
+ @index_driver.clear_index(older_than)
38
+ end
39
+
40
+ # @return true if the index should be reindexed
41
+ def index_stale?
42
+ @index_driver.is_stale?
43
+ end
44
+
45
+ # Setup initial structure of index implementation
46
+ def setup_index
47
+ @index_driver.setup_index
48
+ end
49
+
50
+ def update_index_state
51
+ @index_driver.update_index_state
52
+ end
53
+
54
+ # Retrieves a set of which have one or more services which need to run.
55
+ #
56
+ # @param file_selector [FileSelector] selector for paths to search for files
57
+ # @param stale_datetime [DateTime] find file_paths with services needing to be run before this value
58
+ # @return [Array] array of file paths that need one or more services run, in ascending order by
59
+ # timestamp.
60
+ def paths_with_stale_services(file_selector, stale_datetime)
61
+ @index_driver.paths_with_stale_services(file_selector, stale_datetime)
62
+ end
63
+
64
+ # Retrieves a page of paths for registered files.
65
+ # @param file_selector [FileSelector] selector for what paths to search for files
66
+ # @return [Array] array of file paths that are registered
67
+ def registered_paths(file_selector)
68
+ @index_driver.registered_paths(file_selector)
69
+ end
70
+
71
+ def each_registered_path(file_selector, older_than: nil, &block)
72
+ @index_driver.each_registered_path(file_selector, older_than: older_than, &block)
73
+ end
74
+
75
+ private
76
+ def init_index_driver
77
+ index_conf = @config[SYS_FIELDS::MD_INDEX]
78
+ adapter = index_conf[SYS_FIELDS::MD_INDEX_ADAPTER]&.downcase
79
+
80
+ raise ConfigurationError.new('Must specify an adapter for the metadata index') if adapter.nil?
81
+
82
+ adapter = adapter.to_sym
83
+
84
+ case adapter
85
+ when :postgres, :mysql, :mysql2, :sqlite, :amalgalite
86
+ page_size = index_conf[SYS_FIELDS::MD_INDEX_PAGE_SIZE]&.to_int
87
+
88
+ connection = index_conf[SYS_FIELDS::MD_INDEX_CONNECTION]
89
+ raise ConfigurationError.new("Must specify connection details for index adapter of type '#{adapter}'") if connection.nil?
90
+
91
+ require 'longleaf/indexing/sequel_index_driver'
92
+ @index_driver = SequelIndexDriver.new(@app_config_manager,
93
+ adapter,
94
+ connection,
95
+ page_size: page_size)
96
+ else
97
+ raise ConfigurationError.new("Unknown index adapter '#{adapter}' specified.") if adapter.nil?
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,324 @@
1
+ require 'sequel'
2
+ require 'digest/md5'
3
+ require 'longleaf/events/event_names'
4
+ require 'longleaf/candidates/file_selector'
5
+ require 'longleaf/version'
6
+ require 'longleaf/models/system_config_fields'
7
+ require 'longleaf/logging'
8
+
9
+ module Longleaf
10
+ # Driver for interacting with RDBM based metadata index using the Sequel ORM gem.
11
+ # Users must create the database and credentials for connecting to it in advance,
12
+ # if using a database application that requires creation of databases (ie, not sqlite).
13
+ # The default database name is 'longleaf_metadata_index' but may be overridden.
14
+ #
15
+ # See the Sequel documentation for details about accepted connection parameters:
16
+ # https://github.com/jeremyevans/sequel/blob/master/doc/opening_databases.rdoc
17
+ class SequelIndexDriver
18
+ include Longleaf::Logging
19
+ INDEX_DB_NAME ||= 'longleaf_metadata_index'
20
+ PRESERVE_TBL ||= "preserve_service_times".to_sym
21
+ INDEX_STATE_TBL ||= "index_state".to_sym
22
+ DEFAULT_PAGE_SIZE ||= 1000
23
+ TIMESTAMP_FORMAT ||= '%Y-%m-%d %H:%M:%S.%3N'
24
+
25
+ # Initialize the index driver
26
+ #
27
+ # @param app_config [ApplicationConfigManager] the application configuration manager
28
+ # @param adapter [String] name of the database adapter to use.
29
+ # @param conn_details Details about the configuration and connection to the database used for the index.
30
+ # If a string is provided, it will be used as the connection URL and must identify the adapter.
31
+ # If a hash is provided, it used as the parameters for the database connection.
32
+ # @param page_size [Integer] number of results to retrieve per query when getting candidates
33
+ def initialize(app_config, adapter, conn_details, page_size: nil)
34
+ Sequel.default_timezone = :utc
35
+ @app_config = app_config
36
+ @adapter = adapter
37
+ @conn_details = conn_details
38
+ # Digest of the app config file so we can tell if it changes
39
+ @config_md5 = app_config.config_md5
40
+ @page_size = page_size.nil? || page_size <= 0 ? DEFAULT_PAGE_SIZE : page_size
41
+
42
+ if @conn_details.is_a?(Hash)
43
+ # Add in the adapter name
44
+ @conn_details['adapter'] = adapter unless @conn_details.key?('adapter')
45
+ # Add in default database name if none was specified
46
+ @conn_details['database'] = INDEX_DB_NAME unless @conn_details.key?('database')
47
+ end
48
+ end
49
+
50
+ # Returns true if the application configuration does not match the configuration used for
51
+ # the last reindex.
52
+ def is_stale?
53
+ db_conn[INDEX_STATE_TBL].where(config_md5: @config_md5).count == 0
54
+ end
55
+
56
+ # Index the provided file_rec and its metadata
57
+ #
58
+ # @param file_rec [FileRecord] file record to index
59
+ def index(file_rec)
60
+ file_path = file_rec.path
61
+ md_rec = file_rec.metadata_record
62
+ storage_loc = file_rec.storage_location
63
+ service_manager = @app_config.service_manager
64
+
65
+ # Produce a list of service definitions which should apply to the file
66
+ expected_services = service_manager.list_service_definitions(
67
+ location: storage_loc.name)
68
+
69
+ first_timestamp = first_service_execution_timestamp(expected_services, md_rec)
70
+ delay_until_timestamp = delay_until_timestamp(md_rec)
71
+
72
+ first_timestamp = convert_iso8601_to_timestamp(first_timestamp)
73
+ delay_until_timestamp = convert_iso8601_to_timestamp(delay_until_timestamp)
74
+ now_stamp = Time.now.utc.strftime(TIMESTAMP_FORMAT)
75
+
76
+ if @adapter == :mysql || @adapter == :mysql2
77
+ preserve_tbl.on_duplicate_key_update
78
+ .insert(file_path: file_path,
79
+ storage_location: storage_loc.name,
80
+ service_time: first_timestamp,
81
+ delay_until_time: delay_until_timestamp,
82
+ updated: now_stamp)
83
+ else
84
+ preserve_tbl.insert_conflict(target: :file_path,
85
+ update: {
86
+ storage_location: storage_loc.name,
87
+ service_time: first_timestamp,
88
+ delay_until_time: delay_until_timestamp,
89
+ updated: now_stamp } )
90
+ .insert(file_path: file_path,
91
+ storage_location: storage_loc.name,
92
+ service_time: first_timestamp,
93
+ delay_until_time: delay_until_timestamp,
94
+ updated: now_stamp)
95
+ end
96
+ end
97
+
98
+ # Find the earliest service execution time for any services expected to be run for the specified file.
99
+ #
100
+ # @param expected_services [Array] list of ServiceDefinition objects expected for specified file.
101
+ # @param md_rec [MetadataRecord] metadata record for the file being evaluated
102
+ # @return The timestamp of the earliest service execution time for the file described by md_rec, in iso8601 format.
103
+ # Returns nil if no services are expected all services have already run and do not have a next occurrence, or
104
+ # the file is deregistered.
105
+ def first_service_execution_timestamp(expected_services, md_rec)
106
+ current_time = Time.now.utc.iso8601(3)
107
+ if md_rec.deregistered?
108
+ return nil
109
+ end
110
+
111
+ service_times = Array.new
112
+
113
+ present_services = md_rec.list_services
114
+
115
+ expected_services.each do |service_def|
116
+ service_name = service_def.name
117
+ # Service has never run, set execution time to now
118
+ if !present_services.include?(service_name)
119
+ service_times << current_time
120
+ next
121
+ end
122
+
123
+ service_rec = md_rec.service(service_name)
124
+
125
+ # Service either needs a run or has no timestamp, so execution time of now
126
+ if service_rec.run_needed || service_rec.timestamp.nil?
127
+ service_times << current_time
128
+ next
129
+ end
130
+
131
+ # Calculate the next time this service should run based on frequency
132
+ frequency = service_def.frequency
133
+ unless frequency.nil?
134
+ service_timestamp = service_rec.timestamp
135
+ service_times << ServiceDateHelper.add_to_timestamp(service_timestamp, frequency)
136
+ next
137
+ end
138
+ end
139
+ # Return the lowest service execution time
140
+ service_times.min
141
+ end
142
+
143
+ # @return The first failure timestamp for any service, or nil if there were none.
144
+ def delay_until_timestamp(md_rec)
145
+ md_rec.list_services.each do |service_name|
146
+ service_rec = md_rec.service(service_name)
147
+ return service_rec.failure_timestamp unless service_rec.failure_timestamp.nil?
148
+ end
149
+ # return lowest possible date
150
+ return minimum_timestamp
151
+ end
152
+
153
+ # Remove an entry from the index
154
+ # @param remove_me The record to remove from the index. May be a FileRecord or a String.
155
+ def remove(remove_me)
156
+ if remove_me.is_a?(FileRecord)
157
+ path = remove_me.path
158
+ else
159
+ path = remove_me
160
+ end
161
+
162
+ result = preserve_tbl.where(file_path: path).delete
163
+ if result == 0
164
+ logger.warn("Could not remove #{path} from the index, path was not present.")
165
+ end
166
+ end
167
+
168
+ # Remove all entries from the index
169
+ # @param older_than [Time] Optional. If provided, only entries that have not been indexed
170
+ # since before the provided time will be deleted.
171
+ def clear_index(older_than = nil)
172
+ if older_than.nil?
173
+ preserve_tbl.delete
174
+ else
175
+ older_than_timestamp = older_than.utc.strftime(TIMESTAMP_FORMAT)
176
+ preserve_tbl.where { updated < older_than_timestamp }.delete
177
+ end
178
+ end
179
+
180
+ # Initialize the index's database using the provided configuration
181
+ def setup_index
182
+ # Create the table for tracking when files will need preservation services run on them.
183
+ case @adapter
184
+ when :mysql, :mysql2
185
+ # mysql does not support 'text' fields as primary keys
186
+ db_conn.create_table!(PRESERVE_TBL) do
187
+ String :file_path, primary_key: true, size: 768
188
+ column :storage_location, 'varchar(128)'
189
+ column :service_time, 'timestamp(3)', { :null => true }
190
+ column :delay_until_time, 'timestamp(3)'
191
+ column :updated, 'timestamp(3)'
192
+ end
193
+ else
194
+ db_conn.create_table!(PRESERVE_TBL) do
195
+ String :file_path, primary_key: true, text: true
196
+ column :storage_location, 'varchar(128)'
197
+ column :service_time, 'timestamp(3)', { :null => true }
198
+ column :delay_until_time, 'timestamp(3)'
199
+ column :updated, 'timestamp(3)'
200
+ end
201
+ end
202
+
203
+ # Setup database indexes
204
+ case @adapter
205
+ when :postgres
206
+ db_conn.run("CREATE INDEX service_times_file_path_text_index ON preserve_service_times (file_path text_pattern_ops)")
207
+ when :sqlite, :amalgalite
208
+ db_conn.run("CREATE INDEX service_times_file_path_text_index ON preserve_service_times (file_path collate nocase)")
209
+ end
210
+ db_conn.run("CREATE INDEX service_times_storage_location_index ON preserve_service_times (storage_location)")
211
+
212
+ # Create table for tracking the state of the index
213
+ db_conn.create_table!(INDEX_STATE_TBL) do
214
+ String :config_md5
215
+ DateTime :last_reindexed
216
+ String :longleaf_version
217
+ end
218
+
219
+ # Prepopulate the index state information
220
+ update_index_state
221
+ end
222
+
223
+ # Updates the state information for the index to indicate that the index has been refreshed
224
+ # or is in sync with the application's configuration.
225
+ def update_index_state
226
+ index_state_tbl = db_conn[INDEX_STATE_TBL]
227
+ index_state_tbl.delete
228
+ index_state_tbl.insert(
229
+ config_md5: @config_md5,
230
+ last_reindexed: Time.now.utc,
231
+ longleaf_version: Longleaf::VERSION)
232
+ end
233
+
234
+ # Retrieves page of file paths which have one or more services which need to run.
235
+ # @param file_selector [FileSelector] selector for what paths to search for files
236
+ # @param stale_datetime [DateTime] find file_paths with services needing to be run before this value
237
+ # @return [Array] array of file paths that need one or more services run.
238
+ def paths_with_stale_services(file_selector, stale_datetime)
239
+ if @preserve_dataset.nil?
240
+ @preserve_dataset = db_conn
241
+ .from(PRESERVE_TBL)
242
+ .exclude(service_time: nil)
243
+ .limit(@page_size)
244
+ .order(Sequel.asc(:service_time))
245
+ end
246
+
247
+ # retrieve and return a page of results
248
+ ds = add_path_restrictions(@preserve_dataset, file_selector)
249
+ .where { service_time <= stale_datetime }
250
+ .where { delay_until_time < stale_datetime }
251
+ .select_map(:file_path)
252
+ end
253
+
254
+ # Retrieves a page of paths for registered files.
255
+ # @param file_selector [FileSelector] selector for what paths to search for files
256
+ # @return [Array] array of file paths that are registered
257
+ def registered_paths(file_selector)
258
+ # retrieve and return a page of results
259
+ add_path_restrictions(registered_dataset, file_selector)
260
+ .select_map(:file_path)
261
+ end
262
+
263
+ # Calls the provided block once per each registered file path registered.
264
+ # Must be passed a block.
265
+ # @param file_selector [FileSelector] selector for what paths to search for files
266
+ # @param older_than [Time] Optional. If provided, only files that have not been
267
+ # indexed since before this timestamp will be returned.
268
+ def each_registered_path(file_selector, older_than: nil, &block)
269
+ dataset = add_path_restrictions(registered_dataset, file_selector)
270
+ .select(:file_path)
271
+ if !older_than.nil?
272
+ older_than_timestamp = older_than.utc.strftime(TIMESTAMP_FORMAT)
273
+ dataset = dataset.where { updated < older_than_timestamp }
274
+ end
275
+ # Yield to the provided block once per row return
276
+ dataset.paged_each(:rows_per_fetch => @page_size) do |row|
277
+ block.call(row[:file_path])
278
+ end
279
+ end
280
+
281
+ private
282
+ def db_conn
283
+ @connection = Sequel.connect(@conn_details) if @connection.nil?
284
+ @connection
285
+ end
286
+
287
+ def preserve_tbl
288
+ @preserve_tbl = db_conn[PRESERVE_TBL] if @preserve_tbl.nil?
289
+ @preserve_tbl
290
+ end
291
+
292
+ def add_path_restrictions(dataset, file_selector)
293
+ if file_selector.specificity == FileSelector::SPECIFICITY_STORAGE_LOCATION
294
+ dataset.where(storage_location: file_selector.storage_locations)
295
+ else
296
+ # Reformat all selected paths into LIKE partial string matches
297
+ path_conds = file_selector.target_paths.map { |path| path.end_with?('/') ? path + '%' : path }
298
+ dataset.where(Sequel.like(:file_path, *path_conds))
299
+ end
300
+ end
301
+
302
+ def convert_iso8601_to_timestamp(iso8601)
303
+ return nil if iso8601.nil?
304
+ Time.iso8601(iso8601).strftime(TIMESTAMP_FORMAT)
305
+ end
306
+
307
+ def minimum_timestamp
308
+ if @min_timestamp.nil?
309
+ @min_timestamp = ServiceDateHelper.formatted_timestamp(Time.at(0).utc)
310
+ end
311
+ @min_timestamp
312
+ end
313
+
314
+ def registered_dataset
315
+ if @registered_dataset.nil?
316
+ @registered_dataset = db_conn
317
+ .from(PRESERVE_TBL)
318
+ .limit(@page_size)
319
+ .order(Sequel.asc(:service_time))
320
+ end
321
+ @registered_dataset
322
+ end
323
+ end
324
+ end