longleaf 0.2.0.pre.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.circleci/config.yml +84 -0
- data/.gitignore +4 -2
- data/.rubocop.yml +42 -2
- data/.rubocop_todo.yml +390 -311
- data/.yardopts +1 -0
- data/Gemfile +16 -1
- data/README.md +67 -13
- data/Rakefile +6 -0
- data/bin/setup +16 -1
- data/docs/aboutlongleaf.md +28 -0
- data/docs/extra.css +32 -0
- data/docs/img/change-file.png +0 -0
- data/docs/img/ll-example-preserved.png +0 -0
- data/docs/index.md +19 -0
- data/docs/install.md +66 -0
- data/docs/ll-example/config-example-relative.yml +33 -0
- data/docs/ll-example/files-dir/LLexample-PDF.pdf +0 -0
- data/docs/ll-example/files-dir/LLexample-TOCHANGE.txt +15 -0
- data/docs/ll-example/files-dir/LLexample-tokeep.txt +10 -0
- data/docs/ll-example/metadata-dir/.gitkeep +0 -0
- data/docs/ll-example/replica-files/.gitkeep +0 -0
- data/docs/ll-example/replica-metadata/.gitkeep +0 -0
- data/docs/quickstart.md +270 -0
- data/docs/rdocs/Longleaf.html +135 -0
- data/docs/rdocs/Longleaf/AppFields.html +178 -0
- data/docs/rdocs/Longleaf/ApplicationConfigDeserializer.html +631 -0
- data/docs/rdocs/Longleaf/ApplicationConfigManager.html +610 -0
- data/docs/rdocs/Longleaf/ApplicationConfigValidator.html +238 -0
- data/docs/rdocs/Longleaf/CLI.html +909 -0
- data/docs/rdocs/Longleaf/ChecksumMismatchError.html +151 -0
- data/docs/rdocs/Longleaf/ConfigBuilder.html +1339 -0
- data/docs/rdocs/Longleaf/ConfigurationError.html +143 -0
- data/docs/rdocs/Longleaf/ConfigurationValidator.html +227 -0
- data/docs/rdocs/Longleaf/DeregisterCommand.html +420 -0
- data/docs/rdocs/Longleaf/DeregisterEvent.html +453 -0
- data/docs/rdocs/Longleaf/DeregistrationError.html +151 -0
- data/docs/rdocs/Longleaf/DigestHelper.html +419 -0
- data/docs/rdocs/Longleaf/EventError.html +147 -0
- data/docs/rdocs/Longleaf/EventNames.html +163 -0
- data/docs/rdocs/Longleaf/EventStatusTracking.html +656 -0
- data/docs/rdocs/Longleaf/FileCheckService.html +540 -0
- data/docs/rdocs/Longleaf/FileHelpers.html +520 -0
- data/docs/rdocs/Longleaf/FileRecord.html +716 -0
- data/docs/rdocs/Longleaf/FileSelector.html +901 -0
- data/docs/rdocs/Longleaf/FixityCheckService.html +691 -0
- data/docs/rdocs/Longleaf/IndexManager.html +1155 -0
- data/docs/rdocs/Longleaf/InvalidDigestAlgorithmError.html +143 -0
- data/docs/rdocs/Longleaf/InvalidStoragePathError.html +143 -0
- data/docs/rdocs/Longleaf/Logging.html +405 -0
- data/docs/rdocs/Longleaf/Logging/RedirectingLogger.html +1213 -0
- data/docs/rdocs/Longleaf/LongleafError.html +139 -0
- data/docs/rdocs/Longleaf/MDFields.html +193 -0
- data/docs/rdocs/Longleaf/MetadataBuilder.html +787 -0
- data/docs/rdocs/Longleaf/MetadataDeserializer.html +537 -0
- data/docs/rdocs/Longleaf/MetadataError.html +143 -0
- data/docs/rdocs/Longleaf/MetadataPersistenceManager.html +539 -0
- data/docs/rdocs/Longleaf/MetadataRecord.html +1411 -0
- data/docs/rdocs/Longleaf/MetadataSerializer.html +786 -0
- data/docs/rdocs/Longleaf/PreservationServiceError.html +147 -0
- data/docs/rdocs/Longleaf/PreserveCommand.html +410 -0
- data/docs/rdocs/Longleaf/PreserveEvent.html +491 -0
- data/docs/rdocs/Longleaf/RegisterCommand.html +428 -0
- data/docs/rdocs/Longleaf/RegisterEvent.html +628 -0
- data/docs/rdocs/Longleaf/RegisteredFileSelector.html +446 -0
- data/docs/rdocs/Longleaf/RegistrationError.html +151 -0
- data/docs/rdocs/Longleaf/ReindexCommand.html +576 -0
- data/docs/rdocs/Longleaf/RsyncReplicationService.html +1180 -0
- data/docs/rdocs/Longleaf/SequelIndexDriver.html +1978 -0
- data/docs/rdocs/Longleaf/ServiceCandidateFilesystemIterator.html +572 -0
- data/docs/rdocs/Longleaf/ServiceCandidateIndexIterator.html +532 -0
- data/docs/rdocs/Longleaf/ServiceCandidateLocator.html +333 -0
- data/docs/rdocs/Longleaf/ServiceClassCache.html +725 -0
- data/docs/rdocs/Longleaf/ServiceDateHelper.html +425 -0
- data/docs/rdocs/Longleaf/ServiceDefinition.html +683 -0
- data/docs/rdocs/Longleaf/ServiceDefinitionManager.html +371 -0
- data/docs/rdocs/Longleaf/ServiceDefinitionValidator.html +269 -0
- data/docs/rdocs/Longleaf/ServiceFields.html +173 -0
- data/docs/rdocs/Longleaf/ServiceManager.html +1229 -0
- data/docs/rdocs/Longleaf/ServiceMappingManager.html +410 -0
- data/docs/rdocs/Longleaf/ServiceMappingValidator.html +347 -0
- data/docs/rdocs/Longleaf/ServiceRecord.html +821 -0
- data/docs/rdocs/Longleaf/StorageLocation.html +985 -0
- data/docs/rdocs/Longleaf/StorageLocationManager.html +729 -0
- data/docs/rdocs/Longleaf/StorageLocationUnavailableError.html +143 -0
- data/docs/rdocs/Longleaf/StorageLocationValidator.html +373 -0
- data/docs/rdocs/Longleaf/StoragePathValidator.html +253 -0
- data/docs/rdocs/Longleaf/SystemConfigBuilder.html +441 -0
- data/docs/rdocs/Longleaf/SystemConfigFields.html +163 -0
- data/docs/rdocs/Longleaf/ValidateConfigCommand.html +451 -0
- data/docs/rdocs/Longleaf/ValidateMetadataCommand.html +408 -0
- data/docs/rdocs/_index.html +660 -0
- data/docs/rdocs/class_list.html +51 -0
- data/docs/rdocs/css/common.css +1 -0
- data/docs/rdocs/css/full_list.css +58 -0
- data/docs/rdocs/css/style.css +496 -0
- data/docs/rdocs/file.README.html +165 -0
- data/docs/rdocs/file_list.html +56 -0
- data/docs/rdocs/frames.html +17 -0
- data/docs/rdocs/index.html +165 -0
- data/docs/rdocs/js/app.js +303 -0
- data/docs/rdocs/js/full_list.js +216 -0
- data/docs/rdocs/js/jquery.js +4 -0
- data/docs/rdocs/method_list.html +2051 -0
- data/docs/rdocs/top-level-namespace.html +110 -0
- data/lib/longleaf/candidates/file_selector.rb +47 -15
- data/lib/longleaf/candidates/registered_file_selector.rb +67 -0
- data/lib/longleaf/candidates/service_candidate_filesystem_iterator.rb +29 -35
- data/lib/longleaf/candidates/service_candidate_index_iterator.rb +84 -0
- data/lib/longleaf/candidates/service_candidate_locator.rb +9 -4
- data/lib/longleaf/cli.rb +162 -80
- data/lib/longleaf/commands/deregister_command.rb +12 -11
- data/lib/longleaf/commands/preserve_command.rb +13 -8
- data/lib/longleaf/commands/register_command.rb +9 -6
- data/lib/longleaf/commands/reindex_command.rb +92 -0
- data/lib/longleaf/commands/validate_config_command.rb +27 -6
- data/lib/longleaf/commands/validate_metadata_command.rb +11 -9
- data/lib/longleaf/errors.rb +12 -12
- data/lib/longleaf/events/deregister_event.rb +13 -15
- data/lib/longleaf/events/event_status_tracking.rb +7 -7
- data/lib/longleaf/events/preserve_event.rb +24 -14
- data/lib/longleaf/events/register_event.rb +21 -35
- data/lib/longleaf/helpers/digest_helper.rb +4 -4
- data/lib/longleaf/helpers/service_date_helper.rb +5 -6
- data/lib/longleaf/indexing/index_manager.rb +101 -0
- data/lib/longleaf/indexing/sequel_index_driver.rb +324 -0
- data/lib/longleaf/logging.rb +4 -4
- data/lib/longleaf/logging/redirecting_logger.rb +20 -20
- data/lib/longleaf/models/app_fields.rb +2 -1
- data/lib/longleaf/models/file_record.rb +10 -6
- data/lib/longleaf/models/md_fields.rb +1 -1
- data/lib/longleaf/models/metadata_record.rb +22 -12
- data/lib/longleaf/models/service_definition.rb +3 -3
- data/lib/longleaf/models/service_fields.rb +1 -1
- data/lib/longleaf/models/service_record.rb +6 -5
- data/lib/longleaf/models/storage_location.rb +26 -7
- data/lib/longleaf/models/system_config_fields.rb +9 -0
- data/lib/longleaf/preservation_services/file_check_service.rb +58 -0
- data/lib/longleaf/preservation_services/fixity_check_service.rb +16 -14
- data/lib/longleaf/preservation_services/rsync_replication_service.rb +32 -31
- data/lib/longleaf/services/application_config_deserializer.rb +55 -18
- data/lib/longleaf/services/application_config_manager.rb +16 -4
- data/lib/longleaf/services/application_config_validator.rb +1 -2
- data/lib/longleaf/services/configuration_validator.rb +6 -4
- data/lib/longleaf/services/metadata_deserializer.rb +40 -38
- data/lib/longleaf/services/metadata_persistence_manager.rb +46 -0
- data/lib/longleaf/services/metadata_serializer.rb +23 -22
- data/lib/longleaf/services/service_class_cache.rb +15 -15
- data/lib/longleaf/services/service_definition_manager.rb +5 -6
- data/lib/longleaf/services/service_definition_validator.rb +5 -6
- data/lib/longleaf/services/service_manager.rb +37 -17
- data/lib/longleaf/services/service_mapping_manager.rb +9 -9
- data/lib/longleaf/services/service_mapping_validator.rb +9 -10
- data/lib/longleaf/services/storage_location_manager.rb +22 -8
- data/lib/longleaf/services/storage_location_validator.rb +11 -8
- data/lib/longleaf/services/storage_path_validator.rb +1 -1
- data/lib/longleaf/specs/config_builder.rb +30 -17
- data/lib/longleaf/specs/custom_matchers.rb +1 -1
- data/lib/longleaf/specs/file_helpers.rb +15 -14
- data/lib/longleaf/specs/metadata_builder.rb +91 -0
- data/lib/longleaf/specs/system_config_builder.rb +27 -0
- data/lib/longleaf/version.rb +1 -1
- data/longleaf.gemspec +17 -7
- data/mkdocs.yml +20 -0
- metadata +233 -22
@@ -10,7 +10,7 @@ module Longleaf
|
|
10
10
|
# Event to register a file with longleaf
|
11
11
|
class RegisterEvent
|
12
12
|
include Longleaf::EventStatusTracking
|
13
|
-
|
13
|
+
|
14
14
|
# @param file_rec [FileRecord] file record
|
15
15
|
# @param app_manager [ApplicationConfigManager] the application configuration
|
16
16
|
# @param force [boolean] if true, then already registered files will be re-registered
|
@@ -21,75 +21,61 @@ module Longleaf
|
|
21
21
|
raise ArgumentError.new('Must provide an ApplicationConfigManager') if app_manager.nil?
|
22
22
|
raise ArgumentError.new('Parameter app_manager must be an ApplicationConfigManager') \
|
23
23
|
unless app_manager.is_a?(ApplicationConfigManager)
|
24
|
-
|
24
|
+
|
25
25
|
@app_manager = app_manager
|
26
26
|
@file_rec = file_rec
|
27
27
|
@force = force
|
28
28
|
@checksums = checksums
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
# Perform a registration event on the given file
|
32
|
-
# @raise RegistrationError if a file cannot be registered
|
32
|
+
# @raise RegistrationError if a file cannot be registered
|
33
33
|
def perform
|
34
34
|
begin
|
35
35
|
# Only need to re-register file if the force flag is provided
|
36
36
|
if @file_rec.metadata_present? && !@force
|
37
37
|
raise RegistrationError.new("Unable to register '#{@file_rec.path}', it is already registered.")
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
# create metadata record
|
41
|
-
md_rec = MetadataRecord.new(registered: Time.now.utc.iso8601)
|
41
|
+
md_rec = MetadataRecord.new(registered: Time.now.utc.iso8601(3))
|
42
42
|
@file_rec.metadata_record = md_rec
|
43
|
-
|
43
|
+
|
44
44
|
# retain significant details from former record
|
45
45
|
if @file_rec.metadata_present?
|
46
46
|
retain_existing_properties
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
populate_file_properties
|
50
|
-
|
50
|
+
|
51
51
|
md_rec.checksums.merge!(@checksums) unless @checksums.nil?
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
MetadataSerializer::write(metadata: md_rec,
|
57
|
-
file_path: @file_rec.metadata_path,
|
58
|
-
digest_algs: @file_rec.storage_location.metadata_digests)
|
59
|
-
|
52
|
+
|
53
|
+
# persist the metadata
|
54
|
+
@app_manager.md_manager.persist(@file_rec)
|
55
|
+
|
60
56
|
record_success(EventNames::REGISTER, @file_rec.path)
|
61
57
|
rescue RegistrationError => err
|
62
58
|
record_failure(EventNames::REGISTER, @file_rec.path, err.message)
|
63
59
|
rescue InvalidStoragePathError => err
|
64
60
|
record_failure(EventNames::REGISTER, @file_rec.path, err.message)
|
65
61
|
end
|
66
|
-
|
62
|
+
|
67
63
|
return_status
|
68
64
|
end
|
69
|
-
|
65
|
+
|
70
66
|
private
|
71
67
|
def populate_file_properties
|
72
68
|
md_rec = @file_rec.metadata_record
|
73
|
-
|
69
|
+
|
74
70
|
# Set file properties
|
75
|
-
md_rec.last_modified = File.mtime(@file_rec.path).utc.iso8601
|
71
|
+
md_rec.last_modified = File.mtime(@file_rec.path).utc.iso8601(3)
|
76
72
|
md_rec.file_size = File.size(@file_rec.path)
|
77
73
|
end
|
78
|
-
|
79
|
-
def populate_services
|
80
|
-
md_rec = @file_rec.metadata_record
|
81
|
-
|
82
|
-
service_manager = @app_manager.service_manager
|
83
|
-
service_names = service_manager.list_services(location: @file_rec.storage_location.name)
|
84
|
-
|
85
|
-
# Add service section
|
86
|
-
service_names.each { |serv_name| md_rec.add_service(serv_name) }
|
87
|
-
end
|
88
|
-
|
74
|
+
|
89
75
|
# Copy a subset of properties from an existing metadata record to the new record
|
90
76
|
def retain_existing_properties
|
91
77
|
md_rec = @file_rec.metadata_record
|
92
|
-
|
78
|
+
|
93
79
|
old_md = MetadataDeserializer.deserialize(file_path: @file_rec.metadata_path,
|
94
80
|
digest_algs: @file_rec.storage_location.metadata_digests)
|
95
81
|
# Copy custom properties
|
@@ -97,7 +83,7 @@ module Longleaf
|
|
97
83
|
# Copy stale-replicas flag per service
|
98
84
|
old_md.list_services.each do |serv_name|
|
99
85
|
serv_rec = old_md.service(serv_name)
|
100
|
-
|
86
|
+
|
101
87
|
stale_replicas = serv_rec.stale_replicas
|
102
88
|
if stale_replicas
|
103
89
|
new_service = md_rec.service(serv_name)
|
@@ -106,4 +92,4 @@ module Longleaf
|
|
106
92
|
end
|
107
93
|
end
|
108
94
|
end
|
109
|
-
end
|
95
|
+
end
|
@@ -5,7 +5,7 @@ module Longleaf
|
|
5
5
|
# Helper methods for generating digests
|
6
6
|
class DigestHelper
|
7
7
|
KNOWN_DIGESTS ||= ['md5', 'sha1', 'sha2', 'sha256', 'sha384', 'sha512', 'rmd160']
|
8
|
-
|
8
|
+
|
9
9
|
# @param algs Either a string containing one or an array containing zero or more digest
|
10
10
|
# algorithm names.
|
11
11
|
# @raise [InvalidDigestAlgorithmError] thrown if any of the digest algorithms listed are not
|
@@ -19,11 +19,11 @@ module Longleaf
|
|
19
19
|
else
|
20
20
|
unknown = algs.select { |alg| !KNOWN_DIGESTS.include?(alg) }
|
21
21
|
unless unknown.empty?
|
22
|
-
raise InvalidDigestAlgorithmError.new("Unknown digest algorithm(s): #{unknown
|
22
|
+
raise InvalidDigestAlgorithmError.new("Unknown digest algorithm(s): #{unknown}")
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
# Get a Digest class for the specified algorithm
|
28
28
|
# @param alg [String] name of the digest algorithm
|
29
29
|
# @return [Digest] A digest class for the requested algorithm
|
@@ -47,4 +47,4 @@ module Longleaf
|
|
47
47
|
end
|
48
48
|
end
|
49
49
|
end
|
50
|
-
end
|
50
|
+
end
|
@@ -3,7 +3,6 @@ require 'time'
|
|
3
3
|
module Longleaf
|
4
4
|
# Helper methods for interacting with dates/timestamps on services
|
5
5
|
class ServiceDateHelper
|
6
|
-
|
7
6
|
# Adds the amount of time from modifier to the provided timestamp
|
8
7
|
# @param timestamp [String] ISO-8601 timestamp string
|
9
8
|
# @param modifier [String] amount of time to add to the timestamp. It must follow the syntax
|
@@ -18,7 +17,7 @@ module Longleaf
|
|
18
17
|
else
|
19
18
|
raise ArgumentError.new("Cannot parse time modifier #{modifier}")
|
20
19
|
end
|
21
|
-
|
20
|
+
|
22
21
|
datetime = Time.iso8601(timestamp)
|
23
22
|
case unit
|
24
23
|
when 'second'
|
@@ -36,16 +35,16 @@ module Longleaf
|
|
36
35
|
when 'year'
|
37
36
|
unit_modifier = 365 * 24 * 3600
|
38
37
|
end
|
39
|
-
|
38
|
+
|
40
39
|
modified_time = datetime + (value * unit_modifier)
|
41
40
|
modified_time.iso8601
|
42
41
|
end
|
43
|
-
|
42
|
+
|
44
43
|
# Get a timestamp in the format expected for service timestamps.
|
45
44
|
# @param timestamp [Time] the time to format. Defaults to now.
|
46
45
|
# @return [String] the time formatted as iso8601
|
47
46
|
def self.formatted_timestamp(timestamp = Time.now)
|
48
|
-
timestamp.iso8601.to_s
|
47
|
+
timestamp.utc.iso8601(3).to_s
|
49
48
|
end
|
50
49
|
end
|
51
|
-
end
|
50
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'longleaf/models/system_config_fields'
|
2
|
+
require 'longleaf/services/metadata_persistence_manager'
|
3
|
+
require 'longleaf/errors'
|
4
|
+
|
5
|
+
module Longleaf
|
6
|
+
# Manager configures and provides access to a metadata index if one is specified
|
7
|
+
class IndexManager
|
8
|
+
SYS_FIELDS ||= Longleaf::SystemConfigFields
|
9
|
+
|
10
|
+
# @param config [Hash] The system configuration as a hash
|
11
|
+
# @param app_config_manager [ApplicationConfigManager] the application config
|
12
|
+
def initialize(config, app_config_manager)
|
13
|
+
@config = config
|
14
|
+
@app_config_manager = app_config_manager
|
15
|
+
init_index_driver if @config&.key?(SYS_FIELDS::MD_INDEX)
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return true if the system is configured to use a metadata index
|
19
|
+
def using_index?
|
20
|
+
!@index_driver.nil?
|
21
|
+
end
|
22
|
+
|
23
|
+
# Index the provided file_rec and its metadata
|
24
|
+
#
|
25
|
+
# @param file_rec [FileRecord] file record to index
|
26
|
+
def index(file_rec)
|
27
|
+
@index_driver.index(file_rec)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Remove an entry from the index
|
31
|
+
# @param remove_me The record to remove from the index
|
32
|
+
def remove(remove_me)
|
33
|
+
@index_driver.remove(remove_me)
|
34
|
+
end
|
35
|
+
|
36
|
+
def clear_index(older_than = nil)
|
37
|
+
@index_driver.clear_index(older_than)
|
38
|
+
end
|
39
|
+
|
40
|
+
# @return true if the index should be reindexed
|
41
|
+
def index_stale?
|
42
|
+
@index_driver.is_stale?
|
43
|
+
end
|
44
|
+
|
45
|
+
# Setup initial structure of index implementation
|
46
|
+
def setup_index
|
47
|
+
@index_driver.setup_index
|
48
|
+
end
|
49
|
+
|
50
|
+
def update_index_state
|
51
|
+
@index_driver.update_index_state
|
52
|
+
end
|
53
|
+
|
54
|
+
# Retrieves a set of which have one or more services which need to run.
|
55
|
+
#
|
56
|
+
# @param file_selector [FileSelector] selector for paths to search for files
|
57
|
+
# @param stale_datetime [DateTime] find file_paths with services needing to be run before this value
|
58
|
+
# @return [Array] array of file paths that need one or more services run, in ascending order by
|
59
|
+
# timestamp.
|
60
|
+
def paths_with_stale_services(file_selector, stale_datetime)
|
61
|
+
@index_driver.paths_with_stale_services(file_selector, stale_datetime)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Retrieves a page of paths for registered files.
|
65
|
+
# @param file_selector [FileSelector] selector for what paths to search for files
|
66
|
+
# @return [Array] array of file paths that are registered
|
67
|
+
def registered_paths(file_selector)
|
68
|
+
@index_driver.registered_paths(file_selector)
|
69
|
+
end
|
70
|
+
|
71
|
+
def each_registered_path(file_selector, older_than: nil, &block)
|
72
|
+
@index_driver.each_registered_path(file_selector, older_than: older_than, &block)
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
def init_index_driver
|
77
|
+
index_conf = @config[SYS_FIELDS::MD_INDEX]
|
78
|
+
adapter = index_conf[SYS_FIELDS::MD_INDEX_ADAPTER]&.downcase
|
79
|
+
|
80
|
+
raise ConfigurationError.new('Must specify an adapter for the metadata index') if adapter.nil?
|
81
|
+
|
82
|
+
adapter = adapter.to_sym
|
83
|
+
|
84
|
+
case adapter
|
85
|
+
when :postgres, :mysql, :mysql2, :sqlite, :amalgalite
|
86
|
+
page_size = index_conf[SYS_FIELDS::MD_INDEX_PAGE_SIZE]&.to_int
|
87
|
+
|
88
|
+
connection = index_conf[SYS_FIELDS::MD_INDEX_CONNECTION]
|
89
|
+
raise ConfigurationError.new("Must specify connection details for index adapter of type '#{adapter}'") if connection.nil?
|
90
|
+
|
91
|
+
require 'longleaf/indexing/sequel_index_driver'
|
92
|
+
@index_driver = SequelIndexDriver.new(@app_config_manager,
|
93
|
+
adapter,
|
94
|
+
connection,
|
95
|
+
page_size: page_size)
|
96
|
+
else
|
97
|
+
raise ConfigurationError.new("Unknown index adapter '#{adapter}' specified.") if adapter.nil?
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,324 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'digest/md5'
|
3
|
+
require 'longleaf/events/event_names'
|
4
|
+
require 'longleaf/candidates/file_selector'
|
5
|
+
require 'longleaf/version'
|
6
|
+
require 'longleaf/models/system_config_fields'
|
7
|
+
require 'longleaf/logging'
|
8
|
+
|
9
|
+
module Longleaf
|
10
|
+
# Driver for interacting with RDBM based metadata index using the Sequel ORM gem.
|
11
|
+
# Users must create the database and credentials for connecting to it in advance,
|
12
|
+
# if using a database application that requires creation of databases (ie, not sqlite).
|
13
|
+
# The default database name is 'longleaf_metadata_index' but may be overridden.
|
14
|
+
#
|
15
|
+
# See the Sequel documentation for details about accepted connection parameters:
|
16
|
+
# https://github.com/jeremyevans/sequel/blob/master/doc/opening_databases.rdoc
|
17
|
+
class SequelIndexDriver
|
18
|
+
include Longleaf::Logging
|
19
|
+
INDEX_DB_NAME ||= 'longleaf_metadata_index'
|
20
|
+
PRESERVE_TBL ||= "preserve_service_times".to_sym
|
21
|
+
INDEX_STATE_TBL ||= "index_state".to_sym
|
22
|
+
DEFAULT_PAGE_SIZE ||= 1000
|
23
|
+
TIMESTAMP_FORMAT ||= '%Y-%m-%d %H:%M:%S.%3N'
|
24
|
+
|
25
|
+
# Initialize the index driver
|
26
|
+
#
|
27
|
+
# @param app_config [ApplicationConfigManager] the application configuration manager
|
28
|
+
# @param adapter [String] name of the database adapter to use.
|
29
|
+
# @param conn_details Details about the configuration and connection to the database used for the index.
|
30
|
+
# If a string is provided, it will be used as the connection URL and must identify the adapter.
|
31
|
+
# If a hash is provided, it used as the parameters for the database connection.
|
32
|
+
# @param page_size [Integer] number of results to retrieve per query when getting candidates
|
33
|
+
def initialize(app_config, adapter, conn_details, page_size: nil)
|
34
|
+
Sequel.default_timezone = :utc
|
35
|
+
@app_config = app_config
|
36
|
+
@adapter = adapter
|
37
|
+
@conn_details = conn_details
|
38
|
+
# Digest of the app config file so we can tell if it changes
|
39
|
+
@config_md5 = app_config.config_md5
|
40
|
+
@page_size = page_size.nil? || page_size <= 0 ? DEFAULT_PAGE_SIZE : page_size
|
41
|
+
|
42
|
+
if @conn_details.is_a?(Hash)
|
43
|
+
# Add in the adapter name
|
44
|
+
@conn_details['adapter'] = adapter unless @conn_details.key?('adapter')
|
45
|
+
# Add in default database name if none was specified
|
46
|
+
@conn_details['database'] = INDEX_DB_NAME unless @conn_details.key?('database')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns true if the application configuration does not match the configuration used for
|
51
|
+
# the last reindex.
|
52
|
+
def is_stale?
|
53
|
+
db_conn[INDEX_STATE_TBL].where(config_md5: @config_md5).count == 0
|
54
|
+
end
|
55
|
+
|
56
|
+
# Index the provided file_rec and its metadata
|
57
|
+
#
|
58
|
+
# @param file_rec [FileRecord] file record to index
|
59
|
+
def index(file_rec)
|
60
|
+
file_path = file_rec.path
|
61
|
+
md_rec = file_rec.metadata_record
|
62
|
+
storage_loc = file_rec.storage_location
|
63
|
+
service_manager = @app_config.service_manager
|
64
|
+
|
65
|
+
# Produce a list of service definitions which should apply to the file
|
66
|
+
expected_services = service_manager.list_service_definitions(
|
67
|
+
location: storage_loc.name)
|
68
|
+
|
69
|
+
first_timestamp = first_service_execution_timestamp(expected_services, md_rec)
|
70
|
+
delay_until_timestamp = delay_until_timestamp(md_rec)
|
71
|
+
|
72
|
+
first_timestamp = convert_iso8601_to_timestamp(first_timestamp)
|
73
|
+
delay_until_timestamp = convert_iso8601_to_timestamp(delay_until_timestamp)
|
74
|
+
now_stamp = Time.now.utc.strftime(TIMESTAMP_FORMAT)
|
75
|
+
|
76
|
+
if @adapter == :mysql || @adapter == :mysql2
|
77
|
+
preserve_tbl.on_duplicate_key_update
|
78
|
+
.insert(file_path: file_path,
|
79
|
+
storage_location: storage_loc.name,
|
80
|
+
service_time: first_timestamp,
|
81
|
+
delay_until_time: delay_until_timestamp,
|
82
|
+
updated: now_stamp)
|
83
|
+
else
|
84
|
+
preserve_tbl.insert_conflict(target: :file_path,
|
85
|
+
update: {
|
86
|
+
storage_location: storage_loc.name,
|
87
|
+
service_time: first_timestamp,
|
88
|
+
delay_until_time: delay_until_timestamp,
|
89
|
+
updated: now_stamp } )
|
90
|
+
.insert(file_path: file_path,
|
91
|
+
storage_location: storage_loc.name,
|
92
|
+
service_time: first_timestamp,
|
93
|
+
delay_until_time: delay_until_timestamp,
|
94
|
+
updated: now_stamp)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Find the earliest service execution time for any services expected to be run for the specified file.
|
99
|
+
#
|
100
|
+
# @param expected_services [Array] list of ServiceDefinition objects expected for specified file.
|
101
|
+
# @param md_rec [MetadataRecord] metadata record for the file being evaluated
|
102
|
+
# @return The timestamp of the earliest service execution time for the file described by md_rec, in iso8601 format.
|
103
|
+
# Returns nil if no services are expected all services have already run and do not have a next occurrence, or
|
104
|
+
# the file is deregistered.
|
105
|
+
def first_service_execution_timestamp(expected_services, md_rec)
|
106
|
+
current_time = Time.now.utc.iso8601(3)
|
107
|
+
if md_rec.deregistered?
|
108
|
+
return nil
|
109
|
+
end
|
110
|
+
|
111
|
+
service_times = Array.new
|
112
|
+
|
113
|
+
present_services = md_rec.list_services
|
114
|
+
|
115
|
+
expected_services.each do |service_def|
|
116
|
+
service_name = service_def.name
|
117
|
+
# Service has never run, set execution time to now
|
118
|
+
if !present_services.include?(service_name)
|
119
|
+
service_times << current_time
|
120
|
+
next
|
121
|
+
end
|
122
|
+
|
123
|
+
service_rec = md_rec.service(service_name)
|
124
|
+
|
125
|
+
# Service either needs a run or has no timestamp, so execution time of now
|
126
|
+
if service_rec.run_needed || service_rec.timestamp.nil?
|
127
|
+
service_times << current_time
|
128
|
+
next
|
129
|
+
end
|
130
|
+
|
131
|
+
# Calculate the next time this service should run based on frequency
|
132
|
+
frequency = service_def.frequency
|
133
|
+
unless frequency.nil?
|
134
|
+
service_timestamp = service_rec.timestamp
|
135
|
+
service_times << ServiceDateHelper.add_to_timestamp(service_timestamp, frequency)
|
136
|
+
next
|
137
|
+
end
|
138
|
+
end
|
139
|
+
# Return the lowest service execution time
|
140
|
+
service_times.min
|
141
|
+
end
|
142
|
+
|
143
|
+
# @return The first failure timestamp for any service, or nil if there were none.
|
144
|
+
def delay_until_timestamp(md_rec)
|
145
|
+
md_rec.list_services.each do |service_name|
|
146
|
+
service_rec = md_rec.service(service_name)
|
147
|
+
return service_rec.failure_timestamp unless service_rec.failure_timestamp.nil?
|
148
|
+
end
|
149
|
+
# return lowest possible date
|
150
|
+
return minimum_timestamp
|
151
|
+
end
|
152
|
+
|
153
|
+
# Remove an entry from the index
|
154
|
+
# @param remove_me The record to remove from the index. May be a FileRecord or a String.
|
155
|
+
def remove(remove_me)
|
156
|
+
if remove_me.is_a?(FileRecord)
|
157
|
+
path = remove_me.path
|
158
|
+
else
|
159
|
+
path = remove_me
|
160
|
+
end
|
161
|
+
|
162
|
+
result = preserve_tbl.where(file_path: path).delete
|
163
|
+
if result == 0
|
164
|
+
logger.warn("Could not remove #{path} from the index, path was not present.")
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# Remove all entries from the index
|
169
|
+
# @param older_than [Time] Optional. If provided, only entries that have not been indexed
|
170
|
+
# since before the provided time will be deleted.
|
171
|
+
def clear_index(older_than = nil)
|
172
|
+
if older_than.nil?
|
173
|
+
preserve_tbl.delete
|
174
|
+
else
|
175
|
+
older_than_timestamp = older_than.utc.strftime(TIMESTAMP_FORMAT)
|
176
|
+
preserve_tbl.where { updated < older_than_timestamp }.delete
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Initialize the index's database using the provided configuration
|
181
|
+
def setup_index
|
182
|
+
# Create the table for tracking when files will need preservation services run on them.
|
183
|
+
case @adapter
|
184
|
+
when :mysql, :mysql2
|
185
|
+
# mysql does not support 'text' fields as primary keys
|
186
|
+
db_conn.create_table!(PRESERVE_TBL) do
|
187
|
+
String :file_path, primary_key: true, size: 768
|
188
|
+
column :storage_location, 'varchar(128)'
|
189
|
+
column :service_time, 'timestamp(3)', { :null => true }
|
190
|
+
column :delay_until_time, 'timestamp(3)'
|
191
|
+
column :updated, 'timestamp(3)'
|
192
|
+
end
|
193
|
+
else
|
194
|
+
db_conn.create_table!(PRESERVE_TBL) do
|
195
|
+
String :file_path, primary_key: true, text: true
|
196
|
+
column :storage_location, 'varchar(128)'
|
197
|
+
column :service_time, 'timestamp(3)', { :null => true }
|
198
|
+
column :delay_until_time, 'timestamp(3)'
|
199
|
+
column :updated, 'timestamp(3)'
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# Setup database indexes
|
204
|
+
case @adapter
|
205
|
+
when :postgres
|
206
|
+
db_conn.run("CREATE INDEX service_times_file_path_text_index ON preserve_service_times (file_path text_pattern_ops)")
|
207
|
+
when :sqlite, :amalgalite
|
208
|
+
db_conn.run("CREATE INDEX service_times_file_path_text_index ON preserve_service_times (file_path collate nocase)")
|
209
|
+
end
|
210
|
+
db_conn.run("CREATE INDEX service_times_storage_location_index ON preserve_service_times (storage_location)")
|
211
|
+
|
212
|
+
# Create table for tracking the state of the index
|
213
|
+
db_conn.create_table!(INDEX_STATE_TBL) do
|
214
|
+
String :config_md5
|
215
|
+
DateTime :last_reindexed
|
216
|
+
String :longleaf_version
|
217
|
+
end
|
218
|
+
|
219
|
+
# Prepopulate the index state information
|
220
|
+
update_index_state
|
221
|
+
end
|
222
|
+
|
223
|
+
# Updates the state information for the index to indicate that the index has been refreshed
|
224
|
+
# or is in sync with the application's configuration.
|
225
|
+
def update_index_state
|
226
|
+
index_state_tbl = db_conn[INDEX_STATE_TBL]
|
227
|
+
index_state_tbl.delete
|
228
|
+
index_state_tbl.insert(
|
229
|
+
config_md5: @config_md5,
|
230
|
+
last_reindexed: Time.now.utc,
|
231
|
+
longleaf_version: Longleaf::VERSION)
|
232
|
+
end
|
233
|
+
|
234
|
+
# Retrieves page of file paths which have one or more services which need to run.
|
235
|
+
# @param file_selector [FileSelector] selector for what paths to search for files
|
236
|
+
# @param stale_datetime [DateTime] find file_paths with services needing to be run before this value
|
237
|
+
# @return [Array] array of file paths that need one or more services run.
|
238
|
+
def paths_with_stale_services(file_selector, stale_datetime)
|
239
|
+
if @preserve_dataset.nil?
|
240
|
+
@preserve_dataset = db_conn
|
241
|
+
.from(PRESERVE_TBL)
|
242
|
+
.exclude(service_time: nil)
|
243
|
+
.limit(@page_size)
|
244
|
+
.order(Sequel.asc(:service_time))
|
245
|
+
end
|
246
|
+
|
247
|
+
# retrieve and return a page of results
|
248
|
+
ds = add_path_restrictions(@preserve_dataset, file_selector)
|
249
|
+
.where { service_time <= stale_datetime }
|
250
|
+
.where { delay_until_time < stale_datetime }
|
251
|
+
.select_map(:file_path)
|
252
|
+
end
|
253
|
+
|
254
|
+
# Retrieves a page of paths for registered files.
|
255
|
+
# @param file_selector [FileSelector] selector for what paths to search for files
|
256
|
+
# @return [Array] array of file paths that are registered
|
257
|
+
def registered_paths(file_selector)
|
258
|
+
# retrieve and return a page of results
|
259
|
+
add_path_restrictions(registered_dataset, file_selector)
|
260
|
+
.select_map(:file_path)
|
261
|
+
end
|
262
|
+
|
263
|
+
# Calls the provided block once per each registered file path registered.
|
264
|
+
# Must be passed a block.
|
265
|
+
# @param file_selector [FileSelector] selector for what paths to search for files
|
266
|
+
# @param older_than [Time] Optional. If provided, only files that have not been
|
267
|
+
# indexed since before this timestamp will be returned.
|
268
|
+
def each_registered_path(file_selector, older_than: nil, &block)
|
269
|
+
dataset = add_path_restrictions(registered_dataset, file_selector)
|
270
|
+
.select(:file_path)
|
271
|
+
if !older_than.nil?
|
272
|
+
older_than_timestamp = older_than.utc.strftime(TIMESTAMP_FORMAT)
|
273
|
+
dataset = dataset.where { updated < older_than_timestamp }
|
274
|
+
end
|
275
|
+
# Yield to the provided block once per row return
|
276
|
+
dataset.paged_each(:rows_per_fetch => @page_size) do |row|
|
277
|
+
block.call(row[:file_path])
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
private
|
282
|
+
def db_conn
|
283
|
+
@connection = Sequel.connect(@conn_details) if @connection.nil?
|
284
|
+
@connection
|
285
|
+
end
|
286
|
+
|
287
|
+
def preserve_tbl
|
288
|
+
@preserve_tbl = db_conn[PRESERVE_TBL] if @preserve_tbl.nil?
|
289
|
+
@preserve_tbl
|
290
|
+
end
|
291
|
+
|
292
|
+
def add_path_restrictions(dataset, file_selector)
|
293
|
+
if file_selector.specificity == FileSelector::SPECIFICITY_STORAGE_LOCATION
|
294
|
+
dataset.where(storage_location: file_selector.storage_locations)
|
295
|
+
else
|
296
|
+
# Reformat all selected paths into LIKE partial string matches
|
297
|
+
path_conds = file_selector.target_paths.map { |path| path.end_with?('/') ? path + '%' : path }
|
298
|
+
dataset.where(Sequel.like(:file_path, *path_conds))
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def convert_iso8601_to_timestamp(iso8601)
|
303
|
+
return nil if iso8601.nil?
|
304
|
+
Time.iso8601(iso8601).strftime(TIMESTAMP_FORMAT)
|
305
|
+
end
|
306
|
+
|
307
|
+
def minimum_timestamp
|
308
|
+
if @min_timestamp.nil?
|
309
|
+
@min_timestamp = ServiceDateHelper.formatted_timestamp(Time.at(0).utc)
|
310
|
+
end
|
311
|
+
@min_timestamp
|
312
|
+
end
|
313
|
+
|
314
|
+
def registered_dataset
|
315
|
+
if @registered_dataset.nil?
|
316
|
+
@registered_dataset = db_conn
|
317
|
+
.from(PRESERVE_TBL)
|
318
|
+
.limit(@page_size)
|
319
|
+
.order(Sequel.asc(:service_time))
|
320
|
+
end
|
321
|
+
@registered_dataset
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|