elasticgraph-datastore_core 0.18.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "date"
10
+ require "elastic_graph/datastore_core/index_config_normalizer"
11
+ require "elastic_graph/datastore_core/index_definition/base"
12
+ require "elastic_graph/datastore_core/index_definition/index"
13
+ require "elastic_graph/datastore_core/index_definition/rollover_index"
14
+ require "elastic_graph/error"
15
+ require "elastic_graph/support/memoizable_data"
16
+ require "elastic_graph/support/time_set"
17
+ require "elastic_graph/support/time_util"
18
+ require "time"
19
+
20
+ module ElasticGraph
21
+ class DatastoreCore
22
+ module IndexDefinition
23
+ class RolloverIndexTemplate < Support::MemoizableData.define(
24
+ :name, :route_with, :default_sort_clauses, :current_sources, :fields_by_path, :env_index_config,
25
+ :index_args, :defined_clusters, :datastore_clients_by_name, :timestamp_field_path, :frequency
26
+ )
27
+ # `Data.define` provides all these methods:
28
+ # @dynamic name, route_with, default_sort_clauses, current_sources, fields_by_path, env_index_config,
29
+ # @dynamic index_args, defined_clusters, datastore_clients_by_name, timestamp_field_path, frequency, initialize
30
+
31
+ # `include IndexDefinition::Base` provides all these methods. Steep should be able to detect it
32
+ # but can't for some reason so we have to declare them with `@dynamic`.
33
+ # @dynamic flattened_env_setting_overrides, routing_value_for_prepared_record, has_custom_routing?, cluster_to_query, use_updates_for_indexing?
34
+ # @dynamic clusters_to_index_into, all_accessible_cluster_names, ignored_values_for_routing, searches_could_hit_incomplete_docs?
35
+ # @dynamic accessible_cluster_names_to_index_into, accessible_from_queries?, known_related_query_rollover_indices, list_counts_field_paths_for_source
36
+ include IndexDefinition::Base
37
+
38
+ def mappings_in_datastore(datastore_client)
39
+ IndexConfigNormalizer.normalize_mappings(
40
+ datastore_client.get_index_template(name).dig("template", "mappings") || {}
41
+ )
42
+ end
43
+
44
+ # We need to delete both the template and the actual indices for rollover indices
45
+ def delete_from_datastore(datastore_client)
46
+ datastore_client.delete_index_template(name)
47
+ datastore_client.delete_indices(index_expression_for_search)
48
+ end
49
+
50
+ # Indicates if this is a rollover index definition.
51
+ #
52
+ # Use of this is considered a mild code smell. When feasible, it's generally better to
53
+ # implement a new polymorphic API on the IndexDefinition interface, rather
54
+ # then branching on the value of this predicate.
55
+ def rollover_index_template?
56
+ true
57
+ end
58
+
59
+ # Two underscores used to avoid collisions
60
+ # with other types (e.g. payments_2020 and payments_xyz_2020), though regardless shouldn't
61
+ # happen if types follow naming conventions.
62
+ def index_expression_for_search
63
+ index_name_with_suffix("*")
64
+ end
65
+
66
+ # Returns an index name to use for write operations. The index_definition selection is a function of
67
+ # the index_definition's rollover configuration and the record's timestamp.
68
+ def index_name_for_writes(record, timestamp_field_path: nil)
69
+ index_name_with_suffix(rollover_index_suffix_for_record(
70
+ record,
71
+ timestamp_field_path: timestamp_field_path || self.timestamp_field_path
72
+ ))
73
+ end
74
+
75
+ # Returns a list of indices related to this template. This includes both indices that are
76
+ # specified in our configuration settings (e.g. via `setting_overrides_by_timestamp` and
77
+ # `custom_time_sets`) and also indices that have been auto-created from the template.
78
+ #
79
+ # Note that there can be discrepancies between the configuration settings and the indices in
80
+ # the database. Sometimes this is planned/expected (e.g. such as when invoking `elasticgraph-admin`
81
+ # to configure an index newly defined in configuration) and in other cases it's not.
82
+ #
83
+ # The `only_if_exists` argument controls how a discrepancy is treated.
84
+ #
85
+ # - When `false` (the default), indices that are defined in config but do not exist in the datastore are still returned.
86
+ # This is generally what we want for indexing and cluster administration.
87
+ # - When `true`, any indices in our configuration that do not exist are ignored, and not included in the returned list.
88
+ # This is appropriate for searching the datastore: if we attempt to exclude an index which is defined in config but does
89
+ # not exist (e.g. via `-[index_name]` in the search index expression), the datastore will return an error, but we can
90
+ # safely ignore the index. Likewise, if we have an index in the datastore which we cannot infer a timestamp range, we
91
+ # need to ignore it to avoid getting errors. Ignoring an index is safe when searching because our search logic uses a
92
+ # wildcard to match _all_ indices with the same prefix, and then excludes certain known indices that it can safely
93
+ # exclude based on their timestamp range. Ignored indices which exist will still be searched.
94
+ #
95
+ # In addition, any indices which exist, but which are not controlled by our current configuration, are ignored. Examples:
96
+ #
97
+ # - An index with a custom suffix (e.g. `__before_2019`) which has no corresponding configuration. We have no way to guess
98
+ # what the timestamp range is for such an index, and we want to completely ignore it.
99
+ # - An index with for a different rollover frequency than our current configuration. For example, a `__2019-03` index,
100
+ # which must rollover monthly, would be ignored if our current rollover frequency is yearly or daily.
101
+ #
102
+ # These latter cases are quite rare but can happen when we are dealing with indices defined before an update to our
103
+ # configuration. Our searches will continue to search these indices so long as their name matches the pattern, and
104
+ # we otherwise want to ignore these indices (e.g. we don't want admin to attempt to configure them, or want our
105
+ # indexer to attempt to write to them).
106
+ def related_rollover_indices(datastore_client, only_if_exists: false)
107
+ config_indices_by_name = rollover_indices_to_pre_create.to_h { |i| [i.name, i] }
108
+
109
+ db_indices_by_name = datastore_client.list_indices_matching(index_expression_for_search).filter_map do |name|
110
+ index = concrete_rollover_index_for(name, {}, config_indices_by_name[name]&.time_set)
111
+ [name, index] if index
112
+ end.to_h
113
+
114
+ config_indices_by_name = config_indices_by_name.slice(*db_indices_by_name.keys) if only_if_exists
115
+
116
+ db_indices_by_name.merge(config_indices_by_name).values
117
+ end
118
+
119
+ # Gets a single related `RolloverIndex` for a given timestamp.
120
+ def related_rollover_index_for_timestamp(timestamp, setting_overrides = {})
121
+ # @type var record: ::Hash[::String, untyped]
122
+ # We need to use `__skip__` here because `inner_value` has different types on different
123
+ # block iterations: initially, it's a string, then it becomes a hash. Steep has trouble
124
+ # with this but it works fine.
125
+ __skip__ = record = timestamp_field_path.split(".").reverse.reduce(timestamp) do |inner_value, field_name|
126
+ {field_name => inner_value}
127
+ end
128
+
129
+ concrete_rollover_index_for(index_name_for_writes(record), setting_overrides)
130
+ end
131
+
132
+ private
133
+
134
+ def after_initialize
135
+ unless timestamp_field_path && ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY.key?(frequency)
136
+ raise SchemaError, "Rollover index config 'timestamp_field' or 'frequency' is invalid."
137
+ end
138
+ end
139
+
140
+ # Returns a list of indices that must be pre-created (rather than allowing them to be
141
+ # created lazily based on the template). This is done so that we can use different
142
+ # index settings for some indices. For example, you might want your template to be
143
+ # configured to use 5 shards, but for old months with a small data set you may only
144
+ # want to use 1 shard.
145
+ def rollover_indices_to_pre_create
146
+ @rollover_indices_to_pre_create ||= begin
147
+ indices_with_overrides = setting_overrides_by_timestamp.filter_map do |(timestamp, setting_overrides)|
148
+ related_rollover_index_for_timestamp(timestamp, setting_overrides)
149
+ end
150
+
151
+ indices_for_custom_timestamp_ranges = custom_timestamp_ranges.filter_map do |range|
152
+ concrete_rollover_index_for(
153
+ index_name_with_suffix(range.index_name_suffix),
154
+ range.setting_overrides,
155
+ range.time_set
156
+ )
157
+ end
158
+
159
+ indices_with_overrides + indices_for_custom_timestamp_ranges
160
+ end
161
+ end
162
+
163
+ def setting_overrides_by_timestamp
164
+ env_index_config.setting_overrides_by_timestamp
165
+ end
166
+
167
+ def custom_timestamp_ranges
168
+ env_index_config.custom_timestamp_ranges
169
+ end
170
+
171
+ def index_name_with_suffix(suffix)
172
+ "#{name}#{ROLLOVER_INDEX_INFIX_MARKER}#{suffix}"
173
+ end
174
+
175
+ ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY = {hourly: "%Y-%m-%d-%H", daily: "%Y-%m-%d", monthly: "%Y-%m", yearly: "%Y"}
176
+ ROLLOVER_TIME_ELEMENT_COUNTS_BY_FREQUENCY = ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY.transform_values { |format| format.split("-").size }
177
+ TIME_UNIT_BY_FREQUENCY = {hourly: :hour, daily: :day, monthly: :month, yearly: :year}
178
+
179
+ def rollover_index_suffix_for_record(record, timestamp_field_path:)
180
+ timestamp_value = ::DateTime.iso8601(
181
+ Support::HashUtil.fetch_value_at_path(record, timestamp_field_path)
182
+ ).to_time
183
+
184
+ if (matching_custom_range = env_index_config.custom_timestamp_range_for(timestamp_value))
185
+ return matching_custom_range.index_name_suffix
186
+ end
187
+
188
+ timestamp_value.strftime(ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY[frequency])
189
+ end
190
+
191
+ def concrete_rollover_index_for(index_name, setting_overrides, time_set = nil)
192
+ time_set ||= infer_time_set_from_index_name(index_name)
193
+ return nil if time_set.nil?
194
+
195
+ args = index_args.merge({
196
+ name: index_name,
197
+ env_index_config: env_index_config.without_env_overrides.with(
198
+ setting_overrides: env_index_config.setting_overrides.merge(setting_overrides)
199
+ )
200
+ })
201
+
202
+ RolloverIndex.new(Index.new(**args), time_set)
203
+ end
204
+
205
+ def infer_time_set_from_index_name(index_name)
206
+ time_args = index_name.split(ROLLOVER_INDEX_INFIX_MARKER).last.to_s.split("-")
207
+
208
+ # Verify that the index is for the same rollover frequency as we are currently configured to use.
209
+ # If not, return `nil` because we can't accurately infer the time set without the frequency aligning
210
+ # with the index itself.
211
+ #
212
+ # This can happen when we are migrating from one index frequency to another.
213
+ return nil unless time_args.size == ROLLOVER_TIME_ELEMENT_COUNTS_BY_FREQUENCY.fetch(frequency)
214
+
215
+ # Verify that the args are all numeric. If not, return `nil` because we have no idea what the
216
+ # time set for the index is.
217
+ #
218
+ # This can happen when we are migrating from one index configuration to another while also using
219
+ # custom timestamp ranges (e.g. to have a `__before_2020` index).
220
+ return nil if time_args.any? { |arg| /\A\d+\z/ !~ arg }
221
+
222
+ # Steep can't type the dynamic nature of `*time_args` so we have to use `__skip__` here.
223
+ # @type var lower_bound: ::Time
224
+ __skip__ = lower_bound = ::Time.utc(*time_args)
225
+ upper_bound = Support::TimeUtil.advance_one_unit(lower_bound, TIME_UNIT_BY_FREQUENCY.fetch(frequency))
226
+
227
+ Support::TimeSet.of_range(gte: lower_bound, lt: upper_bound)
228
+ end
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,51 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/datastore_core/index_definition/index"
10
+ require "elastic_graph/datastore_core/index_definition/rollover_index_template"
11
+ require "elastic_graph/error"
12
+
13
+ module ElasticGraph
14
+ class DatastoreCore
15
+ # Represents the definition of a datastore index (or rollover template).
16
+ # Intended to be an entry point for working with datastore indices.
17
+ #
18
+ # This module contains common implementation logic for both the rollover and non-rollover
19
+ # case, as well as a `with` factory method.
20
+ module IndexDefinition
21
+ def self.with(name:, runtime_metadata:, config:, datastore_clients_by_name:)
22
+ if (env_index_config = config.index_definitions[name]).nil?
23
+ raise ConfigError, "Configuration does not provide an index definition for `#{name}`, " \
24
+ "but it is required so we can identify the datastore cluster(s) to query and index into."
25
+ end
26
+
27
+ common_args = {
28
+ name: name,
29
+ route_with: runtime_metadata.route_with,
30
+ default_sort_clauses: runtime_metadata.default_sort_fields.map(&:to_query_clause),
31
+ current_sources: runtime_metadata.current_sources,
32
+ fields_by_path: runtime_metadata.fields_by_path,
33
+ env_index_config: env_index_config,
34
+ defined_clusters: config.clusters.keys.to_set,
35
+ datastore_clients_by_name: datastore_clients_by_name
36
+ }
37
+
38
+ if (rollover = runtime_metadata.rollover)
39
+ RolloverIndexTemplate.new(
40
+ timestamp_field_path: rollover.timestamp_field_path,
41
+ frequency: rollover.frequency,
42
+ index_args: common_args,
43
+ **common_args
44
+ )
45
+ else
46
+ Index.new(**common_args)
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,100 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/datastore_core/config"
10
+ require "elastic_graph/schema_artifacts/from_disk"
11
+ require "elastic_graph/support/logger"
12
+
13
+ module ElasticGraph
14
+ # The entry point into this library. Create an instance of this class to get access to
15
+ # the public interfaces provided by this library.
16
+ class DatastoreCore
17
+ # @dynamic config, schema_artifacts, logger, client_customization_block
18
+ attr_reader :config, :schema_artifacts, :logger, :client_customization_block
19
+
20
+ def self.from_parsed_yaml(parsed_yaml, for_context:, &client_customization_block)
21
+ new(
22
+ config: DatastoreCore::Config.from_parsed_yaml(parsed_yaml),
23
+ logger: Support::Logger.from_parsed_yaml(parsed_yaml),
24
+ schema_artifacts: SchemaArtifacts.from_parsed_yaml(parsed_yaml, for_context: for_context),
25
+ client_customization_block: client_customization_block
26
+ )
27
+ end
28
+
29
+ def initialize(
30
+ config:,
31
+ logger:,
32
+ schema_artifacts:,
33
+ clients_by_name: nil,
34
+ client_customization_block: nil
35
+ )
36
+ @config = config
37
+ @logger = logger
38
+ @schema_artifacts = schema_artifacts
39
+ @clients_by_name = clients_by_name
40
+ @client_customization_block = client_customization_block
41
+ end
42
+
43
+ # Exposes the datastore index definitions as a map, keyed by index definition name.
44
+ def index_definitions_by_name
45
+ @index_definitions_by_name ||= begin
46
+ require "elastic_graph/datastore_core/index_definition"
47
+ schema_artifacts.runtime_metadata.index_definitions_by_name.to_h do |name, index_def_metadata|
48
+ index_def = IndexDefinition.with(
49
+ name: name,
50
+ runtime_metadata: index_def_metadata,
51
+ config: config,
52
+ datastore_clients_by_name: clients_by_name
53
+ )
54
+
55
+ [name, index_def]
56
+ end
57
+ end
58
+ end
59
+
60
+ # Exposes the datastore index definitions as a map, keyed by GraphQL type.
61
+ # Note: the GraphQL type name is also used in non-GraphQL contexts (e.g. it is
62
+ # used in events processed by elasticgraph-indexer), so we expose this hear instead
63
+ # of from elasticgraph-graphql.
64
+ def index_definitions_by_graphql_type
65
+ @index_definitions_by_graphql_type ||= schema_artifacts
66
+ .runtime_metadata
67
+ .object_types_by_name
68
+ .transform_values do |metadata|
69
+ metadata.index_definition_names.map do |name|
70
+ index_definitions_by_name.fetch(name)
71
+ end
72
+ end
73
+ end
74
+
75
+ # Exposes the datastore clients in a map, keyed by cluster name.
76
+ def clients_by_name
77
+ @clients_by_name ||= begin
78
+ if (adapter_lib = config.client_faraday_adapter&.require)
79
+ require adapter_lib
80
+ end
81
+
82
+ adapter_name = config.client_faraday_adapter&.name
83
+ client_logger = config.log_traffic ? logger : nil
84
+
85
+ config.clusters.to_h do |name, cluster_def|
86
+ client = cluster_def.backend_client_class.new(
87
+ name,
88
+ faraday_adapter: adapter_name,
89
+ url: cluster_def.url,
90
+ logger: client_logger,
91
+ retry_on_failure: config.max_client_retries,
92
+ &@client_customization_block
93
+ )
94
+
95
+ [name, client]
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end