elasticgraph-datastore_core 0.18.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,232 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "date"
10
+ require "elastic_graph/datastore_core/index_config_normalizer"
11
+ require "elastic_graph/datastore_core/index_definition/base"
12
+ require "elastic_graph/datastore_core/index_definition/index"
13
+ require "elastic_graph/datastore_core/index_definition/rollover_index"
14
+ require "elastic_graph/error"
15
+ require "elastic_graph/support/memoizable_data"
16
+ require "elastic_graph/support/time_set"
17
+ require "elastic_graph/support/time_util"
18
+ require "time"
19
+
20
+ module ElasticGraph
21
+ class DatastoreCore
22
+ module IndexDefinition
23
+ class RolloverIndexTemplate < Support::MemoizableData.define(
24
+ :name, :route_with, :default_sort_clauses, :current_sources, :fields_by_path, :env_index_config,
25
+ :index_args, :defined_clusters, :datastore_clients_by_name, :timestamp_field_path, :frequency
26
+ )
27
+ # `Data.define` provides all these methods:
28
+ # @dynamic name, route_with, default_sort_clauses, current_sources, fields_by_path, env_index_config,
29
+ # @dynamic index_args, defined_clusters, datastore_clients_by_name, timestamp_field_path, frequency, initialize
30
+
31
+ # `include IndexDefinition::Base` provides all these methods. Steep should be able to detect it
32
+ # but can't for some reason so we have to declare them with `@dynamic`.
33
+ # @dynamic flattened_env_setting_overrides, routing_value_for_prepared_record, has_custom_routing?, cluster_to_query, use_updates_for_indexing?
34
+ # @dynamic clusters_to_index_into, all_accessible_cluster_names, ignored_values_for_routing, searches_could_hit_incomplete_docs?
35
+ # @dynamic accessible_cluster_names_to_index_into, accessible_from_queries?, known_related_query_rollover_indices, list_counts_field_paths_for_source
36
+ include IndexDefinition::Base
37
+
38
+ def mappings_in_datastore(datastore_client)
39
+ IndexConfigNormalizer.normalize_mappings(
40
+ datastore_client.get_index_template(name).dig("template", "mappings") || {}
41
+ )
42
+ end
43
+
44
+ # We need to delete both the template and the actual indices for rollover indices
45
+ def delete_from_datastore(datastore_client)
46
+ datastore_client.delete_index_template(name)
47
+ datastore_client.delete_indices(index_expression_for_search)
48
+ end
49
+
50
+ # Indicates if this is a rollover index definition.
51
+ #
52
+ # Use of this is considered a mild code smell. When feasible, it's generally better to
53
+ # implement a new polymorphic API on the IndexDefinition interface, rather
54
+ # then branching on the value of this predicate.
55
+ def rollover_index_template?
56
+ true
57
+ end
58
+
59
+ # Two underscores used to avoid collisions
60
+ # with other types (e.g. payments_2020 and payments_xyz_2020), though regardless shouldn't
61
+ # happen if types follow naming conventions.
62
+ def index_expression_for_search
63
+ index_name_with_suffix("*")
64
+ end
65
+
66
+ # Returns an index name to use for write operations. The index_definition selection is a function of
67
+ # the index_definition's rollover configuration and the record's timestamp.
68
+ def index_name_for_writes(record, timestamp_field_path: nil)
69
+ index_name_with_suffix(rollover_index_suffix_for_record(
70
+ record,
71
+ timestamp_field_path: timestamp_field_path || self.timestamp_field_path
72
+ ))
73
+ end
74
+
75
+ # Returns a list of indices related to this template. This includes both indices that are
76
+ # specified in our configuration settings (e.g. via `setting_overrides_by_timestamp` and
77
+ # `custom_time_sets`) and also indices that have been auto-created from the template.
78
+ #
79
+ # Note that there can be discrepancies between the configuration settings and the indices in
80
+ # the database. Sometimes this is planned/expected (e.g. such as when invoking `elasticgraph-admin`
81
+ # to configure an index newly defined in configuration) and in other cases it's not.
82
+ #
83
+ # The `only_if_exists` argument controls how a discrepancy is treated.
84
+ #
85
+ # - When `false` (the default), indices that are defined in config but do not exist in the datastore are still returned.
86
+ # This is generally what we want for indexing and cluster administration.
87
+ # - When `true`, any indices in our configuration that do not exist are ignored, and not included in the returned list.
88
+ # This is appropriate for searching the datastore: if we attempt to exclude an index which is defined in config but does
89
+ # not exist (e.g. via `-[index_name]` in the search index expression), the datastore will return an error, but we can
90
+ # safely ignore the index. Likewise, if we have an index in the datastore which we cannot infer a timestamp range, we
91
+ # need to ignore it to avoid getting errors. Ignoring an index is safe when searching because our search logic uses a
92
+ # wildcard to match _all_ indices with the same prefix, and then excludes certain known indices that it can safely
93
+ # exclude based on their timestamp range. Ignored indices which exist will still be searched.
94
+ #
95
+ # In addition, any indices which exist, but which are not controlled by our current configuration, are ignored. Examples:
96
+ #
97
+ # - An index with a custom suffix (e.g. `__before_2019`) which has no corresponding configuration. We have no way to guess
98
+ # what the timestamp range is for such an index, and we want to completely ignore it.
99
+ # - An index with for a different rollover frequency than our current configuration. For example, a `__2019-03` index,
100
+ # which must rollover monthly, would be ignored if our current rollover frequency is yearly or daily.
101
+ #
102
+ # These latter cases are quite rare but can happen when we are dealing with indices defined before an update to our
103
+ # configuration. Our searches will continue to search these indices so long as their name matches the pattern, and
104
+ # we otherwise want to ignore these indices (e.g. we don't want admin to attempt to configure them, or want our
105
+ # indexer to attempt to write to them).
106
+ def related_rollover_indices(datastore_client, only_if_exists: false)
107
+ config_indices_by_name = rollover_indices_to_pre_create.to_h { |i| [i.name, i] }
108
+
109
+ db_indices_by_name = datastore_client.list_indices_matching(index_expression_for_search).filter_map do |name|
110
+ index = concrete_rollover_index_for(name, {}, config_indices_by_name[name]&.time_set)
111
+ [name, index] if index
112
+ end.to_h
113
+
114
+ config_indices_by_name = config_indices_by_name.slice(*db_indices_by_name.keys) if only_if_exists
115
+
116
+ db_indices_by_name.merge(config_indices_by_name).values
117
+ end
118
+
119
+ # Gets a single related `RolloverIndex` for a given timestamp.
120
+ def related_rollover_index_for_timestamp(timestamp, setting_overrides = {})
121
+ # @type var record: ::Hash[::String, untyped]
122
+ # We need to use `__skip__` here because `inner_value` has different types on different
123
+ # block iterations: initially, it's a string, then it becomes a hash. Steep has trouble
124
+ # with this but it works fine.
125
+ __skip__ = record = timestamp_field_path.split(".").reverse.reduce(timestamp) do |inner_value, field_name|
126
+ {field_name => inner_value}
127
+ end
128
+
129
+ concrete_rollover_index_for(index_name_for_writes(record), setting_overrides)
130
+ end
131
+
132
+ private
133
+
134
+ def after_initialize
135
+ unless timestamp_field_path && ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY.key?(frequency)
136
+ raise SchemaError, "Rollover index config 'timestamp_field' or 'frequency' is invalid."
137
+ end
138
+ end
139
+
140
+ # Returns a list of indices that must be pre-created (rather than allowing them to be
141
+ # created lazily based on the template). This is done so that we can use different
142
+ # index settings for some indices. For example, you might want your template to be
143
+ # configured to use 5 shards, but for old months with a small data set you may only
144
+ # want to use 1 shard.
145
+ def rollover_indices_to_pre_create
146
+ @rollover_indices_to_pre_create ||= begin
147
+ indices_with_overrides = setting_overrides_by_timestamp.filter_map do |(timestamp, setting_overrides)|
148
+ related_rollover_index_for_timestamp(timestamp, setting_overrides)
149
+ end
150
+
151
+ indices_for_custom_timestamp_ranges = custom_timestamp_ranges.filter_map do |range|
152
+ concrete_rollover_index_for(
153
+ index_name_with_suffix(range.index_name_suffix),
154
+ range.setting_overrides,
155
+ range.time_set
156
+ )
157
+ end
158
+
159
+ indices_with_overrides + indices_for_custom_timestamp_ranges
160
+ end
161
+ end
162
+
163
+ def setting_overrides_by_timestamp
164
+ env_index_config.setting_overrides_by_timestamp
165
+ end
166
+
167
+ def custom_timestamp_ranges
168
+ env_index_config.custom_timestamp_ranges
169
+ end
170
+
171
+ def index_name_with_suffix(suffix)
172
+ "#{name}#{ROLLOVER_INDEX_INFIX_MARKER}#{suffix}"
173
+ end
174
+
175
+ ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY = {hourly: "%Y-%m-%d-%H", daily: "%Y-%m-%d", monthly: "%Y-%m", yearly: "%Y"}
176
+ ROLLOVER_TIME_ELEMENT_COUNTS_BY_FREQUENCY = ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY.transform_values { |format| format.split("-").size }
177
+ TIME_UNIT_BY_FREQUENCY = {hourly: :hour, daily: :day, monthly: :month, yearly: :year}
178
+
179
+ def rollover_index_suffix_for_record(record, timestamp_field_path:)
180
+ timestamp_value = ::DateTime.iso8601(
181
+ Support::HashUtil.fetch_value_at_path(record, timestamp_field_path)
182
+ ).to_time
183
+
184
+ if (matching_custom_range = env_index_config.custom_timestamp_range_for(timestamp_value))
185
+ return matching_custom_range.index_name_suffix
186
+ end
187
+
188
+ timestamp_value.strftime(ROLLOVER_SUFFIX_FORMATS_BY_FREQUENCY[frequency])
189
+ end
190
+
191
+ def concrete_rollover_index_for(index_name, setting_overrides, time_set = nil)
192
+ time_set ||= infer_time_set_from_index_name(index_name)
193
+ return nil if time_set.nil?
194
+
195
+ args = index_args.merge({
196
+ name: index_name,
197
+ env_index_config: env_index_config.without_env_overrides.with(
198
+ setting_overrides: env_index_config.setting_overrides.merge(setting_overrides)
199
+ )
200
+ })
201
+
202
+ RolloverIndex.new(Index.new(**args), time_set)
203
+ end
204
+
205
+ def infer_time_set_from_index_name(index_name)
206
+ time_args = index_name.split(ROLLOVER_INDEX_INFIX_MARKER).last.to_s.split("-")
207
+
208
+ # Verify that the index is for the same rollover frequency as we are currently configured to use.
209
+ # If not, return `nil` because we can't accurately infer the time set without the frequency aligning
210
+ # with the index itself.
211
+ #
212
+ # This can happen when we are migrating from one index frequency to another.
213
+ return nil unless time_args.size == ROLLOVER_TIME_ELEMENT_COUNTS_BY_FREQUENCY.fetch(frequency)
214
+
215
+ # Verify that the args are all numeric. If not, return `nil` because we have no idea what the
216
+ # time set for the index is.
217
+ #
218
+ # This can happen when we are migrating from one index configuration to another while also using
219
+ # custom timestamp ranges (e.g. to have a `__before_2020` index).
220
+ return nil if time_args.any? { |arg| /\A\d+\z/ !~ arg }
221
+
222
+ # Steep can't type the dynamic nature of `*time_args` so we have to use `__skip__` here.
223
+ # @type var lower_bound: ::Time
224
+ __skip__ = lower_bound = ::Time.utc(*time_args)
225
+ upper_bound = Support::TimeUtil.advance_one_unit(lower_bound, TIME_UNIT_BY_FREQUENCY.fetch(frequency))
226
+
227
+ Support::TimeSet.of_range(gte: lower_bound, lt: upper_bound)
228
+ end
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,51 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/datastore_core/index_definition/index"
10
+ require "elastic_graph/datastore_core/index_definition/rollover_index_template"
11
+ require "elastic_graph/error"
12
+
13
+ module ElasticGraph
14
+ class DatastoreCore
15
+ # Represents the definition of a datastore index (or rollover template).
16
+ # Intended to be an entry point for working with datastore indices.
17
+ #
18
+ # This module contains common implementation logic for both the rollover and non-rollover
19
+ # case, as well as a `with` factory method.
20
+ module IndexDefinition
21
+ def self.with(name:, runtime_metadata:, config:, datastore_clients_by_name:)
22
+ if (env_index_config = config.index_definitions[name]).nil?
23
+ raise ConfigError, "Configuration does not provide an index definition for `#{name}`, " \
24
+ "but it is required so we can identify the datastore cluster(s) to query and index into."
25
+ end
26
+
27
+ common_args = {
28
+ name: name,
29
+ route_with: runtime_metadata.route_with,
30
+ default_sort_clauses: runtime_metadata.default_sort_fields.map(&:to_query_clause),
31
+ current_sources: runtime_metadata.current_sources,
32
+ fields_by_path: runtime_metadata.fields_by_path,
33
+ env_index_config: env_index_config,
34
+ defined_clusters: config.clusters.keys.to_set,
35
+ datastore_clients_by_name: datastore_clients_by_name
36
+ }
37
+
38
+ if (rollover = runtime_metadata.rollover)
39
+ RolloverIndexTemplate.new(
40
+ timestamp_field_path: rollover.timestamp_field_path,
41
+ frequency: rollover.frequency,
42
+ index_args: common_args,
43
+ **common_args
44
+ )
45
+ else
46
+ Index.new(**common_args)
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,100 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/datastore_core/config"
10
+ require "elastic_graph/schema_artifacts/from_disk"
11
+ require "elastic_graph/support/logger"
12
+
13
+ module ElasticGraph
14
+ # The entry point into this library. Create an instance of this class to get access to
15
+ # the public interfaces provided by this library.
16
+ class DatastoreCore
17
+ # @dynamic config, schema_artifacts, logger, client_customization_block
18
+ attr_reader :config, :schema_artifacts, :logger, :client_customization_block
19
+
20
+ def self.from_parsed_yaml(parsed_yaml, for_context:, &client_customization_block)
21
+ new(
22
+ config: DatastoreCore::Config.from_parsed_yaml(parsed_yaml),
23
+ logger: Support::Logger.from_parsed_yaml(parsed_yaml),
24
+ schema_artifacts: SchemaArtifacts.from_parsed_yaml(parsed_yaml, for_context: for_context),
25
+ client_customization_block: client_customization_block
26
+ )
27
+ end
28
+
29
+ def initialize(
30
+ config:,
31
+ logger:,
32
+ schema_artifacts:,
33
+ clients_by_name: nil,
34
+ client_customization_block: nil
35
+ )
36
+ @config = config
37
+ @logger = logger
38
+ @schema_artifacts = schema_artifacts
39
+ @clients_by_name = clients_by_name
40
+ @client_customization_block = client_customization_block
41
+ end
42
+
43
+ # Exposes the datastore index definitions as a map, keyed by index definition name.
44
+ def index_definitions_by_name
45
+ @index_definitions_by_name ||= begin
46
+ require "elastic_graph/datastore_core/index_definition"
47
+ schema_artifacts.runtime_metadata.index_definitions_by_name.to_h do |name, index_def_metadata|
48
+ index_def = IndexDefinition.with(
49
+ name: name,
50
+ runtime_metadata: index_def_metadata,
51
+ config: config,
52
+ datastore_clients_by_name: clients_by_name
53
+ )
54
+
55
+ [name, index_def]
56
+ end
57
+ end
58
+ end
59
+
60
+ # Exposes the datastore index definitions as a map, keyed by GraphQL type.
61
+ # Note: the GraphQL type name is also used in non-GraphQL contexts (e.g. it is
62
+ # used in events processed by elasticgraph-indexer), so we expose this hear instead
63
+ # of from elasticgraph-graphql.
64
+ def index_definitions_by_graphql_type
65
+ @index_definitions_by_graphql_type ||= schema_artifacts
66
+ .runtime_metadata
67
+ .object_types_by_name
68
+ .transform_values do |metadata|
69
+ metadata.index_definition_names.map do |name|
70
+ index_definitions_by_name.fetch(name)
71
+ end
72
+ end
73
+ end
74
+
75
+ # Exposes the datastore clients in a map, keyed by cluster name.
76
+ def clients_by_name
77
+ @clients_by_name ||= begin
78
+ if (adapter_lib = config.client_faraday_adapter&.require)
79
+ require adapter_lib
80
+ end
81
+
82
+ adapter_name = config.client_faraday_adapter&.name
83
+ client_logger = config.log_traffic ? logger : nil
84
+
85
+ config.clusters.to_h do |name, cluster_def|
86
+ client = cluster_def.backend_client_class.new(
87
+ name,
88
+ faraday_adapter: adapter_name,
89
+ url: cluster_def.url,
90
+ logger: client_logger,
91
+ retry_on_failure: config.max_client_retries,
92
+ &@client_customization_block
93
+ )
94
+
95
+ [name, client]
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end