elasticgraph-indexer 0.18.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1bb7a4a0daded4c04bfc69e3142f75606b56a3044f088d95198c7998a85acc9c
4
+ data.tar.gz: f2e2e739ad2f785675a3678d2feab5d5f056f5dd61f5886196572f4c10193df6
5
+ SHA512:
6
+ metadata.gz: be15a227ac25072f1562f8ae850a65b12d46c66fb866b882c9bd2e1ac9581a69a8e08f7f53f3d5e630b9842fbe69b0f7c88208bac915844bb185b2a217d20df0
7
+ data.tar.gz: a5e808366a2e445257f61cf75dc4d8c4921be0ede5317bd0b620db92e4684c7daef63c0d015500dada497babd61e6f4c278febccfe140ac3b8372069e34eaadf
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2024 Block, Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1 @@
1
+ # ElasticGraph::Indexer
@@ -0,0 +1,24 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require_relative "../gemspec_helper"
10
+
11
+ ElasticGraphGemspecHelper.define_elasticgraph_gem(gemspec_file: __FILE__, category: :core) do |spec, eg_version|
12
+ spec.summary = "ElasticGraph gem that provides APIs to robustly index data into a datastore."
13
+
14
+ spec.add_dependency "elasticgraph-datastore_core", eg_version
15
+ spec.add_dependency "elasticgraph-json_schema", eg_version
16
+ spec.add_dependency "elasticgraph-schema_artifacts", eg_version
17
+ spec.add_dependency "elasticgraph-support", eg_version
18
+ spec.add_dependency "hashdiff", "~> 1.1"
19
+
20
+ spec.add_development_dependency "elasticgraph-admin", eg_version
21
+ spec.add_development_dependency "elasticgraph-elasticsearch", eg_version
22
+ spec.add_development_dependency "elasticgraph-opensearch", eg_version
23
+ spec.add_development_dependency "elasticgraph-schema_definition", eg_version
24
+ end
@@ -0,0 +1,48 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/error"
10
+ require "elastic_graph/indexer/event_id"
11
+
12
+ module ElasticGraph
13
+ class Indexer
14
+ class Config < ::Data.define(
15
+ # Map of indexing latency thresholds (in milliseconds), keyed by the name of
16
+ # the indexing latency metric. When an event is indexed with an indexing latency
17
+ # exceeding the threshold, a warning with the event type, id, and version will
18
+ # be logged, so the issue can be investigated.
19
+ :latency_slo_thresholds_by_timestamp_in_ms,
20
+ # Setting that can be used to specify some derived indexing type updates that should be skipped. This
21
+ # setting should be a map keyed by the name of the derived indexing type, and the values should be sets
22
+ # of ids. This can be useful when you have a "hot spot" of a single derived document that is
23
+ # receiving a ton of updates. During a backfill (or whatever) you may want to skip the derived
24
+ # type updates.
25
+ :skip_derived_indexing_type_updates
26
+ )
27
+ def self.from_parsed_yaml(hash)
28
+ hash = hash.fetch("indexer")
29
+ extra_keys = hash.keys - EXPECTED_KEYS
30
+
31
+ unless extra_keys.empty?
32
+ raise ConfigError, "Unknown `indexer` config settings: #{extra_keys.join(", ")}"
33
+ end
34
+
35
+ new(
36
+ latency_slo_thresholds_by_timestamp_in_ms: hash.fetch("latency_slo_thresholds_by_timestamp_in_ms"),
37
+ skip_derived_indexing_type_updates: (hash["skip_derived_indexing_type_updates"] || {}).transform_values(&:to_set)
38
+ )
39
+ end
40
+
41
+ EXPECTED_KEYS = members.map(&:to_s)
42
+ end
43
+
44
+ # Steep weirdly expects them here...
45
+ # @dynamic initialize, config, datastore_core, schema_artifacts, datastore_router
46
+ # @dynamic record_preparer, processor, operation_factory
47
+ end
48
+ end
@@ -0,0 +1,408 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/error"
11
+ require "elastic_graph/datastore_core/index_config_normalizer"
12
+ require "elastic_graph/indexer/event_id"
13
+ require "elastic_graph/indexer/hash_differ"
14
+ require "elastic_graph/indexer/indexing_failures_error"
15
+ require "elastic_graph/support/threading"
16
+
17
+ module ElasticGraph
18
+ class Indexer
19
+ # Responsible for routing datastore indexing requests to the appropriate cluster and index.
20
+ class DatastoreIndexingRouter
21
+ # In this class, we internally cache the datastore mapping for an index definition, so that we don't have to
22
+ # fetch the mapping from the datastore on each call to `bulk`. It rarely changes and ElasticGraph is designed so that
23
+ # mapping updates are applied before deploying the indexer with a new mapping.
24
+ #
25
+ # However, if an engineer forgets to apply a mapping update before deploying, they'll run into "mappings are incomplete"
26
+ # errors. They can updated the mapping to fix it, but the use of caching in this class could mean that the fix doesn't
27
+ # necessarily work right away. The app would have to be deployed or restarted so that the caches are cleared. That could
28
+ # be annoying.
29
+ #
30
+ # To address this issue, we're adding an expiration on the caching of the index mappings. Re-fetching the index
31
+ # mapping once every few minutes is no big deal and will allow the indexer to recover on its own after a mapping
32
+ # update has been applied without requiring a deploy or a restart.
33
+ #
34
+ # The expiration is a range so that, when we have many processes running, and they all started around the same time,
35
+ # (say, after a deploy!), they don't all expire their caches in sync, leading to spiky load on the datastore. Instead,
36
+ # the random distribution of expiration times will spread out the load.
37
+ MAPPING_CACHE_MAX_AGE_IN_MS_RANGE = (5 * 60 * 1000)..(10 * 60 * 1000)
38
+
39
+ def initialize(
40
+ datastore_clients_by_name:,
41
+ mappings_by_index_def_name:,
42
+ monotonic_clock:,
43
+ logger:
44
+ )
45
+ @datastore_clients_by_name = datastore_clients_by_name
46
+ @logger = logger
47
+ @monotonic_clock = monotonic_clock
48
+ @cached_mappings = {}
49
+
50
+ @mappings_by_index_def_name = mappings_by_index_def_name.transform_values do |mappings|
51
+ DatastoreCore::IndexConfigNormalizer.normalize_mappings(mappings)
52
+ end
53
+ end
54
+
55
+ # Proxies `client#bulk` by converting `operations` to their bulk
56
+ # form. Returns a hash between a cluster and a list of successfully applied operations on that cluster.
57
+ #
58
+ # For each operation, 1 of 4 things will happen, each of which will be treated differently:
59
+ #
60
+ # 1. The operation was successfully applied to the datastore and updated its state.
61
+ # The operation will be included in the successful operation of the returned result.
62
+ # 2. The operation could not even be attempted. For example, an `Update` operation
63
+ # cannot be attempted when the source event has `nil` for the field used as the source of
64
+ # the destination type's id. The returned result will not include this operation.
65
+ # 3. The operation was a no-op due to the external version not increasing. This happens when we
66
+ # process a duplicate or out-of-order event. The operation will be included in the returned
67
+ # result's list of noop results.
68
+ # 4. The operation failed outright for some other reason. The operation will be included in the
69
+ # returned result's failure results.
70
+ #
71
+ # It is the caller's responsibility to deal with any returned failures as this method does not
72
+ # raise an exception in that case.
73
+ #
74
+ # Note: before any operations are attempted, the datastore indices are validated for consistency
75
+ # with the mappings we expect, meaning that no bulk operations will be attempted if that is not up-to-date.
76
+ def bulk(operations, refresh: false)
77
+ # Before writing these operations, verify their destination index mapping are consistent.
78
+ validate_mapping_completeness_of!(:accessible_cluster_names_to_index_into, *operations.map(&:destination_index_def).uniq)
79
+
80
+ # @type var ops_by_client: ::Hash[DatastoreCore::_Client, ::Array[_Operation]]
81
+ ops_by_client = ::Hash.new { |h, k| h[k] = [] }
82
+ # @type var unsupported_ops: ::Set[_Operation]
83
+ unsupported_ops = ::Set.new
84
+
85
+ operations.reject { |op| op.to_datastore_bulk.empty? }.each do |op|
86
+ # Note: this intentionally does not use `accessible_cluster_names_to_index_into`.
87
+ # We want to fail with clear error if any clusters are inaccessible instead of silently ignoring
88
+ # the named cluster. The `IndexingFailuresError` provides a clear error.
89
+ cluster_names = op.destination_index_def.clusters_to_index_into
90
+
91
+ cluster_names.each do |cluster_name|
92
+ if (client = @datastore_clients_by_name[cluster_name])
93
+ ops_by_client[client] << op
94
+ else
95
+ unsupported_ops << op
96
+ end
97
+ end
98
+
99
+ unsupported_ops << op if cluster_names.empty?
100
+ end
101
+
102
+ unless unsupported_ops.empty?
103
+ raise IndexingFailuresError,
104
+ "The index definitions for #{unsupported_ops.size} operations " \
105
+ "(#{unsupported_ops.map { |o| Indexer::EventID.from_event(o.event) }.join(", ")}) " \
106
+ "were configured to be inaccessible. Check the configuration, or avoid sending " \
107
+ "events of this type to this ElasticGraph indexer."
108
+ end
109
+
110
+ ops_and_results_by_cluster = Support::Threading.parallel_map(ops_by_client) do |(client, ops)|
111
+ responses = client.bulk(body: ops.flat_map(&:to_datastore_bulk), refresh: refresh).fetch("items")
112
+
113
+ # As per https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html#bulk-api-response-body,
114
+ # > `items` contains the result of each operation in the bulk request, in the order they were submitted.
115
+ # Thus, we can trust it has the same cardinality as `ops` and they can be zipped together.
116
+ ops_and_results = ops.zip(responses).map { |(op, response)| [op, op.categorize(response)] }
117
+ [client.cluster_name, ops_and_results]
118
+ end.to_h
119
+
120
+ BulkResult.new(ops_and_results_by_cluster)
121
+ end
122
+
123
+ # Return type encapsulating all of the results of the bulk call.
124
+ class BulkResult < ::Data.define(:ops_and_results_by_cluster, :noop_results, :failure_results)
125
+ def initialize(ops_and_results_by_cluster:)
126
+ results_by_category = ops_and_results_by_cluster.values
127
+ .flat_map { |ops_and_results| ops_and_results.map(&:last) }
128
+ .group_by(&:category)
129
+
130
+ super(
131
+ ops_and_results_by_cluster: ops_and_results_by_cluster,
132
+ noop_results: results_by_category[:noop] || [],
133
+ failure_results: results_by_category[:failure] || []
134
+ )
135
+ end
136
+
137
+ # Returns successful operations grouped by the cluster they were applied to. If there are any
138
+ # failures, raises an exception to alert the caller to them unless `check_failures: false` is passed.
139
+ #
140
+ # This is designed to prevent failures from silently being ignored. For example, in tests
141
+ # we often call `successful_operations` or `successful_operations_by_cluster_name` and don't
142
+ # bother checking `failure_results` (because we don't expect a failure). If there was a failure
143
+ # we want to be notified about it.
144
+ def successful_operations_by_cluster_name(check_failures: true)
145
+ if check_failures && failure_results.any?
146
+ raise IndexingFailuresError, "Got #{failure_results.size} indexing failure(s):\n\n" \
147
+ "#{failure_results.map.with_index(1) { |result, idx| "#{idx}. #{result.summary}" }.join("\n\n")}"
148
+ end
149
+
150
+ ops_and_results_by_cluster.transform_values do |ops_and_results|
151
+ ops_and_results.filter_map do |(op, result)|
152
+ op if result.category == :success
153
+ end
154
+ end
155
+ end
156
+
157
+ # Returns a flat list of successful operations. If there are any failures, raises an exception
158
+ # to alert the caller to them unless `check_failures: false` is passed.
159
+ #
160
+ # This is designed to prevent failures from silently being ignored. For example, in tests
161
+ # we often call `successful_operations` or `successful_operations_by_cluster_name` and don't
162
+ # bother checking `failure_results` (because we don't expect a failure). If there was a failure
163
+ # we want to be notified about it.
164
+ def successful_operations(check_failures: true)
165
+ successful_operations_by_cluster_name(check_failures: check_failures).values.flatten(1).uniq
166
+ end
167
+ end
168
+
169
+ # Given a list of operations (which can contain different types of operations!), queries the datastore
170
+ # to identify the source event versions stored on the corresponding documents.
171
+ #
172
+ # This was specifically designed to support dealing with malformed events. If an event is malformed we
173
+ # usually want to raise an exception, but if the document targeted by the malformed event is at a newer
174
+ # version in the index than the version number in the event, the malformed state of the event has
175
+ # already been superseded by a corrected event and we can just log a message instead. This method specifically
176
+ # supports that logic.
177
+ #
178
+ # If the datastore returns errors for any of the calls, this method will raise an exception.
179
+ # Otherwise, this method returns a nested hash:
180
+ #
181
+ # - The outer hash maps operations to an inner hash of results for that operation.
182
+ # - The inner hash maps datastore cluster/client names to the version number for that operation from the datastore cluster.
183
+ #
184
+ # Note that the returned `version` for an operation on a cluster can be `nil` (as when the document is not found,
185
+ # or for an operation type that doesn't store source versions).
186
+ #
187
+ # This nested structure is necessary because a single operation can target more than one datastore
188
+ # cluster, and a document may have different source event versions in different datastore clusters.
189
+ def source_event_versions_in_index(operations)
190
+ ops_by_client_name = operations.each_with_object(::Hash.new { |h, k| h[k] = [] }) do |op, ops_hash|
191
+ # Note: this intentionally does not use `accessible_cluster_names_to_index_into`.
192
+ # We want to fail with clear error if any clusters are inaccessible instead of silently ignoring
193
+ # the named cluster. The `IndexingFailuresError` provides a clear error.
194
+ cluster_names = op.destination_index_def.clusters_to_index_into
195
+ cluster_names.each { |cluster_name| ops_hash[cluster_name] << op }
196
+ end
197
+
198
+ client_names_and_results = Support::Threading.parallel_map(ops_by_client_name) do |(client_name, all_ops)|
199
+ ops, unversioned_ops = all_ops.partition(&:versioned?)
200
+
201
+ msearch_response =
202
+ if (client = @datastore_clients_by_name[client_name]) && ops.any?
203
+ body = ops.flat_map do |op|
204
+ # We only care about the source versions, but the way we get it varies.
205
+ include_version =
206
+ if op.destination_index_def.use_updates_for_indexing?
207
+ {_source: {includes: [
208
+ "__versions.#{op.update_target.relationship}",
209
+ # The update_data script before ElasticGraph v0.8 used __sourceVersions[type] instead of __versions[relationship].
210
+ # To be backwards-compatible we need to fetch the data at both paths.
211
+ #
212
+ # TODO: Drop this when we no longer need to maintain backwards-compatibility.
213
+ "__sourceVersions.#{op.event.fetch("type")}"
214
+ ]}}
215
+ else
216
+ {version: true, _source: false}
217
+ end
218
+
219
+ [
220
+ # Note: we intentionally search the entire index expression, not just an individual index based on a rollover timestamp.
221
+ # And we intentionally do NOT provide a routing value--we want to find the version, no matter what shard the document
222
+ # lives on.
223
+ #
224
+ # Since this `source_event_versions_in_index` is for handling malformed events, its possible that the
225
+ # rollover timestamp or routing value on the operation is wrong and that the correct document lives in
226
+ # a different shard and index than what the operation is targeted at. We want to search across all of them
227
+ # so that we will find it, regardless of where it lives.
228
+ {index: op.destination_index_def.index_expression_for_search},
229
+ # Filter to the documents matching the id.
230
+ {query: {ids: {values: [op.doc_id]}}}.merge(include_version)
231
+ ]
232
+ end
233
+
234
+ client.msearch(body: body)
235
+ else
236
+ # The named client doesn't exist, so we don't have any versions for the docs.
237
+ {"responses" => ops.map { |op| {"hits" => {"hits" => _ = []}} }}
238
+ end
239
+
240
+ errors = msearch_response.fetch("responses").filter_map { |res| res if res["error"] }
241
+
242
+ if errors.empty?
243
+ versions_by_op = ops.zip(msearch_response.fetch("responses")).to_h do |(op, response)|
244
+ hits = response.fetch("hits").fetch("hits")
245
+
246
+ if hits.size > 1
247
+ # Got multiple results. The document is duplicated in multiple shards or indexes. Log a warning about this.
248
+ @logger.warn({
249
+ "message_type" => "IdentifyDocumentVersionsGotMultipleResults",
250
+ "index" => hits.map { |h| h["_index"] },
251
+ "routing" => hits.map { |h| h["_routing"] },
252
+ "id" => hits.map { |h| h["_id"] },
253
+ "version" => hits.map { |h| h["_version"] }
254
+ })
255
+ end
256
+
257
+ if op.destination_index_def.use_updates_for_indexing?
258
+ versions = hits.filter_map do |hit|
259
+ hit.dig("_source", "__versions", op.update_target.relationship, hit.fetch("_id")) ||
260
+ # The update_data script before ElasticGraph v0.8 used __sourceVersions[type] instead of __versions[relationship].
261
+ # To be backwards-compatible we need to fetch the data at both paths.
262
+ #
263
+ # TODO: Drop this when we no longer need to maintain backwards-compatibility.
264
+ hit.dig("_source", "__sourceVersions", op.event.fetch("type"), hit.fetch("_id"))
265
+ end
266
+
267
+ [op, versions.uniq]
268
+ else
269
+ [op, hits.map { |h| h.fetch("_version") }.uniq]
270
+ end
271
+ end
272
+
273
+ unversioned_ops_hash = unversioned_ops.to_h do |op|
274
+ [op, []] # : [_Operation, ::Array[::Integer]]
275
+ end
276
+
277
+ [client_name, :success, versions_by_op.merge(unversioned_ops_hash)]
278
+ else
279
+ [client_name, :failure, errors]
280
+ end
281
+ end
282
+
283
+ failures = client_names_and_results.flat_map do |(client_name, success_or_failure, results)|
284
+ if success_or_failure == :success
285
+ []
286
+ else
287
+ results.map do |result|
288
+ "From cluster #{client_name}: #{::JSON.generate(result, space: " ")}"
289
+ end
290
+ end
291
+ end
292
+
293
+ if failures.empty?
294
+ client_names_and_results.each_with_object(_ = {}) do |(client_name, _success_or_failure, results), accum|
295
+ results.each do |op, version|
296
+ accum[op] ||= _ = {}
297
+ accum[op][client_name] = version
298
+ end
299
+ end
300
+ else
301
+ raise IdentifyDocumentVersionsFailedError, "Got #{failures.size} failure(s) while querying the datastore " \
302
+ "for document versions:\n\n#{failures.join("\n")}"
303
+ end
304
+ end
305
+
306
+ # Queries the datastore mapping(s) for the given index definition(s) to verify that they are up-to-date
307
+ # with our schema artifacts, raising an error if the datastore mappings are missing fields that we
308
+ # expect. (Extra fields are allowed, though--we'll just ignore them).
309
+ #
310
+ # This is intended for use when you want a strong guarantee before proceeding that the indices are current,
311
+ # such as before indexing data, or after applying index updates (to "prove" that everything is how it should
312
+ # be).
313
+ #
314
+ # This correctly queries the datastore clusters specified via `index_into_clusters` in config,
315
+ # but ignores clusters specified via `query_cluster` (since this isn't intended to be used as part
316
+ # of the query flow).
317
+ #
318
+ # For a rollover template, this takes care of verifying the template itself and also any indices that originated
319
+ # from the template.
320
+ #
321
+ # Note also that this caches the datastore mappings, since this is intended to be used to verify an index
322
+ # before we index data into it, and we do not want to impose a huge performance penalty on that process (requiring
323
+ # multiple datastore requests before we index each document...). In general, the index mapping only changes
324
+ # when we make it change, and we deploy and restart ElasticGraph after any index mapping changes, so we do not
325
+ # need to worry about it mutating during the lifetime of a single process (particularly given the expense of doing
326
+ # so).
327
+ def validate_mapping_completeness_of!(index_cluster_name_method, *index_definitions)
328
+ diffs_by_cluster_and_index_name = index_definitions.reduce(_ = {}) do |accum, index_def|
329
+ accum.merge(mapping_diffs_for(index_def, index_cluster_name_method))
330
+ end
331
+
332
+ if diffs_by_cluster_and_index_name.any?
333
+ formatted_diffs = diffs_by_cluster_and_index_name.map do |(cluster_name, index_name), diff|
334
+ <<~EOS
335
+ On cluster `#{cluster_name}` and index/template `#{index_name}`:
336
+ #{diff}
337
+ EOS
338
+ end.join("\n\n")
339
+
340
+ raise ConfigError, "Datastore index mappings are incomplete compared to the current schema. " \
341
+ "The diff below uses the datastore index mapping as the base, and shows the expected mapping as a diff. " \
342
+ "\n\n#{formatted_diffs}"
343
+ end
344
+ end
345
+
346
+ private
347
+
348
+ def mapping_diffs_for(index_definition, index_cluster_name_method)
349
+ expected_mapping = @mappings_by_index_def_name.fetch(index_definition.name)
350
+
351
+ index_definition.public_send(index_cluster_name_method).flat_map do |cluster_name|
352
+ datastore_client = datastore_client_named(cluster_name)
353
+
354
+ cached_mappings_for(index_definition, datastore_client).filter_map do |index, mapping_in_index|
355
+ if (diff = HashDiffer.diff(mapping_in_index, expected_mapping, ignore_ops: [:-]))
356
+ [[cluster_name, index.name], diff]
357
+ end
358
+ end
359
+ end.to_h
360
+ end
361
+
362
+ def cached_mappings_for(index_definition, datastore_client)
363
+ key = [datastore_client, index_definition] # : [DatastoreCore::_Client, DatastoreCore::indexDefinition]
364
+ cached_mapping = @cached_mappings[key] ||= new_cached_mapping(fetch_mappings_from_datastore(index_definition, datastore_client))
365
+
366
+ return cached_mapping.mappings if @monotonic_clock.now_in_ms < cached_mapping.expires_at
367
+
368
+ begin
369
+ fetch_mappings_from_datastore(index_definition, datastore_client).tap do |mappings|
370
+ @logger.info "Mapping cache expired for #{index_definition.name}; cleared it from the cache and re-fetched the mapping."
371
+ @cached_mappings[key] = new_cached_mapping(mappings)
372
+ end
373
+ rescue => e
374
+ @logger.warn <<~EOS
375
+ Mapping cache expired for #{index_definition.name}; attempted to re-fetch it but got an error[1]. Will continue using expired mapping information for now.
376
+
377
+ [1] #{e.class}: #{e.message}
378
+ #{e.backtrace.join("\n")}
379
+ EOS
380
+
381
+ # Update the cached mapping so that the expiration is reset.
382
+ @cached_mappings[key] = new_cached_mapping(cached_mapping.mappings)
383
+
384
+ cached_mapping.mappings
385
+ end
386
+ end
387
+
388
+ def fetch_mappings_from_datastore(index_definition, datastore_client)
389
+ # We need to also check any related indices...
390
+ indices_to_check = [index_definition] + index_definition.related_rollover_indices(datastore_client)
391
+
392
+ indices_to_check.to_h do |index|
393
+ [index, index.mappings_in_datastore(datastore_client)]
394
+ end
395
+ end
396
+
397
+ def new_cached_mapping(mappings)
398
+ CachedMapping.new(mappings, @monotonic_clock.now_in_ms + rand(MAPPING_CACHE_MAX_AGE_IN_MS_RANGE).to_i)
399
+ end
400
+
401
+ def datastore_client_named(name)
402
+ @datastore_clients_by_name.fetch(name)
403
+ end
404
+
405
+ CachedMapping = ::Data.define(:mappings, :expires_at)
406
+ end
407
+ end
408
+ end
@@ -0,0 +1,32 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/error"
10
+
11
+ module ElasticGraph
12
+ class Indexer
13
+ # A unique identifier for an event ingested by the indexer. As a string, takes the form of
14
+ # "[type]:[id]@v[version]", such as "Widget:123abc@v7". This format was designed to make it
15
+ # easy to put these ids in a comma-seperated list.
16
+ EventID = ::Data.define(:type, :id, :version) do
17
+ # @implements EventID
18
+ def self.from_event(event)
19
+ new(type: event["type"], id: event["id"], version: event["version"])
20
+ end
21
+
22
+ def to_s
23
+ "#{type}:#{id}@v#{version}"
24
+ end
25
+ end
26
+
27
+ # Steep weirdly expects them here...
28
+ # @dynamic initialize, config, datastore_core, schema_artifacts, datastore_router, monotonic_clock
29
+ # @dynamic record_preparer_factory, processor, operation_factory, logger
30
+ # @dynamic self.from_parsed_yaml
31
+ end
32
+ end
@@ -0,0 +1,83 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/error"
10
+ require "elastic_graph/indexer/event_id"
11
+
12
+ module ElasticGraph
13
+ class Indexer
14
+ # Indicates an event that we attempted to process which failed for some reason. It may have
15
+ # failed due to a validation issue before we even attempted to write it to the datastore, or it
16
+ # could have failed in the datastore itself.
17
+ class FailedEventError < Error
18
+ # @dynamic main_message, event, operations
19
+
20
+ # The "main" part of the error message (without the `full_id` portion).
21
+ attr_reader :main_message
22
+
23
+ # The invalid event.
24
+ attr_reader :event
25
+
26
+ # The operations that would have been returned by the `OperationFactory` if the event was valid.
27
+ # Note that sometimes an event is so malformed that we can't build any operations for it, but
28
+ # most of the time we can.
29
+ attr_reader :operations
30
+
31
+ def self.from_failed_operation_result(result, all_operations_for_event)
32
+ new(
33
+ event: result.event,
34
+ operations: all_operations_for_event,
35
+ main_message: result.summary
36
+ )
37
+ end
38
+
39
+ def initialize(event:, operations:, main_message:)
40
+ @main_message = main_message
41
+ @event = event
42
+ @operations = operations
43
+
44
+ super("#{full_id}: #{main_message}")
45
+ end
46
+
47
+ # A filtered list of operations that have versions that can be compared against our event
48
+ # version. Not all operation types have a version (e.g. derived indexing `Update` operations don't).
49
+ def versioned_operations
50
+ @versioned_operations ||= operations.select(&:versioned?)
51
+ end
52
+
53
+ def full_id
54
+ event_id = EventID.from_event(event).to_s
55
+ if (message_id = event["message_id"])
56
+ "#{event_id} (message_id: #{message_id})"
57
+ else
58
+ event_id
59
+ end
60
+ end
61
+
62
+ def id
63
+ event["id"]
64
+ end
65
+
66
+ def op
67
+ event["op"]
68
+ end
69
+
70
+ def type
71
+ event["type"]
72
+ end
73
+
74
+ def version
75
+ event["version"]
76
+ end
77
+
78
+ def record
79
+ event["record"]
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "hashdiff"
10
+
11
+ module ElasticGraph
12
+ class Indexer
13
+ class HashDiffer
14
+ # Generates a string describing how `old` and `new` differ, similar to a git diff.
15
+ # `ignore_ops` can contain any of `:-`, `:+`, and `:~`; when provided those diff operations
16
+ # will be ignored.
17
+ def self.diff(old, new, ignore_ops: [])
18
+ ignore_op_strings = ignore_ops.map(&:to_s).to_set
19
+
20
+ diffs = ::Hashdiff.diff(old, new)
21
+ .reject { |op, path, *vals| ignore_op_strings.include?(_ = op) }
22
+
23
+ return if diffs.empty?
24
+
25
+ diffs.map do |op, path, *vals|
26
+ suffix = if vals.one?
27
+ vals.first
28
+ else
29
+ vals.map { |v| "`#{v.inspect}`" }.join(" => ")
30
+ end
31
+
32
+ "#{op} #{path}: #{suffix}"
33
+ end.join("\n")
34
+ end
35
+ end
36
+ end
37
+ end