elasticgraph-indexer 0.18.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +1 -0
- data/elasticgraph-indexer.gemspec +24 -0
- data/lib/elastic_graph/indexer/config.rb +48 -0
- data/lib/elastic_graph/indexer/datastore_indexing_router.rb +408 -0
- data/lib/elastic_graph/indexer/event_id.rb +32 -0
- data/lib/elastic_graph/indexer/failed_event_error.rb +83 -0
- data/lib/elastic_graph/indexer/hash_differ.rb +37 -0
- data/lib/elastic_graph/indexer/indexing_failures_error.rb +28 -0
- data/lib/elastic_graph/indexer/indexing_preparers/integer.rb +41 -0
- data/lib/elastic_graph/indexer/indexing_preparers/no_op.rb +19 -0
- data/lib/elastic_graph/indexer/indexing_preparers/untyped.rb +22 -0
- data/lib/elastic_graph/indexer/operation/count_accumulator.rb +166 -0
- data/lib/elastic_graph/indexer/operation/factory.rb +226 -0
- data/lib/elastic_graph/indexer/operation/result.rb +76 -0
- data/lib/elastic_graph/indexer/operation/update.rb +160 -0
- data/lib/elastic_graph/indexer/operation/upsert.rb +71 -0
- data/lib/elastic_graph/indexer/processor.rb +137 -0
- data/lib/elastic_graph/indexer/record_preparer.rb +163 -0
- data/lib/elastic_graph/indexer/spec_support/event_matcher.rb +44 -0
- data/lib/elastic_graph/indexer/test_support/converters.rb +36 -0
- data/lib/elastic_graph/indexer.rb +98 -0
- metadata +454 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1bb7a4a0daded4c04bfc69e3142f75606b56a3044f088d95198c7998a85acc9c
|
4
|
+
data.tar.gz: f2e2e739ad2f785675a3678d2feab5d5f056f5dd61f5886196572f4c10193df6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: be15a227ac25072f1562f8ae850a65b12d46c66fb866b882c9bd2e1ac9581a69a8e08f7f53f3d5e630b9842fbe69b0f7c88208bac915844bb185b2a217d20df0
|
7
|
+
data.tar.gz: a5e808366a2e445257f61cf75dc4d8c4921be0ede5317bd0b620db92e4684c7daef63c0d015500dada497babd61e6f4c278febccfe140ac3b8372069e34eaadf
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2024 Block, Inc.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
# ElasticGraph::Indexer
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require_relative "../gemspec_helper"
|
10
|
+
|
11
|
+
ElasticGraphGemspecHelper.define_elasticgraph_gem(gemspec_file: __FILE__, category: :core) do |spec, eg_version|
|
12
|
+
spec.summary = "ElasticGraph gem that provides APIs to robustly index data into a datastore."
|
13
|
+
|
14
|
+
spec.add_dependency "elasticgraph-datastore_core", eg_version
|
15
|
+
spec.add_dependency "elasticgraph-json_schema", eg_version
|
16
|
+
spec.add_dependency "elasticgraph-schema_artifacts", eg_version
|
17
|
+
spec.add_dependency "elasticgraph-support", eg_version
|
18
|
+
spec.add_dependency "hashdiff", "~> 1.1"
|
19
|
+
|
20
|
+
spec.add_development_dependency "elasticgraph-admin", eg_version
|
21
|
+
spec.add_development_dependency "elasticgraph-elasticsearch", eg_version
|
22
|
+
spec.add_development_dependency "elasticgraph-opensearch", eg_version
|
23
|
+
spec.add_development_dependency "elasticgraph-schema_definition", eg_version
|
24
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/error"
|
10
|
+
require "elastic_graph/indexer/event_id"
|
11
|
+
|
12
|
+
module ElasticGraph
|
13
|
+
class Indexer
|
14
|
+
class Config < ::Data.define(
|
15
|
+
# Map of indexing latency thresholds (in milliseconds), keyed by the name of
|
16
|
+
# the indexing latency metric. When an event is indexed with an indexing latency
|
17
|
+
# exceeding the threshold, a warning with the event type, id, and version will
|
18
|
+
# be logged, so the issue can be investigated.
|
19
|
+
:latency_slo_thresholds_by_timestamp_in_ms,
|
20
|
+
# Setting that can be used to specify some derived indexing type updates that should be skipped. This
|
21
|
+
# setting should be a map keyed by the name of the derived indexing type, and the values should be sets
|
22
|
+
# of ids. This can be useful when you have a "hot spot" of a single derived document that is
|
23
|
+
# receiving a ton of updates. During a backfill (or whatever) you may want to skip the derived
|
24
|
+
# type updates.
|
25
|
+
:skip_derived_indexing_type_updates
|
26
|
+
)
|
27
|
+
def self.from_parsed_yaml(hash)
|
28
|
+
hash = hash.fetch("indexer")
|
29
|
+
extra_keys = hash.keys - EXPECTED_KEYS
|
30
|
+
|
31
|
+
unless extra_keys.empty?
|
32
|
+
raise ConfigError, "Unknown `indexer` config settings: #{extra_keys.join(", ")}"
|
33
|
+
end
|
34
|
+
|
35
|
+
new(
|
36
|
+
latency_slo_thresholds_by_timestamp_in_ms: hash.fetch("latency_slo_thresholds_by_timestamp_in_ms"),
|
37
|
+
skip_derived_indexing_type_updates: (hash["skip_derived_indexing_type_updates"] || {}).transform_values(&:to_set)
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
EXPECTED_KEYS = members.map(&:to_s)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Steep weirdly expects them here...
|
45
|
+
# @dynamic initialize, config, datastore_core, schema_artifacts, datastore_router
|
46
|
+
# @dynamic record_preparer, processor, operation_factory
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,408 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/constants"
|
10
|
+
require "elastic_graph/error"
|
11
|
+
require "elastic_graph/datastore_core/index_config_normalizer"
|
12
|
+
require "elastic_graph/indexer/event_id"
|
13
|
+
require "elastic_graph/indexer/hash_differ"
|
14
|
+
require "elastic_graph/indexer/indexing_failures_error"
|
15
|
+
require "elastic_graph/support/threading"
|
16
|
+
|
17
|
+
module ElasticGraph
|
18
|
+
class Indexer
|
19
|
+
# Responsible for routing datastore indexing requests to the appropriate cluster and index.
|
20
|
+
class DatastoreIndexingRouter
|
21
|
+
# In this class, we internally cache the datastore mapping for an index definition, so that we don't have to
|
22
|
+
# fetch the mapping from the datastore on each call to `bulk`. It rarely changes and ElasticGraph is designed so that
|
23
|
+
# mapping updates are applied before deploying the indexer with a new mapping.
|
24
|
+
#
|
25
|
+
# However, if an engineer forgets to apply a mapping update before deploying, they'll run into "mappings are incomplete"
|
26
|
+
# errors. They can updated the mapping to fix it, but the use of caching in this class could mean that the fix doesn't
|
27
|
+
# necessarily work right away. The app would have to be deployed or restarted so that the caches are cleared. That could
|
28
|
+
# be annoying.
|
29
|
+
#
|
30
|
+
# To address this issue, we're adding an expiration on the caching of the index mappings. Re-fetching the index
|
31
|
+
# mapping once every few minutes is no big deal and will allow the indexer to recover on its own after a mapping
|
32
|
+
# update has been applied without requiring a deploy or a restart.
|
33
|
+
#
|
34
|
+
# The expiration is a range so that, when we have many processes running, and they all started around the same time,
|
35
|
+
# (say, after a deploy!), they don't all expire their caches in sync, leading to spiky load on the datastore. Instead,
|
36
|
+
# the random distribution of expiration times will spread out the load.
|
37
|
+
MAPPING_CACHE_MAX_AGE_IN_MS_RANGE = (5 * 60 * 1000)..(10 * 60 * 1000)
|
38
|
+
|
39
|
+
def initialize(
|
40
|
+
datastore_clients_by_name:,
|
41
|
+
mappings_by_index_def_name:,
|
42
|
+
monotonic_clock:,
|
43
|
+
logger:
|
44
|
+
)
|
45
|
+
@datastore_clients_by_name = datastore_clients_by_name
|
46
|
+
@logger = logger
|
47
|
+
@monotonic_clock = monotonic_clock
|
48
|
+
@cached_mappings = {}
|
49
|
+
|
50
|
+
@mappings_by_index_def_name = mappings_by_index_def_name.transform_values do |mappings|
|
51
|
+
DatastoreCore::IndexConfigNormalizer.normalize_mappings(mappings)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Proxies `client#bulk` by converting `operations` to their bulk
|
56
|
+
# form. Returns a hash between a cluster and a list of successfully applied operations on that cluster.
|
57
|
+
#
|
58
|
+
# For each operation, 1 of 4 things will happen, each of which will be treated differently:
|
59
|
+
#
|
60
|
+
# 1. The operation was successfully applied to the datastore and updated its state.
|
61
|
+
# The operation will be included in the successful operation of the returned result.
|
62
|
+
# 2. The operation could not even be attempted. For example, an `Update` operation
|
63
|
+
# cannot be attempted when the source event has `nil` for the field used as the source of
|
64
|
+
# the destination type's id. The returned result will not include this operation.
|
65
|
+
# 3. The operation was a no-op due to the external version not increasing. This happens when we
|
66
|
+
# process a duplicate or out-of-order event. The operation will be included in the returned
|
67
|
+
# result's list of noop results.
|
68
|
+
# 4. The operation failed outright for some other reason. The operation will be included in the
|
69
|
+
# returned result's failure results.
|
70
|
+
#
|
71
|
+
# It is the caller's responsibility to deal with any returned failures as this method does not
|
72
|
+
# raise an exception in that case.
|
73
|
+
#
|
74
|
+
# Note: before any operations are attempted, the datastore indices are validated for consistency
|
75
|
+
# with the mappings we expect, meaning that no bulk operations will be attempted if that is not up-to-date.
|
76
|
+
def bulk(operations, refresh: false)
|
77
|
+
# Before writing these operations, verify their destination index mapping are consistent.
|
78
|
+
validate_mapping_completeness_of!(:accessible_cluster_names_to_index_into, *operations.map(&:destination_index_def).uniq)
|
79
|
+
|
80
|
+
# @type var ops_by_client: ::Hash[DatastoreCore::_Client, ::Array[_Operation]]
|
81
|
+
ops_by_client = ::Hash.new { |h, k| h[k] = [] }
|
82
|
+
# @type var unsupported_ops: ::Set[_Operation]
|
83
|
+
unsupported_ops = ::Set.new
|
84
|
+
|
85
|
+
operations.reject { |op| op.to_datastore_bulk.empty? }.each do |op|
|
86
|
+
# Note: this intentionally does not use `accessible_cluster_names_to_index_into`.
|
87
|
+
# We want to fail with clear error if any clusters are inaccessible instead of silently ignoring
|
88
|
+
# the named cluster. The `IndexingFailuresError` provides a clear error.
|
89
|
+
cluster_names = op.destination_index_def.clusters_to_index_into
|
90
|
+
|
91
|
+
cluster_names.each do |cluster_name|
|
92
|
+
if (client = @datastore_clients_by_name[cluster_name])
|
93
|
+
ops_by_client[client] << op
|
94
|
+
else
|
95
|
+
unsupported_ops << op
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
unsupported_ops << op if cluster_names.empty?
|
100
|
+
end
|
101
|
+
|
102
|
+
unless unsupported_ops.empty?
|
103
|
+
raise IndexingFailuresError,
|
104
|
+
"The index definitions for #{unsupported_ops.size} operations " \
|
105
|
+
"(#{unsupported_ops.map { |o| Indexer::EventID.from_event(o.event) }.join(", ")}) " \
|
106
|
+
"were configured to be inaccessible. Check the configuration, or avoid sending " \
|
107
|
+
"events of this type to this ElasticGraph indexer."
|
108
|
+
end
|
109
|
+
|
110
|
+
ops_and_results_by_cluster = Support::Threading.parallel_map(ops_by_client) do |(client, ops)|
|
111
|
+
responses = client.bulk(body: ops.flat_map(&:to_datastore_bulk), refresh: refresh).fetch("items")
|
112
|
+
|
113
|
+
# As per https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html#bulk-api-response-body,
|
114
|
+
# > `items` contains the result of each operation in the bulk request, in the order they were submitted.
|
115
|
+
# Thus, we can trust it has the same cardinality as `ops` and they can be zipped together.
|
116
|
+
ops_and_results = ops.zip(responses).map { |(op, response)| [op, op.categorize(response)] }
|
117
|
+
[client.cluster_name, ops_and_results]
|
118
|
+
end.to_h
|
119
|
+
|
120
|
+
BulkResult.new(ops_and_results_by_cluster)
|
121
|
+
end
|
122
|
+
|
123
|
+
# Return type encapsulating all of the results of the bulk call.
|
124
|
+
class BulkResult < ::Data.define(:ops_and_results_by_cluster, :noop_results, :failure_results)
|
125
|
+
def initialize(ops_and_results_by_cluster:)
|
126
|
+
results_by_category = ops_and_results_by_cluster.values
|
127
|
+
.flat_map { |ops_and_results| ops_and_results.map(&:last) }
|
128
|
+
.group_by(&:category)
|
129
|
+
|
130
|
+
super(
|
131
|
+
ops_and_results_by_cluster: ops_and_results_by_cluster,
|
132
|
+
noop_results: results_by_category[:noop] || [],
|
133
|
+
failure_results: results_by_category[:failure] || []
|
134
|
+
)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns successful operations grouped by the cluster they were applied to. If there are any
|
138
|
+
# failures, raises an exception to alert the caller to them unless `check_failures: false` is passed.
|
139
|
+
#
|
140
|
+
# This is designed to prevent failures from silently being ignored. For example, in tests
|
141
|
+
# we often call `successful_operations` or `successful_operations_by_cluster_name` and don't
|
142
|
+
# bother checking `failure_results` (because we don't expect a failure). If there was a failure
|
143
|
+
# we want to be notified about it.
|
144
|
+
def successful_operations_by_cluster_name(check_failures: true)
|
145
|
+
if check_failures && failure_results.any?
|
146
|
+
raise IndexingFailuresError, "Got #{failure_results.size} indexing failure(s):\n\n" \
|
147
|
+
"#{failure_results.map.with_index(1) { |result, idx| "#{idx}. #{result.summary}" }.join("\n\n")}"
|
148
|
+
end
|
149
|
+
|
150
|
+
ops_and_results_by_cluster.transform_values do |ops_and_results|
|
151
|
+
ops_and_results.filter_map do |(op, result)|
|
152
|
+
op if result.category == :success
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns a flat list of successful operations. If there are any failures, raises an exception
|
158
|
+
# to alert the caller to them unless `check_failures: false` is passed.
|
159
|
+
#
|
160
|
+
# This is designed to prevent failures from silently being ignored. For example, in tests
|
161
|
+
# we often call `successful_operations` or `successful_operations_by_cluster_name` and don't
|
162
|
+
# bother checking `failure_results` (because we don't expect a failure). If there was a failure
|
163
|
+
# we want to be notified about it.
|
164
|
+
def successful_operations(check_failures: true)
|
165
|
+
successful_operations_by_cluster_name(check_failures: check_failures).values.flatten(1).uniq
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Given a list of operations (which can contain different types of operations!), queries the datastore
|
170
|
+
# to identify the source event versions stored on the corresponding documents.
|
171
|
+
#
|
172
|
+
# This was specifically designed to support dealing with malformed events. If an event is malformed we
|
173
|
+
# usually want to raise an exception, but if the document targeted by the malformed event is at a newer
|
174
|
+
# version in the index than the version number in the event, the malformed state of the event has
|
175
|
+
# already been superseded by a corrected event and we can just log a message instead. This method specifically
|
176
|
+
# supports that logic.
|
177
|
+
#
|
178
|
+
# If the datastore returns errors for any of the calls, this method will raise an exception.
|
179
|
+
# Otherwise, this method returns a nested hash:
|
180
|
+
#
|
181
|
+
# - The outer hash maps operations to an inner hash of results for that operation.
|
182
|
+
# - The inner hash maps datastore cluster/client names to the version number for that operation from the datastore cluster.
|
183
|
+
#
|
184
|
+
# Note that the returned `version` for an operation on a cluster can be `nil` (as when the document is not found,
|
185
|
+
# or for an operation type that doesn't store source versions).
|
186
|
+
#
|
187
|
+
# This nested structure is necessary because a single operation can target more than one datastore
|
188
|
+
# cluster, and a document may have different source event versions in different datastore clusters.
|
189
|
+
def source_event_versions_in_index(operations)
|
190
|
+
ops_by_client_name = operations.each_with_object(::Hash.new { |h, k| h[k] = [] }) do |op, ops_hash|
|
191
|
+
# Note: this intentionally does not use `accessible_cluster_names_to_index_into`.
|
192
|
+
# We want to fail with clear error if any clusters are inaccessible instead of silently ignoring
|
193
|
+
# the named cluster. The `IndexingFailuresError` provides a clear error.
|
194
|
+
cluster_names = op.destination_index_def.clusters_to_index_into
|
195
|
+
cluster_names.each { |cluster_name| ops_hash[cluster_name] << op }
|
196
|
+
end
|
197
|
+
|
198
|
+
client_names_and_results = Support::Threading.parallel_map(ops_by_client_name) do |(client_name, all_ops)|
|
199
|
+
ops, unversioned_ops = all_ops.partition(&:versioned?)
|
200
|
+
|
201
|
+
msearch_response =
|
202
|
+
if (client = @datastore_clients_by_name[client_name]) && ops.any?
|
203
|
+
body = ops.flat_map do |op|
|
204
|
+
# We only care about the source versions, but the way we get it varies.
|
205
|
+
include_version =
|
206
|
+
if op.destination_index_def.use_updates_for_indexing?
|
207
|
+
{_source: {includes: [
|
208
|
+
"__versions.#{op.update_target.relationship}",
|
209
|
+
# The update_data script before ElasticGraph v0.8 used __sourceVersions[type] instead of __versions[relationship].
|
210
|
+
# To be backwards-compatible we need to fetch the data at both paths.
|
211
|
+
#
|
212
|
+
# TODO: Drop this when we no longer need to maintain backwards-compatibility.
|
213
|
+
"__sourceVersions.#{op.event.fetch("type")}"
|
214
|
+
]}}
|
215
|
+
else
|
216
|
+
{version: true, _source: false}
|
217
|
+
end
|
218
|
+
|
219
|
+
[
|
220
|
+
# Note: we intentionally search the entire index expression, not just an individual index based on a rollover timestamp.
|
221
|
+
# And we intentionally do NOT provide a routing value--we want to find the version, no matter what shard the document
|
222
|
+
# lives on.
|
223
|
+
#
|
224
|
+
# Since this `source_event_versions_in_index` is for handling malformed events, its possible that the
|
225
|
+
# rollover timestamp or routing value on the operation is wrong and that the correct document lives in
|
226
|
+
# a different shard and index than what the operation is targeted at. We want to search across all of them
|
227
|
+
# so that we will find it, regardless of where it lives.
|
228
|
+
{index: op.destination_index_def.index_expression_for_search},
|
229
|
+
# Filter to the documents matching the id.
|
230
|
+
{query: {ids: {values: [op.doc_id]}}}.merge(include_version)
|
231
|
+
]
|
232
|
+
end
|
233
|
+
|
234
|
+
client.msearch(body: body)
|
235
|
+
else
|
236
|
+
# The named client doesn't exist, so we don't have any versions for the docs.
|
237
|
+
{"responses" => ops.map { |op| {"hits" => {"hits" => _ = []}} }}
|
238
|
+
end
|
239
|
+
|
240
|
+
errors = msearch_response.fetch("responses").filter_map { |res| res if res["error"] }
|
241
|
+
|
242
|
+
if errors.empty?
|
243
|
+
versions_by_op = ops.zip(msearch_response.fetch("responses")).to_h do |(op, response)|
|
244
|
+
hits = response.fetch("hits").fetch("hits")
|
245
|
+
|
246
|
+
if hits.size > 1
|
247
|
+
# Got multiple results. The document is duplicated in multiple shards or indexes. Log a warning about this.
|
248
|
+
@logger.warn({
|
249
|
+
"message_type" => "IdentifyDocumentVersionsGotMultipleResults",
|
250
|
+
"index" => hits.map { |h| h["_index"] },
|
251
|
+
"routing" => hits.map { |h| h["_routing"] },
|
252
|
+
"id" => hits.map { |h| h["_id"] },
|
253
|
+
"version" => hits.map { |h| h["_version"] }
|
254
|
+
})
|
255
|
+
end
|
256
|
+
|
257
|
+
if op.destination_index_def.use_updates_for_indexing?
|
258
|
+
versions = hits.filter_map do |hit|
|
259
|
+
hit.dig("_source", "__versions", op.update_target.relationship, hit.fetch("_id")) ||
|
260
|
+
# The update_data script before ElasticGraph v0.8 used __sourceVersions[type] instead of __versions[relationship].
|
261
|
+
# To be backwards-compatible we need to fetch the data at both paths.
|
262
|
+
#
|
263
|
+
# TODO: Drop this when we no longer need to maintain backwards-compatibility.
|
264
|
+
hit.dig("_source", "__sourceVersions", op.event.fetch("type"), hit.fetch("_id"))
|
265
|
+
end
|
266
|
+
|
267
|
+
[op, versions.uniq]
|
268
|
+
else
|
269
|
+
[op, hits.map { |h| h.fetch("_version") }.uniq]
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
unversioned_ops_hash = unversioned_ops.to_h do |op|
|
274
|
+
[op, []] # : [_Operation, ::Array[::Integer]]
|
275
|
+
end
|
276
|
+
|
277
|
+
[client_name, :success, versions_by_op.merge(unversioned_ops_hash)]
|
278
|
+
else
|
279
|
+
[client_name, :failure, errors]
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
failures = client_names_and_results.flat_map do |(client_name, success_or_failure, results)|
|
284
|
+
if success_or_failure == :success
|
285
|
+
[]
|
286
|
+
else
|
287
|
+
results.map do |result|
|
288
|
+
"From cluster #{client_name}: #{::JSON.generate(result, space: " ")}"
|
289
|
+
end
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
if failures.empty?
|
294
|
+
client_names_and_results.each_with_object(_ = {}) do |(client_name, _success_or_failure, results), accum|
|
295
|
+
results.each do |op, version|
|
296
|
+
accum[op] ||= _ = {}
|
297
|
+
accum[op][client_name] = version
|
298
|
+
end
|
299
|
+
end
|
300
|
+
else
|
301
|
+
raise IdentifyDocumentVersionsFailedError, "Got #{failures.size} failure(s) while querying the datastore " \
|
302
|
+
"for document versions:\n\n#{failures.join("\n")}"
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
# Queries the datastore mapping(s) for the given index definition(s) to verify that they are up-to-date
|
307
|
+
# with our schema artifacts, raising an error if the datastore mappings are missing fields that we
|
308
|
+
# expect. (Extra fields are allowed, though--we'll just ignore them).
|
309
|
+
#
|
310
|
+
# This is intended for use when you want a strong guarantee before proceeding that the indices are current,
|
311
|
+
# such as before indexing data, or after applying index updates (to "prove" that everything is how it should
|
312
|
+
# be).
|
313
|
+
#
|
314
|
+
# This correctly queries the datastore clusters specified via `index_into_clusters` in config,
|
315
|
+
# but ignores clusters specified via `query_cluster` (since this isn't intended to be used as part
|
316
|
+
# of the query flow).
|
317
|
+
#
|
318
|
+
# For a rollover template, this takes care of verifying the template itself and also any indices that originated
|
319
|
+
# from the template.
|
320
|
+
#
|
321
|
+
# Note also that this caches the datastore mappings, since this is intended to be used to verify an index
|
322
|
+
# before we index data into it, and we do not want to impose a huge performance penalty on that process (requiring
|
323
|
+
# multiple datastore requests before we index each document...). In general, the index mapping only changes
|
324
|
+
# when we make it change, and we deploy and restart ElasticGraph after any index mapping changes, so we do not
|
325
|
+
# need to worry about it mutating during the lifetime of a single process (particularly given the expense of doing
|
326
|
+
# so).
|
327
|
+
def validate_mapping_completeness_of!(index_cluster_name_method, *index_definitions)
|
328
|
+
diffs_by_cluster_and_index_name = index_definitions.reduce(_ = {}) do |accum, index_def|
|
329
|
+
accum.merge(mapping_diffs_for(index_def, index_cluster_name_method))
|
330
|
+
end
|
331
|
+
|
332
|
+
if diffs_by_cluster_and_index_name.any?
|
333
|
+
formatted_diffs = diffs_by_cluster_and_index_name.map do |(cluster_name, index_name), diff|
|
334
|
+
<<~EOS
|
335
|
+
On cluster `#{cluster_name}` and index/template `#{index_name}`:
|
336
|
+
#{diff}
|
337
|
+
EOS
|
338
|
+
end.join("\n\n")
|
339
|
+
|
340
|
+
raise ConfigError, "Datastore index mappings are incomplete compared to the current schema. " \
|
341
|
+
"The diff below uses the datastore index mapping as the base, and shows the expected mapping as a diff. " \
|
342
|
+
"\n\n#{formatted_diffs}"
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
private
|
347
|
+
|
348
|
+
def mapping_diffs_for(index_definition, index_cluster_name_method)
|
349
|
+
expected_mapping = @mappings_by_index_def_name.fetch(index_definition.name)
|
350
|
+
|
351
|
+
index_definition.public_send(index_cluster_name_method).flat_map do |cluster_name|
|
352
|
+
datastore_client = datastore_client_named(cluster_name)
|
353
|
+
|
354
|
+
cached_mappings_for(index_definition, datastore_client).filter_map do |index, mapping_in_index|
|
355
|
+
if (diff = HashDiffer.diff(mapping_in_index, expected_mapping, ignore_ops: [:-]))
|
356
|
+
[[cluster_name, index.name], diff]
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end.to_h
|
360
|
+
end
|
361
|
+
|
362
|
+
def cached_mappings_for(index_definition, datastore_client)
|
363
|
+
key = [datastore_client, index_definition] # : [DatastoreCore::_Client, DatastoreCore::indexDefinition]
|
364
|
+
cached_mapping = @cached_mappings[key] ||= new_cached_mapping(fetch_mappings_from_datastore(index_definition, datastore_client))
|
365
|
+
|
366
|
+
return cached_mapping.mappings if @monotonic_clock.now_in_ms < cached_mapping.expires_at
|
367
|
+
|
368
|
+
begin
|
369
|
+
fetch_mappings_from_datastore(index_definition, datastore_client).tap do |mappings|
|
370
|
+
@logger.info "Mapping cache expired for #{index_definition.name}; cleared it from the cache and re-fetched the mapping."
|
371
|
+
@cached_mappings[key] = new_cached_mapping(mappings)
|
372
|
+
end
|
373
|
+
rescue => e
|
374
|
+
@logger.warn <<~EOS
|
375
|
+
Mapping cache expired for #{index_definition.name}; attempted to re-fetch it but got an error[1]. Will continue using expired mapping information for now.
|
376
|
+
|
377
|
+
[1] #{e.class}: #{e.message}
|
378
|
+
#{e.backtrace.join("\n")}
|
379
|
+
EOS
|
380
|
+
|
381
|
+
# Update the cached mapping so that the expiration is reset.
|
382
|
+
@cached_mappings[key] = new_cached_mapping(cached_mapping.mappings)
|
383
|
+
|
384
|
+
cached_mapping.mappings
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
def fetch_mappings_from_datastore(index_definition, datastore_client)
|
389
|
+
# We need to also check any related indices...
|
390
|
+
indices_to_check = [index_definition] + index_definition.related_rollover_indices(datastore_client)
|
391
|
+
|
392
|
+
indices_to_check.to_h do |index|
|
393
|
+
[index, index.mappings_in_datastore(datastore_client)]
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
def new_cached_mapping(mappings)
|
398
|
+
CachedMapping.new(mappings, @monotonic_clock.now_in_ms + rand(MAPPING_CACHE_MAX_AGE_IN_MS_RANGE).to_i)
|
399
|
+
end
|
400
|
+
|
401
|
+
def datastore_client_named(name)
|
402
|
+
@datastore_clients_by_name.fetch(name)
|
403
|
+
end
|
404
|
+
|
405
|
+
CachedMapping = ::Data.define(:mappings, :expires_at)
|
406
|
+
end
|
407
|
+
end
|
408
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/error"
|
10
|
+
|
11
|
+
module ElasticGraph
|
12
|
+
class Indexer
|
13
|
+
# A unique identifier for an event ingested by the indexer. As a string, takes the form of
|
14
|
+
# "[type]:[id]@v[version]", such as "Widget:123abc@v7". This format was designed to make it
|
15
|
+
# easy to put these ids in a comma-seperated list.
|
16
|
+
EventID = ::Data.define(:type, :id, :version) do
|
17
|
+
# @implements EventID
|
18
|
+
def self.from_event(event)
|
19
|
+
new(type: event["type"], id: event["id"], version: event["version"])
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"#{type}:#{id}@v#{version}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Steep weirdly expects them here...
|
28
|
+
# @dynamic initialize, config, datastore_core, schema_artifacts, datastore_router, monotonic_clock
|
29
|
+
# @dynamic record_preparer_factory, processor, operation_factory, logger
|
30
|
+
# @dynamic self.from_parsed_yaml
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/error"
|
10
|
+
require "elastic_graph/indexer/event_id"
|
11
|
+
|
12
|
+
module ElasticGraph
|
13
|
+
class Indexer
|
14
|
+
# Indicates an event that we attempted to process which failed for some reason. It may have
|
15
|
+
# failed due to a validation issue before we even attempted to write it to the datastore, or it
|
16
|
+
# could have failed in the datastore itself.
|
17
|
+
class FailedEventError < Error
|
18
|
+
# @dynamic main_message, event, operations
|
19
|
+
|
20
|
+
# The "main" part of the error message (without the `full_id` portion).
|
21
|
+
attr_reader :main_message
|
22
|
+
|
23
|
+
# The invalid event.
|
24
|
+
attr_reader :event
|
25
|
+
|
26
|
+
# The operations that would have been returned by the `OperationFactory` if the event was valid.
|
27
|
+
# Note that sometimes an event is so malformed that we can't build any operations for it, but
|
28
|
+
# most of the time we can.
|
29
|
+
attr_reader :operations
|
30
|
+
|
31
|
+
def self.from_failed_operation_result(result, all_operations_for_event)
|
32
|
+
new(
|
33
|
+
event: result.event,
|
34
|
+
operations: all_operations_for_event,
|
35
|
+
main_message: result.summary
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(event:, operations:, main_message:)
|
40
|
+
@main_message = main_message
|
41
|
+
@event = event
|
42
|
+
@operations = operations
|
43
|
+
|
44
|
+
super("#{full_id}: #{main_message}")
|
45
|
+
end
|
46
|
+
|
47
|
+
# A filtered list of operations that have versions that can be compared against our event
|
48
|
+
# version. Not all operation types have a version (e.g. derived indexing `Update` operations don't).
|
49
|
+
def versioned_operations
|
50
|
+
@versioned_operations ||= operations.select(&:versioned?)
|
51
|
+
end
|
52
|
+
|
53
|
+
def full_id
|
54
|
+
event_id = EventID.from_event(event).to_s
|
55
|
+
if (message_id = event["message_id"])
|
56
|
+
"#{event_id} (message_id: #{message_id})"
|
57
|
+
else
|
58
|
+
event_id
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def id
|
63
|
+
event["id"]
|
64
|
+
end
|
65
|
+
|
66
|
+
def op
|
67
|
+
event["op"]
|
68
|
+
end
|
69
|
+
|
70
|
+
def type
|
71
|
+
event["type"]
|
72
|
+
end
|
73
|
+
|
74
|
+
def version
|
75
|
+
event["version"]
|
76
|
+
end
|
77
|
+
|
78
|
+
def record
|
79
|
+
event["record"]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "hashdiff"
|
10
|
+
|
11
|
+
module ElasticGraph
|
12
|
+
class Indexer
|
13
|
+
class HashDiffer
|
14
|
+
# Generates a string describing how `old` and `new` differ, similar to a git diff.
|
15
|
+
# `ignore_ops` can contain any of `:-`, `:+`, and `:~`; when provided those diff operations
|
16
|
+
# will be ignored.
|
17
|
+
def self.diff(old, new, ignore_ops: [])
|
18
|
+
ignore_op_strings = ignore_ops.map(&:to_s).to_set
|
19
|
+
|
20
|
+
diffs = ::Hashdiff.diff(old, new)
|
21
|
+
.reject { |op, path, *vals| ignore_op_strings.include?(_ = op) }
|
22
|
+
|
23
|
+
return if diffs.empty?
|
24
|
+
|
25
|
+
diffs.map do |op, path, *vals|
|
26
|
+
suffix = if vals.one?
|
27
|
+
vals.first
|
28
|
+
else
|
29
|
+
vals.map { |v| "`#{v.inspect}`" }.join(" => ")
|
30
|
+
end
|
31
|
+
|
32
|
+
"#{op} #{path}: #{suffix}"
|
33
|
+
end.join("\n")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|