elasticgraph-indexer 0.18.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +1 -0
- data/elasticgraph-indexer.gemspec +24 -0
- data/lib/elastic_graph/indexer/config.rb +48 -0
- data/lib/elastic_graph/indexer/datastore_indexing_router.rb +408 -0
- data/lib/elastic_graph/indexer/event_id.rb +32 -0
- data/lib/elastic_graph/indexer/failed_event_error.rb +83 -0
- data/lib/elastic_graph/indexer/hash_differ.rb +37 -0
- data/lib/elastic_graph/indexer/indexing_failures_error.rb +28 -0
- data/lib/elastic_graph/indexer/indexing_preparers/integer.rb +41 -0
- data/lib/elastic_graph/indexer/indexing_preparers/no_op.rb +19 -0
- data/lib/elastic_graph/indexer/indexing_preparers/untyped.rb +22 -0
- data/lib/elastic_graph/indexer/operation/count_accumulator.rb +166 -0
- data/lib/elastic_graph/indexer/operation/factory.rb +226 -0
- data/lib/elastic_graph/indexer/operation/result.rb +76 -0
- data/lib/elastic_graph/indexer/operation/update.rb +160 -0
- data/lib/elastic_graph/indexer/operation/upsert.rb +71 -0
- data/lib/elastic_graph/indexer/processor.rb +137 -0
- data/lib/elastic_graph/indexer/record_preparer.rb +163 -0
- data/lib/elastic_graph/indexer/spec_support/event_matcher.rb +44 -0
- data/lib/elastic_graph/indexer/test_support/converters.rb +36 -0
- data/lib/elastic_graph/indexer.rb +98 -0
- metadata +454 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1bb7a4a0daded4c04bfc69e3142f75606b56a3044f088d95198c7998a85acc9c
|
4
|
+
data.tar.gz: f2e2e739ad2f785675a3678d2feab5d5f056f5dd61f5886196572f4c10193df6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: be15a227ac25072f1562f8ae850a65b12d46c66fb866b882c9bd2e1ac9581a69a8e08f7f53f3d5e630b9842fbe69b0f7c88208bac915844bb185b2a217d20df0
|
7
|
+
data.tar.gz: a5e808366a2e445257f61cf75dc4d8c4921be0ede5317bd0b620db92e4684c7daef63c0d015500dada497babd61e6f4c278febccfe140ac3b8372069e34eaadf
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2024 Block, Inc.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
# ElasticGraph::Indexer
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require_relative "../gemspec_helper"
|
10
|
+
|
11
|
+
ElasticGraphGemspecHelper.define_elasticgraph_gem(gemspec_file: __FILE__, category: :core) do |spec, eg_version|
|
12
|
+
spec.summary = "ElasticGraph gem that provides APIs to robustly index data into a datastore."
|
13
|
+
|
14
|
+
spec.add_dependency "elasticgraph-datastore_core", eg_version
|
15
|
+
spec.add_dependency "elasticgraph-json_schema", eg_version
|
16
|
+
spec.add_dependency "elasticgraph-schema_artifacts", eg_version
|
17
|
+
spec.add_dependency "elasticgraph-support", eg_version
|
18
|
+
spec.add_dependency "hashdiff", "~> 1.1"
|
19
|
+
|
20
|
+
spec.add_development_dependency "elasticgraph-admin", eg_version
|
21
|
+
spec.add_development_dependency "elasticgraph-elasticsearch", eg_version
|
22
|
+
spec.add_development_dependency "elasticgraph-opensearch", eg_version
|
23
|
+
spec.add_development_dependency "elasticgraph-schema_definition", eg_version
|
24
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/error"
|
10
|
+
require "elastic_graph/indexer/event_id"
|
11
|
+
|
12
|
+
module ElasticGraph
|
13
|
+
class Indexer
|
14
|
+
class Config < ::Data.define(
|
15
|
+
# Map of indexing latency thresholds (in milliseconds), keyed by the name of
|
16
|
+
# the indexing latency metric. When an event is indexed with an indexing latency
|
17
|
+
# exceeding the threshold, a warning with the event type, id, and version will
|
18
|
+
# be logged, so the issue can be investigated.
|
19
|
+
:latency_slo_thresholds_by_timestamp_in_ms,
|
20
|
+
# Setting that can be used to specify some derived indexing type updates that should be skipped. This
|
21
|
+
# setting should be a map keyed by the name of the derived indexing type, and the values should be sets
|
22
|
+
# of ids. This can be useful when you have a "hot spot" of a single derived document that is
|
23
|
+
# receiving a ton of updates. During a backfill (or whatever) you may want to skip the derived
|
24
|
+
# type updates.
|
25
|
+
:skip_derived_indexing_type_updates
|
26
|
+
)
|
27
|
+
def self.from_parsed_yaml(hash)
|
28
|
+
hash = hash.fetch("indexer")
|
29
|
+
extra_keys = hash.keys - EXPECTED_KEYS
|
30
|
+
|
31
|
+
unless extra_keys.empty?
|
32
|
+
raise ConfigError, "Unknown `indexer` config settings: #{extra_keys.join(", ")}"
|
33
|
+
end
|
34
|
+
|
35
|
+
new(
|
36
|
+
latency_slo_thresholds_by_timestamp_in_ms: hash.fetch("latency_slo_thresholds_by_timestamp_in_ms"),
|
37
|
+
skip_derived_indexing_type_updates: (hash["skip_derived_indexing_type_updates"] || {}).transform_values(&:to_set)
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
EXPECTED_KEYS = members.map(&:to_s)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Steep weirdly expects them here...
|
45
|
+
# @dynamic initialize, config, datastore_core, schema_artifacts, datastore_router
|
46
|
+
# @dynamic record_preparer, processor, operation_factory
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,408 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/constants"
|
10
|
+
require "elastic_graph/error"
|
11
|
+
require "elastic_graph/datastore_core/index_config_normalizer"
|
12
|
+
require "elastic_graph/indexer/event_id"
|
13
|
+
require "elastic_graph/indexer/hash_differ"
|
14
|
+
require "elastic_graph/indexer/indexing_failures_error"
|
15
|
+
require "elastic_graph/support/threading"
|
16
|
+
|
17
|
+
module ElasticGraph
|
18
|
+
class Indexer
|
19
|
+
# Responsible for routing datastore indexing requests to the appropriate cluster and index.
|
20
|
+
class DatastoreIndexingRouter
|
21
|
+
# In this class, we internally cache the datastore mapping for an index definition, so that we don't have to
|
22
|
+
# fetch the mapping from the datastore on each call to `bulk`. It rarely changes and ElasticGraph is designed so that
|
23
|
+
# mapping updates are applied before deploying the indexer with a new mapping.
|
24
|
+
#
|
25
|
+
# However, if an engineer forgets to apply a mapping update before deploying, they'll run into "mappings are incomplete"
|
26
|
+
# errors. They can updated the mapping to fix it, but the use of caching in this class could mean that the fix doesn't
|
27
|
+
# necessarily work right away. The app would have to be deployed or restarted so that the caches are cleared. That could
|
28
|
+
# be annoying.
|
29
|
+
#
|
30
|
+
# To address this issue, we're adding an expiration on the caching of the index mappings. Re-fetching the index
|
31
|
+
# mapping once every few minutes is no big deal and will allow the indexer to recover on its own after a mapping
|
32
|
+
# update has been applied without requiring a deploy or a restart.
|
33
|
+
#
|
34
|
+
# The expiration is a range so that, when we have many processes running, and they all started around the same time,
|
35
|
+
# (say, after a deploy!), they don't all expire their caches in sync, leading to spiky load on the datastore. Instead,
|
36
|
+
# the random distribution of expiration times will spread out the load.
|
37
|
+
MAPPING_CACHE_MAX_AGE_IN_MS_RANGE = (5 * 60 * 1000)..(10 * 60 * 1000)
|
38
|
+
|
39
|
+
def initialize(
|
40
|
+
datastore_clients_by_name:,
|
41
|
+
mappings_by_index_def_name:,
|
42
|
+
monotonic_clock:,
|
43
|
+
logger:
|
44
|
+
)
|
45
|
+
@datastore_clients_by_name = datastore_clients_by_name
|
46
|
+
@logger = logger
|
47
|
+
@monotonic_clock = monotonic_clock
|
48
|
+
@cached_mappings = {}
|
49
|
+
|
50
|
+
@mappings_by_index_def_name = mappings_by_index_def_name.transform_values do |mappings|
|
51
|
+
DatastoreCore::IndexConfigNormalizer.normalize_mappings(mappings)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Proxies `client#bulk` by converting `operations` to their bulk
|
56
|
+
# form. Returns a hash between a cluster and a list of successfully applied operations on that cluster.
|
57
|
+
#
|
58
|
+
# For each operation, 1 of 4 things will happen, each of which will be treated differently:
|
59
|
+
#
|
60
|
+
# 1. The operation was successfully applied to the datastore and updated its state.
|
61
|
+
# The operation will be included in the successful operation of the returned result.
|
62
|
+
# 2. The operation could not even be attempted. For example, an `Update` operation
|
63
|
+
# cannot be attempted when the source event has `nil` for the field used as the source of
|
64
|
+
# the destination type's id. The returned result will not include this operation.
|
65
|
+
# 3. The operation was a no-op due to the external version not increasing. This happens when we
|
66
|
+
# process a duplicate or out-of-order event. The operation will be included in the returned
|
67
|
+
# result's list of noop results.
|
68
|
+
# 4. The operation failed outright for some other reason. The operation will be included in the
|
69
|
+
# returned result's failure results.
|
70
|
+
#
|
71
|
+
# It is the caller's responsibility to deal with any returned failures as this method does not
|
72
|
+
# raise an exception in that case.
|
73
|
+
#
|
74
|
+
# Note: before any operations are attempted, the datastore indices are validated for consistency
|
75
|
+
# with the mappings we expect, meaning that no bulk operations will be attempted if that is not up-to-date.
|
76
|
+
def bulk(operations, refresh: false)
|
77
|
+
# Before writing these operations, verify their destination index mapping are consistent.
|
78
|
+
validate_mapping_completeness_of!(:accessible_cluster_names_to_index_into, *operations.map(&:destination_index_def).uniq)
|
79
|
+
|
80
|
+
# @type var ops_by_client: ::Hash[DatastoreCore::_Client, ::Array[_Operation]]
|
81
|
+
ops_by_client = ::Hash.new { |h, k| h[k] = [] }
|
82
|
+
# @type var unsupported_ops: ::Set[_Operation]
|
83
|
+
unsupported_ops = ::Set.new
|
84
|
+
|
85
|
+
operations.reject { |op| op.to_datastore_bulk.empty? }.each do |op|
|
86
|
+
# Note: this intentionally does not use `accessible_cluster_names_to_index_into`.
|
87
|
+
# We want to fail with clear error if any clusters are inaccessible instead of silently ignoring
|
88
|
+
# the named cluster. The `IndexingFailuresError` provides a clear error.
|
89
|
+
cluster_names = op.destination_index_def.clusters_to_index_into
|
90
|
+
|
91
|
+
cluster_names.each do |cluster_name|
|
92
|
+
if (client = @datastore_clients_by_name[cluster_name])
|
93
|
+
ops_by_client[client] << op
|
94
|
+
else
|
95
|
+
unsupported_ops << op
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
unsupported_ops << op if cluster_names.empty?
|
100
|
+
end
|
101
|
+
|
102
|
+
unless unsupported_ops.empty?
|
103
|
+
raise IndexingFailuresError,
|
104
|
+
"The index definitions for #{unsupported_ops.size} operations " \
|
105
|
+
"(#{unsupported_ops.map { |o| Indexer::EventID.from_event(o.event) }.join(", ")}) " \
|
106
|
+
"were configured to be inaccessible. Check the configuration, or avoid sending " \
|
107
|
+
"events of this type to this ElasticGraph indexer."
|
108
|
+
end
|
109
|
+
|
110
|
+
ops_and_results_by_cluster = Support::Threading.parallel_map(ops_by_client) do |(client, ops)|
|
111
|
+
responses = client.bulk(body: ops.flat_map(&:to_datastore_bulk), refresh: refresh).fetch("items")
|
112
|
+
|
113
|
+
# As per https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html#bulk-api-response-body,
|
114
|
+
# > `items` contains the result of each operation in the bulk request, in the order they were submitted.
|
115
|
+
# Thus, we can trust it has the same cardinality as `ops` and they can be zipped together.
|
116
|
+
ops_and_results = ops.zip(responses).map { |(op, response)| [op, op.categorize(response)] }
|
117
|
+
[client.cluster_name, ops_and_results]
|
118
|
+
end.to_h
|
119
|
+
|
120
|
+
BulkResult.new(ops_and_results_by_cluster)
|
121
|
+
end
|
122
|
+
|
123
|
+
# Return type encapsulating all of the results of the bulk call.
|
124
|
+
class BulkResult < ::Data.define(:ops_and_results_by_cluster, :noop_results, :failure_results)
|
125
|
+
def initialize(ops_and_results_by_cluster:)
|
126
|
+
results_by_category = ops_and_results_by_cluster.values
|
127
|
+
.flat_map { |ops_and_results| ops_and_results.map(&:last) }
|
128
|
+
.group_by(&:category)
|
129
|
+
|
130
|
+
super(
|
131
|
+
ops_and_results_by_cluster: ops_and_results_by_cluster,
|
132
|
+
noop_results: results_by_category[:noop] || [],
|
133
|
+
failure_results: results_by_category[:failure] || []
|
134
|
+
)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns successful operations grouped by the cluster they were applied to. If there are any
|
138
|
+
# failures, raises an exception to alert the caller to them unless `check_failures: false` is passed.
|
139
|
+
#
|
140
|
+
# This is designed to prevent failures from silently being ignored. For example, in tests
|
141
|
+
# we often call `successful_operations` or `successful_operations_by_cluster_name` and don't
|
142
|
+
# bother checking `failure_results` (because we don't expect a failure). If there was a failure
|
143
|
+
# we want to be notified about it.
|
144
|
+
def successful_operations_by_cluster_name(check_failures: true)
|
145
|
+
if check_failures && failure_results.any?
|
146
|
+
raise IndexingFailuresError, "Got #{failure_results.size} indexing failure(s):\n\n" \
|
147
|
+
"#{failure_results.map.with_index(1) { |result, idx| "#{idx}. #{result.summary}" }.join("\n\n")}"
|
148
|
+
end
|
149
|
+
|
150
|
+
ops_and_results_by_cluster.transform_values do |ops_and_results|
|
151
|
+
ops_and_results.filter_map do |(op, result)|
|
152
|
+
op if result.category == :success
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns a flat list of successful operations. If there are any failures, raises an exception
|
158
|
+
# to alert the caller to them unless `check_failures: false` is passed.
|
159
|
+
#
|
160
|
+
# This is designed to prevent failures from silently being ignored. For example, in tests
|
161
|
+
# we often call `successful_operations` or `successful_operations_by_cluster_name` and don't
|
162
|
+
# bother checking `failure_results` (because we don't expect a failure). If there was a failure
|
163
|
+
# we want to be notified about it.
|
164
|
+
def successful_operations(check_failures: true)
|
165
|
+
successful_operations_by_cluster_name(check_failures: check_failures).values.flatten(1).uniq
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Given a list of operations (which can contain different types of operations!), queries the datastore
|
170
|
+
# to identify the source event versions stored on the corresponding documents.
|
171
|
+
#
|
172
|
+
# This was specifically designed to support dealing with malformed events. If an event is malformed we
|
173
|
+
# usually want to raise an exception, but if the document targeted by the malformed event is at a newer
|
174
|
+
# version in the index than the version number in the event, the malformed state of the event has
|
175
|
+
# already been superseded by a corrected event and we can just log a message instead. This method specifically
|
176
|
+
# supports that logic.
|
177
|
+
#
|
178
|
+
# If the datastore returns errors for any of the calls, this method will raise an exception.
|
179
|
+
# Otherwise, this method returns a nested hash:
|
180
|
+
#
|
181
|
+
# - The outer hash maps operations to an inner hash of results for that operation.
|
182
|
+
# - The inner hash maps datastore cluster/client names to the version number for that operation from the datastore cluster.
|
183
|
+
#
|
184
|
+
# Note that the returned `version` for an operation on a cluster can be `nil` (as when the document is not found,
|
185
|
+
# or for an operation type that doesn't store source versions).
|
186
|
+
#
|
187
|
+
# This nested structure is necessary because a single operation can target more than one datastore
|
188
|
+
# cluster, and a document may have different source event versions in different datastore clusters.
|
189
|
+
def source_event_versions_in_index(operations)
|
190
|
+
ops_by_client_name = operations.each_with_object(::Hash.new { |h, k| h[k] = [] }) do |op, ops_hash|
|
191
|
+
# Note: this intentionally does not use `accessible_cluster_names_to_index_into`.
|
192
|
+
# We want to fail with clear error if any clusters are inaccessible instead of silently ignoring
|
193
|
+
# the named cluster. The `IndexingFailuresError` provides a clear error.
|
194
|
+
cluster_names = op.destination_index_def.clusters_to_index_into
|
195
|
+
cluster_names.each { |cluster_name| ops_hash[cluster_name] << op }
|
196
|
+
end
|
197
|
+
|
198
|
+
client_names_and_results = Support::Threading.parallel_map(ops_by_client_name) do |(client_name, all_ops)|
|
199
|
+
ops, unversioned_ops = all_ops.partition(&:versioned?)
|
200
|
+
|
201
|
+
msearch_response =
|
202
|
+
if (client = @datastore_clients_by_name[client_name]) && ops.any?
|
203
|
+
body = ops.flat_map do |op|
|
204
|
+
# We only care about the source versions, but the way we get it varies.
|
205
|
+
include_version =
|
206
|
+
if op.destination_index_def.use_updates_for_indexing?
|
207
|
+
{_source: {includes: [
|
208
|
+
"__versions.#{op.update_target.relationship}",
|
209
|
+
# The update_data script before ElasticGraph v0.8 used __sourceVersions[type] instead of __versions[relationship].
|
210
|
+
# To be backwards-compatible we need to fetch the data at both paths.
|
211
|
+
#
|
212
|
+
# TODO: Drop this when we no longer need to maintain backwards-compatibility.
|
213
|
+
"__sourceVersions.#{op.event.fetch("type")}"
|
214
|
+
]}}
|
215
|
+
else
|
216
|
+
{version: true, _source: false}
|
217
|
+
end
|
218
|
+
|
219
|
+
[
|
220
|
+
# Note: we intentionally search the entire index expression, not just an individual index based on a rollover timestamp.
|
221
|
+
# And we intentionally do NOT provide a routing value--we want to find the version, no matter what shard the document
|
222
|
+
# lives on.
|
223
|
+
#
|
224
|
+
# Since this `source_event_versions_in_index` is for handling malformed events, its possible that the
|
225
|
+
# rollover timestamp or routing value on the operation is wrong and that the correct document lives in
|
226
|
+
# a different shard and index than what the operation is targeted at. We want to search across all of them
|
227
|
+
# so that we will find it, regardless of where it lives.
|
228
|
+
{index: op.destination_index_def.index_expression_for_search},
|
229
|
+
# Filter to the documents matching the id.
|
230
|
+
{query: {ids: {values: [op.doc_id]}}}.merge(include_version)
|
231
|
+
]
|
232
|
+
end
|
233
|
+
|
234
|
+
client.msearch(body: body)
|
235
|
+
else
|
236
|
+
# The named client doesn't exist, so we don't have any versions for the docs.
|
237
|
+
{"responses" => ops.map { |op| {"hits" => {"hits" => _ = []}} }}
|
238
|
+
end
|
239
|
+
|
240
|
+
errors = msearch_response.fetch("responses").filter_map { |res| res if res["error"] }
|
241
|
+
|
242
|
+
if errors.empty?
|
243
|
+
versions_by_op = ops.zip(msearch_response.fetch("responses")).to_h do |(op, response)|
|
244
|
+
hits = response.fetch("hits").fetch("hits")
|
245
|
+
|
246
|
+
if hits.size > 1
|
247
|
+
# Got multiple results. The document is duplicated in multiple shards or indexes. Log a warning about this.
|
248
|
+
@logger.warn({
|
249
|
+
"message_type" => "IdentifyDocumentVersionsGotMultipleResults",
|
250
|
+
"index" => hits.map { |h| h["_index"] },
|
251
|
+
"routing" => hits.map { |h| h["_routing"] },
|
252
|
+
"id" => hits.map { |h| h["_id"] },
|
253
|
+
"version" => hits.map { |h| h["_version"] }
|
254
|
+
})
|
255
|
+
end
|
256
|
+
|
257
|
+
if op.destination_index_def.use_updates_for_indexing?
|
258
|
+
versions = hits.filter_map do |hit|
|
259
|
+
hit.dig("_source", "__versions", op.update_target.relationship, hit.fetch("_id")) ||
|
260
|
+
# The update_data script before ElasticGraph v0.8 used __sourceVersions[type] instead of __versions[relationship].
|
261
|
+
# To be backwards-compatible we need to fetch the data at both paths.
|
262
|
+
#
|
263
|
+
# TODO: Drop this when we no longer need to maintain backwards-compatibility.
|
264
|
+
hit.dig("_source", "__sourceVersions", op.event.fetch("type"), hit.fetch("_id"))
|
265
|
+
end
|
266
|
+
|
267
|
+
[op, versions.uniq]
|
268
|
+
else
|
269
|
+
[op, hits.map { |h| h.fetch("_version") }.uniq]
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
unversioned_ops_hash = unversioned_ops.to_h do |op|
|
274
|
+
[op, []] # : [_Operation, ::Array[::Integer]]
|
275
|
+
end
|
276
|
+
|
277
|
+
[client_name, :success, versions_by_op.merge(unversioned_ops_hash)]
|
278
|
+
else
|
279
|
+
[client_name, :failure, errors]
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
failures = client_names_and_results.flat_map do |(client_name, success_or_failure, results)|
|
284
|
+
if success_or_failure == :success
|
285
|
+
[]
|
286
|
+
else
|
287
|
+
results.map do |result|
|
288
|
+
"From cluster #{client_name}: #{::JSON.generate(result, space: " ")}"
|
289
|
+
end
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
if failures.empty?
|
294
|
+
client_names_and_results.each_with_object(_ = {}) do |(client_name, _success_or_failure, results), accum|
|
295
|
+
results.each do |op, version|
|
296
|
+
accum[op] ||= _ = {}
|
297
|
+
accum[op][client_name] = version
|
298
|
+
end
|
299
|
+
end
|
300
|
+
else
|
301
|
+
raise IdentifyDocumentVersionsFailedError, "Got #{failures.size} failure(s) while querying the datastore " \
|
302
|
+
"for document versions:\n\n#{failures.join("\n")}"
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
# Queries the datastore mapping(s) for the given index definition(s) to verify that they are up-to-date
|
307
|
+
# with our schema artifacts, raising an error if the datastore mappings are missing fields that we
|
308
|
+
# expect. (Extra fields are allowed, though--we'll just ignore them).
|
309
|
+
#
|
310
|
+
# This is intended for use when you want a strong guarantee before proceeding that the indices are current,
|
311
|
+
# such as before indexing data, or after applying index updates (to "prove" that everything is how it should
|
312
|
+
# be).
|
313
|
+
#
|
314
|
+
# This correctly queries the datastore clusters specified via `index_into_clusters` in config,
|
315
|
+
# but ignores clusters specified via `query_cluster` (since this isn't intended to be used as part
|
316
|
+
# of the query flow).
|
317
|
+
#
|
318
|
+
# For a rollover template, this takes care of verifying the template itself and also any indices that originated
|
319
|
+
# from the template.
|
320
|
+
#
|
321
|
+
# Note also that this caches the datastore mappings, since this is intended to be used to verify an index
|
322
|
+
# before we index data into it, and we do not want to impose a huge performance penalty on that process (requiring
|
323
|
+
# multiple datastore requests before we index each document...). In general, the index mapping only changes
|
324
|
+
# when we make it change, and we deploy and restart ElasticGraph after any index mapping changes, so we do not
|
325
|
+
# need to worry about it mutating during the lifetime of a single process (particularly given the expense of doing
|
326
|
+
# so).
|
327
|
+
def validate_mapping_completeness_of!(index_cluster_name_method, *index_definitions)
|
328
|
+
diffs_by_cluster_and_index_name = index_definitions.reduce(_ = {}) do |accum, index_def|
|
329
|
+
accum.merge(mapping_diffs_for(index_def, index_cluster_name_method))
|
330
|
+
end
|
331
|
+
|
332
|
+
if diffs_by_cluster_and_index_name.any?
|
333
|
+
formatted_diffs = diffs_by_cluster_and_index_name.map do |(cluster_name, index_name), diff|
|
334
|
+
<<~EOS
|
335
|
+
On cluster `#{cluster_name}` and index/template `#{index_name}`:
|
336
|
+
#{diff}
|
337
|
+
EOS
|
338
|
+
end.join("\n\n")
|
339
|
+
|
340
|
+
raise ConfigError, "Datastore index mappings are incomplete compared to the current schema. " \
|
341
|
+
"The diff below uses the datastore index mapping as the base, and shows the expected mapping as a diff. " \
|
342
|
+
"\n\n#{formatted_diffs}"
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
private
|
347
|
+
|
348
|
+
def mapping_diffs_for(index_definition, index_cluster_name_method)
|
349
|
+
expected_mapping = @mappings_by_index_def_name.fetch(index_definition.name)
|
350
|
+
|
351
|
+
index_definition.public_send(index_cluster_name_method).flat_map do |cluster_name|
|
352
|
+
datastore_client = datastore_client_named(cluster_name)
|
353
|
+
|
354
|
+
cached_mappings_for(index_definition, datastore_client).filter_map do |index, mapping_in_index|
|
355
|
+
if (diff = HashDiffer.diff(mapping_in_index, expected_mapping, ignore_ops: [:-]))
|
356
|
+
[[cluster_name, index.name], diff]
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end.to_h
|
360
|
+
end
|
361
|
+
|
362
|
+
def cached_mappings_for(index_definition, datastore_client)
|
363
|
+
key = [datastore_client, index_definition] # : [DatastoreCore::_Client, DatastoreCore::indexDefinition]
|
364
|
+
cached_mapping = @cached_mappings[key] ||= new_cached_mapping(fetch_mappings_from_datastore(index_definition, datastore_client))
|
365
|
+
|
366
|
+
return cached_mapping.mappings if @monotonic_clock.now_in_ms < cached_mapping.expires_at
|
367
|
+
|
368
|
+
begin
|
369
|
+
fetch_mappings_from_datastore(index_definition, datastore_client).tap do |mappings|
|
370
|
+
@logger.info "Mapping cache expired for #{index_definition.name}; cleared it from the cache and re-fetched the mapping."
|
371
|
+
@cached_mappings[key] = new_cached_mapping(mappings)
|
372
|
+
end
|
373
|
+
rescue => e
|
374
|
+
@logger.warn <<~EOS
|
375
|
+
Mapping cache expired for #{index_definition.name}; attempted to re-fetch it but got an error[1]. Will continue using expired mapping information for now.
|
376
|
+
|
377
|
+
[1] #{e.class}: #{e.message}
|
378
|
+
#{e.backtrace.join("\n")}
|
379
|
+
EOS
|
380
|
+
|
381
|
+
# Update the cached mapping so that the expiration is reset.
|
382
|
+
@cached_mappings[key] = new_cached_mapping(cached_mapping.mappings)
|
383
|
+
|
384
|
+
cached_mapping.mappings
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
def fetch_mappings_from_datastore(index_definition, datastore_client)
|
389
|
+
# We need to also check any related indices...
|
390
|
+
indices_to_check = [index_definition] + index_definition.related_rollover_indices(datastore_client)
|
391
|
+
|
392
|
+
indices_to_check.to_h do |index|
|
393
|
+
[index, index.mappings_in_datastore(datastore_client)]
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
def new_cached_mapping(mappings)
|
398
|
+
CachedMapping.new(mappings, @monotonic_clock.now_in_ms + rand(MAPPING_CACHE_MAX_AGE_IN_MS_RANGE).to_i)
|
399
|
+
end
|
400
|
+
|
401
|
+
def datastore_client_named(name)
|
402
|
+
@datastore_clients_by_name.fetch(name)
|
403
|
+
end
|
404
|
+
|
405
|
+
CachedMapping = ::Data.define(:mappings, :expires_at)
|
406
|
+
end
|
407
|
+
end
|
408
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/error"
|
10
|
+
|
11
|
+
module ElasticGraph
|
12
|
+
class Indexer
|
13
|
+
# A unique identifier for an event ingested by the indexer. As a string, takes the form of
|
14
|
+
# "[type]:[id]@v[version]", such as "Widget:123abc@v7". This format was designed to make it
|
15
|
+
# easy to put these ids in a comma-seperated list.
|
16
|
+
EventID = ::Data.define(:type, :id, :version) do
|
17
|
+
# @implements EventID
|
18
|
+
def self.from_event(event)
|
19
|
+
new(type: event["type"], id: event["id"], version: event["version"])
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"#{type}:#{id}@v#{version}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Steep weirdly expects them here...
|
28
|
+
# @dynamic initialize, config, datastore_core, schema_artifacts, datastore_router, monotonic_clock
|
29
|
+
# @dynamic record_preparer_factory, processor, operation_factory, logger
|
30
|
+
# @dynamic self.from_parsed_yaml
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/error"
|
10
|
+
require "elastic_graph/indexer/event_id"
|
11
|
+
|
12
|
+
module ElasticGraph
|
13
|
+
class Indexer
|
14
|
+
# Indicates an event that we attempted to process which failed for some reason. It may have
|
15
|
+
# failed due to a validation issue before we even attempted to write it to the datastore, or it
|
16
|
+
# could have failed in the datastore itself.
|
17
|
+
class FailedEventError < Error
|
18
|
+
# @dynamic main_message, event, operations
|
19
|
+
|
20
|
+
# The "main" part of the error message (without the `full_id` portion).
|
21
|
+
attr_reader :main_message
|
22
|
+
|
23
|
+
# The invalid event.
|
24
|
+
attr_reader :event
|
25
|
+
|
26
|
+
# The operations that would have been returned by the `OperationFactory` if the event was valid.
|
27
|
+
# Note that sometimes an event is so malformed that we can't build any operations for it, but
|
28
|
+
# most of the time we can.
|
29
|
+
attr_reader :operations
|
30
|
+
|
31
|
+
def self.from_failed_operation_result(result, all_operations_for_event)
|
32
|
+
new(
|
33
|
+
event: result.event,
|
34
|
+
operations: all_operations_for_event,
|
35
|
+
main_message: result.summary
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(event:, operations:, main_message:)
|
40
|
+
@main_message = main_message
|
41
|
+
@event = event
|
42
|
+
@operations = operations
|
43
|
+
|
44
|
+
super("#{full_id}: #{main_message}")
|
45
|
+
end
|
46
|
+
|
47
|
+
# A filtered list of operations that have versions that can be compared against our event
|
48
|
+
# version. Not all operation types have a version (e.g. derived indexing `Update` operations don't).
|
49
|
+
def versioned_operations
|
50
|
+
@versioned_operations ||= operations.select(&:versioned?)
|
51
|
+
end
|
52
|
+
|
53
|
+
def full_id
|
54
|
+
event_id = EventID.from_event(event).to_s
|
55
|
+
if (message_id = event["message_id"])
|
56
|
+
"#{event_id} (message_id: #{message_id})"
|
57
|
+
else
|
58
|
+
event_id
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def id
|
63
|
+
event["id"]
|
64
|
+
end
|
65
|
+
|
66
|
+
def op
|
67
|
+
event["op"]
|
68
|
+
end
|
69
|
+
|
70
|
+
def type
|
71
|
+
event["type"]
|
72
|
+
end
|
73
|
+
|
74
|
+
def version
|
75
|
+
event["version"]
|
76
|
+
end
|
77
|
+
|
78
|
+
def record
|
79
|
+
event["record"]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "hashdiff"
|
10
|
+
|
11
|
+
module ElasticGraph
|
12
|
+
class Indexer
|
13
|
+
class HashDiffer
|
14
|
+
# Generates a string describing how `old` and `new` differ, similar to a git diff.
|
15
|
+
# `ignore_ops` can contain any of `:-`, `:+`, and `:~`; when provided those diff operations
|
16
|
+
# will be ignored.
|
17
|
+
def self.diff(old, new, ignore_ops: [])
|
18
|
+
ignore_op_strings = ignore_ops.map(&:to_s).to_set
|
19
|
+
|
20
|
+
diffs = ::Hashdiff.diff(old, new)
|
21
|
+
.reject { |op, path, *vals| ignore_op_strings.include?(_ = op) }
|
22
|
+
|
23
|
+
return if diffs.empty?
|
24
|
+
|
25
|
+
diffs.map do |op, path, *vals|
|
26
|
+
suffix = if vals.one?
|
27
|
+
vals.first
|
28
|
+
else
|
29
|
+
vals.map { |v| "`#{v.inspect}`" }.join(" => ")
|
30
|
+
end
|
31
|
+
|
32
|
+
"#{op} #{path}: #{suffix}"
|
33
|
+
end.join("\n")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|