elasticgraph-indexer 0.18.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,160 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/error"
11
+ require "elastic_graph/indexer/event_id"
12
+ require "elastic_graph/indexer/operation/count_accumulator"
13
+ require "elastic_graph/indexer/operation/result"
14
+ require "elastic_graph/support/hash_util"
15
+ require "elastic_graph/support/memoizable_data"
16
+
17
+ module ElasticGraph
18
+ class Indexer
19
+ module Operation
20
+ class Update < Support::MemoizableData.define(:event, :prepared_record, :destination_index_def, :update_target, :doc_id, :destination_index_mapping)
21
+ # @dynamic event, destination_index_def, doc_id
22
+
23
+ def self.operations_for(
24
+ event:,
25
+ destination_index_def:,
26
+ record_preparer:,
27
+ update_target:,
28
+ destination_index_mapping:
29
+ )
30
+ return [] if update_target.for_normal_indexing? && !destination_index_def.use_updates_for_indexing?
31
+
32
+ prepared_record = record_preparer.prepare_for_index(event["type"], event["record"] || {"id" => event["id"]})
33
+
34
+ Support::HashUtil
35
+ .fetch_leaf_values_at_path(prepared_record, update_target.id_source)
36
+ .reject { |id| id.to_s.strip.empty? }
37
+ .uniq
38
+ .map { |doc_id| new(event, prepared_record, destination_index_def, update_target, doc_id, destination_index_mapping) }
39
+ end
40
+
41
+ def to_datastore_bulk
42
+ @to_datastore_bulk ||= [{update: metadata}, update_request]
43
+ end
44
+
45
+ def categorize(response)
46
+ update = response.fetch("update")
47
+ status = update.fetch("status")
48
+
49
+ if noop_result?(response)
50
+ noop_error_message = message_from_thrown_painless_exception(update)
51
+ &.delete_prefix(UPDATE_WAS_NOOP_MESSAGE_PREAMBLE)
52
+
53
+ Result.noop_of(self, noop_error_message)
54
+ elsif (200..299).cover?(status)
55
+ Result.success_of(self)
56
+ else
57
+ error = update.fetch("error")
58
+
59
+ further_detail =
60
+ if (more_detail = error["caused_by"])
61
+ # Usually the type/reason details are nested an extra level (`caused_by.caused_by`) but sometimes
62
+ # it's not. I think it's nested when the script itself throws an exception where as it's unnested
63
+ # when the datastore is unable to run the script.
64
+ more_detail = more_detail["caused_by"] if more_detail.key?("caused_by")
65
+ " (#{more_detail["type"]}: #{more_detail["reason"]})"
66
+ else
67
+ "; full response: #{::JSON.pretty_generate(response)}"
68
+ end
69
+
70
+ Result.failure_of(self, "#{update_target.script_id}(applied to `#{doc_id}`): #{error.fetch("reason")}#{further_detail}")
71
+ end
72
+ end
73
+
74
+ def type
75
+ :update
76
+ end
77
+
78
+ def description
79
+ if update_target.type == event.fetch("type")
80
+ "#{update_target.type} update"
81
+ else
82
+ "#{update_target.type} update (from #{event.fetch("type")})"
83
+ end
84
+ end
85
+
86
+ def inspect
87
+ "#<#{self.class.name} event=#{EventID.from_event(event)} target=#{update_target.type}>"
88
+ end
89
+ alias_method :to_s, :inspect
90
+
91
+ def versioned?
92
+ # We do not track source event versions when applying derived indexing updates, but we do for
93
+ # normal indexing updates, so if the update target is for normal indexing it's a versioned operation.
94
+ update_target.for_normal_indexing?
95
+ end
96
+
97
+ private
98
+
99
+ # The number of retries of the update script we'll have the datastore attempt on concurrent modification conflicts.
100
+ CONFLICT_RETRIES = 5
101
+
102
+ def metadata
103
+ {
104
+ _index: destination_index_def.index_name_for_writes(prepared_record, timestamp_field_path: update_target.rollover_timestamp_value_source),
105
+ _id: doc_id,
106
+ routing: destination_index_def.routing_value_for_prepared_record(
107
+ prepared_record,
108
+ route_with_path: update_target.routing_value_source,
109
+ id_path: update_target.id_source
110
+ ),
111
+ retry_on_conflict: CONFLICT_RETRIES
112
+ }.compact
113
+ end
114
+
115
+ def update_request
116
+ {
117
+ script: {id: update_target.script_id, params: script_params},
118
+ # We use a scripted upsert instead of formatting an upsert document because it creates
119
+ # for simpler code. To create the upsert document, we'd have to convert the param
120
+ # values to their "upsert form"--for example, for an `append_only_set` field, the param
121
+ # value is generally a single scalar value while in an upsert document it would need to
122
+ # be a list. By using `scripted_upsert`, we can always just pass the params in a consistent
123
+ # way, and rely on the script to handle the case where it is creating a brand new document.
124
+ scripted_upsert: true,
125
+ upsert: {}
126
+ }
127
+ end
128
+
129
+ def noop_result?(response)
130
+ update = response.fetch("update")
131
+ error_message = message_from_thrown_painless_exception(update).to_s
132
+ error_message.start_with?(UPDATE_WAS_NOOP_MESSAGE_PREAMBLE) || update["result"] == "noop"
133
+ end
134
+
135
+ def message_from_thrown_painless_exception(update)
136
+ update.dig("error", "caused_by", "caused_by", "reason")
137
+ end
138
+
139
+ def script_params
140
+ initial_params = update_target.params_for(
141
+ doc_id: doc_id,
142
+ event: event,
143
+ prepared_record: prepared_record
144
+ )
145
+
146
+ # The normal indexing script uses `__counts`. Other indexing scripts (e.g. the ones generated
147
+ # for derived indexing) do not use `__counts` so there's no point in spending effort on computing
148
+ # it. Plus, the logic below raises an exception in that case, so it's important we avoid it.
149
+ return initial_params unless update_target.for_normal_indexing?
150
+
151
+ CountAccumulator.merge_list_counts_into(
152
+ initial_params,
153
+ mapping: destination_index_mapping,
154
+ list_counts_field_paths_for_source: destination_index_def.list_counts_field_paths_for_source(update_target.relationship.to_s)
155
+ )
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,71 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/indexer/operation/result"
10
+ require "elastic_graph/support/hash_util"
11
+ require "elastic_graph/support/memoizable_data"
12
+
13
+ module ElasticGraph
14
+ class Indexer
15
+ module Operation
16
+ Upsert = Support::MemoizableData.define(:event, :destination_index_def, :record_preparer) do
17
+ # @implements Upsert
18
+
19
+ def to_datastore_bulk
20
+ @to_datastore_bulk ||= [{index: metadata}, prepared_record]
21
+ end
22
+
23
+ def categorize(response)
24
+ index = response.fetch("index")
25
+ status = index.fetch("status")
26
+
27
+ case status
28
+ when 200..299
29
+ Result.success_of(self)
30
+ when 409
31
+ Result.noop_of(self, index.fetch("error").fetch("reason"))
32
+ else
33
+ Result.failure_of(self, index.fetch("error").fetch("reason"))
34
+ end
35
+ end
36
+
37
+ def doc_id
38
+ @doc_id ||= event.fetch("id")
39
+ end
40
+
41
+ def type
42
+ :upsert
43
+ end
44
+
45
+ def description
46
+ "#{event.fetch("type")} upsert"
47
+ end
48
+
49
+ def versioned?
50
+ true
51
+ end
52
+
53
+ private
54
+
55
+ def metadata
56
+ @metadata ||= {
57
+ _index: destination_index_def.index_name_for_writes(prepared_record),
58
+ _id: doc_id,
59
+ version: event.fetch("version"),
60
+ version_type: "external",
61
+ routing: destination_index_def.routing_value_for_prepared_record(prepared_record)
62
+ }.compact
63
+ end
64
+
65
+ def prepared_record
66
+ @prepared_record ||= record_preparer.prepare_for_index(event.fetch("type"), event.fetch("record"))
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,137 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/error"
10
+ require "elastic_graph/indexer/event_id"
11
+ require "elastic_graph/indexer/indexing_failures_error"
12
+ require "time"
13
+
14
+ module ElasticGraph
15
+ class Indexer
16
+ class Processor
17
+ def initialize(
18
+ datastore_router:,
19
+ operation_factory:,
20
+ logger:,
21
+ indexing_latency_slo_thresholds_by_timestamp_in_ms:,
22
+ clock: ::Time
23
+ )
24
+ @datastore_router = datastore_router
25
+ @operation_factory = operation_factory
26
+ @clock = clock
27
+ @logger = logger
28
+ @indexing_latency_slo_thresholds_by_timestamp_in_ms = indexing_latency_slo_thresholds_by_timestamp_in_ms
29
+ end
30
+
31
+ # Processes the given events, writing them to the datastore. If any events are invalid, an
32
+ # exception will be raised indicating why the events were invalid, but the valid events will
33
+ # still be written to the datastore. No attempt is made to provide atomic "all or nothing"
34
+ # behavior.
35
+ def process(events, refresh_indices: false)
36
+ failures = process_returning_failures(events, refresh_indices: refresh_indices)
37
+ return if failures.empty?
38
+ raise IndexingFailuresError.for(failures: failures, events: events)
39
+ end
40
+
41
+ # Like `process`, but returns failures instead of raising an exception.
42
+ # The caller is responsible for handling the failures.
43
+ def process_returning_failures(events, refresh_indices: false)
44
+ factory_results_by_event = events.to_h { |event| [event, @operation_factory.build(event)] }
45
+
46
+ factory_results = factory_results_by_event.values
47
+
48
+ bulk_result = @datastore_router.bulk(factory_results.flat_map(&:operations), refresh: refresh_indices)
49
+ successful_operations = bulk_result.successful_operations(check_failures: false)
50
+
51
+ calculate_latency_metrics(successful_operations, bulk_result.noop_results)
52
+
53
+ all_failures =
54
+ factory_results.map(&:failed_event_error).compact +
55
+ bulk_result.failure_results.map do |result|
56
+ all_operations_for_event = factory_results_by_event.fetch(result.event).operations
57
+ FailedEventError.from_failed_operation_result(result, all_operations_for_event.to_set)
58
+ end
59
+
60
+ categorize_failures(all_failures, events)
61
+ end
62
+
63
+ private
64
+
65
+ def categorize_failures(failures, events)
66
+ source_event_versions_by_cluster_by_op = @datastore_router.source_event_versions_in_index(
67
+ failures.flat_map { |f| f.versioned_operations.to_a }
68
+ )
69
+
70
+ superseded_failures, outstanding_failures = failures.partition do |failure|
71
+ failure.versioned_operations.size > 0 && failure.versioned_operations.all? do |op|
72
+ # Under normal conditions, we expect to get back only one version per operation per cluster.
73
+ # However, when a field used for routing or index rollover has mutated, we can wind up with
74
+ # multiple copies of the document in different indexes or shards. `source_event_versions_in_index`
75
+ # returns a list of found versions.
76
+ #
77
+ # We only need to consider the largest version when deciding if a failure has been supeseded or not.
78
+ # An event with a larger version is considered to be a full replacement for an earlier event for the
79
+ # same entity, so if we've processed an event for the same entity with a larger version, we can consider
80
+ # the failure superseded.
81
+ max_version_per_cluster = source_event_versions_by_cluster_by_op.fetch(op).values.map(&:max)
82
+
83
+ # We only consider an event to be superseded if the document version in the datastore
84
+ # for all its versioned operations is greater than the version of the failing event.
85
+ max_version_per_cluster.all? { |v| v && v > failure.version }
86
+ end
87
+ end
88
+
89
+ if superseded_failures.any?
90
+ superseded_ids = superseded_failures.map { |f| EventID.from_event(f.event).to_s }
91
+ @logger.warn(
92
+ "Ignoring #{superseded_ids.size} malformed event(s) because they have been superseded " \
93
+ "by corrected events targeting the same id: #{superseded_ids.join(", ")}."
94
+ )
95
+ end
96
+
97
+ outstanding_failures
98
+ end
99
+
100
+ def calculate_latency_metrics(successful_operations, noop_results)
101
+ current_time = @clock.now
102
+ successful_events = successful_operations.map(&:event).to_set
103
+ noop_events = noop_results.map(&:event).to_set
104
+ all_operations_events = successful_events + noop_events
105
+
106
+ all_operations_events.each do |event|
107
+ latencies_in_ms_from = {} # : Hash[String, Integer]
108
+ slo_results = {} # : Hash[String, String]
109
+
110
+ latency_timestamps = event.fetch("latency_timestamps", _ = {})
111
+ latency_timestamps.each do |ts_name, ts_value|
112
+ metric_value = ((current_time - Time.iso8601(ts_value)) * 1000).round
113
+
114
+ latencies_in_ms_from[ts_name] = metric_value
115
+
116
+ if (threshold = @indexing_latency_slo_thresholds_by_timestamp_in_ms[ts_name])
117
+ slo_results[ts_name] = (metric_value >= threshold) ? "bad" : "good"
118
+ end
119
+ end
120
+
121
+ result = successful_events.include?(event) ? "success" : "noop"
122
+
123
+ @logger.info({
124
+ "message_type" => "ElasticGraphIndexingLatencies",
125
+ "message_id" => event["message_id"],
126
+ "event_type" => event.fetch("type"),
127
+ "event_id" => EventID.from_event(event).to_s,
128
+ JSON_SCHEMA_VERSION_KEY => event.fetch(JSON_SCHEMA_VERSION_KEY),
129
+ "latencies_in_ms_from" => latencies_in_ms_from,
130
+ "slo_results" => slo_results,
131
+ "result" => result
132
+ })
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,163 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/error"
10
+
11
+ module ElasticGraph
12
+ class Indexer
13
+ class RecordPreparer
14
+ # Provides the ability to get a `RecordPreparer` for a specific JSON schema version.
15
+ class Factory
16
+ def initialize(schema_artifacts)
17
+ @schema_artifacts = schema_artifacts
18
+
19
+ scalar_types_by_name = schema_artifacts.runtime_metadata.scalar_types_by_name
20
+ indexing_preparer_by_scalar_type_name = ::Hash.new do |hash, type_name|
21
+ hash[type_name] = scalar_types_by_name[type_name]&.load_indexing_preparer&.extension_class
22
+ end
23
+
24
+ @preparers_by_json_schema_version = ::Hash.new do |hash, version|
25
+ hash[version] = RecordPreparer.new(
26
+ indexing_preparer_by_scalar_type_name,
27
+ build_type_metas_from(@schema_artifacts.json_schemas_for(version))
28
+ )
29
+ end
30
+ end
31
+
32
+ # Gets the `RecordPreparer` for the given JSON schema version.
33
+ def for_json_schema_version(json_schema_version)
34
+ @preparers_by_json_schema_version[json_schema_version]
35
+ end
36
+
37
+ # Gets the `RecordPreparer` for the latest JSON schema version. Intended primarily
38
+ # for use in tests for convenience.
39
+ def for_latest_json_schema_version
40
+ for_json_schema_version(@schema_artifacts.latest_json_schema_version)
41
+ end
42
+
43
+ private
44
+
45
+ def build_type_metas_from(json_schemas)
46
+ json_schemas.fetch("$defs").filter_map do |type, type_def|
47
+ next if type == EVENT_ENVELOPE_JSON_SCHEMA_NAME
48
+
49
+ properties = type_def.fetch("properties") do
50
+ {} # : ::Hash[::String, untyped]
51
+ end # : ::Hash[::String, untyped]
52
+
53
+ required_fields = type_def.fetch("required") do
54
+ [] # : ::Array[::String]
55
+ end # : ::Array[::String]
56
+
57
+ eg_meta_by_field_name = properties.filter_map do |prop_name, prop|
58
+ eg_meta = prop["ElasticGraph"]
59
+ [prop_name, eg_meta] if eg_meta
60
+ end.to_h
61
+
62
+ TypeMetadata.new(
63
+ name: type,
64
+ requires_typename: required_fields.include?("__typename"),
65
+ eg_meta_by_field_name: eg_meta_by_field_name
66
+ )
67
+ end
68
+ end
69
+ end
70
+
71
+ # An alternate `RecordPreparer` implementation that implements the identity function:
72
+ # it just echoes back the record it is given.
73
+ #
74
+ # This is intended only for use where a `RecordPreparer` is required but the data is not
75
+ # ultimately going to be sent to the datastore. For example, when an event is invalid, we
76
+ # still build operations for it, and the operations require a `RecordPreparer`, but we do
77
+ # not send them to the datastore.
78
+ module Identity
79
+ def self.prepare_for_index(type_name, record)
80
+ record
81
+ end
82
+ end
83
+
84
+ def initialize(indexing_preparer_by_scalar_type_name, type_metas)
85
+ @indexing_preparer_by_scalar_type_name = indexing_preparer_by_scalar_type_name
86
+ @eg_meta_by_field_name_by_concrete_type = type_metas.to_h do |meta|
87
+ [meta.name, meta.eg_meta_by_field_name]
88
+ end
89
+
90
+ @types_requiring_typename = type_metas.filter_map do |meta|
91
+ meta.name if meta.requires_typename
92
+ end.to_set
93
+ end
94
+
95
+ # Prepares the given payload for being indexed into the named index.
96
+ # This allows any value or field name conversion to happen before we index
97
+ # the data, to support the few cases where we expect differences between
98
+ # the payload received by the ElasticGraph indexer, and the payload we
99
+ # send to the datastore.
100
+ #
101
+ # As part of preparing the data, we also drop any `record` fields that
102
+ # are not defined in our schema. This allows us to handle events that target
103
+ # multiple indices (e.g. v1 and v2) for the same type. The event can contain
104
+ # the set union of fields and this will take care of dropping any unsupported
105
+ # fields before we attempt to index the record.
106
+ #
107
+ # Note: this method does not mutate the given `record`. Instead it returns a
108
+ # copy with any updates applied to it.
109
+ def prepare_for_index(type_name, record)
110
+ prepare_value_for_indexing(record, type_name)
111
+ end
112
+
113
+ private
114
+
115
+ def prepare_value_for_indexing(value, type_name)
116
+ type_name = type_name.delete_suffix("!")
117
+
118
+ return nil if value.nil?
119
+
120
+ if (preparer = @indexing_preparer_by_scalar_type_name[type_name])
121
+ return (_ = preparer).prepare_for_indexing(value)
122
+ end
123
+
124
+ case value
125
+ when ::Array
126
+ element_type_name = type_name.delete_prefix("[").delete_suffix("]")
127
+ value.map { |v| prepare_value_for_indexing(v, element_type_name) }
128
+ when ::Hash
129
+ # `@eg_meta_by_field_name_by_concrete_type` does not have abstract types in it (e.g. type unions).
130
+ # Instead, it'll have each concrete subtype in it.
131
+ #
132
+ # If `type_name` is an abstract type, we need to look at the `__typename` field to see
133
+ # what the concrete subtype is. `__typename` is required on abstract types and indicates that.
134
+ eg_meta_by_field_name = @eg_meta_by_field_name_by_concrete_type.fetch(value["__typename"] || type_name)
135
+
136
+ value.filter_map do |field_name, field_value|
137
+ if field_name == "__typename"
138
+ # We only want to include __typename if it we're dealing with a type that requires it.
139
+ # (This is the case for an abstract type, so it can differentiate between which subtype we have
140
+ [field_name, field_value] if @types_requiring_typename.include?(type_name)
141
+ elsif (eg_meta = eg_meta_by_field_name[field_name])
142
+ [eg_meta.fetch("nameInIndex"), prepare_value_for_indexing(field_value, eg_meta.fetch("type"))]
143
+ end
144
+ end.to_h
145
+ else
146
+ # We won't have a registered preparer for enum types, since those aren't dumped in
147
+ # runtime metadata `scalar_types_by_name`, and we can just return the value as-is in
148
+ # this case.
149
+ value
150
+ end
151
+ end
152
+
153
+ TypeMetadata = ::Data.define(
154
+ # The name of the type this metadata object is for.
155
+ :name,
156
+ # Indicates if this type requires a `__typename` field.
157
+ :requires_typename,
158
+ # The per-field ElasticGraph metadata, keyed by field name.
159
+ :eg_meta_by_field_name
160
+ )
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,44 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "json"
10
+
11
+ # Defines an RSpec matcher that can be used to validate ElasticGraph events.
12
+ ::RSpec::Matchers.define :be_a_valid_elastic_graph_event do |for_indexer:|
13
+ match do |event|
14
+ result = for_indexer
15
+ .operation_factory
16
+ .with(configure_record_validator: block_arg)
17
+ .build(event)
18
+
19
+ @validation_failure = result.failed_event_error
20
+ !@validation_failure
21
+ end
22
+
23
+ description do
24
+ "be a valid ElasticGraph event"
25
+ end
26
+
27
+ failure_message do |event|
28
+ <<~EOS
29
+ expected the event[1] to #{description}, but it was invalid[2].
30
+
31
+ [1] #{::JSON.pretty_generate(event)}
32
+
33
+ [2] #{@validation_failure.message}
34
+ EOS
35
+ end
36
+
37
+ failure_message_when_negated do |event|
38
+ <<~EOS
39
+ expected the event[1] not to #{description}, but it was valid.
40
+
41
+ [1] #{::JSON.pretty_generate(event)}
42
+ EOS
43
+ end
44
+ end
@@ -0,0 +1,36 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/support/hash_util"
11
+ require "json"
12
+
13
+ module ElasticGraph
14
+ class Indexer
15
+ module TestSupport
16
+ module Converters
17
+ # Helper method for testing and generating fake data to convert a factory record into an event
18
+ def self.upsert_event_for(record)
19
+ {
20
+ "op" => "upsert",
21
+ "id" => record.fetch("id"),
22
+ "type" => record.fetch("__typename"),
23
+ "version" => record.fetch("__version"),
24
+ "record" => record.except("__typename", "__version", "__json_schema_version"),
25
+ JSON_SCHEMA_VERSION_KEY => record.fetch("__json_schema_version")
26
+ }
27
+ end
28
+
29
+ # Helper method to create an array of events given an array of records
30
+ def self.upsert_events_for_records(records)
31
+ records.map { |record| upsert_event_for(Support::HashUtil.stringify_keys(record)) }
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end