elasticgraph-indexer 0.18.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,160 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/error"
11
+ require "elastic_graph/indexer/event_id"
12
+ require "elastic_graph/indexer/operation/count_accumulator"
13
+ require "elastic_graph/indexer/operation/result"
14
+ require "elastic_graph/support/hash_util"
15
+ require "elastic_graph/support/memoizable_data"
16
+
17
+ module ElasticGraph
18
+ class Indexer
19
+ module Operation
20
+ class Update < Support::MemoizableData.define(:event, :prepared_record, :destination_index_def, :update_target, :doc_id, :destination_index_mapping)
21
+ # @dynamic event, destination_index_def, doc_id
22
+
23
+ def self.operations_for(
24
+ event:,
25
+ destination_index_def:,
26
+ record_preparer:,
27
+ update_target:,
28
+ destination_index_mapping:
29
+ )
30
+ return [] if update_target.for_normal_indexing? && !destination_index_def.use_updates_for_indexing?
31
+
32
+ prepared_record = record_preparer.prepare_for_index(event["type"], event["record"] || {"id" => event["id"]})
33
+
34
+ Support::HashUtil
35
+ .fetch_leaf_values_at_path(prepared_record, update_target.id_source)
36
+ .reject { |id| id.to_s.strip.empty? }
37
+ .uniq
38
+ .map { |doc_id| new(event, prepared_record, destination_index_def, update_target, doc_id, destination_index_mapping) }
39
+ end
40
+
41
+ def to_datastore_bulk
42
+ @to_datastore_bulk ||= [{update: metadata}, update_request]
43
+ end
44
+
45
+ def categorize(response)
46
+ update = response.fetch("update")
47
+ status = update.fetch("status")
48
+
49
+ if noop_result?(response)
50
+ noop_error_message = message_from_thrown_painless_exception(update)
51
+ &.delete_prefix(UPDATE_WAS_NOOP_MESSAGE_PREAMBLE)
52
+
53
+ Result.noop_of(self, noop_error_message)
54
+ elsif (200..299).cover?(status)
55
+ Result.success_of(self)
56
+ else
57
+ error = update.fetch("error")
58
+
59
+ further_detail =
60
+ if (more_detail = error["caused_by"])
61
+ # Usually the type/reason details are nested an extra level (`caused_by.caused_by`) but sometimes
62
+ # it's not. I think it's nested when the script itself throws an exception where as it's unnested
63
+ # when the datastore is unable to run the script.
64
+ more_detail = more_detail["caused_by"] if more_detail.key?("caused_by")
65
+ " (#{more_detail["type"]}: #{more_detail["reason"]})"
66
+ else
67
+ "; full response: #{::JSON.pretty_generate(response)}"
68
+ end
69
+
70
+ Result.failure_of(self, "#{update_target.script_id}(applied to `#{doc_id}`): #{error.fetch("reason")}#{further_detail}")
71
+ end
72
+ end
73
+
74
+ def type
75
+ :update
76
+ end
77
+
78
+ def description
79
+ if update_target.type == event.fetch("type")
80
+ "#{update_target.type} update"
81
+ else
82
+ "#{update_target.type} update (from #{event.fetch("type")})"
83
+ end
84
+ end
85
+
86
+ def inspect
87
+ "#<#{self.class.name} event=#{EventID.from_event(event)} target=#{update_target.type}>"
88
+ end
89
+ alias_method :to_s, :inspect
90
+
91
+ def versioned?
92
+ # We do not track source event versions when applying derived indexing updates, but we do for
93
+ # normal indexing updates, so if the update target is for normal indexing it's a versioned operation.
94
+ update_target.for_normal_indexing?
95
+ end
96
+
97
+ private
98
+
99
+ # The number of retries of the update script we'll have the datastore attempt on concurrent modification conflicts.
100
+ CONFLICT_RETRIES = 5
101
+
102
+ def metadata
103
+ {
104
+ _index: destination_index_def.index_name_for_writes(prepared_record, timestamp_field_path: update_target.rollover_timestamp_value_source),
105
+ _id: doc_id,
106
+ routing: destination_index_def.routing_value_for_prepared_record(
107
+ prepared_record,
108
+ route_with_path: update_target.routing_value_source,
109
+ id_path: update_target.id_source
110
+ ),
111
+ retry_on_conflict: CONFLICT_RETRIES
112
+ }.compact
113
+ end
114
+
115
+ def update_request
116
+ {
117
+ script: {id: update_target.script_id, params: script_params},
118
+ # We use a scripted upsert instead of formatting an upsert document because it creates
119
+ # for simpler code. To create the upsert document, we'd have to convert the param
120
+ # values to their "upsert form"--for example, for an `append_only_set` field, the param
121
+ # value is generally a single scalar value while in an upsert document it would need to
122
+ # be a list. By using `scripted_upsert`, we can always just pass the params in a consistent
123
+ # way, and rely on the script to handle the case where it is creating a brand new document.
124
+ scripted_upsert: true,
125
+ upsert: {}
126
+ }
127
+ end
128
+
129
+ def noop_result?(response)
130
+ update = response.fetch("update")
131
+ error_message = message_from_thrown_painless_exception(update).to_s
132
+ error_message.start_with?(UPDATE_WAS_NOOP_MESSAGE_PREAMBLE) || update["result"] == "noop"
133
+ end
134
+
135
+ def message_from_thrown_painless_exception(update)
136
+ update.dig("error", "caused_by", "caused_by", "reason")
137
+ end
138
+
139
+ def script_params
140
+ initial_params = update_target.params_for(
141
+ doc_id: doc_id,
142
+ event: event,
143
+ prepared_record: prepared_record
144
+ )
145
+
146
+ # The normal indexing script uses `__counts`. Other indexing scripts (e.g. the ones generated
147
+ # for derived indexing) do not use `__counts` so there's no point in spending effort on computing
148
+ # it. Plus, the logic below raises an exception in that case, so it's important we avoid it.
149
+ return initial_params unless update_target.for_normal_indexing?
150
+
151
+ CountAccumulator.merge_list_counts_into(
152
+ initial_params,
153
+ mapping: destination_index_mapping,
154
+ list_counts_field_paths_for_source: destination_index_def.list_counts_field_paths_for_source(update_target.relationship.to_s)
155
+ )
156
+ end
157
+ end
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,71 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/indexer/operation/result"
10
+ require "elastic_graph/support/hash_util"
11
+ require "elastic_graph/support/memoizable_data"
12
+
13
+ module ElasticGraph
14
+ class Indexer
15
+ module Operation
16
+ Upsert = Support::MemoizableData.define(:event, :destination_index_def, :record_preparer) do
17
+ # @implements Upsert
18
+
19
+ def to_datastore_bulk
20
+ @to_datastore_bulk ||= [{index: metadata}, prepared_record]
21
+ end
22
+
23
+ def categorize(response)
24
+ index = response.fetch("index")
25
+ status = index.fetch("status")
26
+
27
+ case status
28
+ when 200..299
29
+ Result.success_of(self)
30
+ when 409
31
+ Result.noop_of(self, index.fetch("error").fetch("reason"))
32
+ else
33
+ Result.failure_of(self, index.fetch("error").fetch("reason"))
34
+ end
35
+ end
36
+
37
+ def doc_id
38
+ @doc_id ||= event.fetch("id")
39
+ end
40
+
41
+ def type
42
+ :upsert
43
+ end
44
+
45
+ def description
46
+ "#{event.fetch("type")} upsert"
47
+ end
48
+
49
+ def versioned?
50
+ true
51
+ end
52
+
53
+ private
54
+
55
+ def metadata
56
+ @metadata ||= {
57
+ _index: destination_index_def.index_name_for_writes(prepared_record),
58
+ _id: doc_id,
59
+ version: event.fetch("version"),
60
+ version_type: "external",
61
+ routing: destination_index_def.routing_value_for_prepared_record(prepared_record)
62
+ }.compact
63
+ end
64
+
65
+ def prepared_record
66
+ @prepared_record ||= record_preparer.prepare_for_index(event.fetch("type"), event.fetch("record"))
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,137 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/error"
10
+ require "elastic_graph/indexer/event_id"
11
+ require "elastic_graph/indexer/indexing_failures_error"
12
+ require "time"
13
+
14
+ module ElasticGraph
15
+ class Indexer
16
+ class Processor
17
+ def initialize(
18
+ datastore_router:,
19
+ operation_factory:,
20
+ logger:,
21
+ indexing_latency_slo_thresholds_by_timestamp_in_ms:,
22
+ clock: ::Time
23
+ )
24
+ @datastore_router = datastore_router
25
+ @operation_factory = operation_factory
26
+ @clock = clock
27
+ @logger = logger
28
+ @indexing_latency_slo_thresholds_by_timestamp_in_ms = indexing_latency_slo_thresholds_by_timestamp_in_ms
29
+ end
30
+
31
+ # Processes the given events, writing them to the datastore. If any events are invalid, an
32
+ # exception will be raised indicating why the events were invalid, but the valid events will
33
+ # still be written to the datastore. No attempt is made to provide atomic "all or nothing"
34
+ # behavior.
35
+ def process(events, refresh_indices: false)
36
+ failures = process_returning_failures(events, refresh_indices: refresh_indices)
37
+ return if failures.empty?
38
+ raise IndexingFailuresError.for(failures: failures, events: events)
39
+ end
40
+
41
+ # Like `process`, but returns failures instead of raising an exception.
42
+ # The caller is responsible for handling the failures.
43
+ def process_returning_failures(events, refresh_indices: false)
44
+ factory_results_by_event = events.to_h { |event| [event, @operation_factory.build(event)] }
45
+
46
+ factory_results = factory_results_by_event.values
47
+
48
+ bulk_result = @datastore_router.bulk(factory_results.flat_map(&:operations), refresh: refresh_indices)
49
+ successful_operations = bulk_result.successful_operations(check_failures: false)
50
+
51
+ calculate_latency_metrics(successful_operations, bulk_result.noop_results)
52
+
53
+ all_failures =
54
+ factory_results.map(&:failed_event_error).compact +
55
+ bulk_result.failure_results.map do |result|
56
+ all_operations_for_event = factory_results_by_event.fetch(result.event).operations
57
+ FailedEventError.from_failed_operation_result(result, all_operations_for_event.to_set)
58
+ end
59
+
60
+ categorize_failures(all_failures, events)
61
+ end
62
+
63
+ private
64
+
65
+ def categorize_failures(failures, events)
66
+ source_event_versions_by_cluster_by_op = @datastore_router.source_event_versions_in_index(
67
+ failures.flat_map { |f| f.versioned_operations.to_a }
68
+ )
69
+
70
+ superseded_failures, outstanding_failures = failures.partition do |failure|
71
+ failure.versioned_operations.size > 0 && failure.versioned_operations.all? do |op|
72
+ # Under normal conditions, we expect to get back only one version per operation per cluster.
73
+ # However, when a field used for routing or index rollover has mutated, we can wind up with
74
+ # multiple copies of the document in different indexes or shards. `source_event_versions_in_index`
75
+ # returns a list of found versions.
76
+ #
77
+ # We only need to consider the largest version when deciding if a failure has been supeseded or not.
78
+ # An event with a larger version is considered to be a full replacement for an earlier event for the
79
+ # same entity, so if we've processed an event for the same entity with a larger version, we can consider
80
+ # the failure superseded.
81
+ max_version_per_cluster = source_event_versions_by_cluster_by_op.fetch(op).values.map(&:max)
82
+
83
+ # We only consider an event to be superseded if the document version in the datastore
84
+ # for all its versioned operations is greater than the version of the failing event.
85
+ max_version_per_cluster.all? { |v| v && v > failure.version }
86
+ end
87
+ end
88
+
89
+ if superseded_failures.any?
90
+ superseded_ids = superseded_failures.map { |f| EventID.from_event(f.event).to_s }
91
+ @logger.warn(
92
+ "Ignoring #{superseded_ids.size} malformed event(s) because they have been superseded " \
93
+ "by corrected events targeting the same id: #{superseded_ids.join(", ")}."
94
+ )
95
+ end
96
+
97
+ outstanding_failures
98
+ end
99
+
100
+ def calculate_latency_metrics(successful_operations, noop_results)
101
+ current_time = @clock.now
102
+ successful_events = successful_operations.map(&:event).to_set
103
+ noop_events = noop_results.map(&:event).to_set
104
+ all_operations_events = successful_events + noop_events
105
+
106
+ all_operations_events.each do |event|
107
+ latencies_in_ms_from = {} # : Hash[String, Integer]
108
+ slo_results = {} # : Hash[String, String]
109
+
110
+ latency_timestamps = event.fetch("latency_timestamps", _ = {})
111
+ latency_timestamps.each do |ts_name, ts_value|
112
+ metric_value = ((current_time - Time.iso8601(ts_value)) * 1000).round
113
+
114
+ latencies_in_ms_from[ts_name] = metric_value
115
+
116
+ if (threshold = @indexing_latency_slo_thresholds_by_timestamp_in_ms[ts_name])
117
+ slo_results[ts_name] = (metric_value >= threshold) ? "bad" : "good"
118
+ end
119
+ end
120
+
121
+ result = successful_events.include?(event) ? "success" : "noop"
122
+
123
+ @logger.info({
124
+ "message_type" => "ElasticGraphIndexingLatencies",
125
+ "message_id" => event["message_id"],
126
+ "event_type" => event.fetch("type"),
127
+ "event_id" => EventID.from_event(event).to_s,
128
+ JSON_SCHEMA_VERSION_KEY => event.fetch(JSON_SCHEMA_VERSION_KEY),
129
+ "latencies_in_ms_from" => latencies_in_ms_from,
130
+ "slo_results" => slo_results,
131
+ "result" => result
132
+ })
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,163 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/error"
10
+
11
+ module ElasticGraph
12
+ class Indexer
13
+ class RecordPreparer
14
+ # Provides the ability to get a `RecordPreparer` for a specific JSON schema version.
15
+ class Factory
16
+ def initialize(schema_artifacts)
17
+ @schema_artifacts = schema_artifacts
18
+
19
+ scalar_types_by_name = schema_artifacts.runtime_metadata.scalar_types_by_name
20
+ indexing_preparer_by_scalar_type_name = ::Hash.new do |hash, type_name|
21
+ hash[type_name] = scalar_types_by_name[type_name]&.load_indexing_preparer&.extension_class
22
+ end
23
+
24
+ @preparers_by_json_schema_version = ::Hash.new do |hash, version|
25
+ hash[version] = RecordPreparer.new(
26
+ indexing_preparer_by_scalar_type_name,
27
+ build_type_metas_from(@schema_artifacts.json_schemas_for(version))
28
+ )
29
+ end
30
+ end
31
+
32
+ # Gets the `RecordPreparer` for the given JSON schema version.
33
+ def for_json_schema_version(json_schema_version)
34
+ @preparers_by_json_schema_version[json_schema_version]
35
+ end
36
+
37
+ # Gets the `RecordPreparer` for the latest JSON schema version. Intended primarily
38
+ # for use in tests for convenience.
39
+ def for_latest_json_schema_version
40
+ for_json_schema_version(@schema_artifacts.latest_json_schema_version)
41
+ end
42
+
43
+ private
44
+
45
+ def build_type_metas_from(json_schemas)
46
+ json_schemas.fetch("$defs").filter_map do |type, type_def|
47
+ next if type == EVENT_ENVELOPE_JSON_SCHEMA_NAME
48
+
49
+ properties = type_def.fetch("properties") do
50
+ {} # : ::Hash[::String, untyped]
51
+ end # : ::Hash[::String, untyped]
52
+
53
+ required_fields = type_def.fetch("required") do
54
+ [] # : ::Array[::String]
55
+ end # : ::Array[::String]
56
+
57
+ eg_meta_by_field_name = properties.filter_map do |prop_name, prop|
58
+ eg_meta = prop["ElasticGraph"]
59
+ [prop_name, eg_meta] if eg_meta
60
+ end.to_h
61
+
62
+ TypeMetadata.new(
63
+ name: type,
64
+ requires_typename: required_fields.include?("__typename"),
65
+ eg_meta_by_field_name: eg_meta_by_field_name
66
+ )
67
+ end
68
+ end
69
+ end
70
+
71
+ # An alternate `RecordPreparer` implementation that implements the identity function:
72
+ # it just echoes back the record it is given.
73
+ #
74
+ # This is intended only for use where a `RecordPreparer` is required but the data is not
75
+ # ultimately going to be sent to the datastore. For example, when an event is invalid, we
76
+ # still build operations for it, and the operations require a `RecordPreparer`, but we do
77
+ # not send them to the datastore.
78
+ module Identity
79
+ def self.prepare_for_index(type_name, record)
80
+ record
81
+ end
82
+ end
83
+
84
+ def initialize(indexing_preparer_by_scalar_type_name, type_metas)
85
+ @indexing_preparer_by_scalar_type_name = indexing_preparer_by_scalar_type_name
86
+ @eg_meta_by_field_name_by_concrete_type = type_metas.to_h do |meta|
87
+ [meta.name, meta.eg_meta_by_field_name]
88
+ end
89
+
90
+ @types_requiring_typename = type_metas.filter_map do |meta|
91
+ meta.name if meta.requires_typename
92
+ end.to_set
93
+ end
94
+
95
+ # Prepares the given payload for being indexed into the named index.
96
+ # This allows any value or field name conversion to happen before we index
97
+ # the data, to support the few cases where we expect differences between
98
+ # the payload received by the ElasticGraph indexer, and the payload we
99
+ # send to the datastore.
100
+ #
101
+ # As part of preparing the data, we also drop any `record` fields that
102
+ # are not defined in our schema. This allows us to handle events that target
103
+ # multiple indices (e.g. v1 and v2) for the same type. The event can contain
104
+ # the set union of fields and this will take care of dropping any unsupported
105
+ # fields before we attempt to index the record.
106
+ #
107
+ # Note: this method does not mutate the given `record`. Instead it returns a
108
+ # copy with any updates applied to it.
109
+ def prepare_for_index(type_name, record)
110
+ prepare_value_for_indexing(record, type_name)
111
+ end
112
+
113
+ private
114
+
115
+ def prepare_value_for_indexing(value, type_name)
116
+ type_name = type_name.delete_suffix("!")
117
+
118
+ return nil if value.nil?
119
+
120
+ if (preparer = @indexing_preparer_by_scalar_type_name[type_name])
121
+ return (_ = preparer).prepare_for_indexing(value)
122
+ end
123
+
124
+ case value
125
+ when ::Array
126
+ element_type_name = type_name.delete_prefix("[").delete_suffix("]")
127
+ value.map { |v| prepare_value_for_indexing(v, element_type_name) }
128
+ when ::Hash
129
+ # `@eg_meta_by_field_name_by_concrete_type` does not have abstract types in it (e.g. type unions).
130
+ # Instead, it'll have each concrete subtype in it.
131
+ #
132
+ # If `type_name` is an abstract type, we need to look at the `__typename` field to see
133
+ # what the concrete subtype is. `__typename` is required on abstract types and indicates that.
134
+ eg_meta_by_field_name = @eg_meta_by_field_name_by_concrete_type.fetch(value["__typename"] || type_name)
135
+
136
+ value.filter_map do |field_name, field_value|
137
+ if field_name == "__typename"
138
+ # We only want to include __typename if it we're dealing with a type that requires it.
139
+ # (This is the case for an abstract type, so it can differentiate between which subtype we have
140
+ [field_name, field_value] if @types_requiring_typename.include?(type_name)
141
+ elsif (eg_meta = eg_meta_by_field_name[field_name])
142
+ [eg_meta.fetch("nameInIndex"), prepare_value_for_indexing(field_value, eg_meta.fetch("type"))]
143
+ end
144
+ end.to_h
145
+ else
146
+ # We won't have a registered preparer for enum types, since those aren't dumped in
147
+ # runtime metadata `scalar_types_by_name`, and we can just return the value as-is in
148
+ # this case.
149
+ value
150
+ end
151
+ end
152
+
153
+ TypeMetadata = ::Data.define(
154
+ # The name of the type this metadata object is for.
155
+ :name,
156
+ # Indicates if this type requires a `__typename` field.
157
+ :requires_typename,
158
+ # The per-field ElasticGraph metadata, keyed by field name.
159
+ :eg_meta_by_field_name
160
+ )
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,44 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "json"
10
+
11
+ # Defines an RSpec matcher that can be used to validate ElasticGraph events.
12
+ ::RSpec::Matchers.define :be_a_valid_elastic_graph_event do |for_indexer:|
13
+ match do |event|
14
+ result = for_indexer
15
+ .operation_factory
16
+ .with(configure_record_validator: block_arg)
17
+ .build(event)
18
+
19
+ @validation_failure = result.failed_event_error
20
+ !@validation_failure
21
+ end
22
+
23
+ description do
24
+ "be a valid ElasticGraph event"
25
+ end
26
+
27
+ failure_message do |event|
28
+ <<~EOS
29
+ expected the event[1] to #{description}, but it was invalid[2].
30
+
31
+ [1] #{::JSON.pretty_generate(event)}
32
+
33
+ [2] #{@validation_failure.message}
34
+ EOS
35
+ end
36
+
37
+ failure_message_when_negated do |event|
38
+ <<~EOS
39
+ expected the event[1] not to #{description}, but it was valid.
40
+
41
+ [1] #{::JSON.pretty_generate(event)}
42
+ EOS
43
+ end
44
+ end
@@ -0,0 +1,36 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/support/hash_util"
11
+ require "json"
12
+
13
+ module ElasticGraph
14
+ class Indexer
15
+ module TestSupport
16
+ module Converters
17
+ # Helper method for testing and generating fake data to convert a factory record into an event
18
+ def self.upsert_event_for(record)
19
+ {
20
+ "op" => "upsert",
21
+ "id" => record.fetch("id"),
22
+ "type" => record.fetch("__typename"),
23
+ "version" => record.fetch("__version"),
24
+ "record" => record.except("__typename", "__version", "__json_schema_version"),
25
+ JSON_SCHEMA_VERSION_KEY => record.fetch("__json_schema_version")
26
+ }
27
+ end
28
+
29
+ # Helper method to create an array of events given an array of records
30
+ def self.upsert_events_for_records(records)
31
+ records.map { |record| upsert_event_for(Support::HashUtil.stringify_keys(record)) }
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end