RubyGems - elasticgraph-indexer - Versions diffs - 0.18.0.0 - Mend

elasticgraph-indexer 0.18.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +7 -0
data/LICENSE.txt +21 -0
data/README.md +1 -0
data/elasticgraph-indexer.gemspec +24 -0
data/lib/elastic_graph/indexer/config.rb +48 -0
data/lib/elastic_graph/indexer/datastore_indexing_router.rb +408 -0
data/lib/elastic_graph/indexer/event_id.rb +32 -0
data/lib/elastic_graph/indexer/failed_event_error.rb +83 -0
data/lib/elastic_graph/indexer/hash_differ.rb +37 -0
data/lib/elastic_graph/indexer/indexing_failures_error.rb +28 -0
data/lib/elastic_graph/indexer/indexing_preparers/integer.rb +41 -0
data/lib/elastic_graph/indexer/indexing_preparers/no_op.rb +19 -0
data/lib/elastic_graph/indexer/indexing_preparers/untyped.rb +22 -0
data/lib/elastic_graph/indexer/operation/count_accumulator.rb +166 -0
data/lib/elastic_graph/indexer/operation/factory.rb +226 -0
data/lib/elastic_graph/indexer/operation/result.rb +76 -0
data/lib/elastic_graph/indexer/operation/update.rb +160 -0
data/lib/elastic_graph/indexer/operation/upsert.rb +71 -0
data/lib/elastic_graph/indexer/processor.rb +137 -0
data/lib/elastic_graph/indexer/record_preparer.rb +163 -0
data/lib/elastic_graph/indexer/spec_support/event_matcher.rb +44 -0
data/lib/elastic_graph/indexer/test_support/converters.rb +36 -0
data/lib/elastic_graph/indexer.rb +98 -0
metadata +454 -0

data/lib/elastic_graph/indexer/operation/update.rb ADDED Viewed

@@ -0,0 +1,160 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/constants"
+require "elastic_graph/error"
+require "elastic_graph/indexer/event_id"
+require "elastic_graph/indexer/operation/count_accumulator"
+require "elastic_graph/indexer/operation/result"
+require "elastic_graph/support/hash_util"
+require "elastic_graph/support/memoizable_data"
+module ElasticGraph
+  class Indexer
+    module Operation
+      class Update < Support::MemoizableData.define(:event, :prepared_record, :destination_index_def, :update_target, :doc_id, :destination_index_mapping)
+        # @dynamic event, destination_index_def, doc_id
+        def self.operations_for(
+          event:,
+          destination_index_def:,
+          record_preparer:,
+          update_target:,
+          destination_index_mapping:
+        )
+          return [] if update_target.for_normal_indexing? && !destination_index_def.use_updates_for_indexing?
+          prepared_record = record_preparer.prepare_for_index(event["type"], event["record"] || {"id" => event["id"]})
+          Support::HashUtil
+            .fetch_leaf_values_at_path(prepared_record, update_target.id_source)
+            .reject { |id| id.to_s.strip.empty? }
+            .uniq
+            .map { |doc_id| new(event, prepared_record, destination_index_def, update_target, doc_id, destination_index_mapping) }
+        end
+        def to_datastore_bulk
+          @to_datastore_bulk ||= [{update: metadata}, update_request]
+        end
+        def categorize(response)
+          update = response.fetch("update")
+          status = update.fetch("status")
+          if noop_result?(response)
+            noop_error_message = message_from_thrown_painless_exception(update)
+              &.delete_prefix(UPDATE_WAS_NOOP_MESSAGE_PREAMBLE)
+            Result.noop_of(self, noop_error_message)
+          elsif (200..299).cover?(status)
+            Result.success_of(self)
+          else
+            error = update.fetch("error")
+            further_detail =
+              if (more_detail = error["caused_by"])
+                # Usually the type/reason details are nested an extra level (`caused_by.caused_by`) but sometimes
+                # it's not. I think it's nested when the script itself throws an exception where as it's unnested
+                # when the datastore is unable to run the script.
+                more_detail = more_detail["caused_by"] if more_detail.key?("caused_by")
+                " (#{more_detail["type"]}: #{more_detail["reason"]})"
+              else
+                "; full response: #{::JSON.pretty_generate(response)}"
+              end
+            Result.failure_of(self, "#{update_target.script_id}(applied to `#{doc_id}`): #{error.fetch("reason")}#{further_detail}")
+          end
+        end
+        def type
+          :update
+        end
+        def description
+          if update_target.type == event.fetch("type")
+            "#{update_target.type} update"
+          else
+            "#{update_target.type} update (from #{event.fetch("type")})"
+          end
+        end
+        def inspect
+          "#<#{self.class.name} event=#{EventID.from_event(event)} target=#{update_target.type}>"
+        end
+        alias_method :to_s, :inspect
+        def versioned?
+          # We do not track source event versions when applying derived indexing updates, but we do for
+          # normal indexing updates, so if the update target is for normal indexing it's a versioned operation.
+          update_target.for_normal_indexing?
+        end
+        private
+        # The number of retries of the update script we'll have the datastore attempt on concurrent modification conflicts.
+        CONFLICT_RETRIES = 5
+        def metadata
+          {
+            _index: destination_index_def.index_name_for_writes(prepared_record, timestamp_field_path: update_target.rollover_timestamp_value_source),
+            _id: doc_id,
+            routing: destination_index_def.routing_value_for_prepared_record(
+              prepared_record,
+              route_with_path: update_target.routing_value_source,
+              id_path: update_target.id_source
+            ),
+            retry_on_conflict: CONFLICT_RETRIES
+          }.compact
+        end
+        def update_request
+          {
+            script: {id: update_target.script_id, params: script_params},
+            # We use a scripted upsert instead of formatting an upsert document because it creates
+            # for simpler code. To create the upsert document, we'd have to convert the param
+            # values to their "upsert form"--for example, for an `append_only_set` field, the param
+            # value is generally a single scalar value while in an upsert document it would need to
+            # be a list. By using `scripted_upsert`, we can always just pass the params in a consistent
+            # way, and rely on the script to handle the case where it is creating a brand new document.
+            scripted_upsert: true,
+            upsert: {}
+          }
+        end
+        def noop_result?(response)
+          update = response.fetch("update")
+          error_message = message_from_thrown_painless_exception(update).to_s
+          error_message.start_with?(UPDATE_WAS_NOOP_MESSAGE_PREAMBLE) || update["result"] == "noop"
+        end
+        def message_from_thrown_painless_exception(update)
+          update.dig("error", "caused_by", "caused_by", "reason")
+        end
+        def script_params
+          initial_params = update_target.params_for(
+            doc_id: doc_id,
+            event: event,
+            prepared_record: prepared_record
+          )
+          # The normal indexing script uses `__counts`. Other indexing scripts (e.g. the ones generated
+          # for derived indexing) do not use `__counts` so there's no point in spending effort on computing
+          # it. Plus, the logic below raises an exception in that case, so it's important we avoid it.
+          return initial_params unless update_target.for_normal_indexing?
+          CountAccumulator.merge_list_counts_into(
+            initial_params,
+            mapping: destination_index_mapping,
+            list_counts_field_paths_for_source: destination_index_def.list_counts_field_paths_for_source(update_target.relationship.to_s)
+          )
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/indexer/operation/upsert.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/indexer/operation/result"
+require "elastic_graph/support/hash_util"
+require "elastic_graph/support/memoizable_data"
+module ElasticGraph
+  class Indexer
+    module Operation
+      Upsert = Support::MemoizableData.define(:event, :destination_index_def, :record_preparer) do
+        # @implements Upsert
+        def to_datastore_bulk
+          @to_datastore_bulk ||= [{index: metadata}, prepared_record]
+        end
+        def categorize(response)
+          index = response.fetch("index")
+          status = index.fetch("status")
+          case status
+          when 200..299
+            Result.success_of(self)
+          when 409
+            Result.noop_of(self, index.fetch("error").fetch("reason"))
+          else
+            Result.failure_of(self, index.fetch("error").fetch("reason"))
+          end
+        end
+        def doc_id
+          @doc_id ||= event.fetch("id")
+        end
+        def type
+          :upsert
+        end
+        def description
+          "#{event.fetch("type")} upsert"
+        end
+        def versioned?
+          true
+        end
+        private
+        def metadata
+          @metadata ||= {
+            _index: destination_index_def.index_name_for_writes(prepared_record),
+            _id: doc_id,
+            version: event.fetch("version"),
+            version_type: "external",
+            routing: destination_index_def.routing_value_for_prepared_record(prepared_record)
+          }.compact
+        end
+        def prepared_record
+          @prepared_record ||= record_preparer.prepare_for_index(event.fetch("type"), event.fetch("record"))
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/indexer/processor.rb ADDED Viewed

@@ -0,0 +1,137 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/error"
+require "elastic_graph/indexer/event_id"
+require "elastic_graph/indexer/indexing_failures_error"
+require "time"
+module ElasticGraph
+  class Indexer
+    class Processor
+      def initialize(
+        datastore_router:,
+        operation_factory:,
+        logger:,
+        indexing_latency_slo_thresholds_by_timestamp_in_ms:,
+        clock: ::Time
+      )
+        @datastore_router = datastore_router
+        @operation_factory = operation_factory
+        @clock = clock
+        @logger = logger
+        @indexing_latency_slo_thresholds_by_timestamp_in_ms = indexing_latency_slo_thresholds_by_timestamp_in_ms
+      end
+      # Processes the given events, writing them to the datastore. If any events are invalid, an
+      # exception will be raised indicating why the events were invalid, but the valid events will
+      # still be written to the datastore. No attempt is made to provide atomic "all or nothing"
+      # behavior.
+      def process(events, refresh_indices: false)
+        failures = process_returning_failures(events, refresh_indices: refresh_indices)
+        return if failures.empty?
+        raise IndexingFailuresError.for(failures: failures, events: events)
+      end
+      # Like `process`, but returns failures instead of raising an exception.
+      # The caller is responsible for handling the failures.
+      def process_returning_failures(events, refresh_indices: false)
+        factory_results_by_event = events.to_h { |event| [event, @operation_factory.build(event)] }
+        factory_results = factory_results_by_event.values
+        bulk_result = @datastore_router.bulk(factory_results.flat_map(&:operations), refresh: refresh_indices)
+        successful_operations = bulk_result.successful_operations(check_failures: false)
+        calculate_latency_metrics(successful_operations, bulk_result.noop_results)
+        all_failures =
+          factory_results.map(&:failed_event_error).compact +
+          bulk_result.failure_results.map do |result|
+            all_operations_for_event = factory_results_by_event.fetch(result.event).operations
+            FailedEventError.from_failed_operation_result(result, all_operations_for_event.to_set)
+          end
+        categorize_failures(all_failures, events)
+      end
+      private
+      def categorize_failures(failures, events)
+        source_event_versions_by_cluster_by_op = @datastore_router.source_event_versions_in_index(
+          failures.flat_map { |f| f.versioned_operations.to_a }
+        )
+        superseded_failures, outstanding_failures = failures.partition do |failure|
+          failure.versioned_operations.size > 0 && failure.versioned_operations.all? do |op|
+            # Under normal conditions, we expect to get back only one version per operation per cluster.
+            # However, when a field used for routing or index rollover has mutated, we can wind up with
+            # multiple copies of the document in different indexes or shards. `source_event_versions_in_index`
+            # returns a list of found versions.
+            #
+            # We only need to consider the largest version when deciding if a failure has been supeseded or not.
+            # An event with a larger version is considered to be a full replacement for an earlier event for the
+            # same entity, so if we've processed an event for the same entity with a larger version, we can consider
+            # the failure superseded.
+            max_version_per_cluster = source_event_versions_by_cluster_by_op.fetch(op).values.map(&:max)
+            # We only consider an event to be superseded if the document version in the datastore
+            # for all its versioned operations is greater than the version of the failing event.
+            max_version_per_cluster.all? { |v| v && v > failure.version }
+          end
+        end
+        if superseded_failures.any?
+          superseded_ids = superseded_failures.map { |f| EventID.from_event(f.event).to_s }
+          @logger.warn(
+            "Ignoring #{superseded_ids.size} malformed event(s) because they have been superseded " \
+            "by corrected events targeting the same id: #{superseded_ids.join(", ")}."
+          )
+        end
+        outstanding_failures
+      end
+      def calculate_latency_metrics(successful_operations, noop_results)
+        current_time = @clock.now
+        successful_events = successful_operations.map(&:event).to_set
+        noop_events = noop_results.map(&:event).to_set
+        all_operations_events = successful_events + noop_events
+        all_operations_events.each do |event|
+          latencies_in_ms_from = {} # : Hash[String, Integer]
+          slo_results = {} # : Hash[String, String]
+          latency_timestamps = event.fetch("latency_timestamps", _ = {})
+          latency_timestamps.each do |ts_name, ts_value|
+            metric_value = ((current_time - Time.iso8601(ts_value)) * 1000).round
+            latencies_in_ms_from[ts_name] = metric_value
+            if (threshold = @indexing_latency_slo_thresholds_by_timestamp_in_ms[ts_name])
+              slo_results[ts_name] = (metric_value >= threshold) ? "bad" : "good"
+            end
+          end
+          result = successful_events.include?(event) ? "success" : "noop"
+          @logger.info({
+            "message_type" => "ElasticGraphIndexingLatencies",
+            "message_id" => event["message_id"],
+            "event_type" => event.fetch("type"),
+            "event_id" => EventID.from_event(event).to_s,
+            JSON_SCHEMA_VERSION_KEY => event.fetch(JSON_SCHEMA_VERSION_KEY),
+            "latencies_in_ms_from" => latencies_in_ms_from,
+            "slo_results" => slo_results,
+            "result" => result
+          })
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/indexer/record_preparer.rb ADDED Viewed

@@ -0,0 +1,163 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/error"
+module ElasticGraph
+  class Indexer
+    class RecordPreparer
+      # Provides the ability to get a `RecordPreparer` for a specific JSON schema version.
+      class Factory
+        def initialize(schema_artifacts)
+          @schema_artifacts = schema_artifacts
+          scalar_types_by_name = schema_artifacts.runtime_metadata.scalar_types_by_name
+          indexing_preparer_by_scalar_type_name = ::Hash.new do |hash, type_name|
+            hash[type_name] = scalar_types_by_name[type_name]&.load_indexing_preparer&.extension_class
+          end
+          @preparers_by_json_schema_version = ::Hash.new do |hash, version|
+            hash[version] = RecordPreparer.new(
+              indexing_preparer_by_scalar_type_name,
+              build_type_metas_from(@schema_artifacts.json_schemas_for(version))
+            )
+          end
+        end
+        # Gets the `RecordPreparer` for the given JSON schema version.
+        def for_json_schema_version(json_schema_version)
+          @preparers_by_json_schema_version[json_schema_version]
+        end
+        # Gets the `RecordPreparer` for the latest JSON schema version. Intended primarily
+        # for use in tests for convenience.
+        def for_latest_json_schema_version
+          for_json_schema_version(@schema_artifacts.latest_json_schema_version)
+        end
+        private
+        def build_type_metas_from(json_schemas)
+          json_schemas.fetch("$defs").filter_map do |type, type_def|
+            next if type == EVENT_ENVELOPE_JSON_SCHEMA_NAME
+            properties = type_def.fetch("properties") do
+              {} # : ::Hash[::String, untyped]
+            end # : ::Hash[::String, untyped]
+            required_fields = type_def.fetch("required") do
+              [] # : ::Array[::String]
+            end # : ::Array[::String]
+            eg_meta_by_field_name = properties.filter_map do |prop_name, prop|
+              eg_meta = prop["ElasticGraph"]
+              [prop_name, eg_meta] if eg_meta
+            end.to_h
+            TypeMetadata.new(
+              name: type,
+              requires_typename: required_fields.include?("__typename"),
+              eg_meta_by_field_name: eg_meta_by_field_name
+            )
+          end
+        end
+      end
+      # An alternate `RecordPreparer` implementation that implements the identity function:
+      # it just echoes back the record it is given.
+      #
+      # This is intended only for use where a `RecordPreparer` is required but the data is not
+      # ultimately going to be sent to the datastore. For example, when an event is invalid, we
+      # still build operations for it, and the operations require a `RecordPreparer`, but we do
+      # not send them to the datastore.
+      module Identity
+        def self.prepare_for_index(type_name, record)
+          record
+        end
+      end
+      def initialize(indexing_preparer_by_scalar_type_name, type_metas)
+        @indexing_preparer_by_scalar_type_name = indexing_preparer_by_scalar_type_name
+        @eg_meta_by_field_name_by_concrete_type = type_metas.to_h do |meta|
+          [meta.name, meta.eg_meta_by_field_name]
+        end
+        @types_requiring_typename = type_metas.filter_map do |meta|
+          meta.name if meta.requires_typename
+        end.to_set
+      end
+      # Prepares the given payload for being indexed into the named index.
+      # This allows any value or field name conversion to happen before we index
+      # the data, to support the few cases where we expect differences between
+      # the payload received by the ElasticGraph indexer, and the payload we
+      # send to the datastore.
+      #
+      # As part of preparing the data, we also drop any `record` fields that
+      # are not defined in our schema. This allows us to handle events that target
+      # multiple indices (e.g. v1 and v2) for the same type. The event can contain
+      # the set union of fields and this will take care of dropping any unsupported
+      # fields before we attempt to index the record.
+      #
+      # Note: this method does not mutate the given `record`. Instead it returns a
+      # copy with any updates applied to it.
+      def prepare_for_index(type_name, record)
+        prepare_value_for_indexing(record, type_name)
+      end
+      private
+      def prepare_value_for_indexing(value, type_name)
+        type_name = type_name.delete_suffix("!")
+        return nil if value.nil?
+        if (preparer = @indexing_preparer_by_scalar_type_name[type_name])
+          return (_ = preparer).prepare_for_indexing(value)
+        end
+        case value
+        when ::Array
+          element_type_name = type_name.delete_prefix("[").delete_suffix("]")
+          value.map { |v| prepare_value_for_indexing(v, element_type_name) }
+        when ::Hash
+          # `@eg_meta_by_field_name_by_concrete_type` does not have abstract types in it (e.g. type unions).
+          # Instead, it'll have each concrete subtype in it.
+          #
+          # If `type_name` is an abstract type, we need to look at the `__typename` field to see
+          # what the concrete subtype is. `__typename` is required on abstract types and indicates that.
+          eg_meta_by_field_name = @eg_meta_by_field_name_by_concrete_type.fetch(value["__typename"] || type_name)
+          value.filter_map do |field_name, field_value|
+            if field_name == "__typename"
+              # We only want to include __typename if it we're dealing with a type that requires it.
+              # (This is the case for an abstract type, so it can differentiate between which subtype we have
+              [field_name, field_value] if @types_requiring_typename.include?(type_name)
+            elsif (eg_meta = eg_meta_by_field_name[field_name])
+              [eg_meta.fetch("nameInIndex"), prepare_value_for_indexing(field_value, eg_meta.fetch("type"))]
+            end
+          end.to_h
+        else
+          # We won't have a registered preparer for enum types, since those aren't dumped in
+          # runtime metadata `scalar_types_by_name`, and we can just return the value as-is in
+          # this case.
+          value
+        end
+      end
+      TypeMetadata = ::Data.define(
+        # The name of the type this metadata object is for.
+        :name,
+        # Indicates if this type requires a `__typename` field.
+        :requires_typename,
+        # The per-field ElasticGraph metadata, keyed by field name.
+        :eg_meta_by_field_name
+      )
+    end
+  end
+end

data/lib/elastic_graph/indexer/spec_support/event_matcher.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "json"
+# Defines an RSpec matcher that can be used to validate ElasticGraph events.
+::RSpec::Matchers.define :be_a_valid_elastic_graph_event do |for_indexer:|
+  match do |event|
+    result = for_indexer
+      .operation_factory
+      .with(configure_record_validator: block_arg)
+      .build(event)
+    @validation_failure = result.failed_event_error
+    !@validation_failure
+  end
+  description do
+    "be a valid ElasticGraph event"
+  end
+  failure_message do |event|
+    <<~EOS
+      expected the event[1] to #{description}, but it was invalid[2].
+      [1] #{::JSON.pretty_generate(event)}
+      [2] #{@validation_failure.message}
+    EOS
+  end
+  failure_message_when_negated do |event|
+    <<~EOS
+      expected the event[1] not to #{description}, but it was valid.
+      [1] #{::JSON.pretty_generate(event)}
+    EOS
+  end
+end

data/lib/elastic_graph/indexer/test_support/converters.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/constants"
+require "elastic_graph/support/hash_util"
+require "json"
+module ElasticGraph
+  class Indexer
+    module TestSupport
+      module Converters
+        # Helper method for testing and generating fake data to convert a factory record into an event
+        def self.upsert_event_for(record)
+          {
+            "op" => "upsert",
+            "id" => record.fetch("id"),
+            "type" => record.fetch("__typename"),
+            "version" => record.fetch("__version"),
+            "record" => record.except("__typename", "__version", "__json_schema_version"),
+            JSON_SCHEMA_VERSION_KEY => record.fetch("__json_schema_version")
+          }
+        end
+        # Helper method to create an array of events given an array of records
+        def self.upsert_events_for_records(records)
+          records.map { |record| upsert_event_for(Support::HashUtil.stringify_keys(record)) }
+        end
+      end
+    end
+  end
+end