RubyGems - elasticgraph-graphql - Versions diffs - 0.18.0.0 - Mend

elasticgraph-graphql 0.18.0.0

Files changed (81) hide show

data/lib/elastic_graph/graphql/aggregation/query_optimizer.rb ADDED Viewed

@@ -0,0 +1,187 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/key"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      # This class is used by `DatastoreQuery.perform` to optimize away an inefficiency that's present in
+      # our aggregations API. To explain what this does, it's useful to see an example:
+      #
+      # ```
+      # query WigdetsBySizeAndColor($filter: WidgetFilterInput) {
+      #   by_size: widgetAggregations(filter: $filter) {
+      #     edges { node {
+      #       size
+      #       count
+      #     } }
+      #   }
+      #
+      #   by_color: widgetAggregations(filter: $filter) {
+      #     edges { node {
+      #       color
+      #       count
+      #     } }
+      #   }
+      # }
+      # ```
+      #
+      # With this API, two separate datastore queries get built--one for `by_size`, and one
+      # for `by_color`. While we're able to send them to the datastore in a single `msearch` request,
+      # as it allows a single search to have multiple aggregations in it. The aggregations
+      # API we offered before April 2023 directly supported this, allowing for more efficient
+      # queries. (But it had other significant downsides).
+      #
+      # We found that sending 2 queries is significantly slower than sending one combined query
+      # (from benchmarks/aggregations_old_vs_new_api.rb):
+      #
+      # Benchmarks for old API (300 times):
+      # Average took value: 15
+      # Median took value: 14
+      # P99 took value: 45
+      #
+      # Benchmarks for new API (300 times):
+      # Average took value: 28
+      # Median took value: 25
+      # P99 took value: 75
+      #
+      # This class optimizes this case by merging `DatastoreQuery` objects together when we can safely do so,
+      # in order to execute fewer datastore queries. Notably, while this was designed for this specific
+      # aggregations case, the merging logic can also apply in non-aggregations case.
+      #
+      # Note that we want to err on the side of safety here. We only merge queries if their datastore
+      # payloads are byte-for-byte identical when aggregations are excluded. There are some cases where
+      # we _could_ merge slightly differing queries in clever ways (for example, if the only difference is
+      # `track_total_hits: false` vs `track_total_hits: true`, we could merge to a single query with
+      # `track_total_hits: true`), but that's significantly more complex and error prone, so we do not do it.
+      # We can always improve this further in the future to cover more cases.
+      #
+      # NOTE: the `QueryOptimizer` assumes that `Aggregation::Query` will always produce aggregation keys
+      # using `Aggregation::Query#name` such that `Aggregation::Key.extract_aggregation_name_from` is able
+      # to extract the original name from response keys. If that is violated, it will not work properly and
+      # subtle bugs can result. However, we have a test helper method which is hooked into our unit and
+      # integration tests for `DatastoreQuery` (`verify_aggregations_satisfy_optimizer_requirements`) which
+      # verifies that this requirement is satisfied.
+      class QueryOptimizer
+        def self.optimize_queries(queries)
+          return {} if queries.empty?
+          optimizer = new(queries, logger: (_ = queries.first).logger)
+          responses_by_query = yield optimizer.merged_queries
+          optimizer.unmerge_responses(responses_by_query)
+        end
+        def initialize(original_queries, logger:)
+          @original_queries = original_queries
+          @logger = logger
+          last_id = 0
+          @unique_prefix_by_query = ::Hash.new { |h, k| h[k] = "#{last_id += 1}_" }
+        end
+        def merged_queries
+          original_queries_by_merged_query.keys
+        end
+        def unmerge_responses(responses_by_merged_query)
+          original_queries_by_merged_query.flat_map do |merged, originals|
+            # When we only had a single query to start with, we didn't change the query at all, and don't need to unmerge the response.
+            needs_unmerging = originals.size > 1
+            originals.filter_map do |orig|
+              if (merged_response = responses_by_merged_query[merged])
+                response = needs_unmerging ? unmerge_response(merged_response, orig) : merged_response
+                [orig, response]
+              end
+            end
+          end.to_h
+        end
+        private
+        def original_queries_by_merged_query
+          @original_queries_by_merged_query ||= queries_by_merge_key.values.to_h do |original_queries|
+            [merge_queries(original_queries), original_queries]
+          end
+        end
+        NO_AGGREGATIONS = {}
+        def queries_by_merge_key
+          @original_queries.group_by do |query|
+            # Here we group queries in the simplest, safest way possible: queries are safe to merge if
+            # their datastore payloads are byte-for-byte identical, excluding aggregations.
+            query.with(aggregations: NO_AGGREGATIONS)
+          end
+        end
+        def merge_queries(queries)
+          # If we only have a single query, there's nothing to merge!
+          return (_ = queries.first) if queries.one?
+          all_aggs_by_name = queries.flat_map do |query|
+            # It's possible for two queries to have aggregations with the same name but different parameters.
+            # In a merged query, each aggregation must have a different name. Here we guarantee that by adding
+            # a numeric prefix to the aggregations. For example, if both `query1` and `query2` have a `by_size`
+            # aggregation, on the merged query we'll have a `1_by_size` aggregation and a `2_by_size` aggregation.
+            prefix = @unique_prefix_by_query[query]
+            query.aggregations.values.map do |agg|
+              agg.with(name: "#{prefix}#{agg.name}")
+            end
+          end.to_h { |agg| [agg.name, agg] }
+          @logger.info({
+            "message_type" => "AggregationQueryOptimizerMergedQueries",
+            "query_count" => queries.size,
+            "aggregation_count" => all_aggs_by_name.size,
+            "aggregation_names" => all_aggs_by_name.keys.sort
+          })
+          (_ = queries.first).with(aggregations: all_aggs_by_name)
+        end
+        # "Unmerges" a response to convert it to what it woulud have been if we hadn't merged queries.
+        # To do that, we need to do two things:
+        #
+        # - Filter down the aggregations to just the ones that are for the original query.
+        # - Remove the query-specific prefix (e.g. `1_`) from the parts of the response that
+        #   contain the aggregation name.
+        def unmerge_response(response_from_merged_query, original_query)
+          # If there are no aggregations, there's nothing to unmerge--just return it as is.
+          return response_from_merged_query unless (aggs = response_from_merged_query["aggregations"])
+          prefix = @unique_prefix_by_query[original_query]
+          agg_names = original_query.aggregations.keys.map { |name| "#{prefix}#{name}" }.to_set
+          filtered_aggs = aggs
+            .select { |key, agg_data| agg_names.include?(Key.extract_aggregation_name_from(key)) }
+            .to_h do |key, agg_data|
+              [key.delete_prefix(prefix), strip_prefix_from_agg_data(agg_data, prefix, key)]
+            end
+          response_from_merged_query.merge("aggregations" => filtered_aggs)
+        end
+        def strip_prefix_from_agg_data(agg_data, prefix, key)
+          case agg_data
+          when ::Hash
+            agg_data.to_h do |sub_key, sub_data|
+              sub_key = sub_key.delete_prefix(prefix) if sub_key.start_with?(key)
+              [sub_key, strip_prefix_from_agg_data(sub_data, prefix, key)]
+            end
+          when ::Array
+            agg_data.map do |element|
+              strip_prefix_from_agg_data(element, prefix, key)
+            end
+          else
+            agg_data
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/aggregation/resolvers/aggregated_values.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/key"
+require "elastic_graph/graphql/aggregation/path_segment"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      module Resolvers
+        class AggregatedValues < ::Data.define(:aggregation_name, :bucket, :field_path)
+          def can_resolve?(field:, object:)
+            true
+          end
+          def resolve(field:, object:, args:, context:, lookahead:)
+            return with(field_path: field_path + [PathSegment.for(field: field, lookahead: lookahead)]) if field.type.object?
+            key = Key::AggregatedValue.new(
+              aggregation_name: aggregation_name,
+              field_path: field_path.map(&:name_in_graphql_query),
+              function_name: field.name_in_index.to_s
+            )
+            result = bucket.fetch(key.encode)
+            # Aggregated value results always have a `value` key; in addition, for `date` field, they also have a `value_as_string`.
+            # In that case, `value` is a number (e.g. ms since epoch) whereas `value_as_string` is a formatted value. ElasticGraph
+            # works with date types as formatted strings, so we need to use `value_as_string` here if it is present.
+            result.fetch("value_as_string") { result.fetch("value") }
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/aggregation/resolvers/count_detail.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/resolvers/resolvable_value"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      module Resolvers
+        # Resolves the detailed `count` sub-fields of a sub-aggregation. It's an object because
+        # the count we get from the datastore may not be accurate and we have multiple
+        # fields we expose to give the client control over how much detail they want.
+        #
+        # Note: for now our resolver logic only uses the bucket fields returned to us by the datastore,
+        # but I believe we may have some opportunities to provide more accurate responses to these when custom shard
+        # routing and/or index rollover are in use. For example, when grouping on the custom shard routing field,
+        # we know that no term bucket will have data from more than one shard. The datastore isn't aware of our
+        # custom shard routing logic, though, and can't account for that in what it returns, so it may indicate
+        # a potential error upper bound where we can deduce there is none.
+        class CountDetail < GraphQL::Resolvers::ResolvableValue.new(:bucket)
+          # The (potentially approximate) `doc_count` returned by the datastore for a bucket.
+          def approximate_value
+            @approximate_value ||= bucket.fetch("doc_count")
+          end
+          # The `doc_count`, if we know it was exact. (Otherwise, returns `nil`).
+          def exact_value
+            approximate_value if approximate_value == upper_bound
+          end
+          # The upper bound on how large the doc count could be.
+          def upper_bound
+            @upper_bound ||= bucket.fetch("doc_count_error_upper_bound") + approximate_value
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/aggregation/resolvers/grouped_by.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/field_path_encoder"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      module Resolvers
+        class GroupedBy < ::Data.define(:bucket, :field_path)
+          def can_resolve?(field:, object:)
+            true
+          end
+          def resolve(field:, object:, args:, context:, lookahead:)
+            new_field_path = field_path + [PathSegment.for(field: field, lookahead: lookahead)]
+            return with(field_path: new_field_path) if field.type.object?
+            bucket.fetch("key").fetch(FieldPathEncoder.encode(new_field_path.map(&:name_in_graphql_query)))
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/aggregation/resolvers/node.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/resolvers/aggregated_values"
+require "elastic_graph/graphql/aggregation/resolvers/grouped_by"
+require "elastic_graph/graphql/decoded_cursor"
+require "elastic_graph/graphql/resolvers/resolvable_value"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      module Resolvers
+        class Node < GraphQL::Resolvers::ResolvableValue.new(:query, :parent_queries, :bucket, :field_path)
+          # This file defines a subclass of `Node` and can't be loaded until `Node` has been defined.
+          require "elastic_graph/graphql/aggregation/resolvers/sub_aggregations"
+          def grouped_by
+            @grouped_by ||= GroupedBy.new(bucket, field_path)
+          end
+          def aggregated_values
+            @aggregated_values ||= AggregatedValues.new(query.name, bucket, field_path)
+          end
+          def sub_aggregations
+            @sub_aggregations ||= SubAggregations.new(
+              schema_element_names,
+              query.sub_aggregations,
+              parent_queries + [query],
+              bucket,
+              field_path
+            )
+          end
+          def count
+            bucket.fetch("doc_count")
+          end
+          def count_detail
+            @count_detail ||= CountDetail.new(schema_element_names, bucket)
+          end
+          def cursor
+            # If there's no `key`, then we aren't grouping by anything. We just have a single aggregation
+            # bucket containing computed values over the entire set of filtered documents. In that case,
+            # we still need a pagination cursor but we have no "key" to speak of that we can encode. Instead,
+            # we use the special SINGLETON cursor defined for this case.
+            @cursor ||=
+              if (key = bucket.fetch("key")).empty?
+                DecodedCursor::SINGLETON
+              else
+                DecodedCursor.new(key)
+              end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/aggregation/resolvers/relay_connection_builder.rb ADDED Viewed

@@ -0,0 +1,83 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/resolvers/node"
+require "elastic_graph/graphql/datastore_query"
+require "elastic_graph/graphql/resolvers/relay_connection/generic_adapter"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      module Resolvers
+        module RelayConnectionBuilder
+          def self.build_from_search_response(query:, search_response:, schema_element_names:)
+            build_from_buckets(query: query, parent_queries: [], schema_element_names: schema_element_names) do
+              extract_buckets_from(search_response, for_query: query)
+            end
+          end
+          def self.build_from_buckets(query:, parent_queries:, schema_element_names:, field_path: [], &build_buckets)
+            GraphQL::Resolvers::RelayConnection::GenericAdapter.new(
+              schema_element_names: schema_element_names,
+              raw_nodes: raw_nodes_for(query, parent_queries, schema_element_names, field_path, &build_buckets),
+              paginator: query.paginator,
+              get_total_edge_count: -> {},
+              to_sort_value: ->(node, decoded_cursor) do
+                query.groupings.map do |grouping|
+                  DatastoreQuery::Paginator::SortValue.new(
+                    from_item: (_ = node).bucket.fetch("key").fetch(grouping.key),
+                    from_cursor: decoded_cursor.sort_values.fetch(grouping.key),
+                    sort_direction: :asc # we don't yet support any alternate sorting.
+                  )
+                end
+              end
+            )
+          end
+          private_class_method def self.raw_nodes_for(query, parent_queries, schema_element_names, field_path)
+            # The `DecodedCursor::SINGLETON` is a special case, so handle it here.
+            return [] if query.paginator.paginated_from_singleton_cursor?
+            yield.map do |bucket|
+              Node.new(
+                schema_element_names: schema_element_names,
+                query: query,
+                parent_queries: parent_queries,
+                bucket: bucket,
+                field_path: field_path
+              )
+            end
+          end
+          private_class_method def self.extract_buckets_from(search_response, for_query:)
+            search_response.raw_data.dig(
+              "aggregations",
+              for_query.name,
+              "buckets"
+            ) || [build_bucket(for_query, search_response.raw_data)]
+          end
+          private_class_method def self.build_bucket(query, response)
+            defaults = {
+              "key" => query.groupings.to_h { |g| [g.key, nil] },
+              "doc_count" => response.dig("hits", "total", "value") || 0
+            }
+            empty_bucket_computations = query.computations.to_h do |computation|
+              [computation.key(aggregation_name: query.name), {"value" => computation.detail.empty_bucket_value}]
+            end
+            defaults
+              .merge(empty_bucket_computations)
+              .merge(response["aggregations"] || {})
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/aggregation/resolvers/sub_aggregations.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/composite_grouping_adapter"
+require "elastic_graph/graphql/aggregation/key"
+require "elastic_graph/graphql/aggregation/non_composite_grouping_adapter"
+require "elastic_graph/graphql/aggregation/resolvers/count_detail"
+require "elastic_graph/graphql/decoded_cursor"
+require "elastic_graph/graphql/resolvers/resolvable_value"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      module Resolvers
+        class SubAggregations < ::Data.define(:schema_element_names, :sub_aggregations, :parent_queries, :sub_aggs_by_name, :field_path)
+          def can_resolve?(field:, object:)
+            true
+          end
+          def resolve(field:, object:, args:, context:, lookahead:)
+            path_segment = PathSegment.for(field: field, lookahead: lookahead)
+            new_field_path = field_path + [path_segment]
+            return with(field_path: new_field_path) unless field.type.elasticgraph_category == :nested_sub_aggregation_connection
+            aggregation_name = path_segment.name_in_graphql_query
+            sub_agg_query = sub_aggregations.fetch(aggregation_name).query
+            RelayConnectionBuilder.build_from_buckets(
+              query: sub_agg_query,
+              parent_queries: parent_queries,
+              schema_element_names: schema_element_names,
+              field_path: new_field_path
+            ) { extract_buckets(aggregation_name, args) }
+          end
+          private
+          def extract_buckets(aggregation_name, args)
+            # When the client passes `first: 0`, we omit the sub-aggregation from the query body entirely,
+            # and it wont' be in `sub_aggs_by_name`. Instead, we can just return an empty list of buckets.
+            return [] if args[schema_element_names.first] == 0
+            sub_agg = sub_aggs_by_name.fetch(Key.encode(parent_queries.map(&:name) + [aggregation_name]))
+            meta = sub_agg.fetch("meta")
+            # When the sub-aggregation node of the GraphQL query has a `filter` argument, the direct sub-aggregation returned by
+            # the datastore will be the unfiltered sub-aggregation. To get the filtered sub-aggregation (the data our client
+            # actually cares about), we have a sub-aggregation under that.
+            #
+            # To indicate this case, our query includes a `meta` field which which tells us which sub-key # has the actual data we care about in it:
+            # - If grouping has been applied (leading to multiple buckets): `meta: {buckets_path: [path, to, bucket]}`
+            # - If no grouping has been applied (leading to a single bucket): `meta: {bucket_path: [path, to, bucket]}`
+            if (buckets_path = meta["buckets_path"])
+              bucket_adapter = BUCKET_ADAPTERS.fetch(sub_agg.dig("meta", "adapter"))
+              bucket_adapter.prepare_response_buckets(sub_agg, buckets_path, meta)
+            else
+              singleton_bucket =
+                if (bucket_path = meta["bucket_path"])
+                  sub_agg.dig(*bucket_path)
+                else
+                  sub_agg
+                end
+              # When we have a single ungrouped bucket, we never have any error on the `doc_count`.
+              # Our resolver logic expects it to be present, though.
+              [singleton_bucket.merge({"doc_count_error_upper_bound" => 0})]
+            end
+          end
+          BUCKET_ADAPTERS = [CompositeGroupingAdapter, NonCompositeGroupingAdapter].to_h do |adapter|
+            [adapter.meta_name, adapter]
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/aggregation/script_term_grouping.rb ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/term_grouping"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      # Used for term groupings that use a script instead of a field
+      class ScriptTermGrouping < Support::MemoizableData.define(:field_path, :script_id, :params)
+        # @dynamic field_path
+        include TermGrouping
+        private
+        def terms_subclause
+          {
+            "script" => {
+              "id" => script_id,
+              "params" => params.merge({"field" => encoded_index_field_path})
+            }
+          }
+        end
+      end
+    end
+  end
+end