RubyGems - elasticgraph-graphql - Versions diffs - 0.18.0.0 - Mend

elasticgraph-graphql 0.18.0.0

Files changed (81) hide show

data/lib/elastic_graph/graphql/aggregation/term_grouping.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/aggregation/field_path_encoder"
+require "elastic_graph/support/memoizable_data"
+module ElasticGraph
+  class GraphQL
+    module Aggregation
+      # Represents a grouping on a term.
+      # For the relevant Elasticsearch docs, see:
+      # https://www.elastic.co/guide/en/elasticsearch/reference/7.12/search-aggregations-bucket-terms-aggregation.html
+      # https://www.elastic.co/guide/en/elasticsearch/reference/7.12/search-aggregations-bucket-composite-aggregation.html#_terms
+      module TermGrouping
+        def key
+          @key ||= FieldPathEncoder.encode(field_path.map(&:name_in_graphql_query))
+        end
+        def encoded_index_field_path
+          @encoded_index_field_path ||= FieldPathEncoder.join(field_path.filter_map(&:name_in_index))
+        end
+        def composite_clause(grouping_options: {})
+          {"terms" => terms_subclause.merge(grouping_options)}
+        end
+        def non_composite_clause_for(query)
+          clause_value = work_around_elasticsearch_bug(terms_subclause)
+          {
+            "terms" => clause_value.merge({
+              "size" => query.paginator.desired_page_size,
+              "show_term_doc_count_error" => query.needs_doc_count_error
+            })
+          }
+        end
+        INNER_META = {"key_path" => ["key"], "merge_into_bucket" => {}}
+        def inner_meta
+          INNER_META
+        end
+        private
+        # Here we force the `collect_mode` to `depth_first`. Without doing that, we've observed that some of our acceptance
+        # specs fail on CI when running against Elasticsearch 8.11 with an error like:
+        #
+        # ```
+        # {
+        #   "root_cause": [
+        #     {
+        #       "type": "runtime_exception",
+        #       "reason": "score for different docid, nesting an aggregation under a children aggregation and terms aggregation with collect mode breadth_first isn't possible"
+        #     }
+        #   ],
+        #   "type": "search_phase_execution_exception",
+        #   "reason": "all shards failed",
+        #   "phase": "query",
+        #   "grouped": true,
+        #   "failed_shards": [
+        #     {
+        #       "shard": 0,
+        #       "index": "teams_camel",
+        #       "node": "pDXJzLTsRJCRjKe83DqipA",
+        #       "reason": {
+        #         "type": "runtime_exception",
+        #         "reason": "score for different docid, nesting an aggregation under a children aggregation and terms aggregation with collect mode breadth_first isn't possible"
+        #       }
+        #     }
+        #   ],
+        #   "caused_by": {
+        #     "type": "runtime_exception",
+        #     "reason": "score for different docid, nesting an aggregation under a children aggregation and terms aggregation with collect mode breadth_first isn't possible",
+        #     "caused_by": {
+        #       "type": "runtime_exception",
+        #       "reason": "score for different docid, nesting an aggregation under a children aggregation and terms aggregation with collect mode breadth_first isn't possible"
+        #     }
+        #   }
+        # }
+        # ```
+        #
+        # This specific exception message was introduced in https://github.com/elastic/elasticsearch/pull/89993, but that was done to provide
+        # a better error than a NullPointerException (which is what used to happen). This error also appears to be non-deterministic; I wasn't
+        # able to reproduce the CI failure locally until I forced `"collect_mode" => "breadth_first"`, at which point I did see the same error
+        # locally. The Elasticsearch docs[^1] mention that a heuristic (partially based on if a field's cardinality is known!) is used to pick
+        # whether `breadth_first` or `depth_first` is used when `collect_mode`is not specified:
+        #
+        # > The `breadth_first` is the default mode for fields with a cardinality bigger than the requested size or when the cardinality is unknown
+        # > (numeric fields or scripts for instance).
+        #
+        # In addition, the docs[^2] make it clear that `depth_first` is usually what you want:
+        #
+        # > The strategy we outlined previously—building the tree fully and then pruning—is called depth-first and it is the default.
+        # > Depth-first works well for the majority of aggregations, but can fall apart in situations like our actors and costars example.
+        # >
+        # > ...
+        # >
+        # > Breadth-first should be used only when you expect more buckets to be generated than documents landing in the buckets.
+        #
+        # So, for now we are forcing the collect mode to `depth_first`, as it avoids an issue with Elasticsearch and is a generally
+        # sane default. It may fall over in the case breadth-first is intended for, but we can cross that bridge when it comes.
+        #
+        # Long term, we're hoping to switch sub-aggregations to use a `composite` aggregation instead of `terms`, rendering this moot.
+        #
+        # [^1]: https://www.elastic.co/guide/en/elasticsearch/reference/8.11/search-aggregations-bucket-terms-aggregation.html#search-aggregations-bucket-terms-aggregation-collect
+        # [^2]: https://www.elastic.co/guide/en/elasticsearch/guide/current/_preventing_combinatorial_explosions.html#_depth_first_versus_breadth_first
+        def work_around_elasticsearch_bug(terms_clause)
+          terms_clause.merge({"collect_mode" => "depth_first"})
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/client.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+module ElasticGraph
+  class GraphQL
+    # Represents a client of an ElasticGraph GraphQL endpoint.
+    # `name` and `source_description` can really be any string, but `name` is
+    # meant to be a friendly/human readable string (such as a service name)
+    # where as `source_description` is meant to be an opaque string describing
+    # where `name` came from.
+    class Client < Data.define(:source_description, :name)
+      # `Data.define` provides the following methods:
+      # @dynamic initialize, name, source_description, with
+      ANONYMOUS = new("(anonymous)", "(anonymous)")
+      ELASTICGRAPH_INTERNAL = new("(ElasticGraphInternal)", "(ElasticGraphInternal)")
+      def description
+        if source_description == name
+          name
+        else
+          "#{name} (#{source_description})"
+        end
+      end
+      # Default resolver used to determine the client for a given HTTP request.
+      # Also defines the interface of a client resolver. (This is why we define `initialize`).
+      class DefaultResolver
+        def initialize(config)
+        end
+        def resolve(http_request)
+          Client::ANONYMOUS
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/config.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/error"
+require "elastic_graph/graphql/client"
+require "elastic_graph/schema_artifacts/runtime_metadata/extension_loader"
+module ElasticGraph
+  class GraphQL
+    class Config < Data.define(
+      # Determines the size of our datastore search requests if the query does not specify.
+      :default_page_size,
+      # Determines the maximum size of a requested page. If the client requests a page larger
+      # than this value, `max_page_size` elements will be returned instead.
+      :max_page_size,
+      # Queries that take longer than this configured threshold will have a sanitized version logged.
+      :slow_query_latency_warning_threshold_in_ms,
+      # Object used to identify the client of a GraphQL query based on the HTTP request.
+      :client_resolver,
+      # Array of modules that will be extended onto the `GraphQL` instance to support extension libraries.
+      :extension_modules,
+      # Contains any additional settings that were in the settings file beyond settings that are expected as part of ElasticGraph
+      # itself. Extensions are free to use these extra settings.
+      :extension_settings
+    )
+      def self.from_parsed_yaml(entire_parsed_yaml)
+        parsed_yaml = entire_parsed_yaml.fetch("graphql")
+        extra_keys = parsed_yaml.keys - EXPECTED_KEYS
+        unless extra_keys.empty?
+          raise ConfigError, "Unknown `graphql` config settings: #{extra_keys.join(", ")}"
+        end
+        extension_loader = SchemaArtifacts::RuntimeMetadata::ExtensionLoader.new(::Module.new)
+        extension_mods = parsed_yaml.fetch("extension_modules", []).map do |mod_hash|
+          extension_loader.load(mod_hash.fetch("extension_name"), from: mod_hash.fetch("require_path"), config: {}).extension_class.tap do |mod|
+            unless mod.instance_of?(::Module)
+              raise ConfigError, "`#{mod_hash.fetch("extension_name")}` is not a module, but all application extension modules must be modules."
+            end
+          end
+        end
+        new(
+          default_page_size: parsed_yaml.fetch("default_page_size"),
+          max_page_size: parsed_yaml.fetch("max_page_size"),
+          slow_query_latency_warning_threshold_in_ms: parsed_yaml["slow_query_latency_warning_threshold_in_ms"] || 5000,
+          client_resolver: load_client_resolver(parsed_yaml),
+          extension_modules: extension_mods,
+          extension_settings: entire_parsed_yaml.except(*ELASTICGRAPH_CONFIG_KEYS)
+        )
+      end
+      # The keys we expect under `graphql`.
+      EXPECTED_KEYS = members.map(&:to_s)
+      # The standard ElasticGraph root config setting keys; anything else is assumed to be extension settings.
+      ELASTICGRAPH_CONFIG_KEYS = %w[graphql indexer logger datastore schema_artifacts]
+      private_class_method def self.load_client_resolver(parsed_yaml)
+        config = parsed_yaml.fetch("client_resolver") do
+          return Client::DefaultResolver.new({})
+        end
+        client_resolver_loader = SchemaArtifacts::RuntimeMetadata::ExtensionLoader.new(Client::DefaultResolver)
+        extension = client_resolver_loader.load(
+          config.fetch("extension_name"),
+          from: config.fetch("require_path"),
+          config: config.except("extension_name", "require_path")
+        )
+        extension_class = extension.extension_class # : ::Class
+        __skip__ = extension_class.new(extension.extension_config)
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/datastore_query/document_paginator.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/error"
+require "graphql"
+module ElasticGraph
+  class GraphQL
+    class DatastoreQuery
+      # Contains query logic related to pagination. Mostly delegates to `Paginator`, which
+      # contains most of the logic. This merely adapts the `Paginator` to the needs of document
+      # pagination. (Paginator also supports aggregation bucket pagination.)
+      class DocumentPaginator < Support::MemoizableData.define(
+        :sort_clauses, :paginator, :decoded_cursor_factory, :schema_element_names,
+        # `individual_docs_needed`: when false, we request a `size` of 0. Set to `true` when the client is
+        # requesting any document fields, or if we need documents to compute any parts of the `PageInfo`.
+        :individual_docs_needed,
+        # `total_document_count_needed`: when false, `track_total_hits` will be 0 in our datastore query.
+        # This will prevent the datastore from doing extra work to get an accurate count
+        :total_document_count_needed
+      )
+        # Builds a hash containing the portions of a datastore search body related to pagination.
+        def to_datastore_body
+          {
+            size: effective_size,
+            sort: effective_sort,
+            search_after: search_after,
+            track_total_hits: total_document_count_needed
+          }.reject { |key, value| Array(value).empty? }
+        end
+        def sort
+          @sort ||= sort_clauses.map do |clause|
+            clause.transform_values do |options|
+              # As per the Elasticsearch docs[^1] missing/null values get sorted last by default, but we can control
+              # it here. We want to control it here to make our sorting behavior more consistent in a couple ways:
+              #
+              # 1. We want _document_ sorting and _aggregation_ sorting to behave the same. Aggregation sorting puts
+              #    missing value buckets first when sorting ascending and last when sorting descending[^2]. Note that in
+              #    Elasticsearch 7.16[^3] and above, you can control if missing buckets go first or last, but below that
+              #    version you have no control. Here we match that behavior.
+              # 2. Clients are likely to expect that descending sorting will produce a list in reverse order from what
+              #    ascending sorting produces, but with the default behavior (missing/null values get sorted last), this
+              #    is not the case. We have to use the opposite `missing` option when the `order` is the opposite.
+              #
+              # [^1]: https://www.elastic.co/guide/en/elasticsearch/reference/7.10/sort-search-results.html#_missing_values
+              # [^2]: https://www.elastic.co/guide/en/elasticsearch/reference/7.10/search-aggregations-bucket-composite-aggregation.html#_missing_bucket
+              # [^3]: https://www.elastic.co/guide/en/elasticsearch/reference/7.16/search-aggregations-bucket-composite-aggregation.html#_missing_bucket
+              missing = (options.fetch("order") == "asc") ? "_first" : "_last"
+              options.merge({"missing" => missing})
+            end
+          end
+        end
+        private
+        def effective_size
+          individual_docs_needed ? paginator.requested_page_size : 0
+        end
+        def effective_sort
+          return [] unless effective_size > 0
+          paginator.search_in_reverse? ? reverse_sort : sort
+        end
+        DIRECTION_OPPOSITES = {"asc" => "desc", "desc" => "asc"}.freeze
+        MISSING_OPPOSITES = {"_first" => "_last", "_last" => "_first"}.freeze
+        def reverse_sort
+          @reverse_sort ||= sort.map do |sort_clause|
+            sort_clause.transform_values do |options|
+              {
+                "order" => DIRECTION_OPPOSITES.fetch(options.fetch("order")),
+                "missing" => MISSING_OPPOSITES.fetch(options.fetch("missing"))
+              }
+            end
+          end
+        end
+        def search_after
+          paginator.search_after&.then do |cursor|
+            decoded_cursor_factory.sort_fields.map do |field|
+              cursor.sort_values.fetch(field) do
+                raise ::GraphQL::ExecutionError, "`#{cursor.encode}` is not a valid cursor for the current `#{schema_element_names.order_by}` argument."
+              end
+            end
+          end
+        end
+      end
+      # `Query::DocumentPaginator` exists only for use by `Query` and is effectively private.
+      private_constant :DocumentPaginator
+    end
+  end
+end

data/lib/elastic_graph/graphql/datastore_query/index_expression_builder.rb ADDED Viewed

@@ -0,0 +1,142 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/graphql/filtering/filter_value_set_extractor"
+require "elastic_graph/support/time_set"
+module ElasticGraph
+  class GraphQL
+    class DatastoreQuery
+      # Responsible for building a search index expression for a specific query based on the filters.
+      class IndexExpressionBuilder
+        def initialize(schema_names:)
+          @filter_value_set_extractor = Filtering::FilterValueSetExtractor.new(schema_names, Support::TimeSet::ALL) do |operator, filter_value|
+            case operator
+            when :gt, :gte, :lt, :lte
+              if date_string?(filter_value)
+                # Here we translate into a range of time objects. When translating dates to times,
+                # we need to use an appropriate time suffix:
+                #
+                # - `> 2024-04-01` == `> 2024-04-01T23:59:59.999Z`
+                # - `≥ 2024-04-01` == `≥ 2024-04-01T00:00:00Z`
+                # - `< 2024-04-01` == `< 2024-04-01T00:00:00Z`
+                # - `≤ 2024-04-01` == `≤ 2024-04-01T23:59:59.999Z`
+                time_suffix = (operator == :gt || operator == :lte) ? "T23:59:59.999Z" : "T00:00:00Z"
+                Support::TimeSet.of_range(operator => ::Time.iso8601(filter_value + time_suffix))
+              else
+                Support::TimeSet.of_range(operator => ::Time.iso8601(filter_value))
+              end
+            when :equal_to_any_of
+              # This calls `.compact` to remove `nil` timestamp values.
+              ranges = filter_value.compact.map do |iso8601_string|
+                if date_string?(iso8601_string)
+                  # When we have a date string, build a range for the entire day.
+                  start_of_day = ::Time.iso8601("#{iso8601_string}T00:00:00Z")
+                  end_of_day = ::Time.iso8601("#{iso8601_string}T23:59:59.999Z")
+                  ::Range.new(start_of_day, end_of_day)
+                else
+                  value = ::Time.iso8601(iso8601_string)
+                  ::Range.new(value, value)
+                end
+              end
+              Support::TimeSet.of_range_objects(ranges)
+            end
+          end
+        end
+        # Returns an index_definition expression string to use for searches. This string can specify
+        # multiple indices, use wildcards, etc. For info about what is supported, see:
+        # https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-index.html
+        def determine_search_index_expression(filter_hashes, search_index_definitions, require_indices:)
+          # Here we sort the index expressions. It won't change the behavior in the datastore, but
+          # makes the return value here deterministic which makes it easier to assert on in tests.
+          search_index_definitions.sort_by(&:name).reduce(IndexExpression::EMPTY) do |index_expression, index_def|
+            index_expression + index_expression_for(filter_hashes, index_def, require_indices: require_indices)
+          end
+        end
+        private
+        def index_expression_for(filter_hashes, maybe_rollover_index_def, require_indices:)
+          unless maybe_rollover_index_def.rollover_index_template?
+            return IndexExpression.only(maybe_rollover_index_def.index_expression_for_search)
+          end
+          # @type var index_def: DatastoreCore::IndexDefinition::RolloverIndexTemplate
+          index_def = _ = maybe_rollover_index_def
+          time_set = @filter_value_set_extractor.extract_filter_value_set(filter_hashes, [index_def.timestamp_field_path])
+          if time_set.empty?
+            return require_indices ?
+              # Indices are required. Given the time set is empty, it's impossible for any documents to match our search.
+              # Therefore, which index we use here doesn't matter. We just pick the first one, alphabetically.
+              IndexExpression.only(index_def.known_related_query_rollover_indices.map(&:index_expression_for_search).min) :
+              # No indices are required, so we can return an empty index expression.
+              IndexExpression::EMPTY
+          end
+          indices_to_exclude = index_def.known_related_query_rollover_indices.reject do |index|
+            time_set.intersect?(index.time_set)
+          end
+          if require_indices && (index_def.known_related_query_rollover_indices - indices_to_exclude).empty?
+            # Indices are required, but all known indices have been excluded. We satisfy the requirement for an index by excluding one
+            # less index. This is preferable to the alternative ways to satisfy the requirement.
+            #
+            # - We could return an `IndexExpression` with no exclusions, but that would search across all indices, which is less efficient.
+            # - We could pick the first index to search (as we do for the `time_set.empty?` case), but that could cause matching documents
+            #   to be be missed, because it's possible that matching documents exist in just-created index that is not in
+            #   `known_related_query_rollover_indices`. Therefore, it's important that we still search the rollover wildcard expression,
+            #   and we want to exclude all but one of the known indices.
+            indices_to_exclude = indices_to_exclude.drop(1)
+          end
+          IndexExpression.new(
+            names_to_include: ::Set.new([index_def.index_expression_for_search]),
+            names_to_exclude: ::Set.new(indices_to_exclude.map(&:index_expression_for_search))
+          )
+        end
+        def date_string?(string)
+          /\A\d{4}-\d{2}-\d{2}\z/.match?(string)
+        end
+      end
+      class IndexExpression < ::Data.define(:names_to_include, :names_to_exclude)
+        EMPTY = new(names_to_include: ::Set.new, names_to_exclude: ::Set.new)
+        def self.only(name)
+          IndexExpression.new(names_to_include: ::Set.new([name].compact), names_to_exclude: ::Set.new)
+        end
+        def to_s
+          # Note: exclusions must come after inclusions. I can't find anything in the Elasticsearch or OpenSearch docs
+          # that mention this, but when exclusions come first I found that we got errors.
+          parts = names_to_include.sort + names_to_exclude.sort.map { |name| "-#{name}" }
+          parts.join(",")
+        end
+        def +(other)
+          with(
+            names_to_include: names_to_include.union(other.names_to_include),
+            names_to_exclude: names_to_exclude.union(other.names_to_exclude)
+          )
+        end
+      end
+      # `Query::IndexExpressionBuilder` exists only for use by `Query` and is effectively private.
+      private_constant :IndexExpressionBuilder
+      # Steep is complaining that it can't find some `Query` but they are not in this file...
+      # @dynamic aggregations, shard_routing_values, search_index_definitions, merge_with, search_index_expression
+      # @dynamic with, to_datastore_msearch_header_and_body, document_paginator
+    end
+  end
+end