RubyGems - elasticgraph-graphql - Versions diffs - 0.18.0.0 - Mend

elasticgraph-graphql 0.18.0.0

Files changed (81) hide show

data/lib/elastic_graph/graphql/filtering/filter_interpreter.rb ADDED Viewed

@@ -0,0 +1,526 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "elastic_graph/constants"
+require "elastic_graph/graphql/filtering/boolean_query"
+require "elastic_graph/graphql/filtering/field_path"
+require "elastic_graph/graphql/filtering/range_query"
+require "elastic_graph/graphql/schema/enum_value"
+require "elastic_graph/support/graphql_formatter"
+require "elastic_graph/support/memoizable_data"
+require "elastic_graph/support/time_util"
+require "graphql"
+module ElasticGraph
+  class GraphQL
+    module Filtering
+      # Contains all query logic related to filtering. Not tested directly; tests drive the `Query` interface instead.
+      # For more info on how this works, see:
+      # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html
+      # https://www.elastic.co/blog/lost-in-translation-boolean-operations-and-filters-in-the-bool-query
+      FilterInterpreter = Support::MemoizableData.define(:runtime_metadata, :schema_names, :logger) do
+        # @implements FilterInterpreter
+        def initialize(runtime_metadata:, logger:)
+          super(
+            runtime_metadata: runtime_metadata,
+            schema_names: runtime_metadata.schema_element_names,
+            logger: logger
+          )
+        end
+        # Builds a datastore query from the given collection of filter hashes.
+        #
+        # Returns `nil` if there are no query clauses, to make it easy for a caller to `compact` out
+        # `query: {}` in a larger search request body.
+        #
+        # https://www.elastic.co/guide/en/elasticsearch/reference/8.11/query-dsl.html
+        def build_query(filter_hashes, from_field_path: FieldPath.empty)
+          build_bool_hash do |bool_node|
+            filter_hashes.each do |filter_hash|
+              process_filter_hash(bool_node, filter_hash, from_field_path)
+            end
+          end
+        end
+        def to_s
+          # The inspect/to_s output of `runtime_metadata` and `logger` can be quite large and noisy. We generally don't care about
+          # those details but want to be able to tell at a glance if two `FilterInterpreter` instances are equal or not--and, if they
+          # aren't equal, which part is responsible for the inequality.
+          #
+          # Using the hash of the two initialize args provides us with that.
+          "#<data #{FilterInterpreter.name} runtime_metadata=(hash: #{runtime_metadata.hash}) logger=(hash: #{logger.hash})>"
+        end
+        alias_method :inspect, :to_s
+        private
+        def process_filter_hash(bool_node, filter_hash, field_path)
+          filter_hash.each do |field_or_op, expression|
+            case identify_expression_type(field_or_op, expression)
+            when :empty
+              # This is an "empty" filter predicate and we can ignore it.
+            when :not
+              process_not_expression(bool_node, expression, field_path)
+            when :list_any_filter
+              process_list_any_filter_expression(bool_node, expression, field_path)
+            when :any_of
+              process_any_of_expression(bool_node, expression, field_path)
+            when :all_of
+              process_all_of_expression(bool_node, expression, field_path)
+            when :operator
+              process_operator_expression(bool_node, field_or_op, expression, field_path)
+            when :list_count
+              process_list_count_expression(bool_node, expression, field_path)
+            when :sub_field
+              process_sub_field_expression(bool_node, expression, field_path + field_or_op)
+            else
+              logger.warn("Ignoring unknown filtering operator (#{field_or_op}: #{expression.inspect}) on field `#{field_path.from_root.join(".")}`")
+            end
+          end
+        end
+        def identify_expression_type(field_or_op, expression)
+          return :empty if expression.nil?
+          return :not if field_or_op == schema_names.not
+          return :list_any_filter if field_or_op == schema_names.any_satisfy
+          return :all_of if field_or_op == schema_names.all_of
+          return :any_of if field_or_op == schema_names.any_of
+          return :operator if filter_operators.key?(field_or_op)
+          return :list_count if field_or_op == LIST_COUNTS_FIELD
+          return :sub_field if expression.is_a?(::Hash)
+          :unknown
+        end
+        # Indicates if the given `expression` applies filtering to subfields or just applies
+        # operators at the current field path.
+        def filters_on_sub_fields?(expression)
+          expression.any? do |field_or_op, sub_expression|
+            case identify_expression_type(field_or_op, sub_expression)
+            when :sub_field
+              true
+            when :not, :list_any_filter
+              filters_on_sub_fields?(sub_expression)
+            when :any_of, :all_of
+              # These are the only two cases where the `sub_expression` is an array of filter sub expressions,
+              # so we use `.any?` on it here. (Even for `all_of`--the overall `expression` filters on sub fields so
+              # long as at least one of the sub expressions does, regardless of it being `any_of` vs `all_of`).
+              sub_expression.any? { |expr| filters_on_sub_fields?(expr) }
+            else # :empty, :operator, :unknown, :list_count
+              false
+            end
+          end
+        end
+        def process_not_expression(bool_node, expression, field_path)
+          sub_filter = build_bool_hash do |inner_node|
+            process_filter_hash(inner_node, expression, field_path)
+          end
+          return unless sub_filter
+          # Prevent any negated filters from being unnecessarily double-negated by
+          # converting them to a positive filter (i.e., !!A == A).
+          if sub_filter[:bool].key?(:must_not)
+            # Pull clauses up to current bool_node to remove negation
+            sub_filter[:bool][:must_not].each do |negated_clause|
+              negated_clause[:bool].each { |k, v| bool_node[k].concat(v) }
+            end
+          end
+          # Don't drop any other filters! Let's negate them now.
+          other_filters = sub_filter[:bool].except(:must_not)
+          bool_node[:must_not] << {bool: other_filters} unless other_filters.empty?
+        end
+        # There are two cases for `any_satisfy`, each of which is handled differently:
+        #
+        # - List-of-scalars
+        # - List-of-nested-objects
+        #
+        # We can detect which it is by checking `filter` to see if it filters on any subfields.
+        # If so, we know the filter is being applied to a `nested` list field. We can count on
+        # this because we do not generate `any_satisfy` filters on `object` list fields (instead,
+        # they get generated on their leaf fields).
+        def process_list_any_filter_expression(bool_node, filter, field_path)
+          if filters_on_sub_fields?(filter)
+            process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path)
+          else
+            process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path)
+          end
+        end
+        def process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path)
+          sub_filter = build_bool_hash do |inner_node|
+            process_filter_hash(inner_node, filter, field_path.nested)
+          end
+          if sub_filter
+            bool_node[:filter] << {nested: {path: field_path.from_root.join("."), query: sub_filter}}
+          end
+        end
+        # On a list-of-leaf-values field, `any_satisfy` doesn't _do_ anything: it just expresses
+        # the fact that documents with any list element values matching the predicates will match
+        # the overall filter.
+        def process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path)
+          return unless (processed = build_bool_hash { |node| process_filter_hash(node, filter, field_path) })
+          processed_bool_query = processed.fetch(:bool)
+          # The semantics we want for `any_satisfy` are that it matches when a value exists in the list that
+          # satisfies all of the provided subfilter. That's the semantics the datastore provides when the bool
+          # query only requires one clause to match, but if multiple clauses are required to match there's a subtle
+          # issue. A document matches so long as each required clause matches *some* value, but it doesn't require
+          # that they all match the *same* value. The list field on a document could contain N values, where
+          # each value matches a different one of the required clauses, and the document will be a search hit.
+          #
+          # Rather than behaving in a surprising way here, we'd rather disallow a filter that has multiple required
+          # clauses, so we return an error in this case.
+          if required_matching_clause_count(processed_bool_query) > 1
+            formatted_filter = Support::GraphQLFormatter.serialize(
+              {schema_names.any_satisfy => filter},
+              wrap_hash_with_braces: false
+            )
+            raise ::GraphQL::ExecutionError, "`#{formatted_filter}` is not supported because it produces " \
+              "multiple filtering clauses under `#{schema_names.any_satisfy}`, which doesn't work as expected. " \
+              "Remove one or more of your `#{schema_names.any_satisfy}` predicates and try again."
+          else
+            bool_node.update(processed_bool_query) do |_, existing_clauses, any_satisfy_clauses|
+              existing_clauses + any_satisfy_clauses
+            end
+          end
+        end
+        def process_any_of_expression(bool_node, expressions, field_path)
+          shoulds = expressions.filter_map do |expression|
+            build_bool_hash do |inner_bool_node|
+              process_filter_hash(inner_bool_node, expression, field_path)
+            end
+          end
+          # When our `shoulds` array is empty, the filtering semantics we want is to match no documents.
+          # However, that's not the behavior the datastore will give us if we have an empty array in the
+          # query under `should`. To get the behavior we want, we need to pass the datastore some filter
+          # criteria that will evaluate to false for every document.
+          bool_query = shoulds.empty? ? BooleanQuery::ALWAYS_FALSE_FILTER : BooleanQuery.should(*shoulds)
+          bool_query.merge_into(bool_node)
+        end
+        def process_all_of_expression(bool_node, expressions, field_path)
+          # `all_of` represents an AND. AND is the default way that `process_filter_hash` combines
+          # filters so we just have to call it for each sub-expression.
+          expressions.each do |sub_expression|
+            process_filter_hash(bool_node, sub_expression, field_path)
+          end
+        end
+        def process_operator_expression(bool_node, operator, expression, field_path)
+          # `operator` is a filtering operator, and `expression` is the value the filtering
+          # operator should be applied to. The `op_applicator` lambda, when called, will
+          # return a Clause instance (defined in this module).
+          bool_query = filter_operators.fetch(operator).call(field_path.from_root.join("."), expression)
+          bool_query&.merge_into(bool_node)
+        end
+        def process_sub_field_expression(bool_node, expression, field_path)
+          # `sub_field` is a field name, and `expression` is a hash of filters to apply to that field.
+          # We want to add the field name to the field path and recursively process the hash.
+          #
+          # However, if the hash has `any_of` in it, then we need to process the filter hash on
+          # a nested bool node instead of on the `bool_node` we are already operating on.
+          #
+          # To understand why, first consider a filter that has no `any_of` but does use field nesting:
+          #
+          # filter: {
+          #   weight: {lt: 2000},
+          #   cost: {
+          #     currency: {equal_to_any_of: ["USD"]}
+          #     amount: {gt: 1000}
+          #   }
+          # }
+          #
+          # While this `currency` and `amount` are expressed as sub-filters under `cost` in our GraphQL
+          # syntax, we do not actually need to create a nested bool node structure for the datastore
+          # query. We get a flat filter structure like this:
+          #
+          # {bool: {filter: [
+          #   {range: {"weight": {lt: 2000}}},
+          #   {terms: {"cost.currency": ["USD"]}},
+          #   {range: {"amount": {gt: 1000}}}
+          # ]}}
+          #
+          # The 3 filter conditions are ANDed together as a single list under `filter`.
+          # The nested field structure gets flattened using a dot-separated path.
+          #
+          # Now consider a filter that has multiple `any_of` sub-expressions:
+          #
+          # filter: {
+          #   weight: {any_of: [
+          #     {gt: 9000},
+          #     {lt: 2000}
+          #   ]},
+          #   cost: {any_of: [
+          #     currency: {equal_to_any_of: ["USD"]},
+          #     amount: {gt: 1000}
+          #   ]}
+          # }
+          #
+          # If we did not make a nested structure, we would wind up with a single list of sub-expressions
+          # that are OR'd together:
+          #
+          # {bool: {filter: [{bool: {should: [
+          #   {range: {"weight": {gt: 9000}}},
+          #   {range: {"weight": {lt: 2000}}},
+          #   {terms: {"cost.currency": ["USD"]}},
+          #   {range: {"amount": {gt: 1000}}}
+          # ]}}]}}
+          #
+          # ...but that's clearly wrong. By creating a nested bool node based on the presence of `any_of`,
+          # we can instead produce a structure like this:
+          #
+          # {bool: {filter: [
+          #   {bool: {should: [
+          #     {range: {"weight": {gt: 9000}}},
+          #     {range: {"weight": {lt: 2000}}}
+          #   ]}},
+          #   {bool: {should: [
+          #     {terms: {"cost.currency": ["USD"]}},
+          #     {range: {"amount": {gt: 1000}}}
+          #   ]}}
+          # ]}}
+          #
+          # ...which will actually work correctly.
+          if expression.key?(schema_names.any_of)
+            sub_filter = build_bool_hash do |inner_node|
+              process_filter_hash(inner_node, expression, field_path)
+            end
+            bool_node[:filter] << sub_filter if sub_filter
+          else
+            process_filter_hash(bool_node, expression, field_path)
+          end
+        end
+        def process_list_count_expression(bool_node, expression, field_path)
+          # Normally, we don't have to do anything special for list count expressions.
+          # That's the case, for example, for an expression like:
+          #
+          # filter: {tags: {count: {gt: 2}}}
+          #
+          # However, if the count expression could match count of 0 (that is, if it doesn't
+          # exclude a count of zero), such as this:
+          #
+          # filter: {tags: {count: {lt: 1}}}
+          #
+          # ...then we need some special handling here. A count of 0 is equivalent to the list field not existing.
+          # While we index an explicit count of 0, the count field will be missing from documents indexed before
+          # the list field was defined on the ElasticGraph schema. To properly match those documents, we need to
+          # convert this into an OR (using `any_of`) to also match documents that lack the field entirely.
+          unless excludes_zero?(expression)
+            expression = {schema_names.any_of => [
+              expression,
+              {schema_names.equal_to_any_of => [nil]}
+            ]}
+          end
+          process_sub_field_expression(bool_node, expression, field_path.counts_path)
+        end
+        def build_bool_hash(&block)
+          bool_node = Hash.new { |h, k| h[k] = [] }.tap(&block)
+          # To ignore "empty" filter predicates we need to return `nil` here.
+          return nil if bool_node.empty?
+          # According to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html#bool-min-should-match,
+          # if the bool query includes at least one should clause and no must or filter clauses, the default value is 1. Otherwise, the default value is 0.
+          # However, we want should clauses to work with musts and filters, so we need to set it explicitly to 1 when we have should clauses.
+          bool_node[:minimum_should_match] = 1 if bool_node.key?(:should)
+          {bool: bool_node}
+        end
+        # Determines if the given filter expression excludes the value `0`.
+        def excludes_zero?(expression)
+          expression.any? do |operator, operand|
+            case operator
+            when schema_names.equal_to_any_of then !operand.include?(0)
+            when schema_names.lt then operand <= 0
+            when schema_names.lte then operand < 0
+            when schema_names.gt then operand >= 0
+            when schema_names.gte then operand > 0
+            else
+              # :nocov: -- all operators are covered above. But simplecov complains about an implicit `else` branch being uncovered, so here we've defined it to wrap it with `:nocov:`.
+              false
+              # :nocov:
+            end
+          end
+        end
+        def filter_operators
+          @filter_operators ||= build_filter_operators(runtime_metadata)
+        end
+        def build_filter_operators(runtime_metadata)
+          schema_names = runtime_metadata.schema_element_names
+          filter_by_time_of_day_script_id = runtime_metadata
+            .static_script_ids_by_scoped_name
+            .fetch("filter/by_time_of_day")
+          {
+            schema_names.equal_to_any_of => ->(field_name, value) {
+              values = to_datastore_value(value.compact.uniq) # : ::Array[untyped]
+              equality_sub_expression =
+                if field_name == "id"
+                  # Use specialized "ids" query when querying on ID field.
+                  # See: https://www.elastic.co/guide/en/elasticsearch/reference/7.15/query-dsl-ids-query.html
+                  #
+                  # We reject empty strings because we otherwise get an error from the datastore:
+                  # "failed to create query: Ids can't be empty"
+                  {ids: {values: values - [""]}}
+                else
+                  {terms: {field_name => values}}
+                end
+              exists_sub_expression = {exists: {"field" => field_name}}
+              if !value.empty? && value.all?(&:nil?)
+                BooleanQuery.new(:must_not, [{bool: {filter: [exists_sub_expression]}}])
+              elsif value.include?(nil)
+                BooleanQuery.filter({bool: {
+                  minimum_should_match: 1,
+                  should: [
+                    {bool: {filter: [equality_sub_expression]}},
+                    {bool: {must_not: [{bool: {filter: [exists_sub_expression]}}]}}
+                  ]
+                }})
+              else
+                BooleanQuery.filter(equality_sub_expression)
+              end
+            },
+            schema_names.gt => ->(field_name, value) { RangeQuery.new(field_name, :gt, value) },
+            schema_names.gte => ->(field_name, value) { RangeQuery.new(field_name, :gte, value) },
+            schema_names.lt => ->(field_name, value) { RangeQuery.new(field_name, :lt, value) },
+            schema_names.lte => ->(field_name, value) { RangeQuery.new(field_name, :lte, value) },
+            schema_names.matches => ->(field_name, value) { BooleanQuery.must({match: {field_name => value}}) },
+            schema_names.matches_query => ->(field_name, value) do
+              allowed_edits_per_term = value.fetch(schema_names.allowed_edits_per_term).runtime_metadata.datastore_abbreviation
+              BooleanQuery.must(
+                {
+                  match: {
+                    field_name => {
+                      query: value.fetch(schema_names.query),
+                      # This is always a string field, even though the value is often an integer
+                      fuzziness: allowed_edits_per_term.to_s,
+                      operator: value[schema_names.require_all_terms] ? "AND" : "OR"
+                    }
+                  }
+                }
+              )
+            end,
+            schema_names.matches_phrase => ->(field_name, value) {
+              BooleanQuery.must(
+                {
+                  match_phrase_prefix: {
+                    field_name => {
+                      query: value.fetch(schema_names.phrase)
+                    }
+                  }
+                }
+              )
+            },
+            # This filter operator wraps a geo distance query:
+            # https://www.elastic.co/guide/en/elasticsearch/reference/7.10/query-dsl-geo-distance-query.html
+            schema_names.near => ->(field_name, value) do
+              unit_abbreviation = value.fetch(schema_names.unit).runtime_metadata.datastore_abbreviation
+              BooleanQuery.filter({geo_distance: {
+                "distance" => "#{value.fetch(schema_names.max_distance)}#{unit_abbreviation}",
+                field_name => {
+                  "lat" => value.fetch(schema_names.latitude),
+                  "lon" => value.fetch(schema_names.longitude)
+                }
+              }})
+            end,
+            schema_names.time_of_day => ->(field_name, value) do
+              # To filter on time of day, we use the `filter/by_time_of_day` script. We accomplish
+              # this with a script because Elasticsearch/OpenSearch do not support this natively, and it's
+              # incredibly hard to implement correctly with respect to time zones without using a
+              # script. We considered indexing the `time_of_day` as a separate index field
+              # that we could directly filter on, but since we need the time of day to be relative
+              # to a specific time zone, there's no way to make that work with the reality of
+              # daylight savings time. For example, the `America/Los_Angeles` time zone has a -07:00
+              # UTC offset for part of the year and a `America/Los_Angeles` -08:00 UTC offset for
+              # part of the year. In a script we can use Java time zone APIs to handle this correctly.
+              params = {
+                field: field_name,
+                equal_to_any_of: list_of_nanos_of_day_from(value, schema_names.equal_to_any_of),
+                gt: nano_of_day_from(value, schema_names.gt),
+                gte: nano_of_day_from(value, schema_names.gte),
+                lt: nano_of_day_from(value, schema_names.lt),
+                lte: nano_of_day_from(value, schema_names.lte),
+                time_zone: value[schema_names.time_zone]
+              }.compact
+              # If there are no comparison operators, return `nil` instead of a `Clause` so that we avoid
+              # invoking the script for no reason. Note that `field` and `time_zone` will always be in
+              # `params` so we can't just check for an empty hash here.
+              if (params.keys - [:field, :time_zone]).any?
+                BooleanQuery.filter({script: {script: {id: filter_by_time_of_day_script_id, params: params}}})
+              end
+            end
+          }.freeze
+        end
+        def to_datastore_value(value)
+          case value
+          when ::Array
+            value.map { |v| to_datastore_value(v) }
+          when Schema::EnumValue
+            value.name.to_s
+          else
+            value
+          end
+        end
+        def nano_of_day_from(value, field)
+          local_time = value[field]
+          Support::TimeUtil.nano_of_day_from_local_time(local_time) if local_time
+        end
+        def list_of_nanos_of_day_from(value, field)
+          value[field]&.map { |t| Support::TimeUtil.nano_of_day_from_local_time(t) }
+        end
+        # Counts how many clauses in `bool_query` are required to match for a document to be a search hit.
+        def required_matching_clause_count(bool_query)
+          bool_query.reduce(0) do |count, (occurrence, clauses)|
+            case occurrence
+            when :should
+              # The number of required matching clauses imposed by `:should` depends on the `:minimum_should_match` value.
+              # https://www.elastic.co/guide/en/elasticsearch/reference/8.9/query-dsl-bool-query.html#bool-min-should-match
+              bool_query.fetch(:minimum_should_match)
+            when :minimum_should_match
+              0 # doesn't have any clauses on its own, just controls how many `:should` clauses are required.
+            else
+              # For all other occurrences, each cluse must match.
+              clauses.size
+            end + count
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/graphql/filtering/filter_value_set_extractor.rb ADDED Viewed

@@ -0,0 +1,148 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+module ElasticGraph
+  class GraphQL
+    module Filtering
+      # Responsible for extracting a set of values from query filters, based on a using a custom
+      # set type that is able to efficiently model the "all values" case.
+      class FilterValueSetExtractor
+        def initialize(schema_names, all_values_set, &build_set_for_filter)
+          @schema_names = schema_names
+          @all_values_set = all_values_set
+          @build_set_for_filter = build_set_for_filter
+        end
+        # Given a list of `filter_hashes` and a list of `target_field_paths`, returns a representation
+        # of a set that includes all values that could be matched by the given filters.
+        #
+        # Essentially, this method guarantees that the following pseudo code is always satisfied:
+        #
+        # ``` ruby
+        # filter_value_set = extract_filter_value_set(filter_hashes, target_field_paths)
+        # Datastore.all_documents_matching(filter_hashes).each do |document|
+        #   target_field_paths.each do |field_path|
+        #     expect(filter_value_set).to include(document.value_at(field_path))
+        #   end
+        # end
+        # ```
+        def extract_filter_value_set(filter_hashes, target_field_paths)
+          # We union the filter values together in cases where we have multiple target field paths
+          # to make sure we cover all the values we need to. We generally do not have multiple
+          # `target_field_paths` except for specialized cases, such as when searching multiple
+          # indices in one query, where those indices are configured to use differing `routing_field_paths`.
+          # In such a situation we must use the set union of values. Remember: including additional
+          # routing values causes no adverse behavior (although it may introduce an inefficiency)
+          # but if we fail to route to a shard that contains a matching document, the search results
+          # will be incorrect.
+          map_reduce_sets(target_field_paths, :union, negate: false) do |target_field_path|
+            filter_value_set_for_target_field_path(target_field_path, filter_hashes)
+          end
+        end
+        private
+        # Determines a set of filter values for one of our `target_field_paths`,
+        # based on a list of `filter_hashes`.
+        def filter_value_set_for_target_field_path(target_field_path, filter_hashes)
+          # Pre-split the `target_field_path` to make it easy to compare as an array,
+          # since we build up the `traversed_field_path_parts` as an array as we recurse. We do this here
+          # outside the `map_reduce_sets` block below so we only do it once instead of N times.
+          target_field_path_parts = target_field_path.split(".")
+          # Here we intersect the filter value setbecause when we have multiple `filter_hashes`,
+          # the filters are ANDed together. Only documents that match ALL the filters will be
+          # returned. Therefore, we want the intersection of filter value sets.
+          map_reduce_sets(filter_hashes, :intersection, negate: false) do |filter_hash|
+            filter_value_set_for_filter_hash(filter_hash, target_field_path_parts, negate: false)
+          end
+        end
+        # Determines the set of filter values for one of our `target_field_paths` values and one
+        # `filter_hash` from a list of filter hashes. Note that this method is called recursively,
+        # with `traversed_field_path_parts` as an accumulator that accumulates that path to a nested
+        # field we are filtering on.
+        def filter_value_set_for_filter_hash(filter_hash, target_field_path_parts, traversed_field_path_parts = [], negate:)
+          # Here we intersect the filter value sets because when we have multiple entries in a filter hash,
+          # the filters are ANDed together. Only documents that match ALL the filters will be
+          # returned. Therefore, we want the intersection of filter value sets.
+          map_reduce_sets(filter_hash, :intersection, negate: negate) do |key, value|
+            filter_value_set_for_filter_hash_entry(key, value, target_field_path_parts, traversed_field_path_parts, negate: negate)
+          end
+        end
+        # Determines the set of filter values for one of our `target_field_paths` and one
+        # entry from one `filter_hash`. The key/value pair from a single entry is passed as the
+        # first two arguments. Depending on where we are at in recursing through the nested structure,
+        # the key could identify either a field we are filtering on or a filtering operator to apply
+        # to a particular field.
+        def filter_value_set_for_filter_hash_entry(field_or_op, filter_value, target_field_path_parts, traversed_field_path_parts, negate:)
+          if filter_value.nil?
+            # Any filter with a `nil` value is effectively ignored by our filtering logic, so we need
+            # to return our `@all_values_set` to indicate this filter matches all documents.
+            @all_values_set
+          elsif field_or_op == @schema_names.not
+            filter_value_set_for_filter_hash(filter_value, target_field_path_parts, traversed_field_path_parts, negate: !negate)
+          elsif filter_value.is_a?(::Hash)
+            # the only time `value` is a hash is when `field_or_op` is a field name.
+            # In that case, `value` is a hash of filters that apply to that field.
+            filter_value_set_for_filter_hash(filter_value, target_field_path_parts, traversed_field_path_parts + [field_or_op], negate: negate)
+          elsif field_or_op == @schema_names.any_of
+            filter_value_set_for_any_of(filter_value, target_field_path_parts, traversed_field_path_parts, negate: negate)
+          elsif target_field_path_parts == traversed_field_path_parts
+            set = filter_value_set_for_field_filter(field_or_op, filter_value)
+            negate ? set.negate : set
+          else
+            # Otherwise, we have no information in this clause to limit our filter value set.
+            @all_values_set
+          end
+        end
+        # Determines the set of filter values for an `any_of` clause, which is used for ORing multiple filters together.
+        def filter_value_set_for_any_of(filter_hashes, target_field_path_parts, traversed_field_path_parts, negate:)
+          # Here we union the filter value sets because `any_of` represents an OR. If we can determine specific
+          # filter values for all `any_of` clauses, we will OR them together. Alternately, if we cannot
+          # determine specific filter values for any clauses, we will union `@all_values_set`,
+          # which will result in a return value of `@all_values_set`. This is correct because if there
+          # is an `any_of` clause that does not match on the `target_field_path_parts` then the filter
+          # excludes no documents on the basis of the target filter.
+          map_reduce_sets(filter_hashes, :union, negate: negate) do |filter_hash|
+            filter_value_set_for_filter_hash(filter_hash, target_field_path_parts, traversed_field_path_parts, negate: negate)
+          end
+        end
+        # Determines the set of filter values for a single filter on a single field.
+        def filter_value_set_for_field_filter(filter_op, filter_value)
+          operator_name = @schema_names.canonical_name_for(filter_op)
+          @build_set_for_filter.call(operator_name, filter_value) || @all_values_set
+        end
+        # Maps over the provided `collection` by applying the given `map_transform`
+        # (which must transform a collection entry to an instance of our set representation), then reduces
+        # the resulting collection to a single set value. `reduction` will be either `:union` or `:intersection`.
+        #
+        # If the collection is empty, we return `@all_values_set` because it's the only "safe" value
+        # we can return. We don't have any information that would allow us to limit the set of filter
+        # values in any way.
+        def map_reduce_sets(collection, reduction, negate:, &map_transform)
+          return @all_values_set if collection.empty?
+          # In the case where `negate` is true (`not` is present somewhere in the filtering expression),
+          # we negate the reduction operator. Utilizing De Morgan’s Law (¬(A ∪ B) <-> (¬A) ∩ (¬B)),
+          # the negation of the union of two sets is the intersection of the negation of each set (the negation
+          # of each set is the difference between @all_values_set and the given set)--and vice versa.
+          reduction = REDUCTION_INVERSIONS.fetch(reduction) if negate
+          collection.map(&map_transform).reduce(reduction)
+        end
+        REDUCTION_INVERSIONS = {union: :intersection, intersection: :union}
+      end
+    end
+  end
+end