elasticgraph-graphql 0.18.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +3 -0
  4. data/elasticgraph-graphql.gemspec +23 -0
  5. data/lib/elastic_graph/graphql/aggregation/composite_grouping_adapter.rb +79 -0
  6. data/lib/elastic_graph/graphql/aggregation/computation.rb +39 -0
  7. data/lib/elastic_graph/graphql/aggregation/date_histogram_grouping.rb +83 -0
  8. data/lib/elastic_graph/graphql/aggregation/field_path_encoder.rb +47 -0
  9. data/lib/elastic_graph/graphql/aggregation/field_term_grouping.rb +26 -0
  10. data/lib/elastic_graph/graphql/aggregation/key.rb +87 -0
  11. data/lib/elastic_graph/graphql/aggregation/nested_sub_aggregation.rb +37 -0
  12. data/lib/elastic_graph/graphql/aggregation/non_composite_grouping_adapter.rb +129 -0
  13. data/lib/elastic_graph/graphql/aggregation/path_segment.rb +31 -0
  14. data/lib/elastic_graph/graphql/aggregation/query.rb +172 -0
  15. data/lib/elastic_graph/graphql/aggregation/query_adapter.rb +345 -0
  16. data/lib/elastic_graph/graphql/aggregation/query_optimizer.rb +187 -0
  17. data/lib/elastic_graph/graphql/aggregation/resolvers/aggregated_values.rb +41 -0
  18. data/lib/elastic_graph/graphql/aggregation/resolvers/count_detail.rb +44 -0
  19. data/lib/elastic_graph/graphql/aggregation/resolvers/grouped_by.rb +30 -0
  20. data/lib/elastic_graph/graphql/aggregation/resolvers/node.rb +64 -0
  21. data/lib/elastic_graph/graphql/aggregation/resolvers/relay_connection_builder.rb +83 -0
  22. data/lib/elastic_graph/graphql/aggregation/resolvers/sub_aggregations.rb +82 -0
  23. data/lib/elastic_graph/graphql/aggregation/script_term_grouping.rb +32 -0
  24. data/lib/elastic_graph/graphql/aggregation/term_grouping.rb +118 -0
  25. data/lib/elastic_graph/graphql/client.rb +43 -0
  26. data/lib/elastic_graph/graphql/config.rb +81 -0
  27. data/lib/elastic_graph/graphql/datastore_query/document_paginator.rb +100 -0
  28. data/lib/elastic_graph/graphql/datastore_query/index_expression_builder.rb +142 -0
  29. data/lib/elastic_graph/graphql/datastore_query/paginator.rb +199 -0
  30. data/lib/elastic_graph/graphql/datastore_query/routing_picker.rb +239 -0
  31. data/lib/elastic_graph/graphql/datastore_query.rb +372 -0
  32. data/lib/elastic_graph/graphql/datastore_response/document.rb +78 -0
  33. data/lib/elastic_graph/graphql/datastore_response/search_response.rb +79 -0
  34. data/lib/elastic_graph/graphql/datastore_search_router.rb +151 -0
  35. data/lib/elastic_graph/graphql/decoded_cursor.rb +120 -0
  36. data/lib/elastic_graph/graphql/filtering/boolean_query.rb +45 -0
  37. data/lib/elastic_graph/graphql/filtering/field_path.rb +81 -0
  38. data/lib/elastic_graph/graphql/filtering/filter_args_translator.rb +58 -0
  39. data/lib/elastic_graph/graphql/filtering/filter_interpreter.rb +526 -0
  40. data/lib/elastic_graph/graphql/filtering/filter_value_set_extractor.rb +148 -0
  41. data/lib/elastic_graph/graphql/filtering/range_query.rb +56 -0
  42. data/lib/elastic_graph/graphql/http_endpoint.rb +229 -0
  43. data/lib/elastic_graph/graphql/monkey_patches/schema_field.rb +56 -0
  44. data/lib/elastic_graph/graphql/monkey_patches/schema_object.rb +48 -0
  45. data/lib/elastic_graph/graphql/query_adapter/filters.rb +161 -0
  46. data/lib/elastic_graph/graphql/query_adapter/pagination.rb +27 -0
  47. data/lib/elastic_graph/graphql/query_adapter/requested_fields.rb +124 -0
  48. data/lib/elastic_graph/graphql/query_adapter/sort.rb +32 -0
  49. data/lib/elastic_graph/graphql/query_details_tracker.rb +60 -0
  50. data/lib/elastic_graph/graphql/query_executor.rb +200 -0
  51. data/lib/elastic_graph/graphql/resolvers/get_record_field_value.rb +49 -0
  52. data/lib/elastic_graph/graphql/resolvers/graphql_adapter.rb +114 -0
  53. data/lib/elastic_graph/graphql/resolvers/list_records.rb +29 -0
  54. data/lib/elastic_graph/graphql/resolvers/nested_relationships.rb +74 -0
  55. data/lib/elastic_graph/graphql/resolvers/query_adapter.rb +85 -0
  56. data/lib/elastic_graph/graphql/resolvers/query_source.rb +46 -0
  57. data/lib/elastic_graph/graphql/resolvers/relay_connection/array_adapter.rb +71 -0
  58. data/lib/elastic_graph/graphql/resolvers/relay_connection/generic_adapter.rb +65 -0
  59. data/lib/elastic_graph/graphql/resolvers/relay_connection/page_info.rb +82 -0
  60. data/lib/elastic_graph/graphql/resolvers/relay_connection/search_response_adapter_builder.rb +40 -0
  61. data/lib/elastic_graph/graphql/resolvers/relay_connection.rb +42 -0
  62. data/lib/elastic_graph/graphql/resolvers/resolvable_value.rb +56 -0
  63. data/lib/elastic_graph/graphql/scalar_coercion_adapters/cursor.rb +35 -0
  64. data/lib/elastic_graph/graphql/scalar_coercion_adapters/date.rb +64 -0
  65. data/lib/elastic_graph/graphql/scalar_coercion_adapters/date_time.rb +60 -0
  66. data/lib/elastic_graph/graphql/scalar_coercion_adapters/local_time.rb +30 -0
  67. data/lib/elastic_graph/graphql/scalar_coercion_adapters/longs.rb +47 -0
  68. data/lib/elastic_graph/graphql/scalar_coercion_adapters/no_op.rb +24 -0
  69. data/lib/elastic_graph/graphql/scalar_coercion_adapters/time_zone.rb +44 -0
  70. data/lib/elastic_graph/graphql/scalar_coercion_adapters/untyped.rb +32 -0
  71. data/lib/elastic_graph/graphql/scalar_coercion_adapters/valid_time_zones.rb +634 -0
  72. data/lib/elastic_graph/graphql/schema/arguments.rb +78 -0
  73. data/lib/elastic_graph/graphql/schema/enum_value.rb +30 -0
  74. data/lib/elastic_graph/graphql/schema/field.rb +147 -0
  75. data/lib/elastic_graph/graphql/schema/relation_join.rb +103 -0
  76. data/lib/elastic_graph/graphql/schema/type.rb +263 -0
  77. data/lib/elastic_graph/graphql/schema.rb +164 -0
  78. data/lib/elastic_graph/graphql.rb +253 -0
  79. data/script/dump_time_zones +81 -0
  80. data/script/dump_time_zones.java +17 -0
  81. metadata +503 -0
@@ -0,0 +1,526 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/graphql/filtering/boolean_query"
11
+ require "elastic_graph/graphql/filtering/field_path"
12
+ require "elastic_graph/graphql/filtering/range_query"
13
+ require "elastic_graph/graphql/schema/enum_value"
14
+ require "elastic_graph/support/graphql_formatter"
15
+ require "elastic_graph/support/memoizable_data"
16
+ require "elastic_graph/support/time_util"
17
+ require "graphql"
18
+
19
+ module ElasticGraph
20
+ class GraphQL
21
+ module Filtering
22
+ # Contains all query logic related to filtering. Not tested directly; tests drive the `Query` interface instead.
23
+ # For more info on how this works, see:
24
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html
25
+ # https://www.elastic.co/blog/lost-in-translation-boolean-operations-and-filters-in-the-bool-query
26
+ FilterInterpreter = Support::MemoizableData.define(:runtime_metadata, :schema_names, :logger) do
27
+ # @implements FilterInterpreter
28
+
29
+ def initialize(runtime_metadata:, logger:)
30
+ super(
31
+ runtime_metadata: runtime_metadata,
32
+ schema_names: runtime_metadata.schema_element_names,
33
+ logger: logger
34
+ )
35
+ end
36
+
37
+ # Builds a datastore query from the given collection of filter hashes.
38
+ #
39
+ # Returns `nil` if there are no query clauses, to make it easy for a caller to `compact` out
40
+ # `query: {}` in a larger search request body.
41
+ #
42
+ # https://www.elastic.co/guide/en/elasticsearch/reference/8.11/query-dsl.html
43
+ def build_query(filter_hashes, from_field_path: FieldPath.empty)
44
+ build_bool_hash do |bool_node|
45
+ filter_hashes.each do |filter_hash|
46
+ process_filter_hash(bool_node, filter_hash, from_field_path)
47
+ end
48
+ end
49
+ end
50
+
51
+ def to_s
52
+ # The inspect/to_s output of `runtime_metadata` and `logger` can be quite large and noisy. We generally don't care about
53
+ # those details but want to be able to tell at a glance if two `FilterInterpreter` instances are equal or not--and, if they
54
+ # aren't equal, which part is responsible for the inequality.
55
+ #
56
+ # Using the hash of the two initialize args provides us with that.
57
+ "#<data #{FilterInterpreter.name} runtime_metadata=(hash: #{runtime_metadata.hash}) logger=(hash: #{logger.hash})>"
58
+ end
59
+ alias_method :inspect, :to_s
60
+
61
+ private
62
+
63
+ def process_filter_hash(bool_node, filter_hash, field_path)
64
+ filter_hash.each do |field_or_op, expression|
65
+ case identify_expression_type(field_or_op, expression)
66
+ when :empty
67
+ # This is an "empty" filter predicate and we can ignore it.
68
+ when :not
69
+ process_not_expression(bool_node, expression, field_path)
70
+ when :list_any_filter
71
+ process_list_any_filter_expression(bool_node, expression, field_path)
72
+ when :any_of
73
+ process_any_of_expression(bool_node, expression, field_path)
74
+ when :all_of
75
+ process_all_of_expression(bool_node, expression, field_path)
76
+ when :operator
77
+ process_operator_expression(bool_node, field_or_op, expression, field_path)
78
+ when :list_count
79
+ process_list_count_expression(bool_node, expression, field_path)
80
+ when :sub_field
81
+ process_sub_field_expression(bool_node, expression, field_path + field_or_op)
82
+ else
83
+ logger.warn("Ignoring unknown filtering operator (#{field_or_op}: #{expression.inspect}) on field `#{field_path.from_root.join(".")}`")
84
+ end
85
+ end
86
+ end
87
+
88
+ def identify_expression_type(field_or_op, expression)
89
+ return :empty if expression.nil?
90
+ return :not if field_or_op == schema_names.not
91
+ return :list_any_filter if field_or_op == schema_names.any_satisfy
92
+ return :all_of if field_or_op == schema_names.all_of
93
+ return :any_of if field_or_op == schema_names.any_of
94
+ return :operator if filter_operators.key?(field_or_op)
95
+ return :list_count if field_or_op == LIST_COUNTS_FIELD
96
+ return :sub_field if expression.is_a?(::Hash)
97
+ :unknown
98
+ end
99
+
100
+ # Indicates if the given `expression` applies filtering to subfields or just applies
101
+ # operators at the current field path.
102
+ def filters_on_sub_fields?(expression)
103
+ expression.any? do |field_or_op, sub_expression|
104
+ case identify_expression_type(field_or_op, sub_expression)
105
+ when :sub_field
106
+ true
107
+ when :not, :list_any_filter
108
+ filters_on_sub_fields?(sub_expression)
109
+ when :any_of, :all_of
110
+ # These are the only two cases where the `sub_expression` is an array of filter sub expressions,
111
+ # so we use `.any?` on it here. (Even for `all_of`--the overall `expression` filters on sub fields so
112
+ # long as at least one of the sub expressions does, regardless of it being `any_of` vs `all_of`).
113
+ sub_expression.any? { |expr| filters_on_sub_fields?(expr) }
114
+ else # :empty, :operator, :unknown, :list_count
115
+ false
116
+ end
117
+ end
118
+ end
119
+
120
+ def process_not_expression(bool_node, expression, field_path)
121
+ sub_filter = build_bool_hash do |inner_node|
122
+ process_filter_hash(inner_node, expression, field_path)
123
+ end
124
+
125
+ return unless sub_filter
126
+
127
+ # Prevent any negated filters from being unnecessarily double-negated by
128
+ # converting them to a positive filter (i.e., !!A == A).
129
+ if sub_filter[:bool].key?(:must_not)
130
+ # Pull clauses up to current bool_node to remove negation
131
+ sub_filter[:bool][:must_not].each do |negated_clause|
132
+ negated_clause[:bool].each { |k, v| bool_node[k].concat(v) }
133
+ end
134
+ end
135
+
136
+ # Don't drop any other filters! Let's negate them now.
137
+ other_filters = sub_filter[:bool].except(:must_not)
138
+ bool_node[:must_not] << {bool: other_filters} unless other_filters.empty?
139
+ end
140
+
141
+ # There are two cases for `any_satisfy`, each of which is handled differently:
142
+ #
143
+ # - List-of-scalars
144
+ # - List-of-nested-objects
145
+ #
146
+ # We can detect which it is by checking `filter` to see if it filters on any subfields.
147
+ # If so, we know the filter is being applied to a `nested` list field. We can count on
148
+ # this because we do not generate `any_satisfy` filters on `object` list fields (instead,
149
+ # they get generated on their leaf fields).
150
+ def process_list_any_filter_expression(bool_node, filter, field_path)
151
+ if filters_on_sub_fields?(filter)
152
+ process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path)
153
+ else
154
+ process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path)
155
+ end
156
+ end
157
+
158
+ def process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path)
159
+ sub_filter = build_bool_hash do |inner_node|
160
+ process_filter_hash(inner_node, filter, field_path.nested)
161
+ end
162
+
163
+ if sub_filter
164
+ bool_node[:filter] << {nested: {path: field_path.from_root.join("."), query: sub_filter}}
165
+ end
166
+ end
167
+
168
+ # On a list-of-leaf-values field, `any_satisfy` doesn't _do_ anything: it just expresses
169
+ # the fact that documents with any list element values matching the predicates will match
170
+ # the overall filter.
171
+ def process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path)
172
+ return unless (processed = build_bool_hash { |node| process_filter_hash(node, filter, field_path) })
173
+
174
+ processed_bool_query = processed.fetch(:bool)
175
+
176
+ # The semantics we want for `any_satisfy` are that it matches when a value exists in the list that
177
+ # satisfies all of the provided subfilter. That's the semantics the datastore provides when the bool
178
+ # query only requires one clause to match, but if multiple clauses are required to match there's a subtle
179
+ # issue. A document matches so long as each required clause matches *some* value, but it doesn't require
180
+ # that they all match the *same* value. The list field on a document could contain N values, where
181
+ # each value matches a different one of the required clauses, and the document will be a search hit.
182
+ #
183
+ # Rather than behaving in a surprising way here, we'd rather disallow a filter that has multiple required
184
+ # clauses, so we return an error in this case.
185
+ if required_matching_clause_count(processed_bool_query) > 1
186
+ formatted_filter = Support::GraphQLFormatter.serialize(
187
+ {schema_names.any_satisfy => filter},
188
+ wrap_hash_with_braces: false
189
+ )
190
+
191
+ raise ::GraphQL::ExecutionError, "`#{formatted_filter}` is not supported because it produces " \
192
+ "multiple filtering clauses under `#{schema_names.any_satisfy}`, which doesn't work as expected. " \
193
+ "Remove one or more of your `#{schema_names.any_satisfy}` predicates and try again."
194
+ else
195
+ bool_node.update(processed_bool_query) do |_, existing_clauses, any_satisfy_clauses|
196
+ existing_clauses + any_satisfy_clauses
197
+ end
198
+ end
199
+ end
200
+
201
+ def process_any_of_expression(bool_node, expressions, field_path)
202
+ shoulds = expressions.filter_map do |expression|
203
+ build_bool_hash do |inner_bool_node|
204
+ process_filter_hash(inner_bool_node, expression, field_path)
205
+ end
206
+ end
207
+
208
+ # When our `shoulds` array is empty, the filtering semantics we want is to match no documents.
209
+ # However, that's not the behavior the datastore will give us if we have an empty array in the
210
+ # query under `should`. To get the behavior we want, we need to pass the datastore some filter
211
+ # criteria that will evaluate to false for every document.
212
+ bool_query = shoulds.empty? ? BooleanQuery::ALWAYS_FALSE_FILTER : BooleanQuery.should(*shoulds)
213
+ bool_query.merge_into(bool_node)
214
+ end
215
+
216
+ def process_all_of_expression(bool_node, expressions, field_path)
217
+ # `all_of` represents an AND. AND is the default way that `process_filter_hash` combines
218
+ # filters so we just have to call it for each sub-expression.
219
+ expressions.each do |sub_expression|
220
+ process_filter_hash(bool_node, sub_expression, field_path)
221
+ end
222
+ end
223
+
224
+ def process_operator_expression(bool_node, operator, expression, field_path)
225
+ # `operator` is a filtering operator, and `expression` is the value the filtering
226
+ # operator should be applied to. The `op_applicator` lambda, when called, will
227
+ # return a Clause instance (defined in this module).
228
+ bool_query = filter_operators.fetch(operator).call(field_path.from_root.join("."), expression)
229
+ bool_query&.merge_into(bool_node)
230
+ end
231
+
232
+ def process_sub_field_expression(bool_node, expression, field_path)
233
+ # `sub_field` is a field name, and `expression` is a hash of filters to apply to that field.
234
+ # We want to add the field name to the field path and recursively process the hash.
235
+ #
236
+ # However, if the hash has `any_of` in it, then we need to process the filter hash on
237
+ # a nested bool node instead of on the `bool_node` we are already operating on.
238
+ #
239
+ # To understand why, first consider a filter that has no `any_of` but does use field nesting:
240
+ #
241
+ # filter: {
242
+ # weight: {lt: 2000},
243
+ # cost: {
244
+ # currency: {equal_to_any_of: ["USD"]}
245
+ # amount: {gt: 1000}
246
+ # }
247
+ # }
248
+ #
249
+ # While this `currency` and `amount` are expressed as sub-filters under `cost` in our GraphQL
250
+ # syntax, we do not actually need to create a nested bool node structure for the datastore
251
+ # query. We get a flat filter structure like this:
252
+ #
253
+ # {bool: {filter: [
254
+ # {range: {"weight": {lt: 2000}}},
255
+ # {terms: {"cost.currency": ["USD"]}},
256
+ # {range: {"amount": {gt: 1000}}}
257
+ # ]}}
258
+ #
259
+ # The 3 filter conditions are ANDed together as a single list under `filter`.
260
+ # The nested field structure gets flattened using a dot-separated path.
261
+ #
262
+ # Now consider a filter that has multiple `any_of` sub-expressions:
263
+ #
264
+ # filter: {
265
+ # weight: {any_of: [
266
+ # {gt: 9000},
267
+ # {lt: 2000}
268
+ # ]},
269
+ # cost: {any_of: [
270
+ # currency: {equal_to_any_of: ["USD"]},
271
+ # amount: {gt: 1000}
272
+ # ]}
273
+ # }
274
+ #
275
+ # If we did not make a nested structure, we would wind up with a single list of sub-expressions
276
+ # that are OR'd together:
277
+ #
278
+ # {bool: {filter: [{bool: {should: [
279
+ # {range: {"weight": {gt: 9000}}},
280
+ # {range: {"weight": {lt: 2000}}},
281
+ # {terms: {"cost.currency": ["USD"]}},
282
+ # {range: {"amount": {gt: 1000}}}
283
+ # ]}}]}}
284
+ #
285
+ # ...but that's clearly wrong. By creating a nested bool node based on the presence of `any_of`,
286
+ # we can instead produce a structure like this:
287
+ #
288
+ # {bool: {filter: [
289
+ # {bool: {should: [
290
+ # {range: {"weight": {gt: 9000}}},
291
+ # {range: {"weight": {lt: 2000}}}
292
+ # ]}},
293
+ # {bool: {should: [
294
+ # {terms: {"cost.currency": ["USD"]}},
295
+ # {range: {"amount": {gt: 1000}}}
296
+ # ]}}
297
+ # ]}}
298
+ #
299
+ # ...which will actually work correctly.
300
+ if expression.key?(schema_names.any_of)
301
+ sub_filter = build_bool_hash do |inner_node|
302
+ process_filter_hash(inner_node, expression, field_path)
303
+ end
304
+
305
+ bool_node[:filter] << sub_filter if sub_filter
306
+ else
307
+ process_filter_hash(bool_node, expression, field_path)
308
+ end
309
+ end
310
+
311
+ def process_list_count_expression(bool_node, expression, field_path)
312
+ # Normally, we don't have to do anything special for list count expressions.
313
+ # That's the case, for example, for an expression like:
314
+ #
315
+ # filter: {tags: {count: {gt: 2}}}
316
+ #
317
+ # However, if the count expression could match count of 0 (that is, if it doesn't
318
+ # exclude a count of zero), such as this:
319
+ #
320
+ # filter: {tags: {count: {lt: 1}}}
321
+ #
322
+ # ...then we need some special handling here. A count of 0 is equivalent to the list field not existing.
323
+ # While we index an explicit count of 0, the count field will be missing from documents indexed before
324
+ # the list field was defined on the ElasticGraph schema. To properly match those documents, we need to
325
+ # convert this into an OR (using `any_of`) to also match documents that lack the field entirely.
326
+ unless excludes_zero?(expression)
327
+ expression = {schema_names.any_of => [
328
+ expression,
329
+ {schema_names.equal_to_any_of => [nil]}
330
+ ]}
331
+ end
332
+
333
+ process_sub_field_expression(bool_node, expression, field_path.counts_path)
334
+ end
335
+
336
+ def build_bool_hash(&block)
337
+ bool_node = Hash.new { |h, k| h[k] = [] }.tap(&block)
338
+
339
+ # To ignore "empty" filter predicates we need to return `nil` here.
340
+ return nil if bool_node.empty?
341
+
342
+ # According to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html#bool-min-should-match,
343
+ # if the bool query includes at least one should clause and no must or filter clauses, the default value is 1. Otherwise, the default value is 0.
344
+ # However, we want should clauses to work with musts and filters, so we need to set it explicitly to 1 when we have should clauses.
345
+ bool_node[:minimum_should_match] = 1 if bool_node.key?(:should)
346
+
347
+ {bool: bool_node}
348
+ end
349
+
350
+ # Determines if the given filter expression excludes the value `0`.
351
+ def excludes_zero?(expression)
352
+ expression.any? do |operator, operand|
353
+ case operator
354
+ when schema_names.equal_to_any_of then !operand.include?(0)
355
+ when schema_names.lt then operand <= 0
356
+ when schema_names.lte then operand < 0
357
+ when schema_names.gt then operand >= 0
358
+ when schema_names.gte then operand > 0
359
+ else
360
+ # :nocov: -- all operators are covered above. But simplecov complains about an implicit `else` branch being uncovered, so here we've defined it to wrap it with `:nocov:`.
361
+ false
362
+ # :nocov:
363
+ end
364
+ end
365
+ end
366
+
367
+ def filter_operators
368
+ @filter_operators ||= build_filter_operators(runtime_metadata)
369
+ end
370
+
371
+ def build_filter_operators(runtime_metadata)
372
+ schema_names = runtime_metadata.schema_element_names
373
+
374
+ filter_by_time_of_day_script_id = runtime_metadata
375
+ .static_script_ids_by_scoped_name
376
+ .fetch("filter/by_time_of_day")
377
+
378
+ {
379
+ schema_names.equal_to_any_of => ->(field_name, value) {
380
+ values = to_datastore_value(value.compact.uniq) # : ::Array[untyped]
381
+
382
+ equality_sub_expression =
383
+ if field_name == "id"
384
+ # Use specialized "ids" query when querying on ID field.
385
+ # See: https://www.elastic.co/guide/en/elasticsearch/reference/7.15/query-dsl-ids-query.html
386
+ #
387
+ # We reject empty strings because we otherwise get an error from the datastore:
388
+ # "failed to create query: Ids can't be empty"
389
+ {ids: {values: values - [""]}}
390
+ else
391
+ {terms: {field_name => values}}
392
+ end
393
+
394
+ exists_sub_expression = {exists: {"field" => field_name}}
395
+
396
+ if !value.empty? && value.all?(&:nil?)
397
+ BooleanQuery.new(:must_not, [{bool: {filter: [exists_sub_expression]}}])
398
+ elsif value.include?(nil)
399
+ BooleanQuery.filter({bool: {
400
+ minimum_should_match: 1,
401
+ should: [
402
+ {bool: {filter: [equality_sub_expression]}},
403
+ {bool: {must_not: [{bool: {filter: [exists_sub_expression]}}]}}
404
+ ]
405
+ }})
406
+ else
407
+ BooleanQuery.filter(equality_sub_expression)
408
+ end
409
+ },
410
+ schema_names.gt => ->(field_name, value) { RangeQuery.new(field_name, :gt, value) },
411
+ schema_names.gte => ->(field_name, value) { RangeQuery.new(field_name, :gte, value) },
412
+ schema_names.lt => ->(field_name, value) { RangeQuery.new(field_name, :lt, value) },
413
+ schema_names.lte => ->(field_name, value) { RangeQuery.new(field_name, :lte, value) },
414
+ schema_names.matches => ->(field_name, value) { BooleanQuery.must({match: {field_name => value}}) },
415
+ schema_names.matches_query => ->(field_name, value) do
416
+ allowed_edits_per_term = value.fetch(schema_names.allowed_edits_per_term).runtime_metadata.datastore_abbreviation
417
+
418
+ BooleanQuery.must(
419
+ {
420
+ match: {
421
+ field_name => {
422
+ query: value.fetch(schema_names.query),
423
+ # This is always a string field, even though the value is often an integer
424
+ fuzziness: allowed_edits_per_term.to_s,
425
+ operator: value[schema_names.require_all_terms] ? "AND" : "OR"
426
+ }
427
+ }
428
+ }
429
+ )
430
+ end,
431
+ schema_names.matches_phrase => ->(field_name, value) {
432
+ BooleanQuery.must(
433
+ {
434
+ match_phrase_prefix: {
435
+ field_name => {
436
+ query: value.fetch(schema_names.phrase)
437
+ }
438
+ }
439
+ }
440
+ )
441
+ },
442
+
443
+ # This filter operator wraps a geo distance query:
444
+ # https://www.elastic.co/guide/en/elasticsearch/reference/7.10/query-dsl-geo-distance-query.html
445
+ schema_names.near => ->(field_name, value) do
446
+ unit_abbreviation = value.fetch(schema_names.unit).runtime_metadata.datastore_abbreviation
447
+
448
+ BooleanQuery.filter({geo_distance: {
449
+ "distance" => "#{value.fetch(schema_names.max_distance)}#{unit_abbreviation}",
450
+ field_name => {
451
+ "lat" => value.fetch(schema_names.latitude),
452
+ "lon" => value.fetch(schema_names.longitude)
453
+ }
454
+ }})
455
+ end,
456
+
457
+ schema_names.time_of_day => ->(field_name, value) do
458
+ # To filter on time of day, we use the `filter/by_time_of_day` script. We accomplish
459
+ # this with a script because Elasticsearch/OpenSearch do not support this natively, and it's
460
+ # incredibly hard to implement correctly with respect to time zones without using a
461
+ # script. We considered indexing the `time_of_day` as a separate index field
462
+ # that we could directly filter on, but since we need the time of day to be relative
463
+ # to a specific time zone, there's no way to make that work with the reality of
464
+ # daylight savings time. For example, the `America/Los_Angeles` time zone has a -07:00
465
+ # UTC offset for part of the year and a `America/Los_Angeles` -08:00 UTC offset for
466
+ # part of the year. In a script we can use Java time zone APIs to handle this correctly.
467
+ params = {
468
+ field: field_name,
469
+ equal_to_any_of: list_of_nanos_of_day_from(value, schema_names.equal_to_any_of),
470
+ gt: nano_of_day_from(value, schema_names.gt),
471
+ gte: nano_of_day_from(value, schema_names.gte),
472
+ lt: nano_of_day_from(value, schema_names.lt),
473
+ lte: nano_of_day_from(value, schema_names.lte),
474
+ time_zone: value[schema_names.time_zone]
475
+ }.compact
476
+
477
+ # If there are no comparison operators, return `nil` instead of a `Clause` so that we avoid
478
+ # invoking the script for no reason. Note that `field` and `time_zone` will always be in
479
+ # `params` so we can't just check for an empty hash here.
480
+ if (params.keys - [:field, :time_zone]).any?
481
+ BooleanQuery.filter({script: {script: {id: filter_by_time_of_day_script_id, params: params}}})
482
+ end
483
+ end
484
+ }.freeze
485
+ end
486
+
487
+ def to_datastore_value(value)
488
+ case value
489
+ when ::Array
490
+ value.map { |v| to_datastore_value(v) }
491
+ when Schema::EnumValue
492
+ value.name.to_s
493
+ else
494
+ value
495
+ end
496
+ end
497
+
498
+ def nano_of_day_from(value, field)
499
+ local_time = value[field]
500
+ Support::TimeUtil.nano_of_day_from_local_time(local_time) if local_time
501
+ end
502
+
503
+ def list_of_nanos_of_day_from(value, field)
504
+ value[field]&.map { |t| Support::TimeUtil.nano_of_day_from_local_time(t) }
505
+ end
506
+
507
+ # Counts how many clauses in `bool_query` are required to match for a document to be a search hit.
508
+ def required_matching_clause_count(bool_query)
509
+ bool_query.reduce(0) do |count, (occurrence, clauses)|
510
+ case occurrence
511
+ when :should
512
+ # The number of required matching clauses imposed by `:should` depends on the `:minimum_should_match` value.
513
+ # https://www.elastic.co/guide/en/elasticsearch/reference/8.9/query-dsl-bool-query.html#bool-min-should-match
514
+ bool_query.fetch(:minimum_should_match)
515
+ when :minimum_should_match
516
+ 0 # doesn't have any clauses on its own, just controls how many `:should` clauses are required.
517
+ else
518
+ # For all other occurrences, each cluse must match.
519
+ clauses.size
520
+ end + count
521
+ end
522
+ end
523
+ end
524
+ end
525
+ end
526
+ end
@@ -0,0 +1,148 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ module ElasticGraph
10
+ class GraphQL
11
+ module Filtering
12
+ # Responsible for extracting a set of values from query filters, based on a using a custom
13
+ # set type that is able to efficiently model the "all values" case.
14
+ class FilterValueSetExtractor
15
+ def initialize(schema_names, all_values_set, &build_set_for_filter)
16
+ @schema_names = schema_names
17
+ @all_values_set = all_values_set
18
+ @build_set_for_filter = build_set_for_filter
19
+ end
20
+
21
+ # Given a list of `filter_hashes` and a list of `target_field_paths`, returns a representation
22
+ # of a set that includes all values that could be matched by the given filters.
23
+ #
24
+ # Essentially, this method guarantees that the following pseudo code is always satisfied:
25
+ #
26
+ # ``` ruby
27
+ # filter_value_set = extract_filter_value_set(filter_hashes, target_field_paths)
28
+ # Datastore.all_documents_matching(filter_hashes).each do |document|
29
+ # target_field_paths.each do |field_path|
30
+ # expect(filter_value_set).to include(document.value_at(field_path))
31
+ # end
32
+ # end
33
+ # ```
34
+ def extract_filter_value_set(filter_hashes, target_field_paths)
35
+ # We union the filter values together in cases where we have multiple target field paths
36
+ # to make sure we cover all the values we need to. We generally do not have multiple
37
+ # `target_field_paths` except for specialized cases, such as when searching multiple
38
+ # indices in one query, where those indices are configured to use differing `routing_field_paths`.
39
+ # In such a situation we must use the set union of values. Remember: including additional
40
+ # routing values causes no adverse behavior (although it may introduce an inefficiency)
41
+ # but if we fail to route to a shard that contains a matching document, the search results
42
+ # will be incorrect.
43
+ map_reduce_sets(target_field_paths, :union, negate: false) do |target_field_path|
44
+ filter_value_set_for_target_field_path(target_field_path, filter_hashes)
45
+ end
46
+ end
47
+
48
+ private
49
+
50
+ # Determines a set of filter values for one of our `target_field_paths`,
51
+ # based on a list of `filter_hashes`.
52
+ def filter_value_set_for_target_field_path(target_field_path, filter_hashes)
53
+ # Pre-split the `target_field_path` to make it easy to compare as an array,
54
+ # since we build up the `traversed_field_path_parts` as an array as we recurse. We do this here
55
+ # outside the `map_reduce_sets` block below so we only do it once instead of N times.
56
+ target_field_path_parts = target_field_path.split(".")
57
+
58
+ # Here we intersect the filter value setbecause when we have multiple `filter_hashes`,
59
+ # the filters are ANDed together. Only documents that match ALL the filters will be
60
+ # returned. Therefore, we want the intersection of filter value sets.
61
+ map_reduce_sets(filter_hashes, :intersection, negate: false) do |filter_hash|
62
+ filter_value_set_for_filter_hash(filter_hash, target_field_path_parts, negate: false)
63
+ end
64
+ end
65
+
66
+ # Determines the set of filter values for one of our `target_field_paths` values and one
67
+ # `filter_hash` from a list of filter hashes. Note that this method is called recursively,
68
+ # with `traversed_field_path_parts` as an accumulator that accumulates that path to a nested
69
+ # field we are filtering on.
70
+ def filter_value_set_for_filter_hash(filter_hash, target_field_path_parts, traversed_field_path_parts = [], negate:)
71
+ # Here we intersect the filter value sets because when we have multiple entries in a filter hash,
72
+ # the filters are ANDed together. Only documents that match ALL the filters will be
73
+ # returned. Therefore, we want the intersection of filter value sets.
74
+ map_reduce_sets(filter_hash, :intersection, negate: negate) do |key, value|
75
+ filter_value_set_for_filter_hash_entry(key, value, target_field_path_parts, traversed_field_path_parts, negate: negate)
76
+ end
77
+ end
78
+
79
+ # Determines the set of filter values for one of our `target_field_paths` and one
80
+ # entry from one `filter_hash`. The key/value pair from a single entry is passed as the
81
+ # first two arguments. Depending on where we are at in recursing through the nested structure,
82
+ # the key could identify either a field we are filtering on or a filtering operator to apply
83
+ # to a particular field.
84
+ def filter_value_set_for_filter_hash_entry(field_or_op, filter_value, target_field_path_parts, traversed_field_path_parts, negate:)
85
+ if filter_value.nil?
86
+ # Any filter with a `nil` value is effectively ignored by our filtering logic, so we need
87
+ # to return our `@all_values_set` to indicate this filter matches all documents.
88
+ @all_values_set
89
+ elsif field_or_op == @schema_names.not
90
+ filter_value_set_for_filter_hash(filter_value, target_field_path_parts, traversed_field_path_parts, negate: !negate)
91
+ elsif filter_value.is_a?(::Hash)
92
+ # the only time `value` is a hash is when `field_or_op` is a field name.
93
+ # In that case, `value` is a hash of filters that apply to that field.
94
+ filter_value_set_for_filter_hash(filter_value, target_field_path_parts, traversed_field_path_parts + [field_or_op], negate: negate)
95
+ elsif field_or_op == @schema_names.any_of
96
+ filter_value_set_for_any_of(filter_value, target_field_path_parts, traversed_field_path_parts, negate: negate)
97
+ elsif target_field_path_parts == traversed_field_path_parts
98
+ set = filter_value_set_for_field_filter(field_or_op, filter_value)
99
+ negate ? set.negate : set
100
+ else
101
+ # Otherwise, we have no information in this clause to limit our filter value set.
102
+ @all_values_set
103
+ end
104
+ end
105
+
106
+ # Determines the set of filter values for an `any_of` clause, which is used for ORing multiple filters together.
107
+ def filter_value_set_for_any_of(filter_hashes, target_field_path_parts, traversed_field_path_parts, negate:)
108
+ # Here we union the filter value sets because `any_of` represents an OR. If we can determine specific
109
+ # filter values for all `any_of` clauses, we will OR them together. Alternately, if we cannot
110
+ # determine specific filter values for any clauses, we will union `@all_values_set`,
111
+ # which will result in a return value of `@all_values_set`. This is correct because if there
112
+ # is an `any_of` clause that does not match on the `target_field_path_parts` then the filter
113
+ # excludes no documents on the basis of the target filter.
114
+ map_reduce_sets(filter_hashes, :union, negate: negate) do |filter_hash|
115
+ filter_value_set_for_filter_hash(filter_hash, target_field_path_parts, traversed_field_path_parts, negate: negate)
116
+ end
117
+ end
118
+
119
+ # Determines the set of filter values for a single filter on a single field.
120
+ def filter_value_set_for_field_filter(filter_op, filter_value)
121
+ operator_name = @schema_names.canonical_name_for(filter_op)
122
+ @build_set_for_filter.call(operator_name, filter_value) || @all_values_set
123
+ end
124
+
125
+ # Maps over the provided `collection` by applying the given `map_transform`
126
+ # (which must transform a collection entry to an instance of our set representation), then reduces
127
+ # the resulting collection to a single set value. `reduction` will be either `:union` or `:intersection`.
128
+ #
129
+ # If the collection is empty, we return `@all_values_set` because it's the only "safe" value
130
+ # we can return. We don't have any information that would allow us to limit the set of filter
131
+ # values in any way.
132
+ def map_reduce_sets(collection, reduction, negate:, &map_transform)
133
+ return @all_values_set if collection.empty?
134
+
135
+ # In the case where `negate` is true (`not` is present somewhere in the filtering expression),
136
+ # we negate the reduction operator. Utilizing De Morgan’s Law (¬(A ∪ B) <-> (¬A) ∩ (¬B)),
137
+ # the negation of the union of two sets is the intersection of the negation of each set (the negation
138
+ # of each set is the difference between @all_values_set and the given set)--and vice versa.
139
+ reduction = REDUCTION_INVERSIONS.fetch(reduction) if negate
140
+
141
+ collection.map(&map_transform).reduce(reduction)
142
+ end
143
+
144
+ REDUCTION_INVERSIONS = {union: :intersection, intersection: :union}
145
+ end
146
+ end
147
+ end
148
+ end