elasticgraph-schema_definition 0.18.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +7 -0
  4. data/elasticgraph-schema_definition.gemspec +26 -0
  5. data/lib/elastic_graph/schema_definition/api.rb +359 -0
  6. data/lib/elastic_graph/schema_definition/factory.rb +506 -0
  7. data/lib/elastic_graph/schema_definition/indexing/derived_fields/append_only_set.rb +79 -0
  8. data/lib/elastic_graph/schema_definition/indexing/derived_fields/field_initializer_support.rb +59 -0
  9. data/lib/elastic_graph/schema_definition/indexing/derived_fields/immutable_value.rb +99 -0
  10. data/lib/elastic_graph/schema_definition/indexing/derived_fields/min_or_max_value.rb +62 -0
  11. data/lib/elastic_graph/schema_definition/indexing/derived_indexed_type.rb +346 -0
  12. data/lib/elastic_graph/schema_definition/indexing/event_envelope.rb +74 -0
  13. data/lib/elastic_graph/schema_definition/indexing/field.rb +181 -0
  14. data/lib/elastic_graph/schema_definition/indexing/field_reference.rb +51 -0
  15. data/lib/elastic_graph/schema_definition/indexing/field_type/enum.rb +65 -0
  16. data/lib/elastic_graph/schema_definition/indexing/field_type/object.rb +113 -0
  17. data/lib/elastic_graph/schema_definition/indexing/field_type/scalar.rb +51 -0
  18. data/lib/elastic_graph/schema_definition/indexing/field_type/union.rb +70 -0
  19. data/lib/elastic_graph/schema_definition/indexing/index.rb +318 -0
  20. data/lib/elastic_graph/schema_definition/indexing/json_schema_field_metadata.rb +34 -0
  21. data/lib/elastic_graph/schema_definition/indexing/json_schema_with_metadata.rb +234 -0
  22. data/lib/elastic_graph/schema_definition/indexing/list_counts_mapping.rb +53 -0
  23. data/lib/elastic_graph/schema_definition/indexing/relationship_resolver.rb +96 -0
  24. data/lib/elastic_graph/schema_definition/indexing/rollover_config.rb +25 -0
  25. data/lib/elastic_graph/schema_definition/indexing/update_target_factory.rb +54 -0
  26. data/lib/elastic_graph/schema_definition/indexing/update_target_resolver.rb +195 -0
  27. data/lib/elastic_graph/schema_definition/json_schema_pruner.rb +61 -0
  28. data/lib/elastic_graph/schema_definition/mixins/can_be_graphql_only.rb +31 -0
  29. data/lib/elastic_graph/schema_definition/mixins/has_derived_graphql_type_customizations.rb +119 -0
  30. data/lib/elastic_graph/schema_definition/mixins/has_directives.rb +65 -0
  31. data/lib/elastic_graph/schema_definition/mixins/has_documentation.rb +74 -0
  32. data/lib/elastic_graph/schema_definition/mixins/has_indices.rb +281 -0
  33. data/lib/elastic_graph/schema_definition/mixins/has_readable_to_s_and_inspect.rb +46 -0
  34. data/lib/elastic_graph/schema_definition/mixins/has_subtypes.rb +116 -0
  35. data/lib/elastic_graph/schema_definition/mixins/has_type_info.rb +181 -0
  36. data/lib/elastic_graph/schema_definition/mixins/implements_interfaces.rb +122 -0
  37. data/lib/elastic_graph/schema_definition/mixins/supports_default_value.rb +47 -0
  38. data/lib/elastic_graph/schema_definition/mixins/supports_filtering_and_aggregation.rb +267 -0
  39. data/lib/elastic_graph/schema_definition/mixins/verifies_graphql_name.rb +38 -0
  40. data/lib/elastic_graph/schema_definition/rake_tasks.rb +190 -0
  41. data/lib/elastic_graph/schema_definition/results.rb +404 -0
  42. data/lib/elastic_graph/schema_definition/schema_artifact_manager.rb +482 -0
  43. data/lib/elastic_graph/schema_definition/schema_elements/argument.rb +56 -0
  44. data/lib/elastic_graph/schema_definition/schema_elements/built_in_types.rb +1541 -0
  45. data/lib/elastic_graph/schema_definition/schema_elements/deprecated_element.rb +21 -0
  46. data/lib/elastic_graph/schema_definition/schema_elements/directive.rb +40 -0
  47. data/lib/elastic_graph/schema_definition/schema_elements/enum_type.rb +189 -0
  48. data/lib/elastic_graph/schema_definition/schema_elements/enum_value.rb +73 -0
  49. data/lib/elastic_graph/schema_definition/schema_elements/enum_value_namer.rb +89 -0
  50. data/lib/elastic_graph/schema_definition/schema_elements/enums_for_indexed_types.rb +82 -0
  51. data/lib/elastic_graph/schema_definition/schema_elements/field.rb +1085 -0
  52. data/lib/elastic_graph/schema_definition/schema_elements/field_path.rb +112 -0
  53. data/lib/elastic_graph/schema_definition/schema_elements/field_source.rb +16 -0
  54. data/lib/elastic_graph/schema_definition/schema_elements/graphql_sdl_enumerator.rb +113 -0
  55. data/lib/elastic_graph/schema_definition/schema_elements/input_field.rb +31 -0
  56. data/lib/elastic_graph/schema_definition/schema_elements/input_type.rb +60 -0
  57. data/lib/elastic_graph/schema_definition/schema_elements/interface_type.rb +72 -0
  58. data/lib/elastic_graph/schema_definition/schema_elements/list_counts_state.rb +40 -0
  59. data/lib/elastic_graph/schema_definition/schema_elements/object_type.rb +53 -0
  60. data/lib/elastic_graph/schema_definition/schema_elements/relationship.rb +218 -0
  61. data/lib/elastic_graph/schema_definition/schema_elements/scalar_type.rb +310 -0
  62. data/lib/elastic_graph/schema_definition/schema_elements/sort_order_enum_value.rb +36 -0
  63. data/lib/elastic_graph/schema_definition/schema_elements/sub_aggregation_path.rb +66 -0
  64. data/lib/elastic_graph/schema_definition/schema_elements/type_namer.rb +237 -0
  65. data/lib/elastic_graph/schema_definition/schema_elements/type_reference.rb +353 -0
  66. data/lib/elastic_graph/schema_definition/schema_elements/type_with_subfields.rb +579 -0
  67. data/lib/elastic_graph/schema_definition/schema_elements/union_type.rb +157 -0
  68. data/lib/elastic_graph/schema_definition/scripting/file_system_repository.rb +77 -0
  69. data/lib/elastic_graph/schema_definition/scripting/script.rb +48 -0
  70. data/lib/elastic_graph/schema_definition/scripting/scripts/field/as_day_of_week.painless +24 -0
  71. data/lib/elastic_graph/schema_definition/scripting/scripts/field/as_time_of_day.painless +41 -0
  72. data/lib/elastic_graph/schema_definition/scripting/scripts/filter/by_time_of_day.painless +22 -0
  73. data/lib/elastic_graph/schema_definition/scripting/scripts/update/index_data.painless +93 -0
  74. data/lib/elastic_graph/schema_definition/state.rb +212 -0
  75. data/lib/elastic_graph/schema_definition/test_support.rb +113 -0
  76. metadata +513 -0
@@ -0,0 +1,318 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/schema_artifacts/runtime_metadata/index_definition"
10
+ require "elastic_graph/schema_artifacts/runtime_metadata/index_field"
11
+ require "elastic_graph/schema_definition/indexing/derived_indexed_type"
12
+ require "elastic_graph/schema_definition/indexing/list_counts_mapping"
13
+ require "elastic_graph/schema_definition/indexing/rollover_config"
14
+ require "elastic_graph/schema_definition/mixins/has_readable_to_s_and_inspect"
15
+ require "elastic_graph/schema_definition/schema_elements/field_path"
16
+ require "elastic_graph/support/hash_util"
17
+
18
+ module ElasticGraph
19
+ module SchemaDefinition
20
+ # Contains schema definition logic specific to indexing (such as JSON schema and mapping generation).
21
+ module Indexing
22
+ # Represents an index in a datastore. Defined within an indexed type. Modeled as a separate object to facilitate
23
+ # further customization of the index.
24
+ #
25
+ # @!attribute [r] name
26
+ # @return [String] name of the index
27
+ # @!attribute [r] default_sort_pairs
28
+ # @return [Array<(String, Symbol)>] (field name, direction) pairs for the default sort
29
+ # @!attribute [r] settings
30
+ # @return [Hash<(String, Object)>] datastore settings for the index
31
+ # @!attribute [r] schema_def_state
32
+ # @return [State] schema definition state
33
+ # @!attribute [r] indexed_type
34
+ # @return [SchemaElements::ObjectType, SchemaElements::InterfaceType, SchemaElements::UnionType] type backed by this index
35
+ # @!attribute [r] routing_field_path
36
+ # @return [Array<String>] path to the field used for shard routing
37
+ # @!attribute [r] rollover_config
38
+ # @return [RolloverConfig, nil] rollover configuration for the index
39
+ class Index < Struct.new(:name, :default_sort_pairs, :settings, :schema_def_state, :indexed_type, :routing_field_path, :rollover_config)
40
+ include Mixins::HasReadableToSAndInspect.new { |i| i.name }
41
+
42
+ # @param name [String] name of the index
43
+ # @param settings [Hash<(String, Object)>] datastore settings for the index
44
+ # @param schema_def_state [State] schema definition state
45
+ # @param indexed_type [SchemaElements::ObjectType, SchemaElements::InterfaceType, SchemaElements::UnionType] type backed by this index
46
+ # @yield [Index] the index, for further customization
47
+ # @api private
48
+ def initialize(name, settings, schema_def_state, indexed_type)
49
+ if name.include?(ROLLOVER_INDEX_INFIX_MARKER)
50
+ raise SchemaError, "`#{name}` is an invalid index definition name since it contains " \
51
+ "`#{ROLLOVER_INDEX_INFIX_MARKER}` which ElasticGraph treats as special."
52
+ end
53
+
54
+ settings = DEFAULT_SETTINGS.merge(Support::HashUtil.flatten_and_stringify_keys(settings, prefix: "index"))
55
+
56
+ super(name, [], settings, schema_def_state, indexed_type, [], nil)
57
+
58
+ # `id` is the field Elasticsearch/OpenSearch use for routing by default:
59
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-routing-field.html
60
+ # By using it here, it will cause queries to pass a `routing` parameter when
61
+ # searching with id filtering on an index that does not use custom shard routing, giving
62
+ # us a nice efficiency boost.
63
+ self.routing_field_path = public_field_path("id", explanation: "indexed types must have an `id` field")
64
+
65
+ yield self if block_given?
66
+ end
67
+
68
+ # Specifies how documents in this index should sort by default, when no `orderBy` argument is provided to the GraphQL query.
69
+ #
70
+ # @note the field name strings can be a dot-separated nested fields, but all referenced
71
+ # fields must exist when this is called.
72
+ #
73
+ # @param field_name_direction_pairs [Array<(String, Symbol)>] pairs of field names and `:asc` or `:desc`
74
+ # @return [void]
75
+ #
76
+ # @example Sort on `name` (ascending) with `createdAt` (descending) as a tie-breaker
77
+ # ElasticGraph.define_schema do |schema|
78
+ # schema.object_type "Campaign" do |t|
79
+ # t.field "id", "ID!"
80
+ # t.field "name", "String"
81
+ # t.field "createdAt", "DateTime"
82
+ #
83
+ # t.index "campaigns"do |i|
84
+ # i.default_sort "name", :asc, "createdAt", :desc
85
+ # end
86
+ # end
87
+ # end
88
+ def default_sort(*field_name_direction_pairs)
89
+ self.default_sort_pairs = field_name_direction_pairs
90
+ end
91
+
92
+ # Causes this index to "rollover" at the provided `frequency` based on the value of the provided `timestamp_field_path_name`.
93
+ # This is particularly useful for time-series data. Partitioning the data into `hourly`, `daily`, `monthly` or `yearly` buckets
94
+ # allows for different index configurations, and can be necessary when a dataset is too large to fit in one dataset given
95
+ # Elasticsearch/OpenSearch limitations on the number of shards in one index. In addition, ElasticGraph optimizes queries which
96
+ # filter on the timestamp field to target the subset of the indices in which matching documents could reside.
97
+ #
98
+ # @note the timestamp field specified here **must be immutable**. To understand why, consider a `:yearly` rollover
99
+ # index used for data based on `createdAt`; if ElasticGraph ingests record `123` with a createdAt of `2023-12-31T23:59:59Z`, it
100
+ # will be indexed in the `2023` index. Later if it receives an update event for record `123` with a `createdAt` of
101
+ # `2024-01-01T00:00:00Z` (a mere one second later!), ElasticGraph will store the new version of the payment in the `2024` index,
102
+ # and leave the old copy of the payment in the `2023` index unchanged. It’ll have duplicates for that document.
103
+ # @note changing the `rollover` configuration on an existing index that already has data will result in duplicate documents
104
+ #
105
+ # @param frequency [:yearly, :monthly, :daily, :hourly] how often to rollover the index
106
+ # @param timestamp_field_path_name [String] dot-separated path to the timestamp field used for rollover. Note: all referenced
107
+ # fields must exist when this is called.
108
+ # @return [void]
109
+ #
110
+ # @example Define a `campaigns` index to rollover yearly based on `createdAt`
111
+ # ElasticGraph.define_schema do |schema|
112
+ # schema.object_type "Campaign" do |t|
113
+ # t.field "id", "ID!"
114
+ # t.field "name", "String"
115
+ # t.field "createdAt", "DateTime"
116
+ #
117
+ # t.index "campaigns"do |i|
118
+ # i.rollover :yearly, "createdAt"
119
+ # end
120
+ # end
121
+ # end
122
+ def rollover(frequency, timestamp_field_path_name)
123
+ timestamp_field_path = public_field_path(timestamp_field_path_name, explanation: "it is referenced as an index `rollover` field")
124
+
125
+ unless date_and_datetime_types.include?(timestamp_field_path.type.fully_unwrapped.name)
126
+ date_or_datetime_description = date_and_datetime_types.map { |t| "`#{t}`" }.join(" or ")
127
+ raise SchemaError, "rollover field `#{timestamp_field_path.full_description}` cannot be used for rollover since it is not a #{date_or_datetime_description} field."
128
+ end
129
+
130
+ if timestamp_field_path.type.list?
131
+ raise SchemaError, "rollover field `#{timestamp_field_path.full_description}` cannot be used for rollover since it is a list field."
132
+ end
133
+
134
+ timestamp_field_path.path_parts.each { |f| f.json_schema nullable: false }
135
+
136
+ self.rollover_config = RolloverConfig.new(
137
+ frequency: frequency,
138
+ timestamp_field_path: timestamp_field_path
139
+ )
140
+ end
141
+
142
+ # Configures the index to [route documents to shards](https://www.elastic.co/guide/en/elasticsearch/reference/8.15/mapping-routing-field.html)
143
+ # based on the specified field. ElasticGraph optimizes queries that filter on the shard routing field so that they only run on a
144
+ # subset of nodes instead of all nodes. This can make a big difference in query performance if queries usually filter on a certain
145
+ # field. Using an appropriate field for shard routing is often essential for horizontal scaling, as it avoids having every query
146
+ # hit every node, allowing additional nodes to increase query throughput.
147
+ #
148
+ # @note it is essential that the shards are well-balanced. If the data’s distribution is lopsided, using this feature can make
149
+ # performance worse.
150
+ # @note the routing field specified here **must be immutable**. If ElasticGraph receives an updated version of a document with a
151
+ # different routing value, it’ll write the new version of the document to a different shard and leave the copy on the old shard
152
+ # unchanged, leading to duplicates.
153
+ # @note changing the shard routing configuration on an existing index that already has data will result in duplicate documents
154
+ #
155
+ # @param routing_field_path_name [String] dot-separated path to the field used for shard routing. Note: all referenced
156
+ # fields must exist when this is called.
157
+ # @return [void]
158
+ #
159
+ # @example Define a `campaigns` index to shard on `organizationId`
160
+ # ElasticGraph.define_schema do |schema|
161
+ # schema.object_type "Campaign" do |t|
162
+ # t.field "id", "ID!"
163
+ # t.field "name", "String"
164
+ # t.field "organizationId", "ID"
165
+ #
166
+ # t.index "campaigns"do |i|
167
+ # i.route_with "organizationId"
168
+ # end
169
+ # end
170
+ # end
171
+ def route_with(routing_field_path_name)
172
+ routing_field_path = public_field_path(routing_field_path_name, explanation: "it is referenced as an index `route_with` field")
173
+
174
+ unless routing_field_path.type.leaf?
175
+ raise SchemaError, "shard routing field `#{routing_field_path.full_description}` cannot be used for routing since it is not a leaf field."
176
+ end
177
+
178
+ self.routing_field_path = routing_field_path
179
+
180
+ routing_field_path.path_parts[0..-2].each { |f| f.json_schema nullable: false }
181
+ routing_field_path.last_part.json_schema nullable: false, pattern: HAS_NON_WHITE_SPACE_REGEX
182
+ indexed_type.append_to_documentation "For more performant queries on this type, please filter on `#{routing_field_path_name}` if possible."
183
+ end
184
+
185
+ # @see #route_with
186
+ # @return [Boolean] whether or not this index uses custom shard routing
187
+ def uses_custom_routing?
188
+ routing_field_path.path_in_index != "id"
189
+ end
190
+
191
+ # @return [Hash<String, Object>] datastore configuration for this index for when it does not use rollover
192
+ def to_index_config
193
+ {
194
+ "aliases" => {},
195
+ "mappings" => mappings,
196
+ "settings" => settings
197
+ }.compact
198
+ end
199
+
200
+ # @return [Hash<String, Object>] datastore configuration for the index template that will be defined if rollover is used
201
+ def to_index_template_config
202
+ {
203
+ "index_patterns" => ["#{name}#{ROLLOVER_INDEX_INFIX_MARKER}*"],
204
+ "template" => {
205
+ "aliases" => {},
206
+ "mappings" => mappings,
207
+ "settings" => settings
208
+ }
209
+ }
210
+ end
211
+
212
+ # @return [SchemaArtifacts::RuntimeMetadata::IndexDefinition] runtime metadata for this index
213
+ def runtime_metadata
214
+ SchemaArtifacts::RuntimeMetadata::IndexDefinition.new(
215
+ route_with: routing_field_path.path_in_index,
216
+ rollover: rollover_config&.runtime_metadata,
217
+ current_sources: indexed_type.current_sources,
218
+ fields_by_path: indexed_type.index_field_runtime_metadata_tuples.to_h,
219
+ default_sort_fields: default_sort_pairs.each_slice(2).map do |(graphql_field_path_name, direction)|
220
+ SchemaArtifacts::RuntimeMetadata::SortField.new(
221
+ field_path: public_field_path(graphql_field_path_name, explanation: "it is referenced as an index `default_sort` field").path_in_index,
222
+ direction: direction
223
+ )
224
+ end
225
+ )
226
+ end
227
+
228
+ private
229
+
230
+ # A regex that requires at least one non-whitespace character.
231
+ # Note: this does not use the `/S` character class because it's recommended to use a small subset
232
+ # of Regex syntax:
233
+ #
234
+ # > The regular expression syntax used is from JavaScript (ECMA 262, specifically). However, that
235
+ # > complete syntax is not widely supported, therefore it is recommended that you stick to the subset
236
+ # > of that syntax described below.
237
+ #
238
+ # (From https://json-schema.org/understanding-json-schema/reference/regular_expressions.html)
239
+ HAS_NON_WHITE_SPACE_REGEX = "[^ \t\n]+"
240
+
241
+ DEFAULT_SETTINGS = {
242
+ "index.mapping.ignore_malformed" => false,
243
+ "index.mapping.coerce" => false,
244
+ "index.number_of_replicas" => 1,
245
+ "index.number_of_shards" => 1
246
+ }
247
+
248
+ def mappings
249
+ field_mappings = indexed_type
250
+ .to_indexing_field_type
251
+ .to_mapping
252
+ .except("type") # `type` is invalid at the mapping root because it always has to be an object.
253
+ .then { |mapping| ListCountsMapping.merged_into(mapping, for_type: indexed_type) }
254
+ .then do |fm|
255
+ Support::HashUtil.deep_merge(fm, {"properties" => {
256
+ "__sources" => {"type" => "keyword"},
257
+ "__versions" => {
258
+ "type" => "object",
259
+ # __versions is map keyed by relationship name, with values that are maps keyed by id. Since it's not
260
+ # a static object with known fields, we need to use dynamic here. Passing `false` allows some level
261
+ # of dynamicness. As per https://www.elastic.co/guide/en/elasticsearch/reference/8.7/dynamic.html#dynamic-parameters:
262
+ #
263
+ # > New fields are ignored. These fields will not be indexed or searchable, but will still appear in the _source
264
+ # > field of returned hits. These fields will not be added to the mapping, and new fields must be added explicitly.
265
+ #
266
+ # We need `__versions` to be in `_source` (so that our update scripts can operate on it), but
267
+ # have no need for it to be searchable (as it's just an internal data structure used for indexing).
268
+ #
269
+ # Note: we intentionally set false as a string here, because that's how the datastore echoes it back
270
+ # to us when you query the mapping (even if you set it as a boolean). Our checks for index mapping
271
+ # consistency fail validation if we set it as a boolean since the datastore doesn't echo it back as
272
+ # a boolean.
273
+ "dynamic" => "false"
274
+ }
275
+ }})
276
+ end
277
+
278
+ {"dynamic" => "strict"}.merge(field_mappings).tap do |hash|
279
+ # If we are using custom shard routing, we want to require a `routing` value to be provided
280
+ # in every single index, get, delete or update request; otherwise the request might be
281
+ # made against the wrong shard.
282
+ hash["_routing"] = {"required" => true} if uses_custom_routing?
283
+ hash["_size"] = {"enabled" => true} if schema_def_state.index_document_sizes?
284
+ end
285
+ end
286
+
287
+ def public_field_path(public_path_string, explanation:)
288
+ parent_is_not_list = ->(parent_field) { !parent_field.type.list? }
289
+ resolver = SchemaElements::FieldPath::Resolver.new
290
+ resolved_path = resolver.resolve_public_path(indexed_type, public_path_string, &parent_is_not_list)
291
+ return resolved_path if resolved_path
292
+
293
+ path_parts = public_path_string.split(".")
294
+ error_msg = "Field `#{indexed_type.name}.#{public_path_string}` cannot be resolved, but #{explanation}."
295
+
296
+ # If it is a nested field path, the problem could be that a type has been referenced which does not exist, so mention that.
297
+ if path_parts.size > 1
298
+ error_msg += " Verify that all fields and types referenced by `#{public_path_string}` are defined."
299
+ end
300
+
301
+ # If the first part of the path doesn't resolve, the problem could be that the field is defined after the `index` call
302
+ # but it needs to be defined before it, so mention that.
303
+ if resolver.resolve_public_path(indexed_type, path_parts.first, &parent_is_not_list).nil?
304
+ error_msg += " Note: the `#{indexed_type.name}.#{path_parts.first}` definition must come before the `index` call."
305
+ end
306
+
307
+ raise SchemaError, error_msg
308
+ end
309
+
310
+ def date_and_datetime_types
311
+ @date_and_datetime_types ||= %w[Date DateTime].map do |type|
312
+ schema_def_state.type_namer.name_for(type)
313
+ end
314
+ end
315
+ end
316
+ end
317
+ end
318
+ end
@@ -0,0 +1,34 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ module ElasticGraph
10
+ module SchemaDefinition
11
+ module Indexing
12
+ # @!parse class JSONSchemaFieldMetadata; end
13
+ JSONSchemaFieldMetadata = ::Data.define(:type, :name_in_index)
14
+
15
+ # Metadata about an ElasticGraph field that needs to be stored in our versioned JSON schemas
16
+ # alongside the JSON schema fields.
17
+ #
18
+ # @!attribute [r] type
19
+ # @return [String] name of the ElasticGraph type for this field
20
+ # @!attribute [r] name_in_index
21
+ # @return [String] name of the field in the index
22
+ #
23
+ # @api private
24
+ class JSONSchemaFieldMetadata < ::Data
25
+ # @return [Hash<String, String>] hash form of the metadata that can be dumped in JSON schema
26
+ def to_dumpable_hash
27
+ {"type" => type, "nameInIndex" => name_in_index}
28
+ end
29
+
30
+ # @dynamic initialize, type, name_in_index
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,234 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+
11
+ module ElasticGraph
12
+ module SchemaDefinition
13
+ module Indexing
14
+ # Represents the result of merging a JSON schema with metadata. The result includes both
15
+ # the merged JSON schema and a list of `failed_fields` indicating which fields metadata
16
+ # could not be determined for.
17
+ #
18
+ # @private
19
+ class JSONSchemaWithMetadata < ::Data.define(
20
+ # The JSON schema.
21
+ :json_schema,
22
+ # A set of fields (in the form `Type.field`) that were needed but not found.
23
+ :missing_fields,
24
+ # A set of type names that were needed but not found.
25
+ :missing_types,
26
+ # A set of `DeprecatedElement` objects that create conflicting definitions.
27
+ :definition_conflicts,
28
+ # A set of fields that have been deleted but that must be retained (e.g. for custom shard routing or rollover)
29
+ :missing_necessary_fields
30
+ )
31
+ def json_schema_version
32
+ json_schema.fetch(JSON_SCHEMA_VERSION_KEY)
33
+ end
34
+
35
+ # Responsible for building `JSONSchemaWithMetadata` instances.
36
+ #
37
+ # @private
38
+ class Merger
39
+ # @dynamic unused_deprecated_elements
40
+ attr_reader :unused_deprecated_elements
41
+
42
+ def initialize(schema_def_results)
43
+ @field_metadata_by_type_and_field_name = schema_def_results.json_schema_field_metadata_by_type_and_field_name
44
+ @renamed_types_by_old_name = schema_def_results.state.renamed_types_by_old_name
45
+ @deleted_types_by_old_name = schema_def_results.state.deleted_types_by_old_name
46
+ @renamed_fields_by_type_name_and_old_field_name = schema_def_results.state.renamed_fields_by_type_name_and_old_field_name
47
+ @deleted_fields_by_type_name_and_old_field_name = schema_def_results.state.deleted_fields_by_type_name_and_old_field_name
48
+ @state = schema_def_results.state
49
+ @derived_indexing_type_names = schema_def_results.derived_indexing_type_names
50
+
51
+ @unused_deprecated_elements = (
52
+ @renamed_types_by_old_name.values +
53
+ @deleted_types_by_old_name.values +
54
+ @renamed_fields_by_type_name_and_old_field_name.values.flat_map(&:values) +
55
+ @deleted_fields_by_type_name_and_old_field_name.values.flat_map(&:values)
56
+ ).to_set
57
+ end
58
+
59
+ def merge_metadata_into(json_schema)
60
+ missing_fields = ::Set.new
61
+ missing_types = ::Set.new
62
+ definition_conflicts = ::Set.new
63
+ old_type_name_by_current_name = {} # : ::Hash[String, String]
64
+
65
+ defs = json_schema.fetch("$defs").to_h do |type_name, type_def|
66
+ if type_name != EVENT_ENVELOPE_JSON_SCHEMA_NAME && (properties = type_def["properties"])
67
+ current_type_name = determine_current_type_name(
68
+ type_name,
69
+ missing_types: missing_types,
70
+ definition_conflicts: definition_conflicts
71
+ )
72
+
73
+ if current_type_name
74
+ old_type_name_by_current_name[current_type_name] = type_name
75
+ end
76
+
77
+ properties = properties.to_h do |field_name, prop|
78
+ unless field_name == "__typename"
79
+ field_metadata = current_type_name&.then do |name|
80
+ field_metadata_for(
81
+ name,
82
+ field_name,
83
+ missing_fields: missing_fields,
84
+ definition_conflicts: definition_conflicts
85
+ )
86
+ end
87
+
88
+ prop = prop.merge({"ElasticGraph" => field_metadata&.to_dumpable_hash})
89
+ end
90
+
91
+ [field_name, prop]
92
+ end
93
+
94
+ type_def = type_def.merge({"properties" => properties})
95
+ end
96
+
97
+ [type_name, type_def]
98
+ end
99
+
100
+ json_schema = json_schema.merge("$defs" => defs)
101
+
102
+ JSONSchemaWithMetadata.new(
103
+ json_schema: json_schema,
104
+ missing_fields: missing_fields,
105
+ missing_types: missing_types,
106
+ definition_conflicts: definition_conflicts,
107
+ missing_necessary_fields: identify_missing_necessary_fields(json_schema, old_type_name_by_current_name)
108
+ )
109
+ end
110
+
111
+ private
112
+
113
+ # Given a historical `type_name`, determines (and returns) the current name for that type.
114
+ def determine_current_type_name(type_name, missing_types:, definition_conflicts:)
115
+ exists_currently = @field_metadata_by_type_and_field_name.key?(type_name)
116
+ deleted = @deleted_types_by_old_name[type_name]&.tap { |elem| @unused_deprecated_elements.delete(elem) }
117
+ renamed = @renamed_types_by_old_name[type_name]&.tap { |elem| @unused_deprecated_elements.delete(elem) }
118
+
119
+ if [exists_currently, deleted, renamed].count(&:itself) > 1
120
+ definition_conflicts.merge([deleted, renamed].compact)
121
+ end
122
+
123
+ return type_name if exists_currently
124
+ return nil if deleted
125
+ return renamed.name if renamed
126
+
127
+ missing_types << type_name
128
+ nil
129
+ end
130
+
131
+ # Given a historical `type_name` and `field_name` determines (and returns) the field metadata for it.
132
+ def field_metadata_for(type_name, field_name, missing_fields:, definition_conflicts:)
133
+ full_name = "#{type_name}.#{field_name}"
134
+
135
+ current_meta = @field_metadata_by_type_and_field_name.dig(type_name, field_name)
136
+ deleted = @deleted_fields_by_type_name_and_old_field_name.dig(type_name, field_name)&.tap do |elem|
137
+ @unused_deprecated_elements.delete(elem)
138
+ end
139
+ renamed = @renamed_fields_by_type_name_and_old_field_name.dig(type_name, field_name)&.tap do |elem|
140
+ @unused_deprecated_elements.delete(elem)
141
+ end
142
+
143
+ if [current_meta, deleted, renamed].count(&:itself) > 1
144
+ definition_conflicts.merge([deleted, renamed].compact.map { |elem| elem.with(name: full_name) })
145
+ end
146
+
147
+ return current_meta if current_meta
148
+ return nil if deleted
149
+ return @field_metadata_by_type_and_field_name.dig(type_name, renamed.name) if renamed
150
+
151
+ missing_fields << full_name
152
+ nil
153
+ end
154
+
155
+ def identify_missing_necessary_fields(json_schema, old_type_name_by_current_name)
156
+ json_schema_resolver = JSONSchemaResolver.new(@state, json_schema, old_type_name_by_current_name)
157
+ version = json_schema.fetch(JSON_SCHEMA_VERSION_KEY)
158
+
159
+ types_to_check = @state.object_types_by_name.values.select do |type|
160
+ type.indexed? && !@derived_indexing_type_names.include?(type.name)
161
+ end
162
+
163
+ types_to_check.flat_map do |object_type|
164
+ object_type.indices.flat_map do |index_def|
165
+ identify_missing_necessary_fields_for_index_def(object_type, index_def, json_schema_resolver, version)
166
+ end
167
+ end
168
+ end
169
+
170
+ def identify_missing_necessary_fields_for_index_def(object_type, index_def, json_schema_resolver, json_schema_version)
171
+ {
172
+ "routing" => index_def.routing_field_path,
173
+ "rollover" => index_def.rollover_config&.timestamp_field_path
174
+ }.compact.filter_map do |field_type, field_path|
175
+ if json_schema_resolver.necessary_path_missing?(field_path)
176
+ # The JSON schema v # {json_schema_version} artifact has no field that maps to the #{field_type} path of `#{field_path.fully_qualified_path_in_index}`.
177
+
178
+ MissingNecessaryField.new(
179
+ field_type: field_type,
180
+ fully_qualified_path: field_path.fully_qualified_path_in_index
181
+ )
182
+ end
183
+ end
184
+ end
185
+
186
+ class JSONSchemaResolver
187
+ def initialize(state, json_schema, old_type_name_by_current_name)
188
+ @state = state
189
+ @old_type_name_by_current_name = old_type_name_by_current_name
190
+ @meta_by_old_type_and_name_in_index = ::Hash.new do |hash, type_name|
191
+ properties = json_schema.fetch("$defs").fetch(type_name).fetch("properties")
192
+
193
+ hash[type_name] = properties.filter_map do |name, prop|
194
+ if (metadata = prop["ElasticGraph"])
195
+ [metadata.fetch("nameInIndex"), metadata]
196
+ end
197
+ end.to_h
198
+ end
199
+ end
200
+
201
+ # Indicates if the given `field_path` is (1) necessary and (2) missing from the JSON schema, indicating a problem.
202
+ #
203
+ # - Returns `false` is the given `field_path` is present in the JSON schema.
204
+ # - Returns `false` is the parent type of `field_path` has not been retained in this JSON schema version
205
+ # (in that case, the field path is not necessary).
206
+ # - Otherwise, returns `true` since the field path is both necessary and missing.
207
+ def necessary_path_missing?(field_path)
208
+ parent_type = field_path.first_part.parent_type.name
209
+
210
+ field_path.path_parts.any? do |path_part|
211
+ necessary_path_part_missing?(parent_type, path_part.name_in_index) do |meta|
212
+ parent_type = @state.type_ref(meta.fetch("type")).fully_unwrapped.name
213
+ end
214
+ end
215
+ end
216
+
217
+ private
218
+
219
+ def necessary_path_part_missing?(parent_type, name_in_index)
220
+ old_type_name = @old_type_name_by_current_name[parent_type]
221
+ return false unless old_type_name
222
+
223
+ meta = @meta_by_old_type_and_name_in_index.dig(old_type_name, name_in_index)
224
+ yield meta if meta
225
+ !meta
226
+ end
227
+ end
228
+ end
229
+
230
+ MissingNecessaryField = ::Data.define(:field_type, :fully_qualified_path)
231
+ end
232
+ end
233
+ end
234
+ end
@@ -0,0 +1,53 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/support/hash_util"
11
+
12
+ module ElasticGraph
13
+ module SchemaDefinition
14
+ module Indexing
15
+ # To support filtering on the `count` of a list field, we need to index the counts as we ingest
16
+ # events. This is responsible for defining the mapping for the special `__counts` field in which
17
+ # we store the list counts.
18
+ #
19
+ # @private
20
+ module ListCountsMapping
21
+ # Builds the `__counts` field mapping for the given `for_type`. Returns a new `mapping_hash` with
22
+ # the extra `__counts` field merged into it.
23
+ def self.merged_into(mapping_hash, for_type:)
24
+ counts_properties = for_type.indexing_fields_by_name_in_index.values.flat_map do |field|
25
+ field.paths_to_lists_for_count_indexing.map do |path|
26
+ # We chose the `integer` type here because:
27
+ #
28
+ # - While we expect datasets with more documents than the max integer value (~2B), we don't expect
29
+ # individual documents to have any list fields with more elements than can fit in an integer.
30
+ # - Using `long` would allow for much larger counts, but we don't want to take up double the
31
+ # storage space for this.
32
+ #
33
+ # Note that `new_list_filter_input_type` (in `schema_definition/factory.rb`) relies on this, and
34
+ # has chosen to use `IntFilterInput` (rather than `JsonSafeLongFilterInput`) for filtering these count values.
35
+ # If we change the mapping type here, we should re-evaluate the filter used there.
36
+ [path, {"type" => "integer"}]
37
+ end
38
+ end.to_h
39
+
40
+ return mapping_hash if counts_properties.empty?
41
+
42
+ Support::HashUtil.deep_merge(mapping_hash, {
43
+ "properties" => {
44
+ LIST_COUNTS_FIELD => {
45
+ "properties" => counts_properties
46
+ }
47
+ }
48
+ })
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end