elasticgraph-schema_definition 0.18.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +7 -0
  4. data/elasticgraph-schema_definition.gemspec +26 -0
  5. data/lib/elastic_graph/schema_definition/api.rb +359 -0
  6. data/lib/elastic_graph/schema_definition/factory.rb +506 -0
  7. data/lib/elastic_graph/schema_definition/indexing/derived_fields/append_only_set.rb +79 -0
  8. data/lib/elastic_graph/schema_definition/indexing/derived_fields/field_initializer_support.rb +59 -0
  9. data/lib/elastic_graph/schema_definition/indexing/derived_fields/immutable_value.rb +99 -0
  10. data/lib/elastic_graph/schema_definition/indexing/derived_fields/min_or_max_value.rb +62 -0
  11. data/lib/elastic_graph/schema_definition/indexing/derived_indexed_type.rb +346 -0
  12. data/lib/elastic_graph/schema_definition/indexing/event_envelope.rb +74 -0
  13. data/lib/elastic_graph/schema_definition/indexing/field.rb +181 -0
  14. data/lib/elastic_graph/schema_definition/indexing/field_reference.rb +51 -0
  15. data/lib/elastic_graph/schema_definition/indexing/field_type/enum.rb +65 -0
  16. data/lib/elastic_graph/schema_definition/indexing/field_type/object.rb +113 -0
  17. data/lib/elastic_graph/schema_definition/indexing/field_type/scalar.rb +51 -0
  18. data/lib/elastic_graph/schema_definition/indexing/field_type/union.rb +70 -0
  19. data/lib/elastic_graph/schema_definition/indexing/index.rb +318 -0
  20. data/lib/elastic_graph/schema_definition/indexing/json_schema_field_metadata.rb +34 -0
  21. data/lib/elastic_graph/schema_definition/indexing/json_schema_with_metadata.rb +234 -0
  22. data/lib/elastic_graph/schema_definition/indexing/list_counts_mapping.rb +53 -0
  23. data/lib/elastic_graph/schema_definition/indexing/relationship_resolver.rb +96 -0
  24. data/lib/elastic_graph/schema_definition/indexing/rollover_config.rb +25 -0
  25. data/lib/elastic_graph/schema_definition/indexing/update_target_factory.rb +54 -0
  26. data/lib/elastic_graph/schema_definition/indexing/update_target_resolver.rb +195 -0
  27. data/lib/elastic_graph/schema_definition/json_schema_pruner.rb +61 -0
  28. data/lib/elastic_graph/schema_definition/mixins/can_be_graphql_only.rb +31 -0
  29. data/lib/elastic_graph/schema_definition/mixins/has_derived_graphql_type_customizations.rb +119 -0
  30. data/lib/elastic_graph/schema_definition/mixins/has_directives.rb +65 -0
  31. data/lib/elastic_graph/schema_definition/mixins/has_documentation.rb +74 -0
  32. data/lib/elastic_graph/schema_definition/mixins/has_indices.rb +281 -0
  33. data/lib/elastic_graph/schema_definition/mixins/has_readable_to_s_and_inspect.rb +46 -0
  34. data/lib/elastic_graph/schema_definition/mixins/has_subtypes.rb +116 -0
  35. data/lib/elastic_graph/schema_definition/mixins/has_type_info.rb +181 -0
  36. data/lib/elastic_graph/schema_definition/mixins/implements_interfaces.rb +122 -0
  37. data/lib/elastic_graph/schema_definition/mixins/supports_default_value.rb +47 -0
  38. data/lib/elastic_graph/schema_definition/mixins/supports_filtering_and_aggregation.rb +267 -0
  39. data/lib/elastic_graph/schema_definition/mixins/verifies_graphql_name.rb +38 -0
  40. data/lib/elastic_graph/schema_definition/rake_tasks.rb +190 -0
  41. data/lib/elastic_graph/schema_definition/results.rb +404 -0
  42. data/lib/elastic_graph/schema_definition/schema_artifact_manager.rb +482 -0
  43. data/lib/elastic_graph/schema_definition/schema_elements/argument.rb +56 -0
  44. data/lib/elastic_graph/schema_definition/schema_elements/built_in_types.rb +1541 -0
  45. data/lib/elastic_graph/schema_definition/schema_elements/deprecated_element.rb +21 -0
  46. data/lib/elastic_graph/schema_definition/schema_elements/directive.rb +40 -0
  47. data/lib/elastic_graph/schema_definition/schema_elements/enum_type.rb +189 -0
  48. data/lib/elastic_graph/schema_definition/schema_elements/enum_value.rb +73 -0
  49. data/lib/elastic_graph/schema_definition/schema_elements/enum_value_namer.rb +89 -0
  50. data/lib/elastic_graph/schema_definition/schema_elements/enums_for_indexed_types.rb +82 -0
  51. data/lib/elastic_graph/schema_definition/schema_elements/field.rb +1085 -0
  52. data/lib/elastic_graph/schema_definition/schema_elements/field_path.rb +112 -0
  53. data/lib/elastic_graph/schema_definition/schema_elements/field_source.rb +16 -0
  54. data/lib/elastic_graph/schema_definition/schema_elements/graphql_sdl_enumerator.rb +113 -0
  55. data/lib/elastic_graph/schema_definition/schema_elements/input_field.rb +31 -0
  56. data/lib/elastic_graph/schema_definition/schema_elements/input_type.rb +60 -0
  57. data/lib/elastic_graph/schema_definition/schema_elements/interface_type.rb +72 -0
  58. data/lib/elastic_graph/schema_definition/schema_elements/list_counts_state.rb +40 -0
  59. data/lib/elastic_graph/schema_definition/schema_elements/object_type.rb +53 -0
  60. data/lib/elastic_graph/schema_definition/schema_elements/relationship.rb +218 -0
  61. data/lib/elastic_graph/schema_definition/schema_elements/scalar_type.rb +310 -0
  62. data/lib/elastic_graph/schema_definition/schema_elements/sort_order_enum_value.rb +36 -0
  63. data/lib/elastic_graph/schema_definition/schema_elements/sub_aggregation_path.rb +66 -0
  64. data/lib/elastic_graph/schema_definition/schema_elements/type_namer.rb +237 -0
  65. data/lib/elastic_graph/schema_definition/schema_elements/type_reference.rb +353 -0
  66. data/lib/elastic_graph/schema_definition/schema_elements/type_with_subfields.rb +579 -0
  67. data/lib/elastic_graph/schema_definition/schema_elements/union_type.rb +157 -0
  68. data/lib/elastic_graph/schema_definition/scripting/file_system_repository.rb +77 -0
  69. data/lib/elastic_graph/schema_definition/scripting/script.rb +48 -0
  70. data/lib/elastic_graph/schema_definition/scripting/scripts/field/as_day_of_week.painless +24 -0
  71. data/lib/elastic_graph/schema_definition/scripting/scripts/field/as_time_of_day.painless +41 -0
  72. data/lib/elastic_graph/schema_definition/scripting/scripts/filter/by_time_of_day.painless +22 -0
  73. data/lib/elastic_graph/schema_definition/scripting/scripts/update/index_data.painless +93 -0
  74. data/lib/elastic_graph/schema_definition/state.rb +212 -0
  75. data/lib/elastic_graph/schema_definition/test_support.rb +113 -0
  76. metadata +513 -0
@@ -0,0 +1,318 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/schema_artifacts/runtime_metadata/index_definition"
10
+ require "elastic_graph/schema_artifacts/runtime_metadata/index_field"
11
+ require "elastic_graph/schema_definition/indexing/derived_indexed_type"
12
+ require "elastic_graph/schema_definition/indexing/list_counts_mapping"
13
+ require "elastic_graph/schema_definition/indexing/rollover_config"
14
+ require "elastic_graph/schema_definition/mixins/has_readable_to_s_and_inspect"
15
+ require "elastic_graph/schema_definition/schema_elements/field_path"
16
+ require "elastic_graph/support/hash_util"
17
+
18
+ module ElasticGraph
19
+ module SchemaDefinition
20
+ # Contains schema definition logic specific to indexing (such as JSON schema and mapping generation).
21
+ module Indexing
22
+ # Represents an index in a datastore. Defined within an indexed type. Modeled as a separate object to facilitate
23
+ # further customization of the index.
24
+ #
25
+ # @!attribute [r] name
26
+ # @return [String] name of the index
27
+ # @!attribute [r] default_sort_pairs
28
+ # @return [Array<(String, Symbol)>] (field name, direction) pairs for the default sort
29
+ # @!attribute [r] settings
30
+ # @return [Hash<(String, Object)>] datastore settings for the index
31
+ # @!attribute [r] schema_def_state
32
+ # @return [State] schema definition state
33
+ # @!attribute [r] indexed_type
34
+ # @return [SchemaElements::ObjectType, SchemaElements::InterfaceType, SchemaElements::UnionType] type backed by this index
35
+ # @!attribute [r] routing_field_path
36
+ # @return [Array<String>] path to the field used for shard routing
37
+ # @!attribute [r] rollover_config
38
+ # @return [RolloverConfig, nil] rollover configuration for the index
39
+ class Index < Struct.new(:name, :default_sort_pairs, :settings, :schema_def_state, :indexed_type, :routing_field_path, :rollover_config)
40
+ include Mixins::HasReadableToSAndInspect.new { |i| i.name }
41
+
42
+ # @param name [String] name of the index
43
+ # @param settings [Hash<(String, Object)>] datastore settings for the index
44
+ # @param schema_def_state [State] schema definition state
45
+ # @param indexed_type [SchemaElements::ObjectType, SchemaElements::InterfaceType, SchemaElements::UnionType] type backed by this index
46
+ # @yield [Index] the index, for further customization
47
+ # @api private
48
+ def initialize(name, settings, schema_def_state, indexed_type)
49
+ if name.include?(ROLLOVER_INDEX_INFIX_MARKER)
50
+ raise SchemaError, "`#{name}` is an invalid index definition name since it contains " \
51
+ "`#{ROLLOVER_INDEX_INFIX_MARKER}` which ElasticGraph treats as special."
52
+ end
53
+
54
+ settings = DEFAULT_SETTINGS.merge(Support::HashUtil.flatten_and_stringify_keys(settings, prefix: "index"))
55
+
56
+ super(name, [], settings, schema_def_state, indexed_type, [], nil)
57
+
58
+ # `id` is the field Elasticsearch/OpenSearch use for routing by default:
59
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-routing-field.html
60
+ # By using it here, it will cause queries to pass a `routing` parameter when
61
+ # searching with id filtering on an index that does not use custom shard routing, giving
62
+ # us a nice efficiency boost.
63
+ self.routing_field_path = public_field_path("id", explanation: "indexed types must have an `id` field")
64
+
65
+ yield self if block_given?
66
+ end
67
+
68
+ # Specifies how documents in this index should sort by default, when no `orderBy` argument is provided to the GraphQL query.
69
+ #
70
+ # @note the field name strings can be a dot-separated nested fields, but all referenced
71
+ # fields must exist when this is called.
72
+ #
73
+ # @param field_name_direction_pairs [Array<(String, Symbol)>] pairs of field names and `:asc` or `:desc`
74
+ # @return [void]
75
+ #
76
+ # @example Sort on `name` (ascending) with `createdAt` (descending) as a tie-breaker
77
+ # ElasticGraph.define_schema do |schema|
78
+ # schema.object_type "Campaign" do |t|
79
+ # t.field "id", "ID!"
80
+ # t.field "name", "String"
81
+ # t.field "createdAt", "DateTime"
82
+ #
83
+ # t.index "campaigns"do |i|
84
+ # i.default_sort "name", :asc, "createdAt", :desc
85
+ # end
86
+ # end
87
+ # end
88
+ def default_sort(*field_name_direction_pairs)
89
+ self.default_sort_pairs = field_name_direction_pairs
90
+ end
91
+
92
+ # Causes this index to "rollover" at the provided `frequency` based on the value of the provided `timestamp_field_path_name`.
93
+ # This is particularly useful for time-series data. Partitioning the data into `hourly`, `daily`, `monthly` or `yearly` buckets
94
+ # allows for different index configurations, and can be necessary when a dataset is too large to fit in one dataset given
95
+ # Elasticsearch/OpenSearch limitations on the number of shards in one index. In addition, ElasticGraph optimizes queries which
96
+ # filter on the timestamp field to target the subset of the indices in which matching documents could reside.
97
+ #
98
+ # @note the timestamp field specified here **must be immutable**. To understand why, consider a `:yearly` rollover
99
+ # index used for data based on `createdAt`; if ElasticGraph ingests record `123` with a createdAt of `2023-12-31T23:59:59Z`, it
100
+ # will be indexed in the `2023` index. Later if it receives an update event for record `123` with a `createdAt` of
101
+ # `2024-01-01T00:00:00Z` (a mere one second later!), ElasticGraph will store the new version of the payment in the `2024` index,
102
+ # and leave the old copy of the payment in the `2023` index unchanged. It’ll have duplicates for that document.
103
+ # @note changing the `rollover` configuration on an existing index that already has data will result in duplicate documents
104
+ #
105
+ # @param frequency [:yearly, :monthly, :daily, :hourly] how often to rollover the index
106
+ # @param timestamp_field_path_name [String] dot-separated path to the timestamp field used for rollover. Note: all referenced
107
+ # fields must exist when this is called.
108
+ # @return [void]
109
+ #
110
+ # @example Define a `campaigns` index to rollover yearly based on `createdAt`
111
+ # ElasticGraph.define_schema do |schema|
112
+ # schema.object_type "Campaign" do |t|
113
+ # t.field "id", "ID!"
114
+ # t.field "name", "String"
115
+ # t.field "createdAt", "DateTime"
116
+ #
117
+ # t.index "campaigns"do |i|
118
+ # i.rollover :yearly, "createdAt"
119
+ # end
120
+ # end
121
+ # end
122
+ def rollover(frequency, timestamp_field_path_name)
123
+ timestamp_field_path = public_field_path(timestamp_field_path_name, explanation: "it is referenced as an index `rollover` field")
124
+
125
+ unless date_and_datetime_types.include?(timestamp_field_path.type.fully_unwrapped.name)
126
+ date_or_datetime_description = date_and_datetime_types.map { |t| "`#{t}`" }.join(" or ")
127
+ raise SchemaError, "rollover field `#{timestamp_field_path.full_description}` cannot be used for rollover since it is not a #{date_or_datetime_description} field."
128
+ end
129
+
130
+ if timestamp_field_path.type.list?
131
+ raise SchemaError, "rollover field `#{timestamp_field_path.full_description}` cannot be used for rollover since it is a list field."
132
+ end
133
+
134
+ timestamp_field_path.path_parts.each { |f| f.json_schema nullable: false }
135
+
136
+ self.rollover_config = RolloverConfig.new(
137
+ frequency: frequency,
138
+ timestamp_field_path: timestamp_field_path
139
+ )
140
+ end
141
+
142
+ # Configures the index to [route documents to shards](https://www.elastic.co/guide/en/elasticsearch/reference/8.15/mapping-routing-field.html)
143
+ # based on the specified field. ElasticGraph optimizes queries that filter on the shard routing field so that they only run on a
144
+ # subset of nodes instead of all nodes. This can make a big difference in query performance if queries usually filter on a certain
145
+ # field. Using an appropriate field for shard routing is often essential for horizontal scaling, as it avoids having every query
146
+ # hit every node, allowing additional nodes to increase query throughput.
147
+ #
148
+ # @note it is essential that the shards are well-balanced. If the data’s distribution is lopsided, using this feature can make
149
+ # performance worse.
150
+ # @note the routing field specified here **must be immutable**. If ElasticGraph receives an updated version of a document with a
151
+ # different routing value, it’ll write the new version of the document to a different shard and leave the copy on the old shard
152
+ # unchanged, leading to duplicates.
153
+ # @note changing the shard routing configuration on an existing index that already has data will result in duplicate documents
154
+ #
155
+ # @param routing_field_path_name [String] dot-separated path to the field used for shard routing. Note: all referenced
156
+ # fields must exist when this is called.
157
+ # @return [void]
158
+ #
159
+ # @example Define a `campaigns` index to shard on `organizationId`
160
+ # ElasticGraph.define_schema do |schema|
161
+ # schema.object_type "Campaign" do |t|
162
+ # t.field "id", "ID!"
163
+ # t.field "name", "String"
164
+ # t.field "organizationId", "ID"
165
+ #
166
+ # t.index "campaigns"do |i|
167
+ # i.route_with "organizationId"
168
+ # end
169
+ # end
170
+ # end
171
+ def route_with(routing_field_path_name)
172
+ routing_field_path = public_field_path(routing_field_path_name, explanation: "it is referenced as an index `route_with` field")
173
+
174
+ unless routing_field_path.type.leaf?
175
+ raise SchemaError, "shard routing field `#{routing_field_path.full_description}` cannot be used for routing since it is not a leaf field."
176
+ end
177
+
178
+ self.routing_field_path = routing_field_path
179
+
180
+ routing_field_path.path_parts[0..-2].each { |f| f.json_schema nullable: false }
181
+ routing_field_path.last_part.json_schema nullable: false, pattern: HAS_NON_WHITE_SPACE_REGEX
182
+ indexed_type.append_to_documentation "For more performant queries on this type, please filter on `#{routing_field_path_name}` if possible."
183
+ end
184
+
185
+ # @see #route_with
186
+ # @return [Boolean] whether or not this index uses custom shard routing
187
+ def uses_custom_routing?
188
+ routing_field_path.path_in_index != "id"
189
+ end
190
+
191
+ # @return [Hash<String, Object>] datastore configuration for this index for when it does not use rollover
192
+ def to_index_config
193
+ {
194
+ "aliases" => {},
195
+ "mappings" => mappings,
196
+ "settings" => settings
197
+ }.compact
198
+ end
199
+
200
+ # @return [Hash<String, Object>] datastore configuration for the index template that will be defined if rollover is used
201
+ def to_index_template_config
202
+ {
203
+ "index_patterns" => ["#{name}#{ROLLOVER_INDEX_INFIX_MARKER}*"],
204
+ "template" => {
205
+ "aliases" => {},
206
+ "mappings" => mappings,
207
+ "settings" => settings
208
+ }
209
+ }
210
+ end
211
+
212
+ # @return [SchemaArtifacts::RuntimeMetadata::IndexDefinition] runtime metadata for this index
213
+ def runtime_metadata
214
+ SchemaArtifacts::RuntimeMetadata::IndexDefinition.new(
215
+ route_with: routing_field_path.path_in_index,
216
+ rollover: rollover_config&.runtime_metadata,
217
+ current_sources: indexed_type.current_sources,
218
+ fields_by_path: indexed_type.index_field_runtime_metadata_tuples.to_h,
219
+ default_sort_fields: default_sort_pairs.each_slice(2).map do |(graphql_field_path_name, direction)|
220
+ SchemaArtifacts::RuntimeMetadata::SortField.new(
221
+ field_path: public_field_path(graphql_field_path_name, explanation: "it is referenced as an index `default_sort` field").path_in_index,
222
+ direction: direction
223
+ )
224
+ end
225
+ )
226
+ end
227
+
228
+ private
229
+
230
+ # A regex that requires at least one non-whitespace character.
231
+ # Note: this does not use the `/S` character class because it's recommended to use a small subset
232
+ # of Regex syntax:
233
+ #
234
+ # > The regular expression syntax used is from JavaScript (ECMA 262, specifically). However, that
235
+ # > complete syntax is not widely supported, therefore it is recommended that you stick to the subset
236
+ # > of that syntax described below.
237
+ #
238
+ # (From https://json-schema.org/understanding-json-schema/reference/regular_expressions.html)
239
+ HAS_NON_WHITE_SPACE_REGEX = "[^ \t\n]+"
240
+
241
+ DEFAULT_SETTINGS = {
242
+ "index.mapping.ignore_malformed" => false,
243
+ "index.mapping.coerce" => false,
244
+ "index.number_of_replicas" => 1,
245
+ "index.number_of_shards" => 1
246
+ }
247
+
248
+ def mappings
249
+ field_mappings = indexed_type
250
+ .to_indexing_field_type
251
+ .to_mapping
252
+ .except("type") # `type` is invalid at the mapping root because it always has to be an object.
253
+ .then { |mapping| ListCountsMapping.merged_into(mapping, for_type: indexed_type) }
254
+ .then do |fm|
255
+ Support::HashUtil.deep_merge(fm, {"properties" => {
256
+ "__sources" => {"type" => "keyword"},
257
+ "__versions" => {
258
+ "type" => "object",
259
+ # __versions is map keyed by relationship name, with values that are maps keyed by id. Since it's not
260
+ # a static object with known fields, we need to use dynamic here. Passing `false` allows some level
261
+ # of dynamicness. As per https://www.elastic.co/guide/en/elasticsearch/reference/8.7/dynamic.html#dynamic-parameters:
262
+ #
263
+ # > New fields are ignored. These fields will not be indexed or searchable, but will still appear in the _source
264
+ # > field of returned hits. These fields will not be added to the mapping, and new fields must be added explicitly.
265
+ #
266
+ # We need `__versions` to be in `_source` (so that our update scripts can operate on it), but
267
+ # have no need for it to be searchable (as it's just an internal data structure used for indexing).
268
+ #
269
+ # Note: we intentionally set false as a string here, because that's how the datastore echoes it back
270
+ # to us when you query the mapping (even if you set it as a boolean). Our checks for index mapping
271
+ # consistency fail validation if we set it as a boolean since the datastore doesn't echo it back as
272
+ # a boolean.
273
+ "dynamic" => "false"
274
+ }
275
+ }})
276
+ end
277
+
278
+ {"dynamic" => "strict"}.merge(field_mappings).tap do |hash|
279
+ # If we are using custom shard routing, we want to require a `routing` value to be provided
280
+ # in every single index, get, delete or update request; otherwise the request might be
281
+ # made against the wrong shard.
282
+ hash["_routing"] = {"required" => true} if uses_custom_routing?
283
+ hash["_size"] = {"enabled" => true} if schema_def_state.index_document_sizes?
284
+ end
285
+ end
286
+
287
+ def public_field_path(public_path_string, explanation:)
288
+ parent_is_not_list = ->(parent_field) { !parent_field.type.list? }
289
+ resolver = SchemaElements::FieldPath::Resolver.new
290
+ resolved_path = resolver.resolve_public_path(indexed_type, public_path_string, &parent_is_not_list)
291
+ return resolved_path if resolved_path
292
+
293
+ path_parts = public_path_string.split(".")
294
+ error_msg = "Field `#{indexed_type.name}.#{public_path_string}` cannot be resolved, but #{explanation}."
295
+
296
+ # If it is a nested field path, the problem could be that a type has been referenced which does not exist, so mention that.
297
+ if path_parts.size > 1
298
+ error_msg += " Verify that all fields and types referenced by `#{public_path_string}` are defined."
299
+ end
300
+
301
+ # If the first part of the path doesn't resolve, the problem could be that the field is defined after the `index` call
302
+ # but it needs to be defined before it, so mention that.
303
+ if resolver.resolve_public_path(indexed_type, path_parts.first, &parent_is_not_list).nil?
304
+ error_msg += " Note: the `#{indexed_type.name}.#{path_parts.first}` definition must come before the `index` call."
305
+ end
306
+
307
+ raise SchemaError, error_msg
308
+ end
309
+
310
+ def date_and_datetime_types
311
+ @date_and_datetime_types ||= %w[Date DateTime].map do |type|
312
+ schema_def_state.type_namer.name_for(type)
313
+ end
314
+ end
315
+ end
316
+ end
317
+ end
318
+ end
@@ -0,0 +1,34 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ module ElasticGraph
10
+ module SchemaDefinition
11
+ module Indexing
12
+ # @!parse class JSONSchemaFieldMetadata; end
13
+ JSONSchemaFieldMetadata = ::Data.define(:type, :name_in_index)
14
+
15
+ # Metadata about an ElasticGraph field that needs to be stored in our versioned JSON schemas
16
+ # alongside the JSON schema fields.
17
+ #
18
+ # @!attribute [r] type
19
+ # @return [String] name of the ElasticGraph type for this field
20
+ # @!attribute [r] name_in_index
21
+ # @return [String] name of the field in the index
22
+ #
23
+ # @api private
24
+ class JSONSchemaFieldMetadata < ::Data
25
+ # @return [Hash<String, String>] hash form of the metadata that can be dumped in JSON schema
26
+ def to_dumpable_hash
27
+ {"type" => type, "nameInIndex" => name_in_index}
28
+ end
29
+
30
+ # @dynamic initialize, type, name_in_index
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,234 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+
11
+ module ElasticGraph
12
+ module SchemaDefinition
13
+ module Indexing
14
+ # Represents the result of merging a JSON schema with metadata. The result includes both
15
+ # the merged JSON schema and a list of `failed_fields` indicating which fields metadata
16
+ # could not be determined for.
17
+ #
18
+ # @private
19
+ class JSONSchemaWithMetadata < ::Data.define(
20
+ # The JSON schema.
21
+ :json_schema,
22
+ # A set of fields (in the form `Type.field`) that were needed but not found.
23
+ :missing_fields,
24
+ # A set of type names that were needed but not found.
25
+ :missing_types,
26
+ # A set of `DeprecatedElement` objects that create conflicting definitions.
27
+ :definition_conflicts,
28
+ # A set of fields that have been deleted but that must be retained (e.g. for custom shard routing or rollover)
29
+ :missing_necessary_fields
30
+ )
31
+ def json_schema_version
32
+ json_schema.fetch(JSON_SCHEMA_VERSION_KEY)
33
+ end
34
+
35
+ # Responsible for building `JSONSchemaWithMetadata` instances.
36
+ #
37
+ # @private
38
+ class Merger
39
+ # @dynamic unused_deprecated_elements
40
+ attr_reader :unused_deprecated_elements
41
+
42
+ def initialize(schema_def_results)
43
+ @field_metadata_by_type_and_field_name = schema_def_results.json_schema_field_metadata_by_type_and_field_name
44
+ @renamed_types_by_old_name = schema_def_results.state.renamed_types_by_old_name
45
+ @deleted_types_by_old_name = schema_def_results.state.deleted_types_by_old_name
46
+ @renamed_fields_by_type_name_and_old_field_name = schema_def_results.state.renamed_fields_by_type_name_and_old_field_name
47
+ @deleted_fields_by_type_name_and_old_field_name = schema_def_results.state.deleted_fields_by_type_name_and_old_field_name
48
+ @state = schema_def_results.state
49
+ @derived_indexing_type_names = schema_def_results.derived_indexing_type_names
50
+
51
+ @unused_deprecated_elements = (
52
+ @renamed_types_by_old_name.values +
53
+ @deleted_types_by_old_name.values +
54
+ @renamed_fields_by_type_name_and_old_field_name.values.flat_map(&:values) +
55
+ @deleted_fields_by_type_name_and_old_field_name.values.flat_map(&:values)
56
+ ).to_set
57
+ end
58
+
59
+ def merge_metadata_into(json_schema)
60
+ missing_fields = ::Set.new
61
+ missing_types = ::Set.new
62
+ definition_conflicts = ::Set.new
63
+ old_type_name_by_current_name = {} # : ::Hash[String, String]
64
+
65
+ defs = json_schema.fetch("$defs").to_h do |type_name, type_def|
66
+ if type_name != EVENT_ENVELOPE_JSON_SCHEMA_NAME && (properties = type_def["properties"])
67
+ current_type_name = determine_current_type_name(
68
+ type_name,
69
+ missing_types: missing_types,
70
+ definition_conflicts: definition_conflicts
71
+ )
72
+
73
+ if current_type_name
74
+ old_type_name_by_current_name[current_type_name] = type_name
75
+ end
76
+
77
+ properties = properties.to_h do |field_name, prop|
78
+ unless field_name == "__typename"
79
+ field_metadata = current_type_name&.then do |name|
80
+ field_metadata_for(
81
+ name,
82
+ field_name,
83
+ missing_fields: missing_fields,
84
+ definition_conflicts: definition_conflicts
85
+ )
86
+ end
87
+
88
+ prop = prop.merge({"ElasticGraph" => field_metadata&.to_dumpable_hash})
89
+ end
90
+
91
+ [field_name, prop]
92
+ end
93
+
94
+ type_def = type_def.merge({"properties" => properties})
95
+ end
96
+
97
+ [type_name, type_def]
98
+ end
99
+
100
+ json_schema = json_schema.merge("$defs" => defs)
101
+
102
+ JSONSchemaWithMetadata.new(
103
+ json_schema: json_schema,
104
+ missing_fields: missing_fields,
105
+ missing_types: missing_types,
106
+ definition_conflicts: definition_conflicts,
107
+ missing_necessary_fields: identify_missing_necessary_fields(json_schema, old_type_name_by_current_name)
108
+ )
109
+ end
110
+
111
+ private
112
+
113
+ # Given a historical `type_name`, determines (and returns) the current name for that type.
114
+ def determine_current_type_name(type_name, missing_types:, definition_conflicts:)
115
+ exists_currently = @field_metadata_by_type_and_field_name.key?(type_name)
116
+ deleted = @deleted_types_by_old_name[type_name]&.tap { |elem| @unused_deprecated_elements.delete(elem) }
117
+ renamed = @renamed_types_by_old_name[type_name]&.tap { |elem| @unused_deprecated_elements.delete(elem) }
118
+
119
+ if [exists_currently, deleted, renamed].count(&:itself) > 1
120
+ definition_conflicts.merge([deleted, renamed].compact)
121
+ end
122
+
123
+ return type_name if exists_currently
124
+ return nil if deleted
125
+ return renamed.name if renamed
126
+
127
+ missing_types << type_name
128
+ nil
129
+ end
130
+
131
+ # Given a historical `type_name` and `field_name` determines (and returns) the field metadata for it.
132
+ def field_metadata_for(type_name, field_name, missing_fields:, definition_conflicts:)
133
+ full_name = "#{type_name}.#{field_name}"
134
+
135
+ current_meta = @field_metadata_by_type_and_field_name.dig(type_name, field_name)
136
+ deleted = @deleted_fields_by_type_name_and_old_field_name.dig(type_name, field_name)&.tap do |elem|
137
+ @unused_deprecated_elements.delete(elem)
138
+ end
139
+ renamed = @renamed_fields_by_type_name_and_old_field_name.dig(type_name, field_name)&.tap do |elem|
140
+ @unused_deprecated_elements.delete(elem)
141
+ end
142
+
143
+ if [current_meta, deleted, renamed].count(&:itself) > 1
144
+ definition_conflicts.merge([deleted, renamed].compact.map { |elem| elem.with(name: full_name) })
145
+ end
146
+
147
+ return current_meta if current_meta
148
+ return nil if deleted
149
+ return @field_metadata_by_type_and_field_name.dig(type_name, renamed.name) if renamed
150
+
151
+ missing_fields << full_name
152
+ nil
153
+ end
154
+
155
+ def identify_missing_necessary_fields(json_schema, old_type_name_by_current_name)
156
+ json_schema_resolver = JSONSchemaResolver.new(@state, json_schema, old_type_name_by_current_name)
157
+ version = json_schema.fetch(JSON_SCHEMA_VERSION_KEY)
158
+
159
+ types_to_check = @state.object_types_by_name.values.select do |type|
160
+ type.indexed? && !@derived_indexing_type_names.include?(type.name)
161
+ end
162
+
163
+ types_to_check.flat_map do |object_type|
164
+ object_type.indices.flat_map do |index_def|
165
+ identify_missing_necessary_fields_for_index_def(object_type, index_def, json_schema_resolver, version)
166
+ end
167
+ end
168
+ end
169
+
170
+ def identify_missing_necessary_fields_for_index_def(object_type, index_def, json_schema_resolver, json_schema_version)
171
+ {
172
+ "routing" => index_def.routing_field_path,
173
+ "rollover" => index_def.rollover_config&.timestamp_field_path
174
+ }.compact.filter_map do |field_type, field_path|
175
+ if json_schema_resolver.necessary_path_missing?(field_path)
176
+ # The JSON schema v # {json_schema_version} artifact has no field that maps to the #{field_type} path of `#{field_path.fully_qualified_path_in_index}`.
177
+
178
+ MissingNecessaryField.new(
179
+ field_type: field_type,
180
+ fully_qualified_path: field_path.fully_qualified_path_in_index
181
+ )
182
+ end
183
+ end
184
+ end
185
+
186
+ class JSONSchemaResolver
187
+ def initialize(state, json_schema, old_type_name_by_current_name)
188
+ @state = state
189
+ @old_type_name_by_current_name = old_type_name_by_current_name
190
+ @meta_by_old_type_and_name_in_index = ::Hash.new do |hash, type_name|
191
+ properties = json_schema.fetch("$defs").fetch(type_name).fetch("properties")
192
+
193
+ hash[type_name] = properties.filter_map do |name, prop|
194
+ if (metadata = prop["ElasticGraph"])
195
+ [metadata.fetch("nameInIndex"), metadata]
196
+ end
197
+ end.to_h
198
+ end
199
+ end
200
+
201
+ # Indicates if the given `field_path` is (1) necessary and (2) missing from the JSON schema, indicating a problem.
202
+ #
203
+ # - Returns `false` is the given `field_path` is present in the JSON schema.
204
+ # - Returns `false` is the parent type of `field_path` has not been retained in this JSON schema version
205
+ # (in that case, the field path is not necessary).
206
+ # - Otherwise, returns `true` since the field path is both necessary and missing.
207
+ def necessary_path_missing?(field_path)
208
+ parent_type = field_path.first_part.parent_type.name
209
+
210
+ field_path.path_parts.any? do |path_part|
211
+ necessary_path_part_missing?(parent_type, path_part.name_in_index) do |meta|
212
+ parent_type = @state.type_ref(meta.fetch("type")).fully_unwrapped.name
213
+ end
214
+ end
215
+ end
216
+
217
+ private
218
+
219
+ def necessary_path_part_missing?(parent_type, name_in_index)
220
+ old_type_name = @old_type_name_by_current_name[parent_type]
221
+ return false unless old_type_name
222
+
223
+ meta = @meta_by_old_type_and_name_in_index.dig(old_type_name, name_in_index)
224
+ yield meta if meta
225
+ !meta
226
+ end
227
+ end
228
+ end
229
+
230
+ MissingNecessaryField = ::Data.define(:field_type, :fully_qualified_path)
231
+ end
232
+ end
233
+ end
234
+ end
@@ -0,0 +1,53 @@
1
+ # Copyright 2024 Block, Inc.
2
+ #
3
+ # Use of this source code is governed by an MIT-style
4
+ # license that can be found in the LICENSE file or at
5
+ # https://opensource.org/licenses/MIT.
6
+ #
7
+ # frozen_string_literal: true
8
+
9
+ require "elastic_graph/constants"
10
+ require "elastic_graph/support/hash_util"
11
+
12
+ module ElasticGraph
13
+ module SchemaDefinition
14
+ module Indexing
15
+ # To support filtering on the `count` of a list field, we need to index the counts as we ingest
16
+ # events. This is responsible for defining the mapping for the special `__counts` field in which
17
+ # we store the list counts.
18
+ #
19
+ # @private
20
+ module ListCountsMapping
21
+ # Builds the `__counts` field mapping for the given `for_type`. Returns a new `mapping_hash` with
22
+ # the extra `__counts` field merged into it.
23
+ def self.merged_into(mapping_hash, for_type:)
24
+ counts_properties = for_type.indexing_fields_by_name_in_index.values.flat_map do |field|
25
+ field.paths_to_lists_for_count_indexing.map do |path|
26
+ # We chose the `integer` type here because:
27
+ #
28
+ # - While we expect datasets with more documents than the max integer value (~2B), we don't expect
29
+ # individual documents to have any list fields with more elements than can fit in an integer.
30
+ # - Using `long` would allow for much larger counts, but we don't want to take up double the
31
+ # storage space for this.
32
+ #
33
+ # Note that `new_list_filter_input_type` (in `schema_definition/factory.rb`) relies on this, and
34
+ # has chosen to use `IntFilterInput` (rather than `JsonSafeLongFilterInput`) for filtering these count values.
35
+ # If we change the mapping type here, we should re-evaluate the filter used there.
36
+ [path, {"type" => "integer"}]
37
+ end
38
+ end.to_h
39
+
40
+ return mapping_hash if counts_properties.empty?
41
+
42
+ Support::HashUtil.deep_merge(mapping_hash, {
43
+ "properties" => {
44
+ LIST_COUNTS_FIELD => {
45
+ "properties" => counts_properties
46
+ }
47
+ }
48
+ })
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end