elasticgraph-schema_definition 0.18.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +7 -0
- data/elasticgraph-schema_definition.gemspec +26 -0
- data/lib/elastic_graph/schema_definition/api.rb +359 -0
- data/lib/elastic_graph/schema_definition/factory.rb +506 -0
- data/lib/elastic_graph/schema_definition/indexing/derived_fields/append_only_set.rb +79 -0
- data/lib/elastic_graph/schema_definition/indexing/derived_fields/field_initializer_support.rb +59 -0
- data/lib/elastic_graph/schema_definition/indexing/derived_fields/immutable_value.rb +99 -0
- data/lib/elastic_graph/schema_definition/indexing/derived_fields/min_or_max_value.rb +62 -0
- data/lib/elastic_graph/schema_definition/indexing/derived_indexed_type.rb +346 -0
- data/lib/elastic_graph/schema_definition/indexing/event_envelope.rb +74 -0
- data/lib/elastic_graph/schema_definition/indexing/field.rb +181 -0
- data/lib/elastic_graph/schema_definition/indexing/field_reference.rb +51 -0
- data/lib/elastic_graph/schema_definition/indexing/field_type/enum.rb +65 -0
- data/lib/elastic_graph/schema_definition/indexing/field_type/object.rb +113 -0
- data/lib/elastic_graph/schema_definition/indexing/field_type/scalar.rb +51 -0
- data/lib/elastic_graph/schema_definition/indexing/field_type/union.rb +70 -0
- data/lib/elastic_graph/schema_definition/indexing/index.rb +318 -0
- data/lib/elastic_graph/schema_definition/indexing/json_schema_field_metadata.rb +34 -0
- data/lib/elastic_graph/schema_definition/indexing/json_schema_with_metadata.rb +234 -0
- data/lib/elastic_graph/schema_definition/indexing/list_counts_mapping.rb +53 -0
- data/lib/elastic_graph/schema_definition/indexing/relationship_resolver.rb +96 -0
- data/lib/elastic_graph/schema_definition/indexing/rollover_config.rb +25 -0
- data/lib/elastic_graph/schema_definition/indexing/update_target_factory.rb +54 -0
- data/lib/elastic_graph/schema_definition/indexing/update_target_resolver.rb +195 -0
- data/lib/elastic_graph/schema_definition/json_schema_pruner.rb +61 -0
- data/lib/elastic_graph/schema_definition/mixins/can_be_graphql_only.rb +31 -0
- data/lib/elastic_graph/schema_definition/mixins/has_derived_graphql_type_customizations.rb +119 -0
- data/lib/elastic_graph/schema_definition/mixins/has_directives.rb +65 -0
- data/lib/elastic_graph/schema_definition/mixins/has_documentation.rb +74 -0
- data/lib/elastic_graph/schema_definition/mixins/has_indices.rb +281 -0
- data/lib/elastic_graph/schema_definition/mixins/has_readable_to_s_and_inspect.rb +46 -0
- data/lib/elastic_graph/schema_definition/mixins/has_subtypes.rb +116 -0
- data/lib/elastic_graph/schema_definition/mixins/has_type_info.rb +181 -0
- data/lib/elastic_graph/schema_definition/mixins/implements_interfaces.rb +122 -0
- data/lib/elastic_graph/schema_definition/mixins/supports_default_value.rb +47 -0
- data/lib/elastic_graph/schema_definition/mixins/supports_filtering_and_aggregation.rb +267 -0
- data/lib/elastic_graph/schema_definition/mixins/verifies_graphql_name.rb +38 -0
- data/lib/elastic_graph/schema_definition/rake_tasks.rb +190 -0
- data/lib/elastic_graph/schema_definition/results.rb +404 -0
- data/lib/elastic_graph/schema_definition/schema_artifact_manager.rb +482 -0
- data/lib/elastic_graph/schema_definition/schema_elements/argument.rb +56 -0
- data/lib/elastic_graph/schema_definition/schema_elements/built_in_types.rb +1541 -0
- data/lib/elastic_graph/schema_definition/schema_elements/deprecated_element.rb +21 -0
- data/lib/elastic_graph/schema_definition/schema_elements/directive.rb +40 -0
- data/lib/elastic_graph/schema_definition/schema_elements/enum_type.rb +189 -0
- data/lib/elastic_graph/schema_definition/schema_elements/enum_value.rb +73 -0
- data/lib/elastic_graph/schema_definition/schema_elements/enum_value_namer.rb +89 -0
- data/lib/elastic_graph/schema_definition/schema_elements/enums_for_indexed_types.rb +82 -0
- data/lib/elastic_graph/schema_definition/schema_elements/field.rb +1085 -0
- data/lib/elastic_graph/schema_definition/schema_elements/field_path.rb +112 -0
- data/lib/elastic_graph/schema_definition/schema_elements/field_source.rb +16 -0
- data/lib/elastic_graph/schema_definition/schema_elements/graphql_sdl_enumerator.rb +113 -0
- data/lib/elastic_graph/schema_definition/schema_elements/input_field.rb +31 -0
- data/lib/elastic_graph/schema_definition/schema_elements/input_type.rb +60 -0
- data/lib/elastic_graph/schema_definition/schema_elements/interface_type.rb +72 -0
- data/lib/elastic_graph/schema_definition/schema_elements/list_counts_state.rb +40 -0
- data/lib/elastic_graph/schema_definition/schema_elements/object_type.rb +53 -0
- data/lib/elastic_graph/schema_definition/schema_elements/relationship.rb +218 -0
- data/lib/elastic_graph/schema_definition/schema_elements/scalar_type.rb +310 -0
- data/lib/elastic_graph/schema_definition/schema_elements/sort_order_enum_value.rb +36 -0
- data/lib/elastic_graph/schema_definition/schema_elements/sub_aggregation_path.rb +66 -0
- data/lib/elastic_graph/schema_definition/schema_elements/type_namer.rb +237 -0
- data/lib/elastic_graph/schema_definition/schema_elements/type_reference.rb +353 -0
- data/lib/elastic_graph/schema_definition/schema_elements/type_with_subfields.rb +579 -0
- data/lib/elastic_graph/schema_definition/schema_elements/union_type.rb +157 -0
- data/lib/elastic_graph/schema_definition/scripting/file_system_repository.rb +77 -0
- data/lib/elastic_graph/schema_definition/scripting/script.rb +48 -0
- data/lib/elastic_graph/schema_definition/scripting/scripts/field/as_day_of_week.painless +24 -0
- data/lib/elastic_graph/schema_definition/scripting/scripts/field/as_time_of_day.painless +41 -0
- data/lib/elastic_graph/schema_definition/scripting/scripts/filter/by_time_of_day.painless +22 -0
- data/lib/elastic_graph/schema_definition/scripting/scripts/update/index_data.painless +93 -0
- data/lib/elastic_graph/schema_definition/state.rb +212 -0
- data/lib/elastic_graph/schema_definition/test_support.rb +113 -0
- metadata +513 -0
@@ -0,0 +1,318 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/schema_artifacts/runtime_metadata/index_definition"
|
10
|
+
require "elastic_graph/schema_artifacts/runtime_metadata/index_field"
|
11
|
+
require "elastic_graph/schema_definition/indexing/derived_indexed_type"
|
12
|
+
require "elastic_graph/schema_definition/indexing/list_counts_mapping"
|
13
|
+
require "elastic_graph/schema_definition/indexing/rollover_config"
|
14
|
+
require "elastic_graph/schema_definition/mixins/has_readable_to_s_and_inspect"
|
15
|
+
require "elastic_graph/schema_definition/schema_elements/field_path"
|
16
|
+
require "elastic_graph/support/hash_util"
|
17
|
+
|
18
|
+
module ElasticGraph
|
19
|
+
module SchemaDefinition
|
20
|
+
# Contains schema definition logic specific to indexing (such as JSON schema and mapping generation).
|
21
|
+
module Indexing
|
22
|
+
# Represents an index in a datastore. Defined within an indexed type. Modeled as a separate object to facilitate
|
23
|
+
# further customization of the index.
|
24
|
+
#
|
25
|
+
# @!attribute [r] name
|
26
|
+
# @return [String] name of the index
|
27
|
+
# @!attribute [r] default_sort_pairs
|
28
|
+
# @return [Array<(String, Symbol)>] (field name, direction) pairs for the default sort
|
29
|
+
# @!attribute [r] settings
|
30
|
+
# @return [Hash<(String, Object)>] datastore settings for the index
|
31
|
+
# @!attribute [r] schema_def_state
|
32
|
+
# @return [State] schema definition state
|
33
|
+
# @!attribute [r] indexed_type
|
34
|
+
# @return [SchemaElements::ObjectType, SchemaElements::InterfaceType, SchemaElements::UnionType] type backed by this index
|
35
|
+
# @!attribute [r] routing_field_path
|
36
|
+
# @return [Array<String>] path to the field used for shard routing
|
37
|
+
# @!attribute [r] rollover_config
|
38
|
+
# @return [RolloverConfig, nil] rollover configuration for the index
|
39
|
+
class Index < Struct.new(:name, :default_sort_pairs, :settings, :schema_def_state, :indexed_type, :routing_field_path, :rollover_config)
|
40
|
+
include Mixins::HasReadableToSAndInspect.new { |i| i.name }
|
41
|
+
|
42
|
+
# @param name [String] name of the index
|
43
|
+
# @param settings [Hash<(String, Object)>] datastore settings for the index
|
44
|
+
# @param schema_def_state [State] schema definition state
|
45
|
+
# @param indexed_type [SchemaElements::ObjectType, SchemaElements::InterfaceType, SchemaElements::UnionType] type backed by this index
|
46
|
+
# @yield [Index] the index, for further customization
|
47
|
+
# @api private
|
48
|
+
def initialize(name, settings, schema_def_state, indexed_type)
|
49
|
+
if name.include?(ROLLOVER_INDEX_INFIX_MARKER)
|
50
|
+
raise SchemaError, "`#{name}` is an invalid index definition name since it contains " \
|
51
|
+
"`#{ROLLOVER_INDEX_INFIX_MARKER}` which ElasticGraph treats as special."
|
52
|
+
end
|
53
|
+
|
54
|
+
settings = DEFAULT_SETTINGS.merge(Support::HashUtil.flatten_and_stringify_keys(settings, prefix: "index"))
|
55
|
+
|
56
|
+
super(name, [], settings, schema_def_state, indexed_type, [], nil)
|
57
|
+
|
58
|
+
# `id` is the field Elasticsearch/OpenSearch use for routing by default:
|
59
|
+
# https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-routing-field.html
|
60
|
+
# By using it here, it will cause queries to pass a `routing` parameter when
|
61
|
+
# searching with id filtering on an index that does not use custom shard routing, giving
|
62
|
+
# us a nice efficiency boost.
|
63
|
+
self.routing_field_path = public_field_path("id", explanation: "indexed types must have an `id` field")
|
64
|
+
|
65
|
+
yield self if block_given?
|
66
|
+
end
|
67
|
+
|
68
|
+
# Specifies how documents in this index should sort by default, when no `orderBy` argument is provided to the GraphQL query.
|
69
|
+
#
|
70
|
+
# @note the field name strings can be a dot-separated nested fields, but all referenced
|
71
|
+
# fields must exist when this is called.
|
72
|
+
#
|
73
|
+
# @param field_name_direction_pairs [Array<(String, Symbol)>] pairs of field names and `:asc` or `:desc`
|
74
|
+
# @return [void]
|
75
|
+
#
|
76
|
+
# @example Sort on `name` (ascending) with `createdAt` (descending) as a tie-breaker
|
77
|
+
# ElasticGraph.define_schema do |schema|
|
78
|
+
# schema.object_type "Campaign" do |t|
|
79
|
+
# t.field "id", "ID!"
|
80
|
+
# t.field "name", "String"
|
81
|
+
# t.field "createdAt", "DateTime"
|
82
|
+
#
|
83
|
+
# t.index "campaigns"do |i|
|
84
|
+
# i.default_sort "name", :asc, "createdAt", :desc
|
85
|
+
# end
|
86
|
+
# end
|
87
|
+
# end
|
88
|
+
def default_sort(*field_name_direction_pairs)
|
89
|
+
self.default_sort_pairs = field_name_direction_pairs
|
90
|
+
end
|
91
|
+
|
92
|
+
# Causes this index to "rollover" at the provided `frequency` based on the value of the provided `timestamp_field_path_name`.
|
93
|
+
# This is particularly useful for time-series data. Partitioning the data into `hourly`, `daily`, `monthly` or `yearly` buckets
|
94
|
+
# allows for different index configurations, and can be necessary when a dataset is too large to fit in one dataset given
|
95
|
+
# Elasticsearch/OpenSearch limitations on the number of shards in one index. In addition, ElasticGraph optimizes queries which
|
96
|
+
# filter on the timestamp field to target the subset of the indices in which matching documents could reside.
|
97
|
+
#
|
98
|
+
# @note the timestamp field specified here **must be immutable**. To understand why, consider a `:yearly` rollover
|
99
|
+
# index used for data based on `createdAt`; if ElasticGraph ingests record `123` with a createdAt of `2023-12-31T23:59:59Z`, it
|
100
|
+
# will be indexed in the `2023` index. Later if it receives an update event for record `123` with a `createdAt` of
|
101
|
+
# `2024-01-01T00:00:00Z` (a mere one second later!), ElasticGraph will store the new version of the payment in the `2024` index,
|
102
|
+
# and leave the old copy of the payment in the `2023` index unchanged. It’ll have duplicates for that document.
|
103
|
+
# @note changing the `rollover` configuration on an existing index that already has data will result in duplicate documents
|
104
|
+
#
|
105
|
+
# @param frequency [:yearly, :monthly, :daily, :hourly] how often to rollover the index
|
106
|
+
# @param timestamp_field_path_name [String] dot-separated path to the timestamp field used for rollover. Note: all referenced
|
107
|
+
# fields must exist when this is called.
|
108
|
+
# @return [void]
|
109
|
+
#
|
110
|
+
# @example Define a `campaigns` index to rollover yearly based on `createdAt`
|
111
|
+
# ElasticGraph.define_schema do |schema|
|
112
|
+
# schema.object_type "Campaign" do |t|
|
113
|
+
# t.field "id", "ID!"
|
114
|
+
# t.field "name", "String"
|
115
|
+
# t.field "createdAt", "DateTime"
|
116
|
+
#
|
117
|
+
# t.index "campaigns"do |i|
|
118
|
+
# i.rollover :yearly, "createdAt"
|
119
|
+
# end
|
120
|
+
# end
|
121
|
+
# end
|
122
|
+
def rollover(frequency, timestamp_field_path_name)
|
123
|
+
timestamp_field_path = public_field_path(timestamp_field_path_name, explanation: "it is referenced as an index `rollover` field")
|
124
|
+
|
125
|
+
unless date_and_datetime_types.include?(timestamp_field_path.type.fully_unwrapped.name)
|
126
|
+
date_or_datetime_description = date_and_datetime_types.map { |t| "`#{t}`" }.join(" or ")
|
127
|
+
raise SchemaError, "rollover field `#{timestamp_field_path.full_description}` cannot be used for rollover since it is not a #{date_or_datetime_description} field."
|
128
|
+
end
|
129
|
+
|
130
|
+
if timestamp_field_path.type.list?
|
131
|
+
raise SchemaError, "rollover field `#{timestamp_field_path.full_description}` cannot be used for rollover since it is a list field."
|
132
|
+
end
|
133
|
+
|
134
|
+
timestamp_field_path.path_parts.each { |f| f.json_schema nullable: false }
|
135
|
+
|
136
|
+
self.rollover_config = RolloverConfig.new(
|
137
|
+
frequency: frequency,
|
138
|
+
timestamp_field_path: timestamp_field_path
|
139
|
+
)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Configures the index to [route documents to shards](https://www.elastic.co/guide/en/elasticsearch/reference/8.15/mapping-routing-field.html)
|
143
|
+
# based on the specified field. ElasticGraph optimizes queries that filter on the shard routing field so that they only run on a
|
144
|
+
# subset of nodes instead of all nodes. This can make a big difference in query performance if queries usually filter on a certain
|
145
|
+
# field. Using an appropriate field for shard routing is often essential for horizontal scaling, as it avoids having every query
|
146
|
+
# hit every node, allowing additional nodes to increase query throughput.
|
147
|
+
#
|
148
|
+
# @note it is essential that the shards are well-balanced. If the data’s distribution is lopsided, using this feature can make
|
149
|
+
# performance worse.
|
150
|
+
# @note the routing field specified here **must be immutable**. If ElasticGraph receives an updated version of a document with a
|
151
|
+
# different routing value, it’ll write the new version of the document to a different shard and leave the copy on the old shard
|
152
|
+
# unchanged, leading to duplicates.
|
153
|
+
# @note changing the shard routing configuration on an existing index that already has data will result in duplicate documents
|
154
|
+
#
|
155
|
+
# @param routing_field_path_name [String] dot-separated path to the field used for shard routing. Note: all referenced
|
156
|
+
# fields must exist when this is called.
|
157
|
+
# @return [void]
|
158
|
+
#
|
159
|
+
# @example Define a `campaigns` index to shard on `organizationId`
|
160
|
+
# ElasticGraph.define_schema do |schema|
|
161
|
+
# schema.object_type "Campaign" do |t|
|
162
|
+
# t.field "id", "ID!"
|
163
|
+
# t.field "name", "String"
|
164
|
+
# t.field "organizationId", "ID"
|
165
|
+
#
|
166
|
+
# t.index "campaigns"do |i|
|
167
|
+
# i.route_with "organizationId"
|
168
|
+
# end
|
169
|
+
# end
|
170
|
+
# end
|
171
|
+
def route_with(routing_field_path_name)
|
172
|
+
routing_field_path = public_field_path(routing_field_path_name, explanation: "it is referenced as an index `route_with` field")
|
173
|
+
|
174
|
+
unless routing_field_path.type.leaf?
|
175
|
+
raise SchemaError, "shard routing field `#{routing_field_path.full_description}` cannot be used for routing since it is not a leaf field."
|
176
|
+
end
|
177
|
+
|
178
|
+
self.routing_field_path = routing_field_path
|
179
|
+
|
180
|
+
routing_field_path.path_parts[0..-2].each { |f| f.json_schema nullable: false }
|
181
|
+
routing_field_path.last_part.json_schema nullable: false, pattern: HAS_NON_WHITE_SPACE_REGEX
|
182
|
+
indexed_type.append_to_documentation "For more performant queries on this type, please filter on `#{routing_field_path_name}` if possible."
|
183
|
+
end
|
184
|
+
|
185
|
+
# @see #route_with
|
186
|
+
# @return [Boolean] whether or not this index uses custom shard routing
|
187
|
+
def uses_custom_routing?
|
188
|
+
routing_field_path.path_in_index != "id"
|
189
|
+
end
|
190
|
+
|
191
|
+
# @return [Hash<String, Object>] datastore configuration for this index for when it does not use rollover
|
192
|
+
def to_index_config
|
193
|
+
{
|
194
|
+
"aliases" => {},
|
195
|
+
"mappings" => mappings,
|
196
|
+
"settings" => settings
|
197
|
+
}.compact
|
198
|
+
end
|
199
|
+
|
200
|
+
# @return [Hash<String, Object>] datastore configuration for the index template that will be defined if rollover is used
|
201
|
+
def to_index_template_config
|
202
|
+
{
|
203
|
+
"index_patterns" => ["#{name}#{ROLLOVER_INDEX_INFIX_MARKER}*"],
|
204
|
+
"template" => {
|
205
|
+
"aliases" => {},
|
206
|
+
"mappings" => mappings,
|
207
|
+
"settings" => settings
|
208
|
+
}
|
209
|
+
}
|
210
|
+
end
|
211
|
+
|
212
|
+
# @return [SchemaArtifacts::RuntimeMetadata::IndexDefinition] runtime metadata for this index
|
213
|
+
def runtime_metadata
|
214
|
+
SchemaArtifacts::RuntimeMetadata::IndexDefinition.new(
|
215
|
+
route_with: routing_field_path.path_in_index,
|
216
|
+
rollover: rollover_config&.runtime_metadata,
|
217
|
+
current_sources: indexed_type.current_sources,
|
218
|
+
fields_by_path: indexed_type.index_field_runtime_metadata_tuples.to_h,
|
219
|
+
default_sort_fields: default_sort_pairs.each_slice(2).map do |(graphql_field_path_name, direction)|
|
220
|
+
SchemaArtifacts::RuntimeMetadata::SortField.new(
|
221
|
+
field_path: public_field_path(graphql_field_path_name, explanation: "it is referenced as an index `default_sort` field").path_in_index,
|
222
|
+
direction: direction
|
223
|
+
)
|
224
|
+
end
|
225
|
+
)
|
226
|
+
end
|
227
|
+
|
228
|
+
private
|
229
|
+
|
230
|
+
# A regex that requires at least one non-whitespace character.
|
231
|
+
# Note: this does not use the `/S` character class because it's recommended to use a small subset
|
232
|
+
# of Regex syntax:
|
233
|
+
#
|
234
|
+
# > The regular expression syntax used is from JavaScript (ECMA 262, specifically). However, that
|
235
|
+
# > complete syntax is not widely supported, therefore it is recommended that you stick to the subset
|
236
|
+
# > of that syntax described below.
|
237
|
+
#
|
238
|
+
# (From https://json-schema.org/understanding-json-schema/reference/regular_expressions.html)
|
239
|
+
HAS_NON_WHITE_SPACE_REGEX = "[^ \t\n]+"
|
240
|
+
|
241
|
+
DEFAULT_SETTINGS = {
|
242
|
+
"index.mapping.ignore_malformed" => false,
|
243
|
+
"index.mapping.coerce" => false,
|
244
|
+
"index.number_of_replicas" => 1,
|
245
|
+
"index.number_of_shards" => 1
|
246
|
+
}
|
247
|
+
|
248
|
+
def mappings
|
249
|
+
field_mappings = indexed_type
|
250
|
+
.to_indexing_field_type
|
251
|
+
.to_mapping
|
252
|
+
.except("type") # `type` is invalid at the mapping root because it always has to be an object.
|
253
|
+
.then { |mapping| ListCountsMapping.merged_into(mapping, for_type: indexed_type) }
|
254
|
+
.then do |fm|
|
255
|
+
Support::HashUtil.deep_merge(fm, {"properties" => {
|
256
|
+
"__sources" => {"type" => "keyword"},
|
257
|
+
"__versions" => {
|
258
|
+
"type" => "object",
|
259
|
+
# __versions is map keyed by relationship name, with values that are maps keyed by id. Since it's not
|
260
|
+
# a static object with known fields, we need to use dynamic here. Passing `false` allows some level
|
261
|
+
# of dynamicness. As per https://www.elastic.co/guide/en/elasticsearch/reference/8.7/dynamic.html#dynamic-parameters:
|
262
|
+
#
|
263
|
+
# > New fields are ignored. These fields will not be indexed or searchable, but will still appear in the _source
|
264
|
+
# > field of returned hits. These fields will not be added to the mapping, and new fields must be added explicitly.
|
265
|
+
#
|
266
|
+
# We need `__versions` to be in `_source` (so that our update scripts can operate on it), but
|
267
|
+
# have no need for it to be searchable (as it's just an internal data structure used for indexing).
|
268
|
+
#
|
269
|
+
# Note: we intentionally set false as a string here, because that's how the datastore echoes it back
|
270
|
+
# to us when you query the mapping (even if you set it as a boolean). Our checks for index mapping
|
271
|
+
# consistency fail validation if we set it as a boolean since the datastore doesn't echo it back as
|
272
|
+
# a boolean.
|
273
|
+
"dynamic" => "false"
|
274
|
+
}
|
275
|
+
}})
|
276
|
+
end
|
277
|
+
|
278
|
+
{"dynamic" => "strict"}.merge(field_mappings).tap do |hash|
|
279
|
+
# If we are using custom shard routing, we want to require a `routing` value to be provided
|
280
|
+
# in every single index, get, delete or update request; otherwise the request might be
|
281
|
+
# made against the wrong shard.
|
282
|
+
hash["_routing"] = {"required" => true} if uses_custom_routing?
|
283
|
+
hash["_size"] = {"enabled" => true} if schema_def_state.index_document_sizes?
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
def public_field_path(public_path_string, explanation:)
|
288
|
+
parent_is_not_list = ->(parent_field) { !parent_field.type.list? }
|
289
|
+
resolver = SchemaElements::FieldPath::Resolver.new
|
290
|
+
resolved_path = resolver.resolve_public_path(indexed_type, public_path_string, &parent_is_not_list)
|
291
|
+
return resolved_path if resolved_path
|
292
|
+
|
293
|
+
path_parts = public_path_string.split(".")
|
294
|
+
error_msg = "Field `#{indexed_type.name}.#{public_path_string}` cannot be resolved, but #{explanation}."
|
295
|
+
|
296
|
+
# If it is a nested field path, the problem could be that a type has been referenced which does not exist, so mention that.
|
297
|
+
if path_parts.size > 1
|
298
|
+
error_msg += " Verify that all fields and types referenced by `#{public_path_string}` are defined."
|
299
|
+
end
|
300
|
+
|
301
|
+
# If the first part of the path doesn't resolve, the problem could be that the field is defined after the `index` call
|
302
|
+
# but it needs to be defined before it, so mention that.
|
303
|
+
if resolver.resolve_public_path(indexed_type, path_parts.first, &parent_is_not_list).nil?
|
304
|
+
error_msg += " Note: the `#{indexed_type.name}.#{path_parts.first}` definition must come before the `index` call."
|
305
|
+
end
|
306
|
+
|
307
|
+
raise SchemaError, error_msg
|
308
|
+
end
|
309
|
+
|
310
|
+
def date_and_datetime_types
|
311
|
+
@date_and_datetime_types ||= %w[Date DateTime].map do |type|
|
312
|
+
schema_def_state.type_namer.name_for(type)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module ElasticGraph
|
10
|
+
module SchemaDefinition
|
11
|
+
module Indexing
|
12
|
+
# @!parse class JSONSchemaFieldMetadata; end
|
13
|
+
JSONSchemaFieldMetadata = ::Data.define(:type, :name_in_index)
|
14
|
+
|
15
|
+
# Metadata about an ElasticGraph field that needs to be stored in our versioned JSON schemas
|
16
|
+
# alongside the JSON schema fields.
|
17
|
+
#
|
18
|
+
# @!attribute [r] type
|
19
|
+
# @return [String] name of the ElasticGraph type for this field
|
20
|
+
# @!attribute [r] name_in_index
|
21
|
+
# @return [String] name of the field in the index
|
22
|
+
#
|
23
|
+
# @api private
|
24
|
+
class JSONSchemaFieldMetadata < ::Data
|
25
|
+
# @return [Hash<String, String>] hash form of the metadata that can be dumped in JSON schema
|
26
|
+
def to_dumpable_hash
|
27
|
+
{"type" => type, "nameInIndex" => name_in_index}
|
28
|
+
end
|
29
|
+
|
30
|
+
# @dynamic initialize, type, name_in_index
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,234 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/constants"
|
10
|
+
|
11
|
+
module ElasticGraph
|
12
|
+
module SchemaDefinition
|
13
|
+
module Indexing
|
14
|
+
# Represents the result of merging a JSON schema with metadata. The result includes both
|
15
|
+
# the merged JSON schema and a list of `failed_fields` indicating which fields metadata
|
16
|
+
# could not be determined for.
|
17
|
+
#
|
18
|
+
# @private
|
19
|
+
class JSONSchemaWithMetadata < ::Data.define(
|
20
|
+
# The JSON schema.
|
21
|
+
:json_schema,
|
22
|
+
# A set of fields (in the form `Type.field`) that were needed but not found.
|
23
|
+
:missing_fields,
|
24
|
+
# A set of type names that were needed but not found.
|
25
|
+
:missing_types,
|
26
|
+
# A set of `DeprecatedElement` objects that create conflicting definitions.
|
27
|
+
:definition_conflicts,
|
28
|
+
# A set of fields that have been deleted but that must be retained (e.g. for custom shard routing or rollover)
|
29
|
+
:missing_necessary_fields
|
30
|
+
)
|
31
|
+
def json_schema_version
|
32
|
+
json_schema.fetch(JSON_SCHEMA_VERSION_KEY)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Responsible for building `JSONSchemaWithMetadata` instances.
|
36
|
+
#
|
37
|
+
# @private
|
38
|
+
class Merger
|
39
|
+
# @dynamic unused_deprecated_elements
|
40
|
+
attr_reader :unused_deprecated_elements
|
41
|
+
|
42
|
+
def initialize(schema_def_results)
|
43
|
+
@field_metadata_by_type_and_field_name = schema_def_results.json_schema_field_metadata_by_type_and_field_name
|
44
|
+
@renamed_types_by_old_name = schema_def_results.state.renamed_types_by_old_name
|
45
|
+
@deleted_types_by_old_name = schema_def_results.state.deleted_types_by_old_name
|
46
|
+
@renamed_fields_by_type_name_and_old_field_name = schema_def_results.state.renamed_fields_by_type_name_and_old_field_name
|
47
|
+
@deleted_fields_by_type_name_and_old_field_name = schema_def_results.state.deleted_fields_by_type_name_and_old_field_name
|
48
|
+
@state = schema_def_results.state
|
49
|
+
@derived_indexing_type_names = schema_def_results.derived_indexing_type_names
|
50
|
+
|
51
|
+
@unused_deprecated_elements = (
|
52
|
+
@renamed_types_by_old_name.values +
|
53
|
+
@deleted_types_by_old_name.values +
|
54
|
+
@renamed_fields_by_type_name_and_old_field_name.values.flat_map(&:values) +
|
55
|
+
@deleted_fields_by_type_name_and_old_field_name.values.flat_map(&:values)
|
56
|
+
).to_set
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge_metadata_into(json_schema)
|
60
|
+
missing_fields = ::Set.new
|
61
|
+
missing_types = ::Set.new
|
62
|
+
definition_conflicts = ::Set.new
|
63
|
+
old_type_name_by_current_name = {} # : ::Hash[String, String]
|
64
|
+
|
65
|
+
defs = json_schema.fetch("$defs").to_h do |type_name, type_def|
|
66
|
+
if type_name != EVENT_ENVELOPE_JSON_SCHEMA_NAME && (properties = type_def["properties"])
|
67
|
+
current_type_name = determine_current_type_name(
|
68
|
+
type_name,
|
69
|
+
missing_types: missing_types,
|
70
|
+
definition_conflicts: definition_conflicts
|
71
|
+
)
|
72
|
+
|
73
|
+
if current_type_name
|
74
|
+
old_type_name_by_current_name[current_type_name] = type_name
|
75
|
+
end
|
76
|
+
|
77
|
+
properties = properties.to_h do |field_name, prop|
|
78
|
+
unless field_name == "__typename"
|
79
|
+
field_metadata = current_type_name&.then do |name|
|
80
|
+
field_metadata_for(
|
81
|
+
name,
|
82
|
+
field_name,
|
83
|
+
missing_fields: missing_fields,
|
84
|
+
definition_conflicts: definition_conflicts
|
85
|
+
)
|
86
|
+
end
|
87
|
+
|
88
|
+
prop = prop.merge({"ElasticGraph" => field_metadata&.to_dumpable_hash})
|
89
|
+
end
|
90
|
+
|
91
|
+
[field_name, prop]
|
92
|
+
end
|
93
|
+
|
94
|
+
type_def = type_def.merge({"properties" => properties})
|
95
|
+
end
|
96
|
+
|
97
|
+
[type_name, type_def]
|
98
|
+
end
|
99
|
+
|
100
|
+
json_schema = json_schema.merge("$defs" => defs)
|
101
|
+
|
102
|
+
JSONSchemaWithMetadata.new(
|
103
|
+
json_schema: json_schema,
|
104
|
+
missing_fields: missing_fields,
|
105
|
+
missing_types: missing_types,
|
106
|
+
definition_conflicts: definition_conflicts,
|
107
|
+
missing_necessary_fields: identify_missing_necessary_fields(json_schema, old_type_name_by_current_name)
|
108
|
+
)
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
# Given a historical `type_name`, determines (and returns) the current name for that type.
|
114
|
+
def determine_current_type_name(type_name, missing_types:, definition_conflicts:)
|
115
|
+
exists_currently = @field_metadata_by_type_and_field_name.key?(type_name)
|
116
|
+
deleted = @deleted_types_by_old_name[type_name]&.tap { |elem| @unused_deprecated_elements.delete(elem) }
|
117
|
+
renamed = @renamed_types_by_old_name[type_name]&.tap { |elem| @unused_deprecated_elements.delete(elem) }
|
118
|
+
|
119
|
+
if [exists_currently, deleted, renamed].count(&:itself) > 1
|
120
|
+
definition_conflicts.merge([deleted, renamed].compact)
|
121
|
+
end
|
122
|
+
|
123
|
+
return type_name if exists_currently
|
124
|
+
return nil if deleted
|
125
|
+
return renamed.name if renamed
|
126
|
+
|
127
|
+
missing_types << type_name
|
128
|
+
nil
|
129
|
+
end
|
130
|
+
|
131
|
+
# Given a historical `type_name` and `field_name` determines (and returns) the field metadata for it.
|
132
|
+
def field_metadata_for(type_name, field_name, missing_fields:, definition_conflicts:)
|
133
|
+
full_name = "#{type_name}.#{field_name}"
|
134
|
+
|
135
|
+
current_meta = @field_metadata_by_type_and_field_name.dig(type_name, field_name)
|
136
|
+
deleted = @deleted_fields_by_type_name_and_old_field_name.dig(type_name, field_name)&.tap do |elem|
|
137
|
+
@unused_deprecated_elements.delete(elem)
|
138
|
+
end
|
139
|
+
renamed = @renamed_fields_by_type_name_and_old_field_name.dig(type_name, field_name)&.tap do |elem|
|
140
|
+
@unused_deprecated_elements.delete(elem)
|
141
|
+
end
|
142
|
+
|
143
|
+
if [current_meta, deleted, renamed].count(&:itself) > 1
|
144
|
+
definition_conflicts.merge([deleted, renamed].compact.map { |elem| elem.with(name: full_name) })
|
145
|
+
end
|
146
|
+
|
147
|
+
return current_meta if current_meta
|
148
|
+
return nil if deleted
|
149
|
+
return @field_metadata_by_type_and_field_name.dig(type_name, renamed.name) if renamed
|
150
|
+
|
151
|
+
missing_fields << full_name
|
152
|
+
nil
|
153
|
+
end
|
154
|
+
|
155
|
+
def identify_missing_necessary_fields(json_schema, old_type_name_by_current_name)
|
156
|
+
json_schema_resolver = JSONSchemaResolver.new(@state, json_schema, old_type_name_by_current_name)
|
157
|
+
version = json_schema.fetch(JSON_SCHEMA_VERSION_KEY)
|
158
|
+
|
159
|
+
types_to_check = @state.object_types_by_name.values.select do |type|
|
160
|
+
type.indexed? && !@derived_indexing_type_names.include?(type.name)
|
161
|
+
end
|
162
|
+
|
163
|
+
types_to_check.flat_map do |object_type|
|
164
|
+
object_type.indices.flat_map do |index_def|
|
165
|
+
identify_missing_necessary_fields_for_index_def(object_type, index_def, json_schema_resolver, version)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def identify_missing_necessary_fields_for_index_def(object_type, index_def, json_schema_resolver, json_schema_version)
|
171
|
+
{
|
172
|
+
"routing" => index_def.routing_field_path,
|
173
|
+
"rollover" => index_def.rollover_config&.timestamp_field_path
|
174
|
+
}.compact.filter_map do |field_type, field_path|
|
175
|
+
if json_schema_resolver.necessary_path_missing?(field_path)
|
176
|
+
# The JSON schema v # {json_schema_version} artifact has no field that maps to the #{field_type} path of `#{field_path.fully_qualified_path_in_index}`.
|
177
|
+
|
178
|
+
MissingNecessaryField.new(
|
179
|
+
field_type: field_type,
|
180
|
+
fully_qualified_path: field_path.fully_qualified_path_in_index
|
181
|
+
)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
class JSONSchemaResolver
|
187
|
+
def initialize(state, json_schema, old_type_name_by_current_name)
|
188
|
+
@state = state
|
189
|
+
@old_type_name_by_current_name = old_type_name_by_current_name
|
190
|
+
@meta_by_old_type_and_name_in_index = ::Hash.new do |hash, type_name|
|
191
|
+
properties = json_schema.fetch("$defs").fetch(type_name).fetch("properties")
|
192
|
+
|
193
|
+
hash[type_name] = properties.filter_map do |name, prop|
|
194
|
+
if (metadata = prop["ElasticGraph"])
|
195
|
+
[metadata.fetch("nameInIndex"), metadata]
|
196
|
+
end
|
197
|
+
end.to_h
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Indicates if the given `field_path` is (1) necessary and (2) missing from the JSON schema, indicating a problem.
|
202
|
+
#
|
203
|
+
# - Returns `false` is the given `field_path` is present in the JSON schema.
|
204
|
+
# - Returns `false` is the parent type of `field_path` has not been retained in this JSON schema version
|
205
|
+
# (in that case, the field path is not necessary).
|
206
|
+
# - Otherwise, returns `true` since the field path is both necessary and missing.
|
207
|
+
def necessary_path_missing?(field_path)
|
208
|
+
parent_type = field_path.first_part.parent_type.name
|
209
|
+
|
210
|
+
field_path.path_parts.any? do |path_part|
|
211
|
+
necessary_path_part_missing?(parent_type, path_part.name_in_index) do |meta|
|
212
|
+
parent_type = @state.type_ref(meta.fetch("type")).fully_unwrapped.name
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
private
|
218
|
+
|
219
|
+
def necessary_path_part_missing?(parent_type, name_in_index)
|
220
|
+
old_type_name = @old_type_name_by_current_name[parent_type]
|
221
|
+
return false unless old_type_name
|
222
|
+
|
223
|
+
meta = @meta_by_old_type_and_name_in_index.dig(old_type_name, name_in_index)
|
224
|
+
yield meta if meta
|
225
|
+
!meta
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
MissingNecessaryField = ::Data.define(:field_type, :fully_qualified_path)
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Copyright 2024 Block, Inc.
|
2
|
+
#
|
3
|
+
# Use of this source code is governed by an MIT-style
|
4
|
+
# license that can be found in the LICENSE file or at
|
5
|
+
# https://opensource.org/licenses/MIT.
|
6
|
+
#
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require "elastic_graph/constants"
|
10
|
+
require "elastic_graph/support/hash_util"
|
11
|
+
|
12
|
+
module ElasticGraph
|
13
|
+
module SchemaDefinition
|
14
|
+
module Indexing
|
15
|
+
# To support filtering on the `count` of a list field, we need to index the counts as we ingest
|
16
|
+
# events. This is responsible for defining the mapping for the special `__counts` field in which
|
17
|
+
# we store the list counts.
|
18
|
+
#
|
19
|
+
# @private
|
20
|
+
module ListCountsMapping
|
21
|
+
# Builds the `__counts` field mapping for the given `for_type`. Returns a new `mapping_hash` with
|
22
|
+
# the extra `__counts` field merged into it.
|
23
|
+
def self.merged_into(mapping_hash, for_type:)
|
24
|
+
counts_properties = for_type.indexing_fields_by_name_in_index.values.flat_map do |field|
|
25
|
+
field.paths_to_lists_for_count_indexing.map do |path|
|
26
|
+
# We chose the `integer` type here because:
|
27
|
+
#
|
28
|
+
# - While we expect datasets with more documents than the max integer value (~2B), we don't expect
|
29
|
+
# individual documents to have any list fields with more elements than can fit in an integer.
|
30
|
+
# - Using `long` would allow for much larger counts, but we don't want to take up double the
|
31
|
+
# storage space for this.
|
32
|
+
#
|
33
|
+
# Note that `new_list_filter_input_type` (in `schema_definition/factory.rb`) relies on this, and
|
34
|
+
# has chosen to use `IntFilterInput` (rather than `JsonSafeLongFilterInput`) for filtering these count values.
|
35
|
+
# If we change the mapping type here, we should re-evaluate the filter used there.
|
36
|
+
[path, {"type" => "integer"}]
|
37
|
+
end
|
38
|
+
end.to_h
|
39
|
+
|
40
|
+
return mapping_hash if counts_properties.empty?
|
41
|
+
|
42
|
+
Support::HashUtil.deep_merge(mapping_hash, {
|
43
|
+
"properties" => {
|
44
|
+
LIST_COUNTS_FIELD => {
|
45
|
+
"properties" => counts_properties
|
46
|
+
}
|
47
|
+
}
|
48
|
+
})
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|