structurize 2.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +64 -0
- avrotize/__main__.py +6 -0
- avrotize/_version.py +34 -0
- avrotize/asn1toavro.py +160 -0
- avrotize/avrotize.py +152 -0
- avrotize/avrotocpp.py +483 -0
- avrotize/avrotocsharp.py +1075 -0
- avrotize/avrotocsv.py +121 -0
- avrotize/avrotodatapackage.py +173 -0
- avrotize/avrotodb.py +1383 -0
- avrotize/avrotogo.py +476 -0
- avrotize/avrotographql.py +197 -0
- avrotize/avrotoiceberg.py +210 -0
- avrotize/avrotojava.py +2156 -0
- avrotize/avrotojs.py +250 -0
- avrotize/avrotojsons.py +481 -0
- avrotize/avrotojstruct.py +345 -0
- avrotize/avrotokusto.py +364 -0
- avrotize/avrotomd.py +137 -0
- avrotize/avrotools.py +168 -0
- avrotize/avrotoparquet.py +208 -0
- avrotize/avrotoproto.py +359 -0
- avrotize/avrotopython.py +624 -0
- avrotize/avrotorust.py +435 -0
- avrotize/avrotots.py +598 -0
- avrotize/avrotoxsd.py +344 -0
- avrotize/cddltostructure.py +1841 -0
- avrotize/commands.json +3337 -0
- avrotize/common.py +834 -0
- avrotize/constants.py +72 -0
- avrotize/csvtoavro.py +132 -0
- avrotize/datapackagetoavro.py +76 -0
- avrotize/dependencies/cpp/vcpkg/vcpkg.json +19 -0
- avrotize/dependencies/typescript/node22/package.json +16 -0
- avrotize/dependency_resolver.py +348 -0
- avrotize/dependency_version.py +432 -0
- avrotize/jsonstoavro.py +2167 -0
- avrotize/jsonstostructure.py +2642 -0
- avrotize/jstructtoavro.py +878 -0
- avrotize/kstructtoavro.py +93 -0
- avrotize/kustotoavro.py +455 -0
- avrotize/parquettoavro.py +157 -0
- avrotize/proto2parser.py +498 -0
- avrotize/proto3parser.py +403 -0
- avrotize/prototoavro.py +382 -0
- avrotize/structuretocddl.py +597 -0
- avrotize/structuretocpp.py +697 -0
- avrotize/structuretocsharp.py +2295 -0
- avrotize/structuretocsv.py +365 -0
- avrotize/structuretodatapackage.py +659 -0
- avrotize/structuretodb.py +1125 -0
- avrotize/structuretogo.py +720 -0
- avrotize/structuretographql.py +502 -0
- avrotize/structuretoiceberg.py +355 -0
- avrotize/structuretojava.py +853 -0
- avrotize/structuretojsons.py +498 -0
- avrotize/structuretokusto.py +639 -0
- avrotize/structuretomd.py +322 -0
- avrotize/structuretoproto.py +764 -0
- avrotize/structuretopython.py +772 -0
- avrotize/structuretorust.py +714 -0
- avrotize/structuretots.py +653 -0
- avrotize/structuretoxsd.py +679 -0
- avrotize/xsdtoavro.py +413 -0
- structurize-2.19.0.dist-info/METADATA +107 -0
- structurize-2.19.0.dist-info/RECORD +70 -0
- structurize-2.19.0.dist-info/WHEEL +5 -0
- structurize-2.19.0.dist-info/entry_points.txt +2 -0
- structurize-2.19.0.dist-info/licenses/LICENSE +201 -0
- structurize-2.19.0.dist-info/top_level.txt +1 -0
avrotize/jsonstoavro.py
ADDED
|
@@ -0,0 +1,2167 @@
|
|
|
1
|
+
""" JSON to Avro schema converter. """
|
|
2
|
+
|
|
3
|
+
# pylint: disable=too-many-lines, line-too-long, too-many-branches, too-many-statements, too-many-locals, too-many-nested-blocks, too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-boolean-expressions
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import copy
|
|
8
|
+
import urllib
|
|
9
|
+
from urllib.parse import ParseResult, urlparse, unquote
|
|
10
|
+
from typing import Any, Dict, List, Tuple
|
|
11
|
+
import jsonpointer
|
|
12
|
+
from jsonpointer import JsonPointerException
|
|
13
|
+
import requests
|
|
14
|
+
|
|
15
|
+
from avrotize.common import avro_name, avro_namespace, find_schema_node, generic_type, set_schema_node
|
|
16
|
+
from avrotize.dependency_resolver import inline_dependencies_of, sort_messages_by_dependencies
|
|
17
|
+
|
|
18
|
+
primitive_types = ['null', 'string', 'int',
|
|
19
|
+
'long', 'float', 'double', 'boolean', 'bytes']
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JsonToAvroConverter:
|
|
23
|
+
"""
|
|
24
|
+
Converts JSON schema to Avro schema.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
imported_types: A dictionary of imported type schemas.
|
|
28
|
+
root_namespace: The namespace for the root schema.
|
|
29
|
+
max_recursion_depth: The maximum recursion depth.
|
|
30
|
+
types_with_unmerged_types: A list of types with unmerged types.
|
|
31
|
+
content_cache: A dictionary for caching fetched URLs.
|
|
32
|
+
utility_namespace: The namespace for utility types.
|
|
33
|
+
maximize_compatiblity: A flag to maximize compatibility.
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self) -> None:
|
|
38
|
+
self.imported_types: Dict[Any, Any] = {}
|
|
39
|
+
self.root_namespace = 'example.com'
|
|
40
|
+
self.max_recursion_depth = 40
|
|
41
|
+
self.types_with_unmerged_types: List[dict] = []
|
|
42
|
+
self.content_cache: Dict[str, str] = {}
|
|
43
|
+
self.utility_namespace = 'utility.vasters.com'
|
|
44
|
+
self.split_top_level_records = False
|
|
45
|
+
self.root_class_name = 'document'
|
|
46
|
+
|
|
47
|
+
def is_empty_type(self, avro_type):
|
|
48
|
+
"""
|
|
49
|
+
Check if the Avro type is an empty type.
|
|
50
|
+
|
|
51
|
+
Parameters:
|
|
52
|
+
avro_type (any): The Avro type to check.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
bool: True if the Avro type is empty, False otherwise.
|
|
56
|
+
"""
|
|
57
|
+
if len(avro_type) == 0:
|
|
58
|
+
return True
|
|
59
|
+
if isinstance(avro_type, list):
|
|
60
|
+
return all(self.is_empty_type(t) for t in avro_type)
|
|
61
|
+
if isinstance(avro_type, dict):
|
|
62
|
+
if not 'type' in avro_type:
|
|
63
|
+
return True
|
|
64
|
+
if (avro_type['type'] == 'record' and (not 'fields' in avro_type or len(avro_type['fields']) == 0)) or \
|
|
65
|
+
(avro_type['type'] == 'enum' and (not 'symbols' in avro_type or len(avro_type['symbols']) == 0)) or \
|
|
66
|
+
(avro_type['type'] == 'array' and (not 'items' in avro_type or not avro_type['items'])) or \
|
|
67
|
+
(avro_type['type'] == 'map' and (not 'values' in avro_type or not avro_type['values'])):
|
|
68
|
+
return True
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
def is_empty_json_type(self, json_type):
|
|
72
|
+
"""
|
|
73
|
+
Check if the JSON type is an empty type.
|
|
74
|
+
|
|
75
|
+
Parameters:
|
|
76
|
+
json_type (any): The JSON type to check.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
bool: True if the JSON type is empty, False otherwise.
|
|
80
|
+
"""
|
|
81
|
+
if len(json_type) == 0:
|
|
82
|
+
return True
|
|
83
|
+
if isinstance(json_type, list):
|
|
84
|
+
return all(self.is_empty_json_type(t) for t in json_type)
|
|
85
|
+
if isinstance(json_type, dict):
|
|
86
|
+
if not 'type' in json_type:
|
|
87
|
+
return True
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
def detect_discriminated_union(self, json_type: dict):
|
|
91
|
+
"""
|
|
92
|
+
Detect if a JSON schema is a discriminated union pattern using allOf with if/then conditionals.
|
|
93
|
+
|
|
94
|
+
A discriminated union pattern consists of:
|
|
95
|
+
- A base schema with a discriminator field (usually 'type') with an enum
|
|
96
|
+
- An allOf array containing if/then conditionals that add fields based on discriminator value
|
|
97
|
+
|
|
98
|
+
Parameters:
|
|
99
|
+
json_type (dict): The JSON schema object to check
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
list | None: List of discriminator values if pattern detected, None otherwise
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(json_type, dict) or 'allOf' not in json_type:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
# Check for discriminator field with enum values
|
|
108
|
+
properties = json_type.get('properties', {})
|
|
109
|
+
if 'type' not in properties or 'enum' not in properties.get('type', {}):
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
discriminator_values = properties['type']['enum']
|
|
113
|
+
|
|
114
|
+
# Check if allOf contains if/then conditionals
|
|
115
|
+
has_if_then = any(
|
|
116
|
+
isinstance(item, dict) and 'if' in item and 'then' in item
|
|
117
|
+
for item in json_type['allOf']
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if has_if_then and len(discriminator_values) > 0:
|
|
121
|
+
return discriminator_values
|
|
122
|
+
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
def handle_inline_conditional_schema(self, json_type: dict) -> Tuple[bool, dict]:
|
|
126
|
+
"""
|
|
127
|
+
Handle inline if/then/else conditional schemas by converting them to appropriate structures.
|
|
128
|
+
|
|
129
|
+
Supports the following patterns:
|
|
130
|
+
1. Type-based conditional: if {properties: {type: {enum: [X]}}}, then {...}, else {...}
|
|
131
|
+
- Converted to oneOf with discriminated variants
|
|
132
|
+
2. Field presence conditional: if {properties: {field: {...}}, required: [field]}
|
|
133
|
+
- Merged into comprehensive type (Avro handles optional fields naturally)
|
|
134
|
+
|
|
135
|
+
Parameters:
|
|
136
|
+
json_type (dict): The JSON schema object to process
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Tuple[bool, dict]: (was_handled, modified_json_type)
|
|
140
|
+
"""
|
|
141
|
+
if not isinstance(json_type, dict) or 'if' not in json_type:
|
|
142
|
+
return (False, json_type)
|
|
143
|
+
|
|
144
|
+
if_clause = json_type.get('if', {})
|
|
145
|
+
then_clause = json_type.get('then', {})
|
|
146
|
+
else_clause = json_type.get('else', None)
|
|
147
|
+
|
|
148
|
+
# Check for type-based discriminator pattern
|
|
149
|
+
# if: {properties: {type: {enum: ["X"]}}}
|
|
150
|
+
if (isinstance(if_clause, dict) and
|
|
151
|
+
'properties' in if_clause and
|
|
152
|
+
'type' in if_clause['properties']):
|
|
153
|
+
|
|
154
|
+
type_prop = if_clause['properties']['type']
|
|
155
|
+
if isinstance(type_prop, dict) and 'enum' in type_prop:
|
|
156
|
+
# This is a type-based conditional - convert to oneOf
|
|
157
|
+
return self._convert_type_conditional_to_oneof(json_type, if_clause, then_clause, else_clause)
|
|
158
|
+
|
|
159
|
+
# Check for field presence pattern
|
|
160
|
+
# if: {properties: {field: {...}}, required: [field]}
|
|
161
|
+
if (isinstance(if_clause, dict) and
|
|
162
|
+
'properties' in if_clause and
|
|
163
|
+
'required' in if_clause):
|
|
164
|
+
# This is a field presence conditional - merge all branches
|
|
165
|
+
return self._merge_conditional_branches(json_type, then_clause, else_clause)
|
|
166
|
+
|
|
167
|
+
# Unsupported pattern
|
|
168
|
+
return (False, json_type)
|
|
169
|
+
|
|
170
|
+
def _convert_type_conditional_to_oneof(self, json_type: dict, if_clause: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
|
|
171
|
+
"""
|
|
172
|
+
Convert a type-based conditional schema to oneOf structure.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
Input: {type: object, properties: {type: {enum: [image, host]}}, if: {...}, then: {...}, else: {...}}
|
|
176
|
+
Output: {oneOf: [then_merged_with_base, else_merged_with_base]}
|
|
177
|
+
"""
|
|
178
|
+
# Create a base type without the conditional parts
|
|
179
|
+
base_type = {}
|
|
180
|
+
for key, value in json_type.items():
|
|
181
|
+
if key not in ('if', 'then', 'else'):
|
|
182
|
+
base_type[key] = copy.deepcopy(value)
|
|
183
|
+
|
|
184
|
+
oneof_variants = []
|
|
185
|
+
|
|
186
|
+
# Process then clause
|
|
187
|
+
if then_clause:
|
|
188
|
+
then_variant = self._merge_conditional_branch(base_type, then_clause)
|
|
189
|
+
oneof_variants.append(then_variant)
|
|
190
|
+
|
|
191
|
+
# Process else clause (which may contain nested if/then/else)
|
|
192
|
+
if else_clause:
|
|
193
|
+
if 'if' in else_clause:
|
|
194
|
+
# Recursive handling of nested conditional
|
|
195
|
+
handled, processed_else = self.handle_inline_conditional_schema(else_clause)
|
|
196
|
+
if handled and 'oneOf' in processed_else:
|
|
197
|
+
# Flatten nested oneOf
|
|
198
|
+
for variant in processed_else['oneOf']:
|
|
199
|
+
merged = self._merge_conditional_branch(base_type, variant)
|
|
200
|
+
oneof_variants.append(merged)
|
|
201
|
+
else:
|
|
202
|
+
else_variant = self._merge_conditional_branch(base_type, else_clause)
|
|
203
|
+
oneof_variants.append(else_variant)
|
|
204
|
+
else:
|
|
205
|
+
else_variant = self._merge_conditional_branch(base_type, else_clause)
|
|
206
|
+
oneof_variants.append(else_variant)
|
|
207
|
+
|
|
208
|
+
if len(oneof_variants) > 0:
|
|
209
|
+
result = copy.deepcopy(base_type)
|
|
210
|
+
# Remove properties since they'll be in the variants
|
|
211
|
+
if 'properties' in result:
|
|
212
|
+
del result['properties']
|
|
213
|
+
if 'additionalProperties' in result:
|
|
214
|
+
del result['additionalProperties']
|
|
215
|
+
if 'required' in result:
|
|
216
|
+
del result['required']
|
|
217
|
+
result['oneOf'] = oneof_variants
|
|
218
|
+
return (True, result)
|
|
219
|
+
|
|
220
|
+
return (False, json_type)
|
|
221
|
+
|
|
222
|
+
def _merge_conditional_branches(self, json_type: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
|
|
223
|
+
"""
|
|
224
|
+
Merge conditional branches for field presence patterns.
|
|
225
|
+
Avro handles optional fields naturally, so we can merge all properties.
|
|
226
|
+
"""
|
|
227
|
+
result = {}
|
|
228
|
+
for key, value in json_type.items():
|
|
229
|
+
if key not in ('if', 'then', 'else'):
|
|
230
|
+
result[key] = copy.deepcopy(value)
|
|
231
|
+
|
|
232
|
+
# Merge properties from then clause
|
|
233
|
+
if then_clause and 'properties' in then_clause:
|
|
234
|
+
if 'properties' not in result:
|
|
235
|
+
result['properties'] = {}
|
|
236
|
+
for prop_name, prop_def in then_clause['properties'].items():
|
|
237
|
+
if prop_name not in result['properties']:
|
|
238
|
+
result['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
239
|
+
|
|
240
|
+
# Merge properties from else clause
|
|
241
|
+
if else_clause and 'properties' in else_clause:
|
|
242
|
+
if 'properties' not in result:
|
|
243
|
+
result['properties'] = {}
|
|
244
|
+
for prop_name, prop_def in else_clause['properties'].items():
|
|
245
|
+
if prop_name not in result['properties']:
|
|
246
|
+
result['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
247
|
+
|
|
248
|
+
return (True, result)
|
|
249
|
+
|
|
250
|
+
def _merge_conditional_branch(self, base: dict, branch: dict) -> dict:
|
|
251
|
+
"""Merge a conditional branch with the base type."""
|
|
252
|
+
result = copy.deepcopy(base)
|
|
253
|
+
|
|
254
|
+
if not branch:
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
# Merge properties
|
|
258
|
+
if 'properties' in branch:
|
|
259
|
+
if 'properties' not in result:
|
|
260
|
+
result['properties'] = {}
|
|
261
|
+
for prop_name, prop_def in branch['properties'].items():
|
|
262
|
+
result['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
263
|
+
|
|
264
|
+
# Merge additionalProperties
|
|
265
|
+
if 'additionalProperties' in branch:
|
|
266
|
+
result['additionalProperties'] = branch['additionalProperties']
|
|
267
|
+
|
|
268
|
+
# Merge required (union of required fields)
|
|
269
|
+
if 'required' in branch:
|
|
270
|
+
if 'required' not in result:
|
|
271
|
+
result['required'] = []
|
|
272
|
+
for req in branch['required']:
|
|
273
|
+
if req not in result['required']:
|
|
274
|
+
result['required'].append(req)
|
|
275
|
+
|
|
276
|
+
return result
|
|
277
|
+
|
|
278
|
+
def flatten_union(self, type_list: list) -> list:
|
|
279
|
+
"""
|
|
280
|
+
Flatten the list of types in a union into a single list.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
type_list (list): The list of types in a union.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
list: The flattened list of types.
|
|
287
|
+
|
|
288
|
+
"""
|
|
289
|
+
flat_list = []
|
|
290
|
+
for t in type_list:
|
|
291
|
+
if isinstance(t, list):
|
|
292
|
+
inner = self.flatten_union(t)
|
|
293
|
+
for u in inner:
|
|
294
|
+
if not u in flat_list:
|
|
295
|
+
flat_list.append(u)
|
|
296
|
+
elif not t in flat_list:
|
|
297
|
+
flat_list.append(t)
|
|
298
|
+
# consolidate array type instances
|
|
299
|
+
array_type = None
|
|
300
|
+
map_type = None
|
|
301
|
+
flat_list_1 = []
|
|
302
|
+
for t in flat_list:
|
|
303
|
+
if isinstance(t, dict) and 'type' in t and t['type'] == 'array' and 'items' in t:
|
|
304
|
+
if not array_type:
|
|
305
|
+
array_type = t
|
|
306
|
+
flat_list_1.append(t)
|
|
307
|
+
else:
|
|
308
|
+
array_type = self.merge_avro_schemas([array_type, t], [])
|
|
309
|
+
elif isinstance(t, dict) and 'type' in t and t['type'] == 'map' and 'values' in t:
|
|
310
|
+
if not map_type:
|
|
311
|
+
map_type = t
|
|
312
|
+
flat_list_1.append(t)
|
|
313
|
+
else:
|
|
314
|
+
map_type = self.merge_avro_schemas([map_type, t], [])
|
|
315
|
+
elif not t in flat_list_1:
|
|
316
|
+
flat_list_1.append(t)
|
|
317
|
+
return flat_list_1
|
|
318
|
+
|
|
319
|
+
# pylint: disable=dangerous-default-value
|
|
320
|
+
def merge_avro_schemas(self, schemas: list, avro_schemas: list, type_name: str | None = None, deps: List[str] = []) -> str | list | dict:
|
|
321
|
+
"""Merge multiple Avro type schemas into one."""
|
|
322
|
+
|
|
323
|
+
def split_merge(schema1, schema2, schema_list, offset):
|
|
324
|
+
""" return the continuing schema merges of incompatible schemas """
|
|
325
|
+
remaining_schemas = schema_list[offset +
|
|
326
|
+
1:] if len(schema_list) > offset else []
|
|
327
|
+
if isinstance(schema2, dict) and 'dependencies' in schema2:
|
|
328
|
+
deps.extend(schema2['dependencies'])
|
|
329
|
+
del schema2['dependencies']
|
|
330
|
+
if isinstance(schema1, dict) and 'dependencies' in schema1:
|
|
331
|
+
deps.extend(schema1['dependencies'])
|
|
332
|
+
del schema1['dependencies']
|
|
333
|
+
schema1_merged = self.merge_avro_schemas(
|
|
334
|
+
[schema2] + remaining_schemas, avro_schemas, type_name, deps)
|
|
335
|
+
schema2_merged = self.merge_avro_schemas(
|
|
336
|
+
[schema1] + remaining_schemas, avro_schemas, type_name, deps)
|
|
337
|
+
if not self.is_empty_type(schema1_merged) and not self.is_empty_type(schema2_merged):
|
|
338
|
+
return self.flatten_union([schema1_merged, schema2_merged])
|
|
339
|
+
else:
|
|
340
|
+
if not self.is_empty_type(schema1_merged):
|
|
341
|
+
return schema1_merged
|
|
342
|
+
if not self.is_empty_type(schema2_merged):
|
|
343
|
+
return schema2_merged
|
|
344
|
+
# if both are empty, we'll return an empty record
|
|
345
|
+
return {'type': 'record', 'fields': []}
|
|
346
|
+
|
|
347
|
+
merged_schema: dict = {}
|
|
348
|
+
if len(schemas) == 1:
|
|
349
|
+
return schemas[0]
|
|
350
|
+
if type_name:
|
|
351
|
+
self.set_avro_type_value(merged_schema, 'name', type_name)
|
|
352
|
+
for i, schema in enumerate(schemas):
|
|
353
|
+
schema = copy.deepcopy(schema)
|
|
354
|
+
if isinstance(schema, dict) and 'dependencies' in schema:
|
|
355
|
+
deps1: List[str] = merged_schema.get('dependencies', [])
|
|
356
|
+
deps1.extend(schema['dependencies'])
|
|
357
|
+
merged_schema['dependencies'] = deps1
|
|
358
|
+
if (isinstance(schema, list) or isinstance(schema, dict)) and len(schema) == 0:
|
|
359
|
+
continue
|
|
360
|
+
if isinstance(schema, str):
|
|
361
|
+
sch = next(
|
|
362
|
+
(s for s in avro_schemas if s.get('name') == schema), None)
|
|
363
|
+
if sch:
|
|
364
|
+
merged_schema.update(sch)
|
|
365
|
+
else:
|
|
366
|
+
merged_schema['type'] = schema
|
|
367
|
+
elif isinstance(schema, list):
|
|
368
|
+
# the incoming schema is a list, so it's a union
|
|
369
|
+
if 'type' not in merged_schema:
|
|
370
|
+
merged_schema['type'] = schema
|
|
371
|
+
else:
|
|
372
|
+
if isinstance(merged_schema['type'], list):
|
|
373
|
+
merged_schema['type'].extend(schema)
|
|
374
|
+
else:
|
|
375
|
+
if isinstance(merged_schema['type'], str):
|
|
376
|
+
if merged_schema['type'] == 'record' or merged_schema['type'] == 'enum' or merged_schema['type'] == 'fixed' \
|
|
377
|
+
or merged_schema['type'] == 'map' or merged_schema['type'] == 'array':
|
|
378
|
+
return split_merge(merged_schema, schema, schemas, i)
|
|
379
|
+
else:
|
|
380
|
+
merged_schema['type'] = [merged_schema['type']]
|
|
381
|
+
else:
|
|
382
|
+
merged_schema['type'].extend(schema)
|
|
383
|
+
elif schema and ('type' not in schema or 'type' not in merged_schema):
|
|
384
|
+
merged_schema.update(schema)
|
|
385
|
+
elif schema:
|
|
386
|
+
if 'type' in merged_schema and schema['type'] != merged_schema['type']:
|
|
387
|
+
return split_merge(merged_schema, schema, schemas, i)
|
|
388
|
+
if not type_name:
|
|
389
|
+
self.set_avro_type_value(merged_schema, 'name', avro_name(
|
|
390
|
+
merged_schema.get('name', '') + schema.get('name', '')))
|
|
391
|
+
if 'fields' in schema:
|
|
392
|
+
if 'fields' in merged_schema:
|
|
393
|
+
for field in schema['fields']:
|
|
394
|
+
if field not in merged_schema['fields']:
|
|
395
|
+
merged_schema['fields'].append(field)
|
|
396
|
+
else:
|
|
397
|
+
merged_schema_field = next(
|
|
398
|
+
f for f in merged_schema['fields'] if f.get('name') == field.get('name'))
|
|
399
|
+
if merged_schema_field['type'] != field['type']:
|
|
400
|
+
merged_schema_field['type'] = [
|
|
401
|
+
field['type'], merged_schema_field['type']]
|
|
402
|
+
if 'doc' in field and 'doc' not in merged_schema_field:
|
|
403
|
+
merged_schema_field['doc'] = field['doc']
|
|
404
|
+
else:
|
|
405
|
+
merged_schema['fields'] = schema['fields']
|
|
406
|
+
if self.is_avro_complex_type(merged_schema) and 'namespace' in merged_schema:
|
|
407
|
+
if merged_schema['type'] in ['array', 'map']:
|
|
408
|
+
del merged_schema['namespace']
|
|
409
|
+
return merged_schema
|
|
410
|
+
|
|
411
|
+
def merge_json_schemas(self, json_schemas: list[dict], intersect: bool = False) -> dict:
|
|
412
|
+
"""
|
|
413
|
+
Merge multiple JSON schemas into one.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
json_schemas (list[dict]): A list of JSON schemas to be merged.
|
|
417
|
+
intersect (bool, optional): If True, only keep the intersection of the required fields. Defaults to False.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
dict: The merged JSON schema.
|
|
421
|
+
"""
|
|
422
|
+
|
|
423
|
+
def merge_structures(schema1: dict, schema2: dict) -> dict | list:
|
|
424
|
+
""" merge two JSON dicts recursively """
|
|
425
|
+
if 'type' in schema1 and 'type' in schema2 and schema1['type'] != schema2['type']:
|
|
426
|
+
return [schema1, schema2]
|
|
427
|
+
schema1 = copy.deepcopy(schema1)
|
|
428
|
+
for key in schema2:
|
|
429
|
+
if key not in schema1:
|
|
430
|
+
schema1[key] = schema2[key]
|
|
431
|
+
elif isinstance(schema1[key], dict) and isinstance(schema2[key], dict):
|
|
432
|
+
schema1[key] = merge_structures(schema1[key], schema2[key])
|
|
433
|
+
elif isinstance(schema1[key], list) and isinstance(schema2[key], list):
|
|
434
|
+
schema1[key].extend(schema2[key])
|
|
435
|
+
elif schema1[key] == schema2[key]:
|
|
436
|
+
continue
|
|
437
|
+
else:
|
|
438
|
+
if isinstance(schema1[key], list):
|
|
439
|
+
if schema2[key] not in schema1[key]:
|
|
440
|
+
schema1[key].append(schema2[key])
|
|
441
|
+
else:
|
|
442
|
+
schema1[key] = [schema1[key], schema2[key]]
|
|
443
|
+
return schema1
|
|
444
|
+
|
|
445
|
+
merged_type: dict = {}
|
|
446
|
+
|
|
447
|
+
for json_schema in json_schemas:
|
|
448
|
+
if 'type' not in json_schema or 'type' not in merged_type:
|
|
449
|
+
for key in json_schema:
|
|
450
|
+
if not key in merged_type:
|
|
451
|
+
merged_type[key] = copy.deepcopy(json_schema[key])
|
|
452
|
+
else:
|
|
453
|
+
if key == 'required':
|
|
454
|
+
merged_type[key] = list(
|
|
455
|
+
set(merged_type[key]).union(set(json_schema[key])))
|
|
456
|
+
if key == 'name' or key == 'title' or key == 'description':
|
|
457
|
+
merged_type[key] = merged_type[key] + \
|
|
458
|
+
json_schema[key]
|
|
459
|
+
elif isinstance(merged_type[key], dict):
|
|
460
|
+
merged_type[key] = merge_structures(
|
|
461
|
+
merged_type[key], copy.deepcopy(json_schema[key]))
|
|
462
|
+
elif isinstance(merged_type[key], list) and isinstance(json_schema[key], list):
|
|
463
|
+
for item in json_schema[key]:
|
|
464
|
+
if item not in merged_type[key]:
|
|
465
|
+
merged_type[key].append(item)
|
|
466
|
+
else:
|
|
467
|
+
if merged_type[key] is None:
|
|
468
|
+
merged_type[key] = json_schema[key]
|
|
469
|
+
else:
|
|
470
|
+
merged_type[key] = [merged_type[key],
|
|
471
|
+
copy.deepcopy(json_schema[key])]
|
|
472
|
+
else:
|
|
473
|
+
if 'type' in merged_type and json_schema['type'] != merged_type['type']:
|
|
474
|
+
if isinstance(merged_type['type'], str):
|
|
475
|
+
merged_type['type'] = [merged_type['type']]
|
|
476
|
+
merged_type['type'].append(json_schema['type'])
|
|
477
|
+
if 'required' in json_schema:
|
|
478
|
+
if 'required' in merged_type:
|
|
479
|
+
merged_type['required'] = list(
|
|
480
|
+
set(merged_type['required']).union(set(json_schema['required'])))
|
|
481
|
+
else:
|
|
482
|
+
merged_type['required'] = json_schema['required']
|
|
483
|
+
if 'name' in json_schema:
|
|
484
|
+
if 'name' in merged_type:
|
|
485
|
+
merged_type['name'] = merged_type.get(
|
|
486
|
+
'name', '') + json_schema['name']
|
|
487
|
+
else:
|
|
488
|
+
merged_type['name'] = json_schema['name']
|
|
489
|
+
if 'properties' in json_schema:
|
|
490
|
+
if 'properties' in merged_type:
|
|
491
|
+
for prop in json_schema['properties']:
|
|
492
|
+
if prop in merged_type['properties']:
|
|
493
|
+
merged_type['properties'][prop] = merge_structures(
|
|
494
|
+
merged_type['properties'][prop], copy.deepcopy(json_schema['properties'][prop]))
|
|
495
|
+
else:
|
|
496
|
+
merged_type['properties'][prop] = json_schema['properties'][prop]
|
|
497
|
+
else:
|
|
498
|
+
merged_type['properties'] = json_schema['properties']
|
|
499
|
+
if 'enum' in json_schema:
|
|
500
|
+
if 'enum' in merged_type:
|
|
501
|
+
merged_type['enum'] = list(
|
|
502
|
+
set(merged_type['enum']).union(set(json_schema['enum'])))
|
|
503
|
+
else:
|
|
504
|
+
merged_type['enum'] = json_schema['enum']
|
|
505
|
+
if 'format' in json_schema:
|
|
506
|
+
if 'format' in merged_type:
|
|
507
|
+
merged_type['format'] = merged_type['format'] + \
|
|
508
|
+
json_schema['format']
|
|
509
|
+
else:
|
|
510
|
+
merged_type['format'] = json_schema['format']
|
|
511
|
+
|
|
512
|
+
if intersect:
|
|
513
|
+
# only keep the intersection of the required fields
|
|
514
|
+
if 'required' in merged_type:
|
|
515
|
+
new_required = merged_type['required']
|
|
516
|
+
for json_schema in json_schemas:
|
|
517
|
+
new_required = list(set(new_required).intersection(
|
|
518
|
+
set(json_schema.get('required', []))))
|
|
519
|
+
merged_type['required'] = new_required
|
|
520
|
+
|
|
521
|
+
return merged_type
|
|
522
|
+
|
|
523
|
+
def ensure_type(self, type: dict | str | list) -> dict | str | list:
|
|
524
|
+
"""
|
|
525
|
+
Ensures that the given type is valid by adding a 'type' field if it is missing.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
type (dict | str | list): The type to ensure.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
dict | str | list: The ensured type.
|
|
532
|
+
"""
|
|
533
|
+
if isinstance(type, str) or isinstance(type, list) or 'type' in type:
|
|
534
|
+
return type
|
|
535
|
+
|
|
536
|
+
type['type'] = generic_type()
|
|
537
|
+
return type
|
|
538
|
+
|
|
539
|
+
def json_schema_primitive_to_avro_type(self, json_primitive: str | list, format: str | None, enum: list | None, record_name: str, field_name: str, namespace: str, dependencies: list) -> str | dict[str, Any] | list:
|
|
540
|
+
"""
|
|
541
|
+
Convert a JSON-schema primitive type to Avro primitive type.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
json_primitive (str | list): The JSON-schema primitive type to be converted.
|
|
545
|
+
format (str | None): The format of the JSON primitive type, if applicable.
|
|
546
|
+
enum (list | None): The list of enum values, if applicable.
|
|
547
|
+
record_name (str): The name of the record.
|
|
548
|
+
field_name (str): The name of the field.
|
|
549
|
+
namespace (str): The namespace of the Avro type.
|
|
550
|
+
dependencies (list): The list of dependencies.
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
str | dict[str,Any] | list: The converted Avro primitive type.
|
|
554
|
+
|
|
555
|
+
"""
|
|
556
|
+
if isinstance(json_primitive, list):
|
|
557
|
+
if enum:
|
|
558
|
+
# Handle mixed-type enums properly using the dedicated helper
|
|
559
|
+
return self.create_enum_for_mixed_types(
|
|
560
|
+
field_name + '_1',
|
|
561
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
562
|
+
enum,
|
|
563
|
+
json_primitive
|
|
564
|
+
)
|
|
565
|
+
else:
|
|
566
|
+
union = []
|
|
567
|
+
for item in json_primitive:
|
|
568
|
+
enum2 = item.get('enum') if isinstance(
|
|
569
|
+
item, dict) else None
|
|
570
|
+
format2 = item.get('format') if isinstance(
|
|
571
|
+
item, dict) else None
|
|
572
|
+
avro_primitive = self.json_schema_primitive_to_avro_type(
|
|
573
|
+
item, format2, enum2, record_name, field_name, self.compose_namespace(namespace, record_name, field_name), dependencies)
|
|
574
|
+
union.append(avro_primitive)
|
|
575
|
+
return union
|
|
576
|
+
|
|
577
|
+
if json_primitive == 'string':
|
|
578
|
+
avro_primitive = 'string'
|
|
579
|
+
elif json_primitive == 'integer':
|
|
580
|
+
avro_primitive = 'int'
|
|
581
|
+
if format == 'int64':
|
|
582
|
+
avro_primitive = 'long'
|
|
583
|
+
elif json_primitive == 'number':
|
|
584
|
+
avro_primitive = 'float'
|
|
585
|
+
elif json_primitive == 'boolean':
|
|
586
|
+
avro_primitive = 'boolean'
|
|
587
|
+
elif not format:
|
|
588
|
+
if isinstance(json_primitive, str):
|
|
589
|
+
dependencies.append(json_primitive)
|
|
590
|
+
avro_primitive = json_primitive
|
|
591
|
+
|
|
592
|
+
# if you've got { 'type': 'string', 'format': ['date-time', 'duration'] }, I'm sorry
|
|
593
|
+
if format and isinstance(format, str):
|
|
594
|
+
if format in ('date-time', 'date'):
|
|
595
|
+
avro_primitive = {'type': 'int', 'logicalType': 'date'}
|
|
596
|
+
elif format in ('time'):
|
|
597
|
+
avro_primitive = {'type': 'int', 'logicalType': 'time-millis'}
|
|
598
|
+
elif format in ('duration'):
|
|
599
|
+
avro_primitive = {'type': 'fixed',
|
|
600
|
+
'size': 12, 'logicalType': 'duration'}
|
|
601
|
+
elif format in ('uuid'):
|
|
602
|
+
avro_primitive = {'type': 'string', 'logicalType': 'uuid'}
|
|
603
|
+
|
|
604
|
+
return avro_primitive
|
|
605
|
+
|
|
606
|
+
def fetch_content(self, url: str | ParseResult):
|
|
607
|
+
"""
|
|
608
|
+
Fetches the content from the specified URL.
|
|
609
|
+
|
|
610
|
+
Args:
|
|
611
|
+
url (str or ParseResult): The URL to fetch the content from.
|
|
612
|
+
|
|
613
|
+
Returns:
|
|
614
|
+
str: The fetched content.
|
|
615
|
+
|
|
616
|
+
Raises:
|
|
617
|
+
requests.RequestException: If there is an error while making the HTTP request.
|
|
618
|
+
Exception: If there is an error while reading the file.
|
|
619
|
+
|
|
620
|
+
"""
|
|
621
|
+
# Parse the URL to determine the scheme
|
|
622
|
+
if isinstance(url, str):
|
|
623
|
+
parsed_url = urlparse(url)
|
|
624
|
+
else:
|
|
625
|
+
parsed_url = url
|
|
626
|
+
|
|
627
|
+
if parsed_url.geturl() in self.content_cache:
|
|
628
|
+
return self.content_cache[parsed_url.geturl()]
|
|
629
|
+
scheme = parsed_url.scheme
|
|
630
|
+
|
|
631
|
+
# Handle HTTP and HTTPS URLs
|
|
632
|
+
if scheme in ['http', 'https']:
|
|
633
|
+
response = requests.get(url if isinstance(
|
|
634
|
+
url, str) else parsed_url.geturl(), timeout=30)
|
|
635
|
+
# Raises an HTTPError if the response status code is 4XX/5XX
|
|
636
|
+
response.raise_for_status()
|
|
637
|
+
self.content_cache[parsed_url.geturl()] = response.text
|
|
638
|
+
return response.text
|
|
639
|
+
|
|
640
|
+
# Handle file URLs
|
|
641
|
+
elif scheme == 'file':
|
|
642
|
+
# Remove the leading 'file://' from the path for compatibility
|
|
643
|
+
file_path = parsed_url.netloc
|
|
644
|
+
if not file_path:
|
|
645
|
+
file_path = parsed_url.path
|
|
646
|
+
# On Windows, a file URL might start with a '/' but it's not part of the actual path
|
|
647
|
+
if os.name == 'nt' and file_path.startswith('/'):
|
|
648
|
+
file_path = file_path[1:]
|
|
649
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
650
|
+
text = file.read()
|
|
651
|
+
self.content_cache[parsed_url.geturl()] = text
|
|
652
|
+
return text
|
|
653
|
+
else:
|
|
654
|
+
raise NotImplementedError(f'Unsupported URL scheme: {scheme}')
|
|
655
|
+
|
|
656
|
+
def resolve_reference(self, json_type: dict, base_uri: str, json_doc: dict) -> Tuple[dict, dict]:
|
|
657
|
+
"""
|
|
658
|
+
Resolve a JSON Pointer reference or a JSON $ref reference.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
json_type (dict): The JSON type containing the reference.
|
|
662
|
+
base_uri (str): The base URI of the JSON document.
|
|
663
|
+
json_doc (dict): The JSON document containing the reference.
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
Tuple[dict, dict]: A tuple containing the resolved JSON schema and the original JSON schema document.
|
|
667
|
+
|
|
668
|
+
Raises:
|
|
669
|
+
Exception: If there is an error decoding JSON from the reference.
|
|
670
|
+
Exception: If there is an error resolving the JSON Pointer reference.
|
|
671
|
+
|
|
672
|
+
"""
|
|
673
|
+
try:
|
|
674
|
+
ref = json_type['$ref']
|
|
675
|
+
content = None
|
|
676
|
+
url = urlparse(ref)
|
|
677
|
+
if url.scheme:
|
|
678
|
+
content = self.fetch_content(ref)
|
|
679
|
+
elif url.path:
|
|
680
|
+
file_uri = self.compose_uri(base_uri, url)
|
|
681
|
+
content = self.fetch_content(file_uri)
|
|
682
|
+
if content:
|
|
683
|
+
try:
|
|
684
|
+
json_schema_doc = json_schema = json.loads(content)
|
|
685
|
+
# resolve the JSON Pointer reference, if any
|
|
686
|
+
if url.fragment:
|
|
687
|
+
json_schema = jsonpointer.resolve_pointer(
|
|
688
|
+
json_schema, url.fragment)
|
|
689
|
+
return json_schema, json_schema_doc
|
|
690
|
+
except json.JSONDecodeError:
|
|
691
|
+
raise Exception(f'Error decoding JSON from {ref}')
|
|
692
|
+
|
|
693
|
+
if url.fragment:
|
|
694
|
+
json_pointer = unquote(url.fragment)
|
|
695
|
+
ref_schema = jsonpointer.resolve_pointer(
|
|
696
|
+
json_doc, json_pointer)
|
|
697
|
+
if ref_schema:
|
|
698
|
+
return ref_schema, json_doc
|
|
699
|
+
except JsonPointerException as e:
|
|
700
|
+
raise Exception(
|
|
701
|
+
f'Error resolving JSON Pointer reference for {base_uri}')
|
|
702
|
+
return json_type, json_doc
|
|
703
|
+
|
|
704
|
+
def compose_uri(self, base_uri, url):
|
|
705
|
+
if isinstance(url, str):
|
|
706
|
+
url = urlparse(url)
|
|
707
|
+
if url.scheme:
|
|
708
|
+
return url.geturl()
|
|
709
|
+
if not url.path and not url.netloc:
|
|
710
|
+
return base_uri
|
|
711
|
+
if base_uri.startswith('file'):
|
|
712
|
+
parsed_file_uri = urlparse(base_uri)
|
|
713
|
+
dir = os.path.dirname(
|
|
714
|
+
parsed_file_uri.netloc if parsed_file_uri.netloc else parsed_file_uri.path)
|
|
715
|
+
filename = os.path.join(dir, url.path)
|
|
716
|
+
file_uri = f'file://{filename}'
|
|
717
|
+
else:
|
|
718
|
+
# combine the base URI with the URL
|
|
719
|
+
file_uri = urllib.parse.urljoin(base_uri, url.geturl())
|
|
720
|
+
return file_uri
|
|
721
|
+
|
|
722
|
+
def get_field_type_name(self, field: dict) -> str:
|
|
723
|
+
if isinstance(field['type'], str):
|
|
724
|
+
return field['type']
|
|
725
|
+
elif isinstance(field['type'], list):
|
|
726
|
+
names = []
|
|
727
|
+
for field_type in field['type']:
|
|
728
|
+
if isinstance(field_type, str):
|
|
729
|
+
names.append(field_type)
|
|
730
|
+
elif isinstance(field_type, dict):
|
|
731
|
+
names.append(self.get_field_type_name(field_type))
|
|
732
|
+
else:
|
|
733
|
+
names.append('union')
|
|
734
|
+
return ', '.join(names)
|
|
735
|
+
elif isinstance(field['type'], dict) and 'type' in field['type']:
|
|
736
|
+
return field['type']['type']
|
|
737
|
+
return 'union'
|
|
738
|
+
|
|
739
|
+
def json_type_to_avro_type(self, json_type: str | dict, record_name: str, field_name: str, namespace: str, dependencies: list, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth=1) -> dict | list | str:
|
|
740
|
+
"""Convert a JSON type to Avro type."""
|
|
741
|
+
|
|
742
|
+
try:
|
|
743
|
+
if recursion_depth >= self.max_recursion_depth:
|
|
744
|
+
print(
|
|
745
|
+
f'WARNING: Maximum recursion depth reached for {record_name} at field {field_name}')
|
|
746
|
+
return generic_type()
|
|
747
|
+
|
|
748
|
+
avro_type: list | dict | str = {}
|
|
749
|
+
local_name = avro_name(field_name if field_name else record_name)
|
|
750
|
+
hasAnyOf = isinstance(json_type, dict) and 'anyOf' in json_type
|
|
751
|
+
|
|
752
|
+
if isinstance(json_type, dict):
|
|
753
|
+
|
|
754
|
+
json_object_type = json_type.get('type')
|
|
755
|
+
# Check if the type is already an Avro schema (e.g., shared discriminator enum)
|
|
756
|
+
# This happens when a discriminated union property was pre-set with an Avro type
|
|
757
|
+
if isinstance(json_object_type, dict) and 'type' in json_object_type and json_object_type.get('type') in ['enum', 'record', 'fixed', 'array', 'map']:
|
|
758
|
+
return self.post_check_avro_type(dependencies, json_object_type)
|
|
759
|
+
if isinstance(json_object_type, list):
|
|
760
|
+
# if the 'type' is a list, we map it back to a string
|
|
761
|
+
# if the list has only one item or if the list has two items
|
|
762
|
+
# and one of them is 'null'
|
|
763
|
+
# otherwise, we will construct and inject a oneOf type
|
|
764
|
+
# and split the type
|
|
765
|
+
|
|
766
|
+
# Special case: if we have a mixed-type enum (e.g., type: ["string", "integer"] with enum),
|
|
767
|
+
# handle it directly here to avoid duplicate processing
|
|
768
|
+
if 'enum' in json_type and any(t in json_object_type for t in ['string', 'integer', 'int']):
|
|
769
|
+
has_null = 'null' in json_object_type
|
|
770
|
+
avro_type = self.create_enum_for_mixed_types(
|
|
771
|
+
local_name + '_1',
|
|
772
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
773
|
+
json_type['enum'],
|
|
774
|
+
json_object_type
|
|
775
|
+
)
|
|
776
|
+
if 'description' in json_type and isinstance(avro_type, dict):
|
|
777
|
+
avro_type['doc'] = json_type['description']
|
|
778
|
+
elif 'description' in json_type and isinstance(avro_type, list):
|
|
779
|
+
# For unions, we can't set doc directly - it will be set on the field
|
|
780
|
+
pass
|
|
781
|
+
return self.post_check_avro_type(dependencies, avro_type)
|
|
782
|
+
|
|
783
|
+
if len(json_object_type) == 1:
|
|
784
|
+
json_object_type = json_object_type[0]
|
|
785
|
+
elif len(json_object_type) == 2 and 'null' in json_object_type:
|
|
786
|
+
if json_object_type[0] == 'null':
|
|
787
|
+
json_object_type = json_object_type[1]
|
|
788
|
+
else:
|
|
789
|
+
json_object_type = json_object_type[0]
|
|
790
|
+
else:
|
|
791
|
+
oneof = []
|
|
792
|
+
for option in json_object_type:
|
|
793
|
+
if not option == 'null':
|
|
794
|
+
oneof.append({
|
|
795
|
+
'type': option
|
|
796
|
+
})
|
|
797
|
+
if len(oneof) > 0:
|
|
798
|
+
del json_type['type']
|
|
799
|
+
json_type['oneOf'] = oneof
|
|
800
|
+
|
|
801
|
+
if 'if' in json_type or 'then' in json_type or 'else' in json_type or 'dependentSchemas' in json_type or 'dependentRequired' in json_type:
|
|
802
|
+
# Try to handle the conditional schema pattern
|
|
803
|
+
conditional_handled = False
|
|
804
|
+
if 'if' in json_type:
|
|
805
|
+
conditional_handled, json_type = self.handle_inline_conditional_schema(json_type)
|
|
806
|
+
|
|
807
|
+
if not conditional_handled:
|
|
808
|
+
# Only warn for patterns we can't handle
|
|
809
|
+
remaining_conditionals = []
|
|
810
|
+
if 'if' in json_type:
|
|
811
|
+
remaining_conditionals.append('if/then/else')
|
|
812
|
+
if 'dependentSchemas' in json_type:
|
|
813
|
+
remaining_conditionals.append('dependentSchemas')
|
|
814
|
+
if 'dependentRequired' in json_type:
|
|
815
|
+
remaining_conditionals.append('dependentRequired')
|
|
816
|
+
|
|
817
|
+
if remaining_conditionals:
|
|
818
|
+
print(
|
|
819
|
+
f'WARNING: Conditional schema pattern ({", ".join(remaining_conditionals)}) is not fully supported and will be simplified.')
|
|
820
|
+
|
|
821
|
+
if 'if' in json_type:
|
|
822
|
+
del json_type['if']
|
|
823
|
+
if 'then' in json_type:
|
|
824
|
+
del json_type['then']
|
|
825
|
+
if 'else' in json_type:
|
|
826
|
+
del json_type['else']
|
|
827
|
+
if 'dependentSchemas' in json_type:
|
|
828
|
+
del json_type['dependentSchemas']
|
|
829
|
+
if 'dependentRequired' in json_type:
|
|
830
|
+
del json_type['dependentRequired']
|
|
831
|
+
|
|
832
|
+
base_type = json_type.copy()
|
|
833
|
+
if 'oneOf' in base_type:
|
|
834
|
+
del base_type['oneOf']
|
|
835
|
+
if 'anyOf' in base_type:
|
|
836
|
+
del base_type['anyOf']
|
|
837
|
+
if 'allOf' in base_type:
|
|
838
|
+
del base_type['allOf']
|
|
839
|
+
json_types = []
|
|
840
|
+
|
|
841
|
+
if 'allOf' in json_type:
|
|
842
|
+
# Check if this is a discriminated union pattern
|
|
843
|
+
discriminated_union_types = self.detect_discriminated_union(json_type)
|
|
844
|
+
|
|
845
|
+
if discriminated_union_types:
|
|
846
|
+
# Generate separate types for each discriminated variant
|
|
847
|
+
base_props = json_type.get('properties', {})
|
|
848
|
+
discriminator_field = 'type' # The discriminator field
|
|
849
|
+
discriminator_enum = base_props.get(discriminator_field, {}).get('enum', [])
|
|
850
|
+
|
|
851
|
+
# Create a shared enum type for the discriminator field that all variants will reference
|
|
852
|
+
shared_discriminator_enum = None
|
|
853
|
+
if discriminator_enum:
|
|
854
|
+
shared_discriminator_enum = self.create_enum_type(
|
|
855
|
+
discriminator_field,
|
|
856
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
857
|
+
discriminator_enum
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
for allof_item in json_type['allOf']:
|
|
861
|
+
if not (isinstance(allof_item, dict) and 'if' in allof_item and 'then' in allof_item):
|
|
862
|
+
continue
|
|
863
|
+
|
|
864
|
+
# Extract the discriminator value from the if clause
|
|
865
|
+
if_clause = allof_item['if']
|
|
866
|
+
discriminator_value = None
|
|
867
|
+
if (isinstance(if_clause, dict) and
|
|
868
|
+
'properties' in if_clause and
|
|
869
|
+
discriminator_field in if_clause['properties']):
|
|
870
|
+
disc_prop = if_clause['properties'][discriminator_field]
|
|
871
|
+
if 'enum' in disc_prop and len(disc_prop['enum']) > 0:
|
|
872
|
+
discriminator_value = disc_prop['enum'][0]
|
|
873
|
+
|
|
874
|
+
if not discriminator_value:
|
|
875
|
+
continue
|
|
876
|
+
|
|
877
|
+
# Resolve the then clause reference
|
|
878
|
+
then_clause = allof_item['then']
|
|
879
|
+
if isinstance(then_clause, dict) and '$ref' in then_clause:
|
|
880
|
+
resolved_type, _ = self.resolve_reference(then_clause, base_uri, json_schema)
|
|
881
|
+
|
|
882
|
+
# Create a new type combining base properties and resolved type
|
|
883
|
+
variant_type = copy.deepcopy(resolved_type)
|
|
884
|
+
|
|
885
|
+
# Set the variant type name to the discriminator value
|
|
886
|
+
variant_type['title'] = discriminator_value
|
|
887
|
+
|
|
888
|
+
# Preserve description from base type if variant doesn't have one
|
|
889
|
+
if 'description' not in variant_type and 'description' in base_type:
|
|
890
|
+
variant_type['description'] = base_type['description']
|
|
891
|
+
|
|
892
|
+
# Merge base properties into the variant
|
|
893
|
+
if 'properties' not in variant_type:
|
|
894
|
+
variant_type['properties'] = {}
|
|
895
|
+
|
|
896
|
+
for prop_name, prop_def in base_props.items():
|
|
897
|
+
if prop_name not in variant_type['properties']:
|
|
898
|
+
# For non-discriminator fields, copy the property definition
|
|
899
|
+
if prop_name != discriminator_field:
|
|
900
|
+
variant_type['properties'][prop_name] = copy.deepcopy(prop_def)
|
|
901
|
+
|
|
902
|
+
# Set discriminator field to reference the shared enum type
|
|
903
|
+
if shared_discriminator_enum:
|
|
904
|
+
variant_type['properties'][discriminator_field] = {
|
|
905
|
+
'type': shared_discriminator_enum,
|
|
906
|
+
'default': discriminator_value,
|
|
907
|
+
'const': discriminator_value,
|
|
908
|
+
'discriminator': True
|
|
909
|
+
}
|
|
910
|
+
else:
|
|
911
|
+
# Fallback if no enum was found
|
|
912
|
+
variant_type['properties'][discriminator_field] = {
|
|
913
|
+
'type': 'string',
|
|
914
|
+
'default': discriminator_value,
|
|
915
|
+
'const': discriminator_value,
|
|
916
|
+
'discriminator': True
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
# Add union annotation to indicate this is part of a discriminated union
|
|
920
|
+
variant_type['union'] = record_name
|
|
921
|
+
|
|
922
|
+
json_types.append(variant_type)
|
|
923
|
+
else:
|
|
924
|
+
# Original allOf merging logic for non-discriminated unions
|
|
925
|
+
type_list = [copy.deepcopy(base_type)]
|
|
926
|
+
for allof_option in json_type['allOf']:
|
|
927
|
+
while isinstance(allof_option, dict) and '$ref' in allof_option:
|
|
928
|
+
resolved_json_type, resolved_schema = self.resolve_reference(
|
|
929
|
+
allof_option, base_uri, json_schema)
|
|
930
|
+
del allof_option['$ref']
|
|
931
|
+
allof_option = self.merge_json_schemas(
|
|
932
|
+
[allof_option, resolved_json_type])
|
|
933
|
+
type_list.append(copy.deepcopy(allof_option))
|
|
934
|
+
merged_type = self.merge_json_schemas(
|
|
935
|
+
type_list, intersect=False)
|
|
936
|
+
json_types.append(merged_type)
|
|
937
|
+
|
|
938
|
+
if 'oneOf' in json_type:
|
|
939
|
+
# if the json type is a oneOf, we create a type union of all types
|
|
940
|
+
if len(json_types) == 0:
|
|
941
|
+
type_to_process = copy.deepcopy(base_type)
|
|
942
|
+
else:
|
|
943
|
+
type_to_process = copy.deepcopy(json_types.pop())
|
|
944
|
+
json_types = []
|
|
945
|
+
oneof = json_type['oneOf']
|
|
946
|
+
if len(json_types) == 0:
|
|
947
|
+
for oneof_option in oneof:
|
|
948
|
+
if isinstance(oneof_option, dict) and 'type' in oneof_option and 'type' in type_to_process and not type_to_process.get('type') == oneof_option.get('type'):
|
|
949
|
+
# we can't merge these due to conflicting types, so we pass the option-type on as-is
|
|
950
|
+
json_types.append(oneof_option)
|
|
951
|
+
else:
|
|
952
|
+
json_types.append(self.merge_json_schemas(
|
|
953
|
+
[type_to_process, oneof_option], intersect=True))
|
|
954
|
+
else:
|
|
955
|
+
new_json_types = []
|
|
956
|
+
for oneof_option in oneof:
|
|
957
|
+
for json_type_option in json_types:
|
|
958
|
+
json_type_option = self.merge_json_schemas(
|
|
959
|
+
[json_type_option, oneof_option], intersect=True)
|
|
960
|
+
new_json_types.append(json_type_option)
|
|
961
|
+
json_types = new_json_types
|
|
962
|
+
|
|
963
|
+
if 'anyOf' in json_type:
|
|
964
|
+
types_to_process = json_types.copy() if len(json_types) > 0 else [
|
|
965
|
+
copy.deepcopy(base_type)]
|
|
966
|
+
json_types = []
|
|
967
|
+
for type_to_process in types_to_process:
|
|
968
|
+
type_list = [copy.deepcopy(type_to_process)]
|
|
969
|
+
# anyOf is a list of types where any number from 1 to all
|
|
970
|
+
# may match the data. Trouble with anyOf is that it doesn't
|
|
971
|
+
# really have a semantic interpretation in the context of Avro.
|
|
972
|
+
for anyof_option in json_type['anyOf']:
|
|
973
|
+
if isinstance(anyof_option, dict) and '$ref' in anyof_option:
|
|
974
|
+
# if we have a ref, we can't merge into the base type, so we pass it on as-is.
|
|
975
|
+
# into the JSON type list
|
|
976
|
+
json_types.append(copy.deepcopy(anyof_option))
|
|
977
|
+
else:
|
|
978
|
+
type_list.append(copy.deepcopy(anyof_option))
|
|
979
|
+
merged_type = self.merge_json_schemas(
|
|
980
|
+
type_list, intersect=False)
|
|
981
|
+
json_types.append(merged_type)
|
|
982
|
+
|
|
983
|
+
if len(json_types) > 0:
|
|
984
|
+
if len(json_types) == 1:
|
|
985
|
+
avro_type = self.json_type_to_avro_type(
|
|
986
|
+
json_types[0], record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
987
|
+
if isinstance(avro_type, dict) and self.is_empty_type(avro_type) and not 'allOf' in json_type:
|
|
988
|
+
avro_type['type'] = generic_type()
|
|
989
|
+
avro_type = self.post_check_avro_type(
|
|
990
|
+
dependencies, avro_type)
|
|
991
|
+
return avro_type
|
|
992
|
+
else:
|
|
993
|
+
try:
|
|
994
|
+
record_stack.append(
|
|
995
|
+
field_name if field_name else record_name)
|
|
996
|
+
subtypes = []
|
|
997
|
+
count = 1
|
|
998
|
+
type_deps: List[str] = []
|
|
999
|
+
for json_type_option in json_types:
|
|
1000
|
+
if isinstance(json_type_option, dict) and '$ref' in json_type_option:
|
|
1001
|
+
ref = json_type_option['$ref']
|
|
1002
|
+
if ref in self.imported_types:
|
|
1003
|
+
avro_subtype = self.imported_types[ref]
|
|
1004
|
+
subtypes.append(avro_subtype)
|
|
1005
|
+
type_deps.append(avro_subtype)
|
|
1006
|
+
continue
|
|
1007
|
+
|
|
1008
|
+
subtype_deps: List[str] = []
|
|
1009
|
+
# Use title from discriminated union if available, otherwise generate numbered name
|
|
1010
|
+
if isinstance(json_type_option, dict) and 'title' in json_type_option:
|
|
1011
|
+
sub_field_name = avro_name(json_type_option['title'])
|
|
1012
|
+
elif not isinstance(json_type_option, dict) or not '$ref' in json_type_option:
|
|
1013
|
+
sub_field_name = avro_name(local_name + '_' + str(count))
|
|
1014
|
+
else:
|
|
1015
|
+
sub_field_name = None
|
|
1016
|
+
avro_subtype = self.json_type_to_avro_type(
|
|
1017
|
+
json_type_option, record_name, sub_field_name, namespace, subtype_deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1018
|
+
if not avro_subtype:
|
|
1019
|
+
continue
|
|
1020
|
+
if isinstance(avro_subtype, dict) and 'name' in avro_subtype and 'type' in avro_subtype and (avro_subtype['type'] == 'record' or avro_subtype['type'] == 'enum'):
|
|
1021
|
+
# we have a standalone record or enum so we need to add it to the schema at the top-level
|
|
1022
|
+
# and reference it as a dependency from the parent type if it's not already been added.
|
|
1023
|
+
existing_type = next((t for t in avro_schema if t.get('name') == avro_subtype['name'] and t.get(
|
|
1024
|
+
'namespace') == avro_subtype.get('namespace')), None)
|
|
1025
|
+
if not existing_type:
|
|
1026
|
+
if subtype_deps:
|
|
1027
|
+
if not 'dependencies' in avro_subtype:
|
|
1028
|
+
avro_subtype['dependencies'] = subtype_deps
|
|
1029
|
+
else:
|
|
1030
|
+
avro_subtype['dependencies'].extend(
|
|
1031
|
+
subtype_deps)
|
|
1032
|
+
if self.is_empty_type(avro_subtype):
|
|
1033
|
+
print(
|
|
1034
|
+
f'WARN: Standalone type {avro_subtype["name"]} is empty')
|
|
1035
|
+
if avro_subtype['type'] != 'enum' and avro_subtype['type'] != 'record' and avro_subtype['type'] != 'fixed':
|
|
1036
|
+
raise ValueError(
|
|
1037
|
+
f'WARN: Standalone type {avro_subtype["name"]} is not a record or enum or fixed type')
|
|
1038
|
+
avro_schema.append(avro_subtype)
|
|
1039
|
+
full_name = self.get_qualified_name(
|
|
1040
|
+
avro_subtype)
|
|
1041
|
+
subtype_deps = [full_name]
|
|
1042
|
+
avro_subtype = full_name
|
|
1043
|
+
if isinstance(avro_subtype, dict) and 'dependencies' in avro_subtype:
|
|
1044
|
+
subtype_deps.extend(
|
|
1045
|
+
avro_subtype['dependencies'])
|
|
1046
|
+
del avro_subtype['dependencies']
|
|
1047
|
+
if len(subtype_deps) > 0:
|
|
1048
|
+
type_deps.extend(subtype_deps)
|
|
1049
|
+
if not self.is_empty_type(avro_subtype):
|
|
1050
|
+
if isinstance(avro_subtype, list):
|
|
1051
|
+
subtypes.extend(
|
|
1052
|
+
copy.deepcopy(avro_subtype))
|
|
1053
|
+
else:
|
|
1054
|
+
subtypes.append(
|
|
1055
|
+
copy.deepcopy(avro_subtype))
|
|
1056
|
+
count += 1
|
|
1057
|
+
if len(type_deps) > 0:
|
|
1058
|
+
dependencies.extend(type_deps)
|
|
1059
|
+
if len(subtypes) == 1:
|
|
1060
|
+
return self.post_check_avro_type(dependencies, subtypes[0])
|
|
1061
|
+
finally:
|
|
1062
|
+
record_stack.pop()
|
|
1063
|
+
|
|
1064
|
+
if hasAnyOf:
|
|
1065
|
+
# if all subtypes are strings, they are either primitive types or type references
|
|
1066
|
+
# which means there's nothing to merge, so we'll return the list of types
|
|
1067
|
+
if all([isinstance(st, str) for st in subtypes]):
|
|
1068
|
+
return self.post_check_avro_type(dependencies, subtypes)
|
|
1069
|
+
|
|
1070
|
+
# we now has a list of types that may match the data, but this would be
|
|
1071
|
+
# an Avro union which is mutually exclusive. We will merge this list
|
|
1072
|
+
# into a record type in postprocessing when all types are available
|
|
1073
|
+
if not isinstance(avro_type, dict):
|
|
1074
|
+
avro_type = {}
|
|
1075
|
+
avro_type['unmerged_types'] = subtypes
|
|
1076
|
+
avro_type['type'] = 'record'
|
|
1077
|
+
avro_type['name'] = avro_name(local_name)
|
|
1078
|
+
if local_name != avro_name(local_name):
|
|
1079
|
+
avro_type['altnames'] = { 'json': local_name }
|
|
1080
|
+
avro_type['namespace'] = namespace
|
|
1081
|
+
avro_type['fields'] = []
|
|
1082
|
+
if 'description' in json_type:
|
|
1083
|
+
avro_type['doc'] = json_type['description']
|
|
1084
|
+
json_type = {}
|
|
1085
|
+
else:
|
|
1086
|
+
return self.post_check_avro_type(dependencies, subtypes)
|
|
1087
|
+
|
|
1088
|
+
if 'properties' in json_type and not 'type' in json_type:
|
|
1089
|
+
json_type['type'] = 'object'
|
|
1090
|
+
|
|
1091
|
+
if 'description' in json_type and isinstance(avro_type, dict):
|
|
1092
|
+
avro_type['doc'] = json_type['description']
|
|
1093
|
+
|
|
1094
|
+
if 'title' in json_type and isinstance(avro_type, dict):
|
|
1095
|
+
self.set_avro_type_value(
|
|
1096
|
+
avro_type, 'name', avro_name(json_type['title']))
|
|
1097
|
+
|
|
1098
|
+
# first, pull in any referenced definitions and merge with this schema
|
|
1099
|
+
if '$ref' in json_type:
|
|
1100
|
+
# the $ref can indeed be a list as a result from a prior allOf/anyOf merge
|
|
1101
|
+
# if that is so, we will copy the type and process each $ref separately
|
|
1102
|
+
# and return the result as a list of types
|
|
1103
|
+
if isinstance(json_type['$ref'], list):
|
|
1104
|
+
types = []
|
|
1105
|
+
for ref in json_type['$ref']:
|
|
1106
|
+
json_type_copy = copy.deepcopy(json_type)
|
|
1107
|
+
json_type_copy['$ref'] = ref
|
|
1108
|
+
types.append(self.json_type_to_avro_type(json_type_copy, record_name, field_name, namespace,
|
|
1109
|
+
dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
|
|
1110
|
+
return self.post_check_avro_type(dependencies, types)
|
|
1111
|
+
|
|
1112
|
+
ref = json_type['$ref']
|
|
1113
|
+
if ref in self.imported_types:
|
|
1114
|
+
# reference was already resolved, so we can resolve the reference simply by returning the type
|
|
1115
|
+
type_ref = copy.deepcopy(self.imported_types[ref])
|
|
1116
|
+
if isinstance(type_ref, str):
|
|
1117
|
+
dependencies.append(type_ref)
|
|
1118
|
+
return self.post_check_avro_type(dependencies, type_ref)
|
|
1119
|
+
else:
|
|
1120
|
+
new_base_uri = self.compose_uri(
|
|
1121
|
+
base_uri, json_type['$ref'])
|
|
1122
|
+
resolved_json_type, resolved_schema = self.resolve_reference(
|
|
1123
|
+
json_type, base_uri, json_schema)
|
|
1124
|
+
if self.is_empty_json_type(json_type):
|
|
1125
|
+
# it's a standalone reference, so will import the type into the schema
|
|
1126
|
+
# and reference it like it was in the same file
|
|
1127
|
+
type_name = record_name
|
|
1128
|
+
type_namespace = namespace
|
|
1129
|
+
parsed_ref = urlparse(ref)
|
|
1130
|
+
if parsed_ref.fragment:
|
|
1131
|
+
type_name = avro_name(
|
|
1132
|
+
parsed_ref.fragment.split('/')[-1])
|
|
1133
|
+
sub_namespace = self.compose_namespace(
|
|
1134
|
+
*parsed_ref.fragment.split('/')[2:-1])
|
|
1135
|
+
type_namespace = self.compose_namespace(
|
|
1136
|
+
self.root_namespace, sub_namespace)
|
|
1137
|
+
|
|
1138
|
+
# registering in imported_types ahead of resolving to prevent circular references.
|
|
1139
|
+
# we only cache the type if it's forseeable that it is usable as a standalone type
|
|
1140
|
+
# which means that it must be either a record or an enum or a fixed type when converted
|
|
1141
|
+
# to Avro. That means we look for the presence of 'type', 'properties', 'allOf', 'anyOf',
|
|
1142
|
+
# and 'enum' in the resolved type.
|
|
1143
|
+
if resolved_json_type and (('type' in resolved_json_type and resolved_json_type['type'] == 'object') or 'properties' in resolved_json_type or 'enum' in resolved_json_type or
|
|
1144
|
+
'allOf' in resolved_json_type or 'anyOf' in resolved_json_type):
|
|
1145
|
+
self.imported_types[ref] = self.compose_namespace(
|
|
1146
|
+
type_namespace, type_name)
|
|
1147
|
+
# resolve type
|
|
1148
|
+
deps: List[str] = []
|
|
1149
|
+
resolved_avro_type: dict | list | str | None = self.json_type_to_avro_type(
|
|
1150
|
+
resolved_json_type, type_name, '', type_namespace, deps, resolved_schema, new_base_uri, avro_schema, [], recursion_depth + 1)
|
|
1151
|
+
if isinstance(resolved_avro_type, str):
|
|
1152
|
+
dependencies.extend(deps)
|
|
1153
|
+
return self.post_check_avro_type(dependencies, resolved_avro_type)
|
|
1154
|
+
if isinstance(resolved_avro_type, list) or (not isinstance(resolved_avro_type, dict) or (not resolved_avro_type.get('type') == 'record' and not resolved_avro_type.get('type') == 'enum')):
|
|
1155
|
+
if isinstance(resolved_avro_type, dict) and not 'type' in resolved_avro_type:
|
|
1156
|
+
if isinstance(avro_type, dict):
|
|
1157
|
+
# the resolved type didn't have a type and avro_type is a dict,
|
|
1158
|
+
# so we assume it's a mixin into the type we found
|
|
1159
|
+
avro_type.update(resolved_avro_type)
|
|
1160
|
+
resolved_avro_type = None
|
|
1161
|
+
else:
|
|
1162
|
+
# no 'type' definition for this field and we can't mix into the avro type,
|
|
1163
|
+
# so we fallback to a generic type
|
|
1164
|
+
print(
|
|
1165
|
+
f"WARNING: no 'type' definition for {ref} in record {record_name}: {json.dumps(resolved_avro_type)}")
|
|
1166
|
+
resolved_avro_type = generic_type()
|
|
1167
|
+
elif isinstance(avro_type, str) and resolved_avro_type:
|
|
1168
|
+
# this is a plain type reference
|
|
1169
|
+
avro_type = resolved_avro_type
|
|
1170
|
+
self.imported_types[ref] = avro_type
|
|
1171
|
+
resolved_avro_type = None
|
|
1172
|
+
if resolved_avro_type:
|
|
1173
|
+
# this is not a record type that can stand on its own,
|
|
1174
|
+
# so we remove the cached type entry
|
|
1175
|
+
# and pass it on as an inline type
|
|
1176
|
+
dependencies.extend(deps)
|
|
1177
|
+
if ref in self.imported_types:
|
|
1178
|
+
del self.imported_types[ref]
|
|
1179
|
+
avro_type = self.merge_avro_schemas(
|
|
1180
|
+
[avro_type, resolved_avro_type], avro_schema, local_name)
|
|
1181
|
+
if isinstance(avro_type, dict) and 'name' in avro_type and not self.is_standalone_avro_type(avro_type):
|
|
1182
|
+
del avro_type['name']
|
|
1183
|
+
return self.post_check_avro_type(dependencies, avro_type)
|
|
1184
|
+
else:
|
|
1185
|
+
avro_type = resolved_avro_type
|
|
1186
|
+
self.imported_types[ref] = copy.deepcopy(
|
|
1187
|
+
avro_type)
|
|
1188
|
+
|
|
1189
|
+
if len(deps) > 0:
|
|
1190
|
+
if isinstance(avro_type, dict):
|
|
1191
|
+
avro_type['dependencies'] = deps
|
|
1192
|
+
else:
|
|
1193
|
+
dependencies.extend(deps)
|
|
1194
|
+
|
|
1195
|
+
if self.is_standalone_avro_type(avro_type):
|
|
1196
|
+
self.register_type(avro_schema, avro_type)
|
|
1197
|
+
full_name = self.get_qualified_name(avro_type)
|
|
1198
|
+
if ref in self.imported_types:
|
|
1199
|
+
# update the import reference to the resolved type if it's cached
|
|
1200
|
+
self.imported_types[ref] = full_name
|
|
1201
|
+
dependencies.append(full_name)
|
|
1202
|
+
avro_type = full_name
|
|
1203
|
+
else:
|
|
1204
|
+
del json_type['$ref']
|
|
1205
|
+
# it's a reference within a definition, so we will turn this into an inline type
|
|
1206
|
+
if isinstance(resolved_json_type, dict) and 'type' in resolved_json_type and json_type.get('type') and not json_type['type'] == resolved_json_type['type']:
|
|
1207
|
+
# the types conflict, so we can't merge them
|
|
1208
|
+
type1 = self.json_type_to_avro_type(
|
|
1209
|
+
json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1210
|
+
type2 = self.json_type_to_avro_type(resolved_json_type, record_name, field_name, namespace,
|
|
1211
|
+
dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1212
|
+
# if either of the types are empty, use just the other one
|
|
1213
|
+
if not self.is_empty_type(type1) and not self.is_empty_type(type2):
|
|
1214
|
+
return self.flatten_union([type1, type2])
|
|
1215
|
+
if not self.is_empty_type(type1):
|
|
1216
|
+
avro_type = type1
|
|
1217
|
+
if isinstance(avro_type, list):
|
|
1218
|
+
return self.post_check_avro_type(dependencies, avro_type)
|
|
1219
|
+
if not self.is_empty_type(type2):
|
|
1220
|
+
avro_type = type2
|
|
1221
|
+
if isinstance(avro_type, list):
|
|
1222
|
+
return self.post_check_avro_type(dependencies, avro_type)
|
|
1223
|
+
json_type = {}
|
|
1224
|
+
else:
|
|
1225
|
+
json_type = self.merge_json_schemas(
|
|
1226
|
+
[json_type, resolved_json_type])
|
|
1227
|
+
avro_type = self.json_type_to_avro_type(
|
|
1228
|
+
json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1229
|
+
json_type = {}
|
|
1230
|
+
if ref in self.imported_types:
|
|
1231
|
+
# update the import reference to the resolved type if it's cached
|
|
1232
|
+
if isinstance(avro_type, dict) and 'name' in avro_type:
|
|
1233
|
+
self.imported_types[ref] = avro_type['name']
|
|
1234
|
+
else:
|
|
1235
|
+
self.imported_types[ref] = avro_type
|
|
1236
|
+
|
|
1237
|
+
# if 'const' is present, make this an enum
|
|
1238
|
+
if 'const' in json_type:
|
|
1239
|
+
const_list = json_type['const'] if isinstance(
|
|
1240
|
+
json_type['const'], list) else [json_type['const']]
|
|
1241
|
+
avro_type = self.merge_avro_schemas([avro_type, self.create_enum_type(
|
|
1242
|
+
local_name, namespace, const_list)], avro_schema, local_name)
|
|
1243
|
+
if json_object_type or 'enum' in json_type:
|
|
1244
|
+
if json_object_type == 'array':
|
|
1245
|
+
if isinstance(json_type, dict) and 'items' in json_type:
|
|
1246
|
+
deps = []
|
|
1247
|
+
item_type = self.json_type_to_avro_type(
|
|
1248
|
+
json_type['items'], record_name, field_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1249
|
+
if self.is_standalone_avro_type(item_type):
|
|
1250
|
+
if isinstance(item_type, dict) and len(deps) > 0:
|
|
1251
|
+
item_type['dependencies'] = deps
|
|
1252
|
+
self.register_type(avro_schema, item_type)
|
|
1253
|
+
dependencies.append(
|
|
1254
|
+
self.get_qualified_name(item_type))
|
|
1255
|
+
else:
|
|
1256
|
+
dependencies.extend(deps)
|
|
1257
|
+
if isinstance(item_type, dict) and not 'type' in item_type:
|
|
1258
|
+
item_type = generic_type()
|
|
1259
|
+
elif isinstance(item_type, str) and not item_type in primitive_types:
|
|
1260
|
+
dependencies.append(item_type)
|
|
1261
|
+
else: # not a standalone type, but has a type definition, so we unwind that here
|
|
1262
|
+
item_type = self.post_check_avro_type(
|
|
1263
|
+
dependencies, item_type)
|
|
1264
|
+
avro_type = self.merge_avro_schemas(
|
|
1265
|
+
[avro_type, self.create_array_type(item_type)], avro_schema, '')
|
|
1266
|
+
else:
|
|
1267
|
+
avro_type = self.merge_avro_schemas(
|
|
1268
|
+
[avro_type, self.create_array_type(generic_type())], avro_schema, '')
|
|
1269
|
+
elif json_object_type and (json_object_type == 'object' or 'object' in json_object_type):
|
|
1270
|
+
avro_record_type = self.json_schema_object_to_avro_record(
|
|
1271
|
+
local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1272
|
+
if isinstance(avro_record_type, list):
|
|
1273
|
+
for record_entry in avro_record_type:
|
|
1274
|
+
self.lift_dependencies_from_type(
|
|
1275
|
+
record_entry, dependencies)
|
|
1276
|
+
avro_type = self.merge_avro_schemas([avro_type, avro_record_type], avro_schema, avro_type.get(
|
|
1277
|
+
'name', local_name) if isinstance(avro_type, dict) else local_name)
|
|
1278
|
+
self.lift_dependencies_from_type(
|
|
1279
|
+
avro_type, dependencies)
|
|
1280
|
+
elif 'enum' in json_type:
|
|
1281
|
+
# Handle enums with proper type handling for mixed string/int enums
|
|
1282
|
+
enum_values = json_type['enum']
|
|
1283
|
+
schema_type = json_type.get('type', 'string')
|
|
1284
|
+
|
|
1285
|
+
# For pure string enums with valid symbols, use simple enum without suffix
|
|
1286
|
+
string_values = [v for v in enum_values if isinstance(v, str) and v]
|
|
1287
|
+
int_values = [v for v in enum_values if isinstance(v, int)]
|
|
1288
|
+
|
|
1289
|
+
if not int_values and string_values:
|
|
1290
|
+
# Pure string enum
|
|
1291
|
+
if not self.enum_symbols_need_string_fallback(string_values):
|
|
1292
|
+
# Simple case: valid symbols, just create enum
|
|
1293
|
+
avro_type = self.create_enum_type(
|
|
1294
|
+
local_name,
|
|
1295
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
1296
|
+
string_values
|
|
1297
|
+
)
|
|
1298
|
+
else:
|
|
1299
|
+
# Symbols need prefixing, use helper with string fallback
|
|
1300
|
+
avro_type = self.create_enum_for_mixed_types(
|
|
1301
|
+
local_name,
|
|
1302
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
1303
|
+
enum_values,
|
|
1304
|
+
schema_type
|
|
1305
|
+
)
|
|
1306
|
+
# Register any embedded enum types in the union
|
|
1307
|
+
self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
|
|
1308
|
+
else:
|
|
1309
|
+
# Mixed or int-only enum, use helper
|
|
1310
|
+
avro_type = self.create_enum_for_mixed_types(
|
|
1311
|
+
local_name + '_1',
|
|
1312
|
+
self.compose_namespace(namespace, record_name + '_types'),
|
|
1313
|
+
enum_values,
|
|
1314
|
+
schema_type
|
|
1315
|
+
)
|
|
1316
|
+
# Register any embedded enum types in the union
|
|
1317
|
+
self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
|
|
1318
|
+
else:
|
|
1319
|
+
avro_type = self.json_schema_primitive_to_avro_type(json_object_type, json_type.get(
|
|
1320
|
+
'format'), json_type.get('enum'), record_name, field_name, namespace, dependencies)
|
|
1321
|
+
else:
|
|
1322
|
+
if isinstance(json_type, dict):
|
|
1323
|
+
avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(json_type, json_type.get('format'), json_type.get(
|
|
1324
|
+
'enum'), record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
|
|
1325
|
+
else:
|
|
1326
|
+
avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(
|
|
1327
|
+
json_type, None, None, record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
|
|
1328
|
+
|
|
1329
|
+
if isinstance(avro_type, dict) and 'name' in avro_type and 'type' in avro_type and not (avro_type['type'] in ['array', 'map']):
|
|
1330
|
+
if not 'namespace' in avro_type:
|
|
1331
|
+
avro_type['namespace'] = namespace
|
|
1332
|
+
existing_type = next((t for t in avro_schema if t.get(
|
|
1333
|
+
'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
|
|
1334
|
+
if existing_type:
|
|
1335
|
+
existing_type_name = self.get_qualified_name(existing_type)
|
|
1336
|
+
if not existing_type_name in dependencies:
|
|
1337
|
+
dependencies.append(existing_type_name)
|
|
1338
|
+
return existing_type_name
|
|
1339
|
+
self.set_avro_type_value(avro_type, 'name', local_name)
|
|
1340
|
+
|
|
1341
|
+
# post-check on the avro type: if the type is a dict, and the 'type' is not
|
|
1342
|
+
# a record, enum, fixed, array, or map, we will just return the basic type
|
|
1343
|
+
# and push its dependencies up the stack
|
|
1344
|
+
avro_type = self.post_check_avro_type(dependencies, avro_type)
|
|
1345
|
+
|
|
1346
|
+
if isinstance(avro_type, dict) and 'unmerged_types' in avro_type:
|
|
1347
|
+
self.types_with_unmerged_types.append(avro_type)
|
|
1348
|
+
|
|
1349
|
+
return avro_type
|
|
1350
|
+
except RecursionError as e:
|
|
1351
|
+
print(
|
|
1352
|
+
f"Recursion error while processing {namespace}:{record_name}:{field_name} with recursion depth {recursion_depth}")
|
|
1353
|
+
raise e
|
|
1354
|
+
|
|
1355
|
+
def post_check_avro_type(self, dependencies, avro_type):
|
|
1356
|
+
"""Post-check the Avro type and push dependencies up the stack."""
|
|
1357
|
+
if isinstance(avro_type, dict) and 'type' in avro_type and (isinstance(avro_type, list) or not avro_type['type'] in ['array', 'map', 'record', 'enum', 'fixed']):
|
|
1358
|
+
if 'dependencies' in avro_type:
|
|
1359
|
+
dependencies.extend(avro_type['dependencies'])
|
|
1360
|
+
avro_type = avro_type['type']
|
|
1361
|
+
return avro_type
|
|
1362
|
+
|
|
1363
|
+
def register_type(self, avro_schema, avro_type) -> bool:
|
|
1364
|
+
"""Register a type in the Avro schema."""
|
|
1365
|
+
existing_type = next((t for t in avro_schema if t.get(
|
|
1366
|
+
'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
|
|
1367
|
+
if not existing_type:
|
|
1368
|
+
if self.is_empty_type(avro_type) and not 'unmerged_types' in avro_type:
|
|
1369
|
+
print(f'WARN: Standalone type {avro_type["name"]} is empty')
|
|
1370
|
+
if self.is_standalone_avro_type(avro_type):
|
|
1371
|
+
avro_schema.append(avro_type)
|
|
1372
|
+
return True
|
|
1373
|
+
else:
|
|
1374
|
+
return False
|
|
1375
|
+
else:
|
|
1376
|
+
return True
|
|
1377
|
+
|
|
1378
|
+
def register_embedded_types_in_union(self, avro_type, avro_schema, dependencies):
|
|
1379
|
+
"""
|
|
1380
|
+
Register any embedded named types (enum, record, fixed) found within a union type.
|
|
1381
|
+
This ensures that enum types created by create_enum_for_mixed_types are properly
|
|
1382
|
+
registered in the schema and can be referenced by name.
|
|
1383
|
+
"""
|
|
1384
|
+
if isinstance(avro_type, list):
|
|
1385
|
+
for i, member in enumerate(avro_type):
|
|
1386
|
+
if isinstance(member, dict) and 'type' in member and member['type'] in ['enum', 'record', 'fixed']:
|
|
1387
|
+
# Register the embedded type
|
|
1388
|
+
if self.register_type(avro_schema, member):
|
|
1389
|
+
# Replace the inline definition with a reference
|
|
1390
|
+
full_name = self.get_qualified_name(member)
|
|
1391
|
+
avro_type[i] = full_name
|
|
1392
|
+
if full_name not in dependencies:
|
|
1393
|
+
dependencies.append(full_name)
|
|
1394
|
+
|
|
1395
|
+
def has_composition_keywords(self, json_object: dict) -> bool:
|
|
1396
|
+
"""Check if the JSON object has any of the combining keywords: allOf, oneOf, anyOf."""
|
|
1397
|
+
return isinstance(json_object, dict) and ('allOf' in json_object or 'oneOf' in json_object or 'anyOf' in json_object)
|
|
1398
|
+
|
|
1399
|
+
def has_enum_keyword(self, json_object: dict) -> bool:
|
|
1400
|
+
"""Check if the JSON object is an enum."""
|
|
1401
|
+
return isinstance(json_object, dict) and 'enum' in json_object
|
|
1402
|
+
|
|
1403
|
+
def is_array_object(self, json_object: dict) -> bool:
|
|
1404
|
+
"""Check if the JSON object is an array object."""
|
|
1405
|
+
return isinstance(json_object, dict) and 'type' in json_object and json_object['type'] == 'array'
|
|
1406
|
+
|
|
1407
|
+
def is_standalone_avro_type(self, avro_type: dict | list | str) -> bool:
|
|
1408
|
+
"""Check if the Avro type is a standalone type."""
|
|
1409
|
+
return isinstance(avro_type, dict) and 'type' in avro_type and (avro_type['type'] in ['record', 'enum', 'fixed'])
|
|
1410
|
+
|
|
1411
|
+
def is_avro_complex_type(self, avro_type: dict) -> bool:
|
|
1412
|
+
"""Check if the Avro type is a complex type."""
|
|
1413
|
+
return 'type' in avro_type and avro_type['type'] in ['record', 'enum', 'fixed', 'array', 'map']
|
|
1414
|
+
|
|
1415
|
+
def set_avro_type_value(self, avro_type: dict | list | str, name: str, value: dict | list | str):
|
|
1416
|
+
"""Set a value in an Avro type."""
|
|
1417
|
+
if isinstance(avro_type, dict):
|
|
1418
|
+
if name == 'namespace' or name == 'name':
|
|
1419
|
+
if 'type' in avro_type:
|
|
1420
|
+
if not (avro_type['type'] in ['record', 'enum', 'fixed']):
|
|
1421
|
+
return
|
|
1422
|
+
avro_type[name] = value
|
|
1423
|
+
|
|
1424
|
+
def create_avro_record(self, name: str, namespace: str, fields: list) -> dict:
|
|
1425
|
+
"""Create an Avro record type."""
|
|
1426
|
+
return {
|
|
1427
|
+
'type': 'record',
|
|
1428
|
+
'name': avro_name(name),
|
|
1429
|
+
'namespace': namespace,
|
|
1430
|
+
'fields': fields
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
def create_wrapper_record(self, wrapper_name: str, wrapper_namespace: str, wrapper_field: str, dependencies: list, avro_type: list | str | dict) -> dict:
|
|
1434
|
+
"""Create a union wrapper type in Avro."""
|
|
1435
|
+
rec = self.create_avro_record(wrapper_name, wrapper_namespace, [
|
|
1436
|
+
{
|
|
1437
|
+
'name': wrapper_field,
|
|
1438
|
+
'type': avro_type
|
|
1439
|
+
}
|
|
1440
|
+
])
|
|
1441
|
+
if len(dependencies) > 0:
|
|
1442
|
+
rec['dependencies'] = dependencies
|
|
1443
|
+
return rec
|
|
1444
|
+
|
|
1445
|
+
def create_enum_type(self, name: str, namespace: str, symbols: list) -> dict:
|
|
1446
|
+
"""Create an Avro enum type."""
|
|
1447
|
+
# the symbol list may have been merged by composition to we flatten it to have a unique list
|
|
1448
|
+
symbols = self.flatten_union(symbols)
|
|
1449
|
+
return {
|
|
1450
|
+
'type': 'enum',
|
|
1451
|
+
'name': name,
|
|
1452
|
+
'namespace': namespace,
|
|
1453
|
+
'symbols': [avro_name(s) for s in symbols]
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
def enum_symbols_need_string_fallback(self, symbols: list) -> bool:
|
|
1457
|
+
"""
|
|
1458
|
+
Check if any enum symbols will be transformed by avro_name().
|
|
1459
|
+
If symbols are prefixed (e.g., "1" -> "_1"), we need a string fallback
|
|
1460
|
+
in the union to handle original JSON values during deserialization.
|
|
1461
|
+
"""
|
|
1462
|
+
for s in symbols:
|
|
1463
|
+
if isinstance(s, str) and s:
|
|
1464
|
+
if avro_name(s) != s:
|
|
1465
|
+
return True
|
|
1466
|
+
return False
|
|
1467
|
+
|
|
1468
|
+
def create_enum_for_mixed_types(self, name: str, namespace: str, enum_values: list, json_types: list) -> dict | list:
|
|
1469
|
+
"""
|
|
1470
|
+
Create an Avro type for enums with mixed or special type requirements.
|
|
1471
|
+
|
|
1472
|
+
Handles:
|
|
1473
|
+
- Pure string enum with valid symbols -> enum
|
|
1474
|
+
- Pure string enum with prefixed symbols -> [enum, string]
|
|
1475
|
+
- Pure int enum -> int (with doc hint about allowed values)
|
|
1476
|
+
- Mixed string/int enum -> [enum, string, int]
|
|
1477
|
+
|
|
1478
|
+
Args:
|
|
1479
|
+
name: The enum type name
|
|
1480
|
+
namespace: The namespace for the enum
|
|
1481
|
+
enum_values: The list of enum values from JSON Schema
|
|
1482
|
+
json_types: The JSON Schema type(s), e.g., "string", "integer", or ["string", "integer"]
|
|
1483
|
+
|
|
1484
|
+
Returns:
|
|
1485
|
+
Avro type: either an enum dict, a primitive string, or a union list
|
|
1486
|
+
"""
|
|
1487
|
+
if not isinstance(json_types, list):
|
|
1488
|
+
json_types = [json_types]
|
|
1489
|
+
|
|
1490
|
+
# Normalize type names
|
|
1491
|
+
has_string = 'string' in json_types
|
|
1492
|
+
has_int = 'integer' in json_types or 'int' in json_types
|
|
1493
|
+
has_null = 'null' in json_types
|
|
1494
|
+
|
|
1495
|
+
# Separate string and int enum values
|
|
1496
|
+
string_values = [v for v in enum_values if isinstance(v, str) and v]
|
|
1497
|
+
int_values = [v for v in enum_values if isinstance(v, int)]
|
|
1498
|
+
|
|
1499
|
+
# Pure integer enum case
|
|
1500
|
+
if has_int and not has_string and not string_values:
|
|
1501
|
+
# Just use int - no enum type needed for pure int enums
|
|
1502
|
+
# The doc will contain the allowed values hint
|
|
1503
|
+
result = 'int'
|
|
1504
|
+
if has_null:
|
|
1505
|
+
result = ['null', result]
|
|
1506
|
+
return result
|
|
1507
|
+
|
|
1508
|
+
# Build the enum from string values (or string representations of all values)
|
|
1509
|
+
if string_values:
|
|
1510
|
+
enum_symbols = list(set(string_values))
|
|
1511
|
+
else:
|
|
1512
|
+
# No string values but has_string type - shouldn't happen normally
|
|
1513
|
+
enum_symbols = []
|
|
1514
|
+
|
|
1515
|
+
if not enum_symbols:
|
|
1516
|
+
# No valid enum symbols, fall back to primitive types
|
|
1517
|
+
union = []
|
|
1518
|
+
if has_null:
|
|
1519
|
+
union.append('null')
|
|
1520
|
+
if has_string:
|
|
1521
|
+
union.append('string')
|
|
1522
|
+
if has_int:
|
|
1523
|
+
union.append('int')
|
|
1524
|
+
return union if len(union) > 1 else (union[0] if union else 'string')
|
|
1525
|
+
|
|
1526
|
+
# Create the enum type
|
|
1527
|
+
avro_enum = self.create_enum_type(name, namespace, enum_symbols)
|
|
1528
|
+
|
|
1529
|
+
# Determine if we need additional types in union
|
|
1530
|
+
needs_string_fallback = self.enum_symbols_need_string_fallback(enum_symbols)
|
|
1531
|
+
|
|
1532
|
+
# Build the union
|
|
1533
|
+
union = []
|
|
1534
|
+
if has_null:
|
|
1535
|
+
union.append('null')
|
|
1536
|
+
union.append(avro_enum)
|
|
1537
|
+
|
|
1538
|
+
# Add string fallback if symbols were prefixed OR if this is a mixed type enum
|
|
1539
|
+
if needs_string_fallback or has_int:
|
|
1540
|
+
union.append('string')
|
|
1541
|
+
|
|
1542
|
+
# Add int if the schema allows integers
|
|
1543
|
+
if has_int:
|
|
1544
|
+
union.append('int')
|
|
1545
|
+
|
|
1546
|
+
# Return enum directly if no union needed
|
|
1547
|
+
if len(union) == 1:
|
|
1548
|
+
return union[0]
|
|
1549
|
+
|
|
1550
|
+
return union
|
|
1551
|
+
|
|
1552
|
+
def create_array_type(self, items: list | dict | str) -> dict:
|
|
1553
|
+
"""Create an Avro array type."""
|
|
1554
|
+
return {
|
|
1555
|
+
'type': 'array',
|
|
1556
|
+
'items': items
|
|
1557
|
+
}
|
|
1558
|
+
|
|
1559
|
+
def create_map_type(self, values: list | dict | str) -> dict:
|
|
1560
|
+
"""Create an Avro map type."""
|
|
1561
|
+
return {
|
|
1562
|
+
'type': 'map',
|
|
1563
|
+
'values': values
|
|
1564
|
+
}
|
|
1565
|
+
|
|
1566
|
+
def nullable(self, avro_type: list | dict | str) -> list | dict | str:
|
|
1567
|
+
"""Wrap a type in a union with null."""
|
|
1568
|
+
if isinstance(avro_type, list):
|
|
1569
|
+
cp = avro_type.copy()
|
|
1570
|
+
cp.insert(0, 'null')
|
|
1571
|
+
return cp
|
|
1572
|
+
return ['null', avro_type]
|
|
1573
|
+
|
|
1574
|
+
def merge_description_into_doc(self, source_json: dict, target_avro: dict | list | str):
|
|
1575
|
+
"""Merge a description in JSON into Avro doc."""
|
|
1576
|
+
if isinstance(source_json, dict) and 'description' in source_json and isinstance(target_avro, dict):
|
|
1577
|
+
target_avro['doc'] = target_avro['doc'] + ", " + \
|
|
1578
|
+
source_json['description'] if 'doc' in target_avro else source_json['description']
|
|
1579
|
+
|
|
1580
|
+
def merge_dependencies_into_parent(self, dependencies: list, child_type: dict | list | str, parent_type: dict | list | str):
|
|
1581
|
+
"""Merge dependencies from a child type into a parent type."""
|
|
1582
|
+
self.lift_dependencies_from_type(child_type, dependencies)
|
|
1583
|
+
if len(dependencies) > 0 and isinstance(parent_type, dict):
|
|
1584
|
+
if 'dependencies' in parent_type:
|
|
1585
|
+
dependencies.extend(parent_type['dependencies'])
|
|
1586
|
+
else:
|
|
1587
|
+
parent_type['dependencies'] = dependencies
|
|
1588
|
+
|
|
1589
|
+
def lift_dependencies_from_type(self, child_type: dict | list | str, dependencies: list):
|
|
1590
|
+
"""Lift all dependencies from a type and return a new type with the dependencies lifted."""
|
|
1591
|
+
if isinstance(child_type, dict):
|
|
1592
|
+
if 'dependencies' in child_type:
|
|
1593
|
+
dependencies.extend(child_type['dependencies'])
|
|
1594
|
+
del child_type['dependencies']
|
|
1595
|
+
|
|
1596
|
+
def compose_namespace(self, *names) -> str:
|
|
1597
|
+
"""Compose a namespace from a list of names."""
|
|
1598
|
+
return '.'.join([avro_namespace(n) for n in names if n])
|
|
1599
|
+
|
|
1600
|
+
def get_qualified_name(self, avro_type):
|
|
1601
|
+
"""Get the qualified name of an Avro type."""
|
|
1602
|
+
return self.compose_namespace(avro_type.get('namespace', ''), avro_type.get('name', ''))
|
|
1603
|
+
|
|
1604
|
+
def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth: int = 1) -> dict | list | str | None:
|
|
1605
|
+
"""Convert a JSON schema object declaration to an Avro record."""
|
|
1606
|
+
dependencies: List[str] = []
|
|
1607
|
+
avro_type: list | dict | str = {}
|
|
1608
|
+
|
|
1609
|
+
# handle top-level allOf, anyOf, oneOf
|
|
1610
|
+
if self.has_composition_keywords(json_object):
|
|
1611
|
+
# we will merge allOf, oneOf, anyOf into a union record type
|
|
1612
|
+
type = self.json_type_to_avro_type(
|
|
1613
|
+
json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1614
|
+
if isinstance(type, str):
|
|
1615
|
+
# we are skipping references and primitives
|
|
1616
|
+
return None
|
|
1617
|
+
if isinstance(type, list):
|
|
1618
|
+
# we should have a union type
|
|
1619
|
+
avro_type = self.create_wrapper_record(
|
|
1620
|
+
name+"_union", self.utility_namespace, 'options', [], type)
|
|
1621
|
+
elif isinstance(type, dict) and 'type' in type and type['type'] != 'record':
|
|
1622
|
+
# merge the type into a record type if it's not a record type
|
|
1623
|
+
print(
|
|
1624
|
+
f'INFO: Standalone type {name} is being wrapped in a record')
|
|
1625
|
+
avro_type = self.create_wrapper_record(avro_name(type.get(
|
|
1626
|
+
'name', name)+'_wrapper'), self.utility_namespace, 'value', type.get('dependencies', []), type)
|
|
1627
|
+
else:
|
|
1628
|
+
avro_type = type
|
|
1629
|
+
# add external dependencies to the record
|
|
1630
|
+
self.merge_dependencies_into_parent(dependencies, type, avro_type)
|
|
1631
|
+
self.merge_description_into_doc(json_object, avro_type)
|
|
1632
|
+
# return the union type
|
|
1633
|
+
return avro_type
|
|
1634
|
+
|
|
1635
|
+
if self.has_enum_keyword(json_object):
|
|
1636
|
+
# this is an enum
|
|
1637
|
+
avro_enum = self.create_enum_type(
|
|
1638
|
+
avro_name(name), namespace, json_object['enum'])
|
|
1639
|
+
self.merge_description_into_doc(json_object, avro_enum)
|
|
1640
|
+
return avro_enum
|
|
1641
|
+
|
|
1642
|
+
if self.is_array_object(json_object):
|
|
1643
|
+
# this is an array, which can't be standalone in Avro, so we will wraps it into a record
|
|
1644
|
+
# and include the type as an inline
|
|
1645
|
+
print(
|
|
1646
|
+
f'WARN: Standalone array type {name} will be wrapped in a record')
|
|
1647
|
+
deps: List[str] = []
|
|
1648
|
+
array_type = self.json_type_to_avro_type(json_object, name, avro_name(
|
|
1649
|
+
name), namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1650
|
+
avro_array = self.create_wrapper_record(
|
|
1651
|
+
avro_name(name+'_wrapper'), self.utility_namespace, 'items', [], array_type)
|
|
1652
|
+
self.merge_description_into_doc(json_object, avro_array)
|
|
1653
|
+
self.merge_dependencies_into_parent(deps, array_type, avro_array)
|
|
1654
|
+
return avro_array
|
|
1655
|
+
|
|
1656
|
+
# at this point, we have to assume that we have a JSON schema object
|
|
1657
|
+
title = json_object.get('title')
|
|
1658
|
+
record_name = avro_name(name if name else title if title else None)
|
|
1659
|
+
if record_name is None:
|
|
1660
|
+
raise ValueError(
|
|
1661
|
+
f"Cannot determine record name for json_object {json_object}")
|
|
1662
|
+
if len(record_stack) > 0:
|
|
1663
|
+
# if we have a record stack, we need to add the current name to
|
|
1664
|
+
# the namespace since nested types are disambiguated by their namespace
|
|
1665
|
+
namespace = self.compose_namespace(
|
|
1666
|
+
namespace, record_stack[-1] + "_types")
|
|
1667
|
+
# at this point we have a record type
|
|
1668
|
+
avro_record = self.create_avro_record(record_name, namespace, [])
|
|
1669
|
+
# Check if this record has a 'union' annotation from discriminated union pattern
|
|
1670
|
+
if 'union' in json_object:
|
|
1671
|
+
avro_record['union'] = json_object['union']
|
|
1672
|
+
# we need to prevent circular dependencies, so we will maintain a stack of the in-progress
|
|
1673
|
+
# records and will resolve the cycle as we go. if this record is already in the stack, we will
|
|
1674
|
+
# just return a reference to a record that contains this record
|
|
1675
|
+
if record_name in record_stack:
|
|
1676
|
+
# to break the cycle, we will use a containment type that references
|
|
1677
|
+
# the record that is being defined
|
|
1678
|
+
print(
|
|
1679
|
+
f'WARN: Circular dependency found for record {record_name}. Creating {record_name}_ref.')
|
|
1680
|
+
ref_name = avro_name(record_name + '_ref')
|
|
1681
|
+
return self.create_wrapper_record(ref_name, namespace, record_name, [], self.compose_namespace(namespace, record_name))
|
|
1682
|
+
try:
|
|
1683
|
+
# enter the record stack scope for this record
|
|
1684
|
+
record_stack.append(record_name)
|
|
1685
|
+
# collect the required fields so we can make those fields non-null
|
|
1686
|
+
required_fields = json_object.get('required', [])
|
|
1687
|
+
|
|
1688
|
+
field_refs = []
|
|
1689
|
+
if 'properties' in json_object and isinstance(json_object['properties'], dict):
|
|
1690
|
+
# add the properties as fields
|
|
1691
|
+
for field_name, json_field_types in json_object['properties'].items():
|
|
1692
|
+
if isinstance(json_field_types, bool):
|
|
1693
|
+
# for "propertyname": true, we skip. schema bug.
|
|
1694
|
+
continue
|
|
1695
|
+
if not isinstance(json_field_types, list):
|
|
1696
|
+
json_field_types = [json_field_types]
|
|
1697
|
+
field_type_list = []
|
|
1698
|
+
field_ref_type_list = []
|
|
1699
|
+
const = None
|
|
1700
|
+
default = None
|
|
1701
|
+
description = None
|
|
1702
|
+
discriminator = None
|
|
1703
|
+
for json_field_type in json_field_types:
|
|
1704
|
+
# skip fields with an bad or empty type
|
|
1705
|
+
if not isinstance(json_field_type, dict):
|
|
1706
|
+
continue
|
|
1707
|
+
field_name = avro_name(field_name)
|
|
1708
|
+
# last const wins if there are multiple
|
|
1709
|
+
const = json_field_type.get('const', const)
|
|
1710
|
+
# last default wins if there are multiple
|
|
1711
|
+
default_value = json_field_type.get('default')
|
|
1712
|
+
if default_value and not isinstance(default_value, dict) and not isinstance(default_value, list):
|
|
1713
|
+
default = default_value
|
|
1714
|
+
# get the description from the field type
|
|
1715
|
+
description = json_field_type.get('description', description)
|
|
1716
|
+
# check for discriminator annotation
|
|
1717
|
+
discriminator = json_field_type.get('discriminator', discriminator)
|
|
1718
|
+
# convert the JSON-type field to an Avro-type field
|
|
1719
|
+
avro_field_ref_type = avro_field_type = self.ensure_type(self.json_type_to_avro_type(
|
|
1720
|
+
json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
|
|
1721
|
+
if isinstance(avro_field_type, list):
|
|
1722
|
+
avro_field_type = self.flatten_union(
|
|
1723
|
+
avro_field_type)
|
|
1724
|
+
avro_field_ref_type = avro_field_type
|
|
1725
|
+
elif isinstance(avro_field_type, dict):
|
|
1726
|
+
self.lift_dependencies_from_type(
|
|
1727
|
+
avro_field_type, dependencies)
|
|
1728
|
+
# if the first call gave us a global type that got added to the schema, this call will give us a reference
|
|
1729
|
+
if self.is_standalone_avro_type(avro_field_type):
|
|
1730
|
+
avro_field_ref_type = self.get_qualified_name(
|
|
1731
|
+
avro_field_type)
|
|
1732
|
+
if avro_field_type is None:
|
|
1733
|
+
# None type is a problem
|
|
1734
|
+
raise ValueError(
|
|
1735
|
+
f"avro_field_type is None for field {field_name}")
|
|
1736
|
+
if isinstance(avro_field_type, dict) and 'type' in avro_field_type and not self.is_avro_complex_type(avro_field_type):
|
|
1737
|
+
# if the field type is a basic type, inline it
|
|
1738
|
+
avro_field_type = avro_field_type['type']
|
|
1739
|
+
field_type_list.append(avro_field_type)
|
|
1740
|
+
field_ref_type_list.append(avro_field_ref_type)
|
|
1741
|
+
|
|
1742
|
+
effective_field_type = field_type_list[0] if len(
|
|
1743
|
+
field_type_list) == 1 else field_type_list
|
|
1744
|
+
effective_field_ref_type = field_ref_type_list[0] if len(
|
|
1745
|
+
field_ref_type_list) == 1 else field_ref_type_list
|
|
1746
|
+
avro_field = {
|
|
1747
|
+
'name': avro_name(field_name),
|
|
1748
|
+
'type': self.nullable(effective_field_type) if not field_name in required_fields and 'null' not in effective_field_type else effective_field_type
|
|
1749
|
+
}
|
|
1750
|
+
if field_name != avro_name(field_name):
|
|
1751
|
+
avro_field['altnames'] = { "json": field_name }
|
|
1752
|
+
if const:
|
|
1753
|
+
avro_field['const'] = const
|
|
1754
|
+
if default:
|
|
1755
|
+
avro_field['default'] = default
|
|
1756
|
+
if description:
|
|
1757
|
+
avro_field['doc'] = description
|
|
1758
|
+
if discriminator:
|
|
1759
|
+
avro_field['discriminator'] = discriminator
|
|
1760
|
+
field_type_list.append(avro_field_type)
|
|
1761
|
+
avro_field_ref = {
|
|
1762
|
+
'name': avro_name(field_name),
|
|
1763
|
+
'type': self.nullable(effective_field_ref_type) if not field_name in required_fields and 'null' not in effective_field_ref_type else effective_field_ref_type
|
|
1764
|
+
}
|
|
1765
|
+
if description:
|
|
1766
|
+
avro_field_ref['doc'] = description
|
|
1767
|
+
field_ref_type_list.append(avro_field_ref)
|
|
1768
|
+
# add the field to the record
|
|
1769
|
+
avro_record['fields'].append(avro_field)
|
|
1770
|
+
field_refs.append(avro_field_ref)
|
|
1771
|
+
elif not 'additionalProperties' in json_object and not 'patternProperties' in json_object:
|
|
1772
|
+
if 'type' in json_object and (json_object['type'] == 'object' or 'object' in json_object['type']) and \
|
|
1773
|
+
not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
|
|
1774
|
+
# we don't have any fields, but we have an object type, so we create a map
|
|
1775
|
+
avro_record = self.create_map_type(generic_type())
|
|
1776
|
+
elif 'type' in json_object and (json_object['type'] == 'array' or 'array' in json_object['type']) and \
|
|
1777
|
+
not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
|
|
1778
|
+
# we don't have any fields, but we have an array type, so we create a record with an 'items' field
|
|
1779
|
+
avro_record = self.create_array_type(
|
|
1780
|
+
self.json_type_to_avro_type(
|
|
1781
|
+
json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1782
|
+
if 'items' in json_object
|
|
1783
|
+
else generic_type())
|
|
1784
|
+
else:
|
|
1785
|
+
return json_object['type'] if 'type' in json_object else generic_type()
|
|
1786
|
+
|
|
1787
|
+
extension_types = []
|
|
1788
|
+
prop_docs = ''
|
|
1789
|
+
if 'patternProperties' in json_object and isinstance(json_object['patternProperties'], dict) and len(json_object['patternProperties']) > 0:
|
|
1790
|
+
# pattern properties are represented as a record with field names that are the patterns
|
|
1791
|
+
pattern_props = json_object['patternProperties']
|
|
1792
|
+
for pattern_name, props in pattern_props.items():
|
|
1793
|
+
deps = []
|
|
1794
|
+
prop_type = self.ensure_type(self.json_type_to_avro_type(
|
|
1795
|
+
props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
|
|
1796
|
+
if self.is_standalone_avro_type(prop_type):
|
|
1797
|
+
self.lift_dependencies_from_type(prop_type, deps)
|
|
1798
|
+
self.set_avro_type_value(
|
|
1799
|
+
prop_type, 'namespace', namespace)
|
|
1800
|
+
self.register_type(avro_schema, prop_type)
|
|
1801
|
+
prop_type_ref = self.get_qualified_name(prop_type)
|
|
1802
|
+
dependencies.append(prop_type_ref)
|
|
1803
|
+
else:
|
|
1804
|
+
dependencies.extend(deps)
|
|
1805
|
+
if isinstance(prop_type, str) and not prop_type in primitive_types:
|
|
1806
|
+
dependencies.append(prop_type)
|
|
1807
|
+
if self.is_empty_type(prop_type):
|
|
1808
|
+
prop_type = generic_type()
|
|
1809
|
+
prop_docs += f"Name pattern '{pattern_name}': [{self.get_field_type_name({'type':prop_type})}]. "
|
|
1810
|
+
extension_types.append(prop_type)
|
|
1811
|
+
|
|
1812
|
+
if 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], bool):
|
|
1813
|
+
if True == json_object['additionalProperties']:
|
|
1814
|
+
prop_type = generic_type()
|
|
1815
|
+
extension_types.append(prop_type)
|
|
1816
|
+
elif 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], dict) and len(json_object['additionalProperties']) > 0:
|
|
1817
|
+
# additional properties are represented as a map of string to the type of the value
|
|
1818
|
+
additional_props = json_object['additionalProperties']
|
|
1819
|
+
deps = []
|
|
1820
|
+
values_type = self.json_type_to_avro_type(
|
|
1821
|
+
additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
|
|
1822
|
+
if self.is_standalone_avro_type(values_type):
|
|
1823
|
+
self.lift_dependencies_from_type(values_type, deps)
|
|
1824
|
+
self.set_avro_type_value(
|
|
1825
|
+
values_type, 'namespace', namespace)
|
|
1826
|
+
self.register_type(avro_schema, values_type)
|
|
1827
|
+
values_type_ref = self.get_qualified_name(values_type)
|
|
1828
|
+
dependencies.append(values_type_ref)
|
|
1829
|
+
else:
|
|
1830
|
+
dependencies.extend(deps)
|
|
1831
|
+
if isinstance(values_type, str) and not values_type in primitive_types:
|
|
1832
|
+
dependencies.append(values_type)
|
|
1833
|
+
if self.is_empty_type(values_type):
|
|
1834
|
+
values_type = generic_type()
|
|
1835
|
+
prop_docs += f"Extra properties: [{self.get_field_type_name({'type':values_type})}]. "
|
|
1836
|
+
extension_types.append(values_type)
|
|
1837
|
+
self.merge_description_into_doc(json_object, avro_record)
|
|
1838
|
+
|
|
1839
|
+
avro_alternate_record = None
|
|
1840
|
+
if extension_types:
|
|
1841
|
+
# Since Avro Schema does not allow fields with dynamic names
|
|
1842
|
+
# to appear alongside regular fields, we will union the types of all properties with the
|
|
1843
|
+
# type of the additionalProperties and document this in the record's description
|
|
1844
|
+
json_field_types = [field['type'] for field in field_refs]
|
|
1845
|
+
field_type_names = [
|
|
1846
|
+
[field['name'], self.get_field_type_name(field)] for field in field_refs]
|
|
1847
|
+
field_type_name_list: str = ', '.join(
|
|
1848
|
+
[f"'{field[0]}': [{field[1]}]" for field in field_type_names])
|
|
1849
|
+
json_field_types.extend(extension_types)
|
|
1850
|
+
json_field_types = self.flatten_union(json_field_types)
|
|
1851
|
+
if len(json_field_types) == 1:
|
|
1852
|
+
json_field_types = json_field_types[0]
|
|
1853
|
+
doc = f"Alternate map: {field_type_name_list}. " if field_type_names else ''
|
|
1854
|
+
doc += prop_docs
|
|
1855
|
+
avro_alternate_record = self.create_map_type(json_field_types)
|
|
1856
|
+
if not self.is_empty_type(avro_record):
|
|
1857
|
+
avro_alternate_record['alternateof'] = self.get_qualified_name(avro_record)
|
|
1858
|
+
dependencies.append(
|
|
1859
|
+
self.compose_namespace(namespace, record_name))
|
|
1860
|
+
avro_record['doc'] = doc if not 'doc' in avro_record else avro_record['doc'] + ', ' + doc
|
|
1861
|
+
|
|
1862
|
+
if len(dependencies) > 0:
|
|
1863
|
+
# dedupe the list
|
|
1864
|
+
dependencies = list(set(dependencies))
|
|
1865
|
+
avro_record['dependencies'] = dependencies
|
|
1866
|
+
finally:
|
|
1867
|
+
record_stack.pop()
|
|
1868
|
+
if avro_alternate_record:
|
|
1869
|
+
if self.is_empty_type(avro_record):
|
|
1870
|
+
# there's no substantive content in the record,
|
|
1871
|
+
# so we will just return the alternate record, which
|
|
1872
|
+
# is a plain map
|
|
1873
|
+
return avro_alternate_record
|
|
1874
|
+
return [avro_record, avro_alternate_record]
|
|
1875
|
+
return avro_record
|
|
1876
|
+
|
|
1877
|
+
def postprocess_schema(self, avro_schema: list) -> None:
|
|
1878
|
+
""" Post-process the Avro Schema for cases wheer we need a second pass """
|
|
1879
|
+
if len(self.types_with_unmerged_types) > 0:
|
|
1880
|
+
types_with_unmerged_types = copy.deepcopy(
|
|
1881
|
+
self.types_with_unmerged_types)
|
|
1882
|
+
self.types_with_unmerged_types = []
|
|
1883
|
+
for ref_type in types_with_unmerged_types:
|
|
1884
|
+
# find ref_type anywhere in the avro_schema graph, matching
|
|
1885
|
+
# on name and namespace.
|
|
1886
|
+
def find_fn(
|
|
1887
|
+
t): return 'name' in t and t['name'] == ref_type['name'] and 'namespace' in t and t['namespace'] == ref_type['namespace']
|
|
1888
|
+
type = find_schema_node(find_fn, avro_schema)
|
|
1889
|
+
if not type:
|
|
1890
|
+
raise ValueError(
|
|
1891
|
+
f"Couldn't find type {ref_type['namespace']}.{ref_type['name']} in the Avro Schema.")
|
|
1892
|
+
# resolve the unmerged types
|
|
1893
|
+
local_name = type.get('name')
|
|
1894
|
+
if not isinstance(type, dict):
|
|
1895
|
+
continue
|
|
1896
|
+
unmerged_types = type.get('unmerged_types', [])
|
|
1897
|
+
if len(unmerged_types) == 0:
|
|
1898
|
+
if 'unmerged_types' in type:
|
|
1899
|
+
del type['unmerged_types']
|
|
1900
|
+
continue
|
|
1901
|
+
base_type = copy.deepcopy(type)
|
|
1902
|
+
if 'unmerged_types' in base_type:
|
|
1903
|
+
del base_type['unmerged_types']
|
|
1904
|
+
mergeable_types = [base_type]
|
|
1905
|
+
deps: List[str] = []
|
|
1906
|
+
self.lift_dependencies_from_type(type, deps)
|
|
1907
|
+
for item in unmerged_types:
|
|
1908
|
+
if isinstance(item, str):
|
|
1909
|
+
found_avro_type = next(
|
|
1910
|
+
(t for t in avro_schema if self.get_qualified_name(t) == item), None)
|
|
1911
|
+
if not found_avro_type:
|
|
1912
|
+
continue
|
|
1913
|
+
elif isinstance(item, dict):
|
|
1914
|
+
found_avro_type = item
|
|
1915
|
+
self.lift_dependencies_from_type(found_avro_type, deps)
|
|
1916
|
+
if isinstance(found_avro_type, dict):
|
|
1917
|
+
candidate = found_avro_type
|
|
1918
|
+
if 'unmerged_types' in candidate:
|
|
1919
|
+
del candidate['unmerged_types']
|
|
1920
|
+
mergeable_types.append(candidate)
|
|
1921
|
+
merge_result = self.merge_avro_schemas(
|
|
1922
|
+
mergeable_types, avro_schema, local_name, deps)
|
|
1923
|
+
if isinstance(merge_result, dict):
|
|
1924
|
+
merge_result['dependencies'] = deps
|
|
1925
|
+
if 'unmerged_types' in merge_result:
|
|
1926
|
+
del merge_result['unmerged_types']
|
|
1927
|
+
if isinstance(merge_result, list):
|
|
1928
|
+
# unmerged field containers have fields - wrap the union in a record
|
|
1929
|
+
# Keep the original name since references expect it
|
|
1930
|
+
self.set_avro_type_value(
|
|
1931
|
+
type, 'fields', [{'name': 'value', 'type': merge_result}])
|
|
1932
|
+
if 'unmerged_types' in type:
|
|
1933
|
+
del type['unmerged_types']
|
|
1934
|
+
merge_result = copy.deepcopy(type)
|
|
1935
|
+
set_schema_node(find_fn, merge_result, avro_schema)
|
|
1936
|
+
|
|
1937
|
+
def process_definition_list(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema_list):
|
|
1938
|
+
"""Process a schema definition list."""
|
|
1939
|
+
for sub_schema_name, schema in json_schema_list.items():
|
|
1940
|
+
if not isinstance(schema, dict) and not isinstance(schema, list):
|
|
1941
|
+
# skip items that are not schema definitions or lists
|
|
1942
|
+
continue
|
|
1943
|
+
if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
|
|
1944
|
+
# this is a schema definition
|
|
1945
|
+
self.process_definition(
|
|
1946
|
+
json_schema, namespace, base_uri, avro_schema, record_stack, sub_schema_name, schema)
|
|
1947
|
+
continue
|
|
1948
|
+
# it's a schema definition list
|
|
1949
|
+
self.process_definition_list(
|
|
1950
|
+
json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema)
|
|
1951
|
+
|
|
1952
|
+
def process_definition(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema, is_root: bool = False) -> Tuple[str, str] | None:
|
|
1953
|
+
""" Process a schema definition. """
|
|
1954
|
+
avro_schema_item = None
|
|
1955
|
+
avro_schema_item_list = self.json_schema_object_to_avro_record(
|
|
1956
|
+
schema_name, schema, namespace, json_schema, base_uri, avro_schema, record_stack)
|
|
1957
|
+
if not isinstance(avro_schema_item_list, list) and not isinstance(avro_schema_item_list, dict):
|
|
1958
|
+
# skip if the record couldn't be resolved
|
|
1959
|
+
return None
|
|
1960
|
+
# the call above usually returns a single record, but we pretend it's normally a list to handle allOf/anyOf/oneOf cases
|
|
1961
|
+
if isinstance(avro_schema_item_list, list) and is_root and len(avro_schema_item_list) > 1:
|
|
1962
|
+
# if we have multiple root-level records, we will wrap them all in a single record
|
|
1963
|
+
root_avro_schema_item = self.create_wrapper_record(
|
|
1964
|
+
schema_name+'_wrapper', namespace, 'root', [], avro_schema_item_list)
|
|
1965
|
+
for avro_schema_item in avro_schema_item_list:
|
|
1966
|
+
self.merge_dependencies_into_parent(
|
|
1967
|
+
[], avro_schema_item, root_avro_schema_item)
|
|
1968
|
+
self.register_type(avro_schema, root_avro_schema_item)
|
|
1969
|
+
return root_avro_schema_item['namespace'], root_avro_schema_item['name']
|
|
1970
|
+
elif not isinstance(avro_schema_item_list, list):
|
|
1971
|
+
# is not a list, so we'll wrap it in a list
|
|
1972
|
+
avro_schema_item_list = [avro_schema_item_list]
|
|
1973
|
+
for avro_schema_item in avro_schema_item_list:
|
|
1974
|
+
# add the item to the schema if it's not already there
|
|
1975
|
+
if isinstance(avro_schema_item, str):
|
|
1976
|
+
continue
|
|
1977
|
+
if isinstance(avro_schema_item, dict) and not 'name' in avro_schema_item:
|
|
1978
|
+
avro_schema_item['name'] = avro_name(schema_name)
|
|
1979
|
+
existing_type = next((t for t in avro_schema if t.get('name') == avro_schema_item['name'] and t.get(
|
|
1980
|
+
'namespace') == avro_schema_item.get('namespace')), None)
|
|
1981
|
+
if not existing_type:
|
|
1982
|
+
if (not self.is_empty_type(avro_schema_item) or 'unmerged_types' in avro_schema_item) and \
|
|
1983
|
+
self.is_standalone_avro_type(avro_schema_item):
|
|
1984
|
+
# we only register record/enum as type. the other defs are mix-ins
|
|
1985
|
+
self.register_type(avro_schema, avro_schema_item)
|
|
1986
|
+
return avro_schema_item['namespace'], avro_schema_item['name']
|
|
1987
|
+
elif is_root:
|
|
1988
|
+
# at the root, we will wrap the type in a record to make it top-level
|
|
1989
|
+
deps: List[str] = []
|
|
1990
|
+
self.lift_dependencies_from_type(avro_schema_item, deps)
|
|
1991
|
+
avro_schema_wrapper = self.create_wrapper_record(schema_name, avro_schema_item.get(
|
|
1992
|
+
'namespace', namespace), avro_schema_item['name'], deps, avro_schema_item)
|
|
1993
|
+
if len(deps) > 0:
|
|
1994
|
+
avro_schema_wrapper['dependencies'] = deps
|
|
1995
|
+
avro_schema_item = avro_schema_wrapper
|
|
1996
|
+
self.register_type(avro_schema, avro_schema_item)
|
|
1997
|
+
return avro_schema_item['namespace'], avro_schema_item['name']
|
|
1998
|
+
return None
|
|
1999
|
+
|
|
2000
|
+
def id_to_avro_namespace(self, id: str) -> str:
|
|
2001
|
+
"""Convert a XSD namespace to Avro Namespace."""
|
|
2002
|
+
parsed_url = urlparse(id)
|
|
2003
|
+
# strip the file extension
|
|
2004
|
+
path = parsed_url.path.rsplit('.')[0]
|
|
2005
|
+
path_segments = path.strip('/').replace('-', '_').split('/')
|
|
2006
|
+
reversed_path_segments = reversed(path_segments)
|
|
2007
|
+
namespace_suffix = self.compose_namespace(*reversed_path_segments)
|
|
2008
|
+
if parsed_url.hostname:
|
|
2009
|
+
namespace_prefix = self.compose_namespace(
|
|
2010
|
+
*reversed(parsed_url.hostname.split('.')))
|
|
2011
|
+
namespace = self.compose_namespace(namespace_prefix, namespace_suffix)
|
|
2012
|
+
return namespace
|
|
2013
|
+
|
|
2014
|
+
def jsons_to_avro(self, json_schema: dict | list, namespace: str, base_uri: str) -> list | dict | str:
|
|
2015
|
+
"""Convert a JSON-schema to an Avro-schema."""
|
|
2016
|
+
avro_schema: List[dict] = []
|
|
2017
|
+
record_stack: List[str] = []
|
|
2018
|
+
|
|
2019
|
+
parsed_url = urlparse(base_uri)
|
|
2020
|
+
schema_name = self.root_class_name
|
|
2021
|
+
|
|
2022
|
+
if isinstance(json_schema, dict) and ('definitions' in json_schema or '$defs' in json_schema):
|
|
2023
|
+
# this is a swagger file or has a 'definitions' block
|
|
2024
|
+
json_schema_defs = json_schema.get(
|
|
2025
|
+
'definitions', json_schema.get('$defs', []))
|
|
2026
|
+
for def_schema_name, schema in json_schema_defs.items():
|
|
2027
|
+
if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
|
|
2028
|
+
# this is a schema definition
|
|
2029
|
+
self.process_definition(
|
|
2030
|
+
json_schema, namespace, base_uri, avro_schema, record_stack, def_schema_name, schema)
|
|
2031
|
+
else:
|
|
2032
|
+
# it's a schema definition list
|
|
2033
|
+
self.process_definition_list(
|
|
2034
|
+
json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema.copy())
|
|
2035
|
+
elif isinstance(json_schema, list):
|
|
2036
|
+
# this is a schema definition list
|
|
2037
|
+
self.process_definition_list(
|
|
2038
|
+
json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema)
|
|
2039
|
+
|
|
2040
|
+
root_namespace = None
|
|
2041
|
+
root_name = None
|
|
2042
|
+
if isinstance(json_schema, dict) and 'type' in json_schema or 'allOf' in json_schema or 'oneOf' in json_schema or 'anyOf' in json_schema or 'properties' in json_schema:
|
|
2043
|
+
# this is a schema definition
|
|
2044
|
+
if isinstance(json_schema, dict) and '$ref' in json_schema:
|
|
2045
|
+
# if there is a $ref at the root level, resolve the reference and merge it with the current schema
|
|
2046
|
+
ref = json_schema['$ref']
|
|
2047
|
+
if ref:
|
|
2048
|
+
ref_schema, json_doc = self.resolve_reference(
|
|
2049
|
+
json_schema, base_uri, json_schema)
|
|
2050
|
+
json_schema = self.merge_json_schemas(
|
|
2051
|
+
[json_schema, ref_schema], intersect=False)
|
|
2052
|
+
root_info = self.process_definition(
|
|
2053
|
+
json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema, is_root=True)
|
|
2054
|
+
if root_info:
|
|
2055
|
+
root_namespace, root_name = root_info
|
|
2056
|
+
|
|
2057
|
+
# postprocessing pass
|
|
2058
|
+
self.postprocess_schema(avro_schema)
|
|
2059
|
+
|
|
2060
|
+
if isinstance(avro_schema, list) and len(avro_schema) > 1 and self.split_top_level_records:
|
|
2061
|
+
new_avro_schema = []
|
|
2062
|
+
for item in avro_schema:
|
|
2063
|
+
if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
|
|
2064
|
+
# we need to make a copy since the inlining operation shuffles types
|
|
2065
|
+
schema_copy = copy.deepcopy(avro_schema)
|
|
2066
|
+
# find the item with the same name and namespace in the copy
|
|
2067
|
+
found_item = next((t for t in schema_copy if t.get(
|
|
2068
|
+
'name') == item['name'] and t.get('namespace') == item.get('namespace')), None)
|
|
2069
|
+
if found_item:
|
|
2070
|
+
# inline all dependencies of the item
|
|
2071
|
+
inline_dependencies_of(schema_copy, found_item)
|
|
2072
|
+
new_avro_schema.append(found_item)
|
|
2073
|
+
avro_schema = new_avro_schema
|
|
2074
|
+
else:
|
|
2075
|
+
# sort the records by their dependencies
|
|
2076
|
+
if root_name and root_namespace and not ('definitions' in json_schema or '$defs' in json_schema):
|
|
2077
|
+
# inline all dependencies if this is a doc with only a root level definition
|
|
2078
|
+
root = find_schema_node(
|
|
2079
|
+
lambda t: 'name' in t and t['name'] == root_name and 'namespace' in t and t['namespace'] == root_namespace, avro_schema)
|
|
2080
|
+
inline_dependencies_of(avro_schema, root)
|
|
2081
|
+
return root
|
|
2082
|
+
else:
|
|
2083
|
+
avro_schema = sort_messages_by_dependencies(avro_schema)
|
|
2084
|
+
|
|
2085
|
+
if parsed_url.fragment and isinstance(json_schema, dict):
|
|
2086
|
+
# if the fragment is present in the URL, it's a reference to a schema definition
|
|
2087
|
+
# so we will resolve that reference and return a type
|
|
2088
|
+
self.imported_types.clear()
|
|
2089
|
+
fragment_schema: List[dict] = []
|
|
2090
|
+
json_pointer = parsed_url.fragment
|
|
2091
|
+
schema_name = parsed_url.fragment.split('/')[-1]
|
|
2092
|
+
schema = jsonpointer.resolve_pointer(json_schema, json_pointer)
|
|
2093
|
+
avro_schema_item = self.json_schema_object_to_avro_record(
|
|
2094
|
+
schema_name, schema, namespace, json_schema, base_uri, fragment_schema, record_stack)
|
|
2095
|
+
if avro_schema_item:
|
|
2096
|
+
# we roll all the types into this record as the top level type
|
|
2097
|
+
inline_dependencies_of(avro_schema, avro_schema_item)
|
|
2098
|
+
return avro_schema_item
|
|
2099
|
+
|
|
2100
|
+
return avro_schema
|
|
2101
|
+
|
|
2102
|
+
def convert_jsons_to_avro(self, json_schema_file_path: str, avro_schema_path: str, namespace: str | None = None, utility_namespace: str | None = None) -> list | dict | str:
|
|
2103
|
+
"""Convert JSON schema file to Avro schema file."""
|
|
2104
|
+
# turn the file path into a file URI if it's not a URI already
|
|
2105
|
+
parsed_url = urlparse(json_schema_file_path)
|
|
2106
|
+
if not parsed_url.hostname and not parsed_url.scheme == 'file':
|
|
2107
|
+
json_schema_file_path = 'file://' + json_schema_file_path
|
|
2108
|
+
parsed_url = urlparse(json_schema_file_path)
|
|
2109
|
+
content = self.fetch_content(parsed_url.geturl())
|
|
2110
|
+
json_schema = json.loads(content)
|
|
2111
|
+
|
|
2112
|
+
if not namespace:
|
|
2113
|
+
namespace = parsed_url.geturl().replace('\\', '/').replace('-',
|
|
2114
|
+
'_').split('/')[-1].split('.')[0]
|
|
2115
|
+
# get the $id if present
|
|
2116
|
+
if '$id' in json_schema:
|
|
2117
|
+
namespace = self.id_to_avro_namespace(json_schema['$id'])
|
|
2118
|
+
self.root_namespace = namespace
|
|
2119
|
+
if utility_namespace:
|
|
2120
|
+
self.utility_namespace = utility_namespace
|
|
2121
|
+
else:
|
|
2122
|
+
self.utility_namespace = self.root_namespace + '.utility'
|
|
2123
|
+
|
|
2124
|
+
# drop the file name from the parsed URL to get the base URI
|
|
2125
|
+
avro_schema = self.jsons_to_avro(
|
|
2126
|
+
json_schema, namespace, parsed_url.geturl())
|
|
2127
|
+
if len(avro_schema) == 1:
|
|
2128
|
+
avro_schema = avro_schema[0]
|
|
2129
|
+
|
|
2130
|
+
# create the directory for the Avro schema file if it doesn't exist
|
|
2131
|
+
dir = os.path.dirname(
|
|
2132
|
+
avro_schema_path) if not self.split_top_level_records else avro_schema_path
|
|
2133
|
+
if dir != '' and not os.path.exists(dir):
|
|
2134
|
+
os.makedirs(dir, exist_ok=True)
|
|
2135
|
+
if self.split_top_level_records:
|
|
2136
|
+
# if we are splitting top level records, we will create a file for each record
|
|
2137
|
+
for item in avro_schema:
|
|
2138
|
+
if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
|
|
2139
|
+
schema_file_path = os.path.join(
|
|
2140
|
+
dir, item['name'] + '.avsc')
|
|
2141
|
+
with open(schema_file_path, 'w') as avro_file:
|
|
2142
|
+
json.dump(item, avro_file, indent=4)
|
|
2143
|
+
else:
|
|
2144
|
+
with open(avro_schema_path, 'w') as avro_file:
|
|
2145
|
+
json.dump(avro_schema, avro_file, indent=4)
|
|
2146
|
+
return avro_schema
|
|
2147
|
+
|
|
2148
|
+
|
|
2149
|
+
def convert_jsons_to_avro(json_schema_file_path: str, avro_schema_path: str, namespace: str = '', utility_namespace='', root_class_name='', split_top_level_records=False) -> list | dict | str:
|
|
2150
|
+
"""Convert JSON schema file to Avro schema file."""
|
|
2151
|
+
|
|
2152
|
+
if not json_schema_file_path:
|
|
2153
|
+
raise ValueError('JSON schema file path is required')
|
|
2154
|
+
if not json_schema_file_path.startswith('http'):
|
|
2155
|
+
if not os.path.exists(json_schema_file_path):
|
|
2156
|
+
raise FileNotFoundError(f'JSON schema file {json_schema_file_path} not found')
|
|
2157
|
+
|
|
2158
|
+
try:
|
|
2159
|
+
converter = JsonToAvroConverter()
|
|
2160
|
+
converter.split_top_level_records = split_top_level_records
|
|
2161
|
+
if root_class_name:
|
|
2162
|
+
converter.root_class_name = root_class_name
|
|
2163
|
+
return converter.convert_jsons_to_avro(json_schema_file_path, avro_schema_path, namespace, utility_namespace)
|
|
2164
|
+
except Exception as e:
|
|
2165
|
+
print(
|
|
2166
|
+
f'Error converting JSON {json_schema_file_path} to Avro: {e.args[0]}')
|
|
2167
|
+
return []
|