structurize 2.16.2__py3-none-any.whl → 2.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +63 -63
- avrotize/__main__.py +5 -5
- avrotize/_version.py +34 -34
- avrotize/asn1toavro.py +160 -160
- avrotize/avrotize.py +152 -152
- avrotize/avrotocpp.py +483 -483
- avrotize/avrotocsharp.py +992 -992
- avrotize/avrotocsv.py +121 -121
- avrotize/avrotodatapackage.py +173 -173
- avrotize/avrotodb.py +1383 -1383
- avrotize/avrotogo.py +476 -476
- avrotize/avrotographql.py +197 -197
- avrotize/avrotoiceberg.py +210 -210
- avrotize/avrotojava.py +1023 -1023
- avrotize/avrotojs.py +250 -250
- avrotize/avrotojsons.py +481 -481
- avrotize/avrotojstruct.py +345 -345
- avrotize/avrotokusto.py +363 -363
- avrotize/avrotomd.py +137 -137
- avrotize/avrotools.py +168 -168
- avrotize/avrotoparquet.py +208 -208
- avrotize/avrotoproto.py +358 -358
- avrotize/avrotopython.py +622 -622
- avrotize/avrotorust.py +435 -435
- avrotize/avrotots.py +598 -598
- avrotize/avrotoxsd.py +344 -344
- avrotize/commands.json +2493 -2433
- avrotize/common.py +828 -828
- avrotize/constants.py +4 -4
- avrotize/csvtoavro.py +131 -131
- avrotize/datapackagetoavro.py +76 -76
- avrotize/dependency_resolver.py +348 -348
- avrotize/jsonstoavro.py +1698 -1698
- avrotize/jsonstostructure.py +2642 -2642
- avrotize/jstructtoavro.py +878 -878
- avrotize/kstructtoavro.py +93 -93
- avrotize/kustotoavro.py +455 -455
- avrotize/parquettoavro.py +157 -157
- avrotize/proto2parser.py +497 -497
- avrotize/proto3parser.py +402 -402
- avrotize/prototoavro.py +382 -382
- avrotize/structuretocsharp.py +2005 -2005
- avrotize/structuretojsons.py +498 -498
- avrotize/structuretopython.py +772 -772
- avrotize/structuretots.py +653 -0
- avrotize/xsdtoavro.py +413 -413
- structurize-2.16.6.dist-info/METADATA +107 -0
- structurize-2.16.6.dist-info/RECORD +52 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/licenses/LICENSE +200 -200
- structurize-2.16.2.dist-info/METADATA +0 -805
- structurize-2.16.2.dist-info/RECORD +0 -51
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/WHEEL +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/entry_points.txt +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/top_level.txt +0 -0
avrotize/avrotojsons.py
CHANGED
|
@@ -1,481 +1,481 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import json
|
|
3
|
-
from typing import Dict, Any, Union, List
|
|
4
|
-
from avrotize.common import build_tree_hash_list, group_by_hash, is_generic_json_type, NodeHashReference
|
|
5
|
-
from functools import reduce
|
|
6
|
-
import jsonpath_ng
|
|
7
|
-
|
|
8
|
-
class AvroToJsonSchemaConverter:
|
|
9
|
-
|
|
10
|
-
def __init__(self, naming_mode: str = 'snake') -> None:
|
|
11
|
-
self.naming_mode = naming_mode
|
|
12
|
-
self.defined_types: Dict[str, Any] = {}
|
|
13
|
-
self.common_namespace = ''
|
|
14
|
-
|
|
15
|
-
def find_common_namespace(self, namespaces: List[str]) -> str:
|
|
16
|
-
"""
|
|
17
|
-
Find the common namespace prefix from a list of namespaces.
|
|
18
|
-
"""
|
|
19
|
-
if not namespaces:
|
|
20
|
-
return ''
|
|
21
|
-
|
|
22
|
-
def common_prefix(a, b):
|
|
23
|
-
prefix = ''
|
|
24
|
-
for a_char, b_char in zip(a.split('.'), b.split('.')):
|
|
25
|
-
if a_char == b_char:
|
|
26
|
-
prefix += a_char + '.'
|
|
27
|
-
else:
|
|
28
|
-
break
|
|
29
|
-
return prefix.rstrip('.')
|
|
30
|
-
|
|
31
|
-
return reduce(common_prefix, namespaces)
|
|
32
|
-
|
|
33
|
-
def update_common_namespace(self, namespace: str) -> None:
|
|
34
|
-
"""
|
|
35
|
-
Update the common namespace based on the provided namespace.
|
|
36
|
-
"""
|
|
37
|
-
if not self.common_namespace:
|
|
38
|
-
self.common_namespace = namespace
|
|
39
|
-
else:
|
|
40
|
-
self.common_namespace = self.find_common_namespace([self.common_namespace, namespace])
|
|
41
|
-
|
|
42
|
-
def get_definition_ref(self, name: str) -> str:
|
|
43
|
-
"""
|
|
44
|
-
Construct the reference string based on the namespace and name.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
if '.' in name:
|
|
48
|
-
namespace, name = name.rsplit('.', 1)
|
|
49
|
-
else:
|
|
50
|
-
namespace = self.common_namespace
|
|
51
|
-
|
|
52
|
-
if not self.common_namespace:
|
|
53
|
-
return f"#/definitions/{name}"
|
|
54
|
-
|
|
55
|
-
# Remove the common namespace and replace '.' with '/'
|
|
56
|
-
namespace_suffix = namespace[len(self.common_namespace):].lstrip('.')
|
|
57
|
-
path = namespace_suffix.replace('.', '/') if namespace_suffix else ''
|
|
58
|
-
ref = f"#/definitions/{path}/{name}" if path else f"#/definitions/{name}"
|
|
59
|
-
return ref
|
|
60
|
-
|
|
61
|
-
def get_qualified_name(self, avro_type: Dict[str, Any]) -> str:
|
|
62
|
-
"""
|
|
63
|
-
Construct the qualified name based on the namespace and name.
|
|
64
|
-
"""
|
|
65
|
-
return avro_type['name'] if 'namespace' not in avro_type else f"{avro_type['namespace']}.{avro_type['name']}"
|
|
66
|
-
|
|
67
|
-
def avro_primitive_to_json_type(self, avro_type: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
68
|
-
"""
|
|
69
|
-
Map Avro primitive types to JSON types with appropriate format annotations.
|
|
70
|
-
Handles both standard Avro logical types and Avrotize schema extensions.
|
|
71
|
-
"""
|
|
72
|
-
json_type = {}
|
|
73
|
-
if isinstance(avro_type, dict):
|
|
74
|
-
# Check for logical type before unwrapping the base type
|
|
75
|
-
logical_type = avro_type.get('logicalType')
|
|
76
|
-
base_type = avro_type.get('type', avro_type)
|
|
77
|
-
|
|
78
|
-
# Handle logical types based on their base type
|
|
79
|
-
if logical_type and isinstance(base_type, str):
|
|
80
|
-
# Standard Avro logical types on int/long
|
|
81
|
-
if base_type == 'int' and logical_type == 'date':
|
|
82
|
-
# Standard Avro: int with date logicalType represents days since epoch
|
|
83
|
-
json_type['type'] = 'integer'
|
|
84
|
-
json_type['format'] = 'int32'
|
|
85
|
-
return json_type
|
|
86
|
-
elif base_type == 'int' and logical_type in ['time-millis']:
|
|
87
|
-
# Standard Avro: int with time-millis represents milliseconds since midnight
|
|
88
|
-
json_type['type'] = 'integer'
|
|
89
|
-
json_type['format'] = 'int32'
|
|
90
|
-
return json_type
|
|
91
|
-
elif base_type == 'long' and logical_type in ['time-micros']:
|
|
92
|
-
# Standard Avro: long with time-micros represents microseconds since midnight
|
|
93
|
-
json_type['type'] = 'integer'
|
|
94
|
-
json_type['format'] = 'int64'
|
|
95
|
-
return json_type
|
|
96
|
-
elif base_type == 'long' and logical_type in ['timestamp-millis', 'timestamp-micros']:
|
|
97
|
-
# Standard Avro: long with timestamp represents milliseconds/microseconds since epoch
|
|
98
|
-
json_type['type'] = 'integer'
|
|
99
|
-
json_type['format'] = 'int64'
|
|
100
|
-
return json_type
|
|
101
|
-
# Avrotize schema extensions: string-based logical types
|
|
102
|
-
elif base_type == 'string' and logical_type == 'date':
|
|
103
|
-
# Avrotize extension: string with date logicalType
|
|
104
|
-
json_type['type'] = 'string'
|
|
105
|
-
json_type['format'] = 'date'
|
|
106
|
-
return json_type
|
|
107
|
-
elif base_type == 'string' and logical_type in ['timestamp-millis', 'timestamp-micros', 'datetime']:
|
|
108
|
-
# Avrotize extension: string with datetime logicalType
|
|
109
|
-
json_type['type'] = 'string'
|
|
110
|
-
json_type['format'] = 'date-time'
|
|
111
|
-
return json_type
|
|
112
|
-
elif base_type == 'string' and logical_type in ['time-millis', 'time-micros', 'time']:
|
|
113
|
-
# Avrotize extension: string with time logicalType
|
|
114
|
-
json_type['type'] = 'string'
|
|
115
|
-
json_type['format'] = 'time'
|
|
116
|
-
return json_type
|
|
117
|
-
elif logical_type == 'decimal':
|
|
118
|
-
json_type['type'] = 'number'
|
|
119
|
-
return json_type
|
|
120
|
-
elif logical_type == 'uuid':
|
|
121
|
-
json_type['type'] = 'string'
|
|
122
|
-
json_type['format'] = 'uuid'
|
|
123
|
-
return json_type
|
|
124
|
-
|
|
125
|
-
# If base_type is still a dict, recurse
|
|
126
|
-
if isinstance(base_type, dict):
|
|
127
|
-
if 'logicalType' in base_type:
|
|
128
|
-
return self.avro_primitive_to_json_type(base_type)
|
|
129
|
-
else:
|
|
130
|
-
raise ValueError(f"Avro schema contains unexpected construct {avro_type}")
|
|
131
|
-
|
|
132
|
-
# No logical type or unhandled combination, process base type
|
|
133
|
-
return self.avro_primitive_to_json_type(base_type)
|
|
134
|
-
|
|
135
|
-
mapping = {
|
|
136
|
-
'null': {'type': 'null'},
|
|
137
|
-
'boolean': {'type': 'boolean'},
|
|
138
|
-
'int': {'type': 'integer', 'format': 'int32'},
|
|
139
|
-
'long': {'type': 'integer', 'format': 'int64'},
|
|
140
|
-
'float': {'type': 'number', 'format': 'float'},
|
|
141
|
-
'double': {'type': 'number', 'format': 'double'},
|
|
142
|
-
'bytes': {'type': 'string', 'contentEncoding': 'base64'},
|
|
143
|
-
'string': {'type': 'string'},
|
|
144
|
-
'fixed': {'type': 'string'} # Could specify length in a format or a separate attribute
|
|
145
|
-
}
|
|
146
|
-
type_ref = mapping.get(avro_type, '') # Defaulting to string type for any unknown types
|
|
147
|
-
if not type_ref:
|
|
148
|
-
raise ValueError(f"Avro schema contains unexpected type {avro_type}")
|
|
149
|
-
return type_ref
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def convert_name(self, name: str) -> str:
|
|
153
|
-
"""
|
|
154
|
-
Convert names according to the specified naming mode.
|
|
155
|
-
"""
|
|
156
|
-
if self.naming_mode == 'snake':
|
|
157
|
-
return self.to_snake_case(name)
|
|
158
|
-
elif self.naming_mode == 'camel':
|
|
159
|
-
return self.to_camel_case(name)
|
|
160
|
-
elif self.naming_mode == 'pascal':
|
|
161
|
-
return self.to_pascal_case(name)
|
|
162
|
-
return name
|
|
163
|
-
|
|
164
|
-
@staticmethod
|
|
165
|
-
def to_snake_case(name: str) -> str:
|
|
166
|
-
return ''.join(['_'+c.lower() if c.isupper() else c for c in name]).lstrip('_')
|
|
167
|
-
|
|
168
|
-
@staticmethod
|
|
169
|
-
def to_camel_case(name: str) -> str:
|
|
170
|
-
return ''.join(word.capitalize() if i else word for i, word in enumerate(name.split('_')))
|
|
171
|
-
|
|
172
|
-
@staticmethod
|
|
173
|
-
def to_pascal_case(name: str) -> str:
|
|
174
|
-
return ''.join(word.capitalize() for word in name.split('_'))
|
|
175
|
-
|
|
176
|
-
def is_nullable(self, avro_type: Union[str, Dict[str, Any]]) -> bool:
|
|
177
|
-
"""
|
|
178
|
-
Check if a given Avro type is nullable.
|
|
179
|
-
"""
|
|
180
|
-
if isinstance(avro_type, list):
|
|
181
|
-
return 'null' in avro_type
|
|
182
|
-
return avro_type == 'null'
|
|
183
|
-
|
|
184
|
-
def handle_type_union(self, types: List[Union[str, Dict[str, Any]]]) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
|
|
185
|
-
"""
|
|
186
|
-
Handle Avro type unions, returning a JSON schema that validates against any of the types.
|
|
187
|
-
"""
|
|
188
|
-
non_null_types = [t for t in types if t != 'null']
|
|
189
|
-
if len(non_null_types) == 1:
|
|
190
|
-
# Single non-null type
|
|
191
|
-
return self.parse_avro_schema(non_null_types[0])
|
|
192
|
-
else:
|
|
193
|
-
# Multiple non-null types
|
|
194
|
-
union_types = [self.convert_reference(t) if isinstance(t,str) and t in self.defined_types else self.avro_primitive_to_json_type(t)
|
|
195
|
-
if isinstance(t, str) else self.parse_avro_schema(t)
|
|
196
|
-
for t in non_null_types]
|
|
197
|
-
return {
|
|
198
|
-
'oneOf': union_types
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
def parse_avro_schema(self, avro_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str, is_root = False) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
|
|
202
|
-
"""
|
|
203
|
-
Parse an Avro schema structure and return the corresponding JSON schema.
|
|
204
|
-
"""
|
|
205
|
-
if isinstance(avro_schema, list):
|
|
206
|
-
# Type union
|
|
207
|
-
union = self.handle_type_union(avro_schema)
|
|
208
|
-
if is_root:
|
|
209
|
-
# all the definitions go into 'definitions'
|
|
210
|
-
return {
|
|
211
|
-
"$schema": "http://json-schema.org/draft-07/schema#"
|
|
212
|
-
}
|
|
213
|
-
if is_generic_json_type(union):
|
|
214
|
-
return { "type": "object" }
|
|
215
|
-
else:
|
|
216
|
-
return union
|
|
217
|
-
elif isinstance(avro_schema, dict):
|
|
218
|
-
if 'namespace' in avro_schema:
|
|
219
|
-
namespace = avro_schema['namespace']
|
|
220
|
-
self.update_common_namespace(namespace)
|
|
221
|
-
if avro_schema['type'] == 'record':
|
|
222
|
-
return self.convert_record(avro_schema, is_root)
|
|
223
|
-
elif avro_schema['type'] == 'enum':
|
|
224
|
-
return self.convert_enum(avro_schema, is_root)
|
|
225
|
-
elif avro_schema['type'] == 'fixed':
|
|
226
|
-
return self.convert_fixed(avro_schema, is_root)
|
|
227
|
-
elif avro_schema['type'] == 'array':
|
|
228
|
-
return self.convert_array(avro_schema)
|
|
229
|
-
elif avro_schema['type'] == 'map':
|
|
230
|
-
return self.convert_map(avro_schema)
|
|
231
|
-
elif avro_schema['type'] in self.defined_types:
|
|
232
|
-
# Type reference
|
|
233
|
-
return self.convert_reference(avro_schema)
|
|
234
|
-
else:
|
|
235
|
-
# Nested type or a direct type definition
|
|
236
|
-
return self.parse_avro_schema(avro_schema['type'])
|
|
237
|
-
elif isinstance(avro_schema, str):
|
|
238
|
-
# Primitive type or a reference to a defined type
|
|
239
|
-
if avro_schema in self.defined_types:
|
|
240
|
-
return self.convert_reference(avro_schema)
|
|
241
|
-
elif '.' in avro_schema:
|
|
242
|
-
raise ValueError(f"Unknown type reference {avro_schema}")
|
|
243
|
-
else:
|
|
244
|
-
return self.avro_primitive_to_json_type(avro_schema)
|
|
245
|
-
|
|
246
|
-
def convert_reference(self, avro_schema: Dict[str, Any] | str) -> Dict[str, Any]:
|
|
247
|
-
"""
|
|
248
|
-
Convert a reference to a defined type to a JSON schema object with a reference to the definition.
|
|
249
|
-
"""
|
|
250
|
-
key = avro_schema['type'] if isinstance(avro_schema, dict) else avro_schema
|
|
251
|
-
json_type = self.defined_types[key]
|
|
252
|
-
if 'enum' in json_type:
|
|
253
|
-
return copy.deepcopy(json_type)
|
|
254
|
-
else:
|
|
255
|
-
return {"$ref": self.get_definition_ref(key)}
|
|
256
|
-
|
|
257
|
-
def convert_record(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
|
|
258
|
-
"""
|
|
259
|
-
Convert an Avro record type to a JSON schema object, handling nested types and type definitions.
|
|
260
|
-
"""
|
|
261
|
-
record_name = self.convert_name(avro_schema['name'])
|
|
262
|
-
properties = {}
|
|
263
|
-
required = []
|
|
264
|
-
|
|
265
|
-
json_schema: Dict[str, Any] = {
|
|
266
|
-
"type": "object",
|
|
267
|
-
"title": record_name
|
|
268
|
-
}
|
|
269
|
-
if not is_root:
|
|
270
|
-
self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
|
|
271
|
-
|
|
272
|
-
for field in avro_schema['fields']:
|
|
273
|
-
field_name = self.convert_name(field['name'])
|
|
274
|
-
prop = self.parse_avro_schema(field['type'])
|
|
275
|
-
if 'doc' in field:
|
|
276
|
-
if isinstance(prop, dict):
|
|
277
|
-
prop['description'] = field['doc']
|
|
278
|
-
elif isinstance(prop, list) or isinstance(prop, str):
|
|
279
|
-
prop = {
|
|
280
|
-
'allOf': [
|
|
281
|
-
prop,
|
|
282
|
-
{'description': field['doc']}
|
|
283
|
-
]}
|
|
284
|
-
properties[field_name] = prop
|
|
285
|
-
if not self.is_nullable(field['type']):
|
|
286
|
-
required.append(field_name)
|
|
287
|
-
|
|
288
|
-
if 'doc' in avro_schema:
|
|
289
|
-
json_schema['description'] = avro_schema['doc']
|
|
290
|
-
if properties:
|
|
291
|
-
json_schema['properties'] = properties
|
|
292
|
-
|
|
293
|
-
if required:
|
|
294
|
-
json_schema['required'] = required
|
|
295
|
-
|
|
296
|
-
if not is_root:
|
|
297
|
-
return {"$ref": self.get_definition_ref(self.get_qualified_name(avro_schema))}
|
|
298
|
-
return json_schema
|
|
299
|
-
|
|
300
|
-
def convert_enum(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
|
|
301
|
-
"""
|
|
302
|
-
Convert an Avro enum type to a JSON schema enum, adding the definition to the schema.
|
|
303
|
-
"""
|
|
304
|
-
enum_name = self.convert_name(avro_schema['name'])
|
|
305
|
-
json_schema = {
|
|
306
|
-
"type": "string",
|
|
307
|
-
"enum": avro_schema['symbols'],
|
|
308
|
-
"title": enum_name
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
if 'doc' in avro_schema:
|
|
312
|
-
json_schema['description'] = avro_schema['doc']
|
|
313
|
-
|
|
314
|
-
# Add to defined types
|
|
315
|
-
if not is_root:
|
|
316
|
-
self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
|
|
317
|
-
return json_schema
|
|
318
|
-
|
|
319
|
-
def convert_fixed(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
|
|
320
|
-
"""
|
|
321
|
-
Convert an Avro fixed type to a JSON schema string with length constraints.
|
|
322
|
-
Fixed types are represented as strings with base16 (hex) encoding and
|
|
323
|
-
minLength and maxLength constraints based on the size.
|
|
324
|
-
"""
|
|
325
|
-
fixed_name = self.convert_name(avro_schema['name'])
|
|
326
|
-
size = avro_schema['size']
|
|
327
|
-
# Fixed types in JSON are represented as hex strings, so length is 2 * size
|
|
328
|
-
hex_length = size * 2
|
|
329
|
-
|
|
330
|
-
json_schema = {
|
|
331
|
-
"type": "string",
|
|
332
|
-
"contentEncoding": "base16",
|
|
333
|
-
"minLength": hex_length,
|
|
334
|
-
"maxLength": hex_length,
|
|
335
|
-
"title": fixed_name
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
if 'doc' in avro_schema:
|
|
339
|
-
json_schema['description'] = avro_schema['doc']
|
|
340
|
-
|
|
341
|
-
# Add to defined types
|
|
342
|
-
if not is_root:
|
|
343
|
-
self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
|
|
344
|
-
return {"$ref": self.get_definition_ref(self.get_qualified_name(avro_schema))}
|
|
345
|
-
return json_schema
|
|
346
|
-
|
|
347
|
-
def convert_array(self, avro_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
348
|
-
"""
|
|
349
|
-
Convert an Avro array type to a JSON schema array.
|
|
350
|
-
"""
|
|
351
|
-
return {
|
|
352
|
-
"type": "array",
|
|
353
|
-
"items": self.parse_avro_schema(avro_schema['items'])
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
def convert_map(self, avro_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
357
|
-
"""
|
|
358
|
-
Convert an Avro map type to a JSON schema object with additionalProperties.
|
|
359
|
-
"""
|
|
360
|
-
return {
|
|
361
|
-
"type": "object",
|
|
362
|
-
"additionalProperties": self.parse_avro_schema(avro_schema['values'])
|
|
363
|
-
}
|
|
364
|
-
|
|
365
|
-
def convert(self, avro_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
|
|
366
|
-
"""
|
|
367
|
-
Convert the root Avro schema to a JSON schema.
|
|
368
|
-
"""
|
|
369
|
-
json_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str = self.parse_avro_schema(avro_schema, is_root = True)
|
|
370
|
-
|
|
371
|
-
if self.defined_types and isinstance(json_schema, dict):
|
|
372
|
-
for name, definition in self.defined_types.items():
|
|
373
|
-
if isinstance(definition, dict) and 'enum' in definition:
|
|
374
|
-
# enums are inlined
|
|
375
|
-
continue
|
|
376
|
-
current_level = json_schema.setdefault('definitions', {})
|
|
377
|
-
if '.' in name:
|
|
378
|
-
definition_namespace, definition_name = name.rsplit('.',1)
|
|
379
|
-
if not self.common_namespace or (self.common_namespace and definition_namespace == self.common_namespace):
|
|
380
|
-
definition_namespace = ''
|
|
381
|
-
else:
|
|
382
|
-
definition_namespace = definition_namespace[len(self.common_namespace):].lstrip('.')
|
|
383
|
-
# Split the definition_namespace into path segments
|
|
384
|
-
path_segments = definition_namespace.split('.')
|
|
385
|
-
if definition_namespace and len(path_segments) > 0:
|
|
386
|
-
# Traverse through all but the last segment, creating nested dictionaries as needed
|
|
387
|
-
for segment in path_segments:
|
|
388
|
-
# If the segment does not exist, create a new dictionary at that level
|
|
389
|
-
if segment not in current_level:
|
|
390
|
-
current_level[segment] = {}
|
|
391
|
-
# Move deeper into the nested structure
|
|
392
|
-
current_level = current_level[segment]
|
|
393
|
-
else:
|
|
394
|
-
definition_name = name
|
|
395
|
-
current_level[definition_name] = copy.deepcopy(definition)
|
|
396
|
-
|
|
397
|
-
return json_schema
|
|
398
|
-
|
|
399
|
-
def compact_tree(json_schema):
|
|
400
|
-
shared_def_counter = 1
|
|
401
|
-
ignored_hashes = []
|
|
402
|
-
while True:
|
|
403
|
-
thl = build_tree_hash_list(json_schema)
|
|
404
|
-
ghl = group_by_hash(thl)
|
|
405
|
-
if len(ghl) == 0:
|
|
406
|
-
return
|
|
407
|
-
# sort ghl by the count in of the first item in each group
|
|
408
|
-
ghl = dict(sorted(ghl.items(), key=lambda item: -item[1][0].count))
|
|
409
|
-
repeat = True
|
|
410
|
-
while repeat:
|
|
411
|
-
repeat = False
|
|
412
|
-
first_group_key = next((key for key in ghl.keys() if key not in ignored_hashes), None)
|
|
413
|
-
if first_group_key is None:
|
|
414
|
-
return
|
|
415
|
-
ghl_top_item_entries = ghl[first_group_key]
|
|
416
|
-
# sort the items by the shortest .path value
|
|
417
|
-
ghl_top_item_entries = sorted(ghl_top_item_entries, key=lambda item: len(item.path.split('.')))
|
|
418
|
-
top_item_entry: NodeHashReference = ghl_top_item_entries[0]
|
|
419
|
-
top_item_path_segments = top_item_entry.path.split('.')
|
|
420
|
-
if top_item_path_segments[1] == 'definitions' and len(top_item_path_segments) == 3:
|
|
421
|
-
# the top item sits right under definitions, we will merge into that one
|
|
422
|
-
def_key = top_item_path_segments[2]
|
|
423
|
-
ghl_top_item_entries.remove(top_item_entry)
|
|
424
|
-
elif ((top_item_path_segments[-1] == 'options' and top_item_path_segments[-2] == 'properties' and len(top_item_path_segments) > 4) and 'oneOf' in top_item_entry.value):
|
|
425
|
-
# the first case is likely a union we created in j2a that we had to create a top-level item for. We will undo that here.
|
|
426
|
-
json_item = json_schema
|
|
427
|
-
def_key = ''
|
|
428
|
-
for seg in top_item_path_segments[1:-2]:
|
|
429
|
-
def_key += '/' + seg if def_key else seg
|
|
430
|
-
json_item = json_item[seg]
|
|
431
|
-
json_item.clear()
|
|
432
|
-
json_item.update(copy.deepcopy(top_item_entry.value))
|
|
433
|
-
ghl_top_item_entries.remove(top_item_entry)
|
|
434
|
-
elif top_item_path_segments[-2] == 'properties' or top_item_path_segments[-1] == 'properties':
|
|
435
|
-
# the top item is a property of an object, which means that we would create direct
|
|
436
|
-
# links into that object and therefore we will drop that hash
|
|
437
|
-
ignored_hashes.append(first_group_key)
|
|
438
|
-
repeat = True
|
|
439
|
-
continue
|
|
440
|
-
else:
|
|
441
|
-
# the second is indeed a proper type declaration, so we will use the first as the one all other occurrences refer to
|
|
442
|
-
json_item = json_schema
|
|
443
|
-
def_key = ''
|
|
444
|
-
for seg in top_item_path_segments[1:]:
|
|
445
|
-
def_key += '/' + seg if def_key else seg
|
|
446
|
-
ghl_top_item_entries.remove(top_item_entry)
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
for ghl_item in ghl_top_item_entries:
|
|
450
|
-
node = ghl_item.value
|
|
451
|
-
if isinstance(node,dict):
|
|
452
|
-
node.clear()
|
|
453
|
-
node.update({
|
|
454
|
-
'$ref': f"#/{def_key}"
|
|
455
|
-
})
|
|
456
|
-
break
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
def convert_avro_to_json_schema(avro_schema_file: str, json_schema_file: str, naming_mode: str = 'default') -> None:
|
|
461
|
-
"""
|
|
462
|
-
Convert an Avro schema file to a JSON schema file.
|
|
463
|
-
|
|
464
|
-
:param avro_schema_file: The path to the input Avro schema file.
|
|
465
|
-
:param json_schema_file: The path to the output JSON schema file.
|
|
466
|
-
:param naming_mode: The naming mode for converting names ('snake', 'camel', 'pascal').
|
|
467
|
-
"""
|
|
468
|
-
converter = AvroToJsonSchemaConverter(naming_mode)
|
|
469
|
-
|
|
470
|
-
# Read the Avro schema file
|
|
471
|
-
with open(avro_schema_file, 'r') as file:
|
|
472
|
-
avro_schema = json.load(file)
|
|
473
|
-
|
|
474
|
-
# Convert the Avro schema to JSON schema
|
|
475
|
-
json_schema = converter.convert(avro_schema)
|
|
476
|
-
|
|
477
|
-
compact_tree(json_schema)
|
|
478
|
-
# Write the JSON schema to the output file
|
|
479
|
-
with open(json_schema_file, 'w') as file:
|
|
480
|
-
json.dump(json_schema, file, indent=4)
|
|
481
|
-
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
from typing import Dict, Any, Union, List
|
|
4
|
+
from avrotize.common import build_tree_hash_list, group_by_hash, is_generic_json_type, NodeHashReference
|
|
5
|
+
from functools import reduce
|
|
6
|
+
import jsonpath_ng
|
|
7
|
+
|
|
8
|
+
class AvroToJsonSchemaConverter:
|
|
9
|
+
|
|
10
|
+
def __init__(self, naming_mode: str = 'snake') -> None:
|
|
11
|
+
self.naming_mode = naming_mode
|
|
12
|
+
self.defined_types: Dict[str, Any] = {}
|
|
13
|
+
self.common_namespace = ''
|
|
14
|
+
|
|
15
|
+
def find_common_namespace(self, namespaces: List[str]) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Find the common namespace prefix from a list of namespaces.
|
|
18
|
+
"""
|
|
19
|
+
if not namespaces:
|
|
20
|
+
return ''
|
|
21
|
+
|
|
22
|
+
def common_prefix(a, b):
|
|
23
|
+
prefix = ''
|
|
24
|
+
for a_char, b_char in zip(a.split('.'), b.split('.')):
|
|
25
|
+
if a_char == b_char:
|
|
26
|
+
prefix += a_char + '.'
|
|
27
|
+
else:
|
|
28
|
+
break
|
|
29
|
+
return prefix.rstrip('.')
|
|
30
|
+
|
|
31
|
+
return reduce(common_prefix, namespaces)
|
|
32
|
+
|
|
33
|
+
def update_common_namespace(self, namespace: str) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Update the common namespace based on the provided namespace.
|
|
36
|
+
"""
|
|
37
|
+
if not self.common_namespace:
|
|
38
|
+
self.common_namespace = namespace
|
|
39
|
+
else:
|
|
40
|
+
self.common_namespace = self.find_common_namespace([self.common_namespace, namespace])
|
|
41
|
+
|
|
42
|
+
def get_definition_ref(self, name: str) -> str:
|
|
43
|
+
"""
|
|
44
|
+
Construct the reference string based on the namespace and name.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
if '.' in name:
|
|
48
|
+
namespace, name = name.rsplit('.', 1)
|
|
49
|
+
else:
|
|
50
|
+
namespace = self.common_namespace
|
|
51
|
+
|
|
52
|
+
if not self.common_namespace:
|
|
53
|
+
return f"#/definitions/{name}"
|
|
54
|
+
|
|
55
|
+
# Remove the common namespace and replace '.' with '/'
|
|
56
|
+
namespace_suffix = namespace[len(self.common_namespace):].lstrip('.')
|
|
57
|
+
path = namespace_suffix.replace('.', '/') if namespace_suffix else ''
|
|
58
|
+
ref = f"#/definitions/{path}/{name}" if path else f"#/definitions/{name}"
|
|
59
|
+
return ref
|
|
60
|
+
|
|
61
|
+
def get_qualified_name(self, avro_type: Dict[str, Any]) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Construct the qualified name based on the namespace and name.
|
|
64
|
+
"""
|
|
65
|
+
return avro_type['name'] if 'namespace' not in avro_type else f"{avro_type['namespace']}.{avro_type['name']}"
|
|
66
|
+
|
|
67
|
+
def avro_primitive_to_json_type(self, avro_type: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
68
|
+
"""
|
|
69
|
+
Map Avro primitive types to JSON types with appropriate format annotations.
|
|
70
|
+
Handles both standard Avro logical types and Avrotize schema extensions.
|
|
71
|
+
"""
|
|
72
|
+
json_type = {}
|
|
73
|
+
if isinstance(avro_type, dict):
|
|
74
|
+
# Check for logical type before unwrapping the base type
|
|
75
|
+
logical_type = avro_type.get('logicalType')
|
|
76
|
+
base_type = avro_type.get('type', avro_type)
|
|
77
|
+
|
|
78
|
+
# Handle logical types based on their base type
|
|
79
|
+
if logical_type and isinstance(base_type, str):
|
|
80
|
+
# Standard Avro logical types on int/long
|
|
81
|
+
if base_type == 'int' and logical_type == 'date':
|
|
82
|
+
# Standard Avro: int with date logicalType represents days since epoch
|
|
83
|
+
json_type['type'] = 'integer'
|
|
84
|
+
json_type['format'] = 'int32'
|
|
85
|
+
return json_type
|
|
86
|
+
elif base_type == 'int' and logical_type in ['time-millis']:
|
|
87
|
+
# Standard Avro: int with time-millis represents milliseconds since midnight
|
|
88
|
+
json_type['type'] = 'integer'
|
|
89
|
+
json_type['format'] = 'int32'
|
|
90
|
+
return json_type
|
|
91
|
+
elif base_type == 'long' and logical_type in ['time-micros']:
|
|
92
|
+
# Standard Avro: long with time-micros represents microseconds since midnight
|
|
93
|
+
json_type['type'] = 'integer'
|
|
94
|
+
json_type['format'] = 'int64'
|
|
95
|
+
return json_type
|
|
96
|
+
elif base_type == 'long' and logical_type in ['timestamp-millis', 'timestamp-micros']:
|
|
97
|
+
# Standard Avro: long with timestamp represents milliseconds/microseconds since epoch
|
|
98
|
+
json_type['type'] = 'integer'
|
|
99
|
+
json_type['format'] = 'int64'
|
|
100
|
+
return json_type
|
|
101
|
+
# Avrotize schema extensions: string-based logical types
|
|
102
|
+
elif base_type == 'string' and logical_type == 'date':
|
|
103
|
+
# Avrotize extension: string with date logicalType
|
|
104
|
+
json_type['type'] = 'string'
|
|
105
|
+
json_type['format'] = 'date'
|
|
106
|
+
return json_type
|
|
107
|
+
elif base_type == 'string' and logical_type in ['timestamp-millis', 'timestamp-micros', 'datetime']:
|
|
108
|
+
# Avrotize extension: string with datetime logicalType
|
|
109
|
+
json_type['type'] = 'string'
|
|
110
|
+
json_type['format'] = 'date-time'
|
|
111
|
+
return json_type
|
|
112
|
+
elif base_type == 'string' and logical_type in ['time-millis', 'time-micros', 'time']:
|
|
113
|
+
# Avrotize extension: string with time logicalType
|
|
114
|
+
json_type['type'] = 'string'
|
|
115
|
+
json_type['format'] = 'time'
|
|
116
|
+
return json_type
|
|
117
|
+
elif logical_type == 'decimal':
|
|
118
|
+
json_type['type'] = 'number'
|
|
119
|
+
return json_type
|
|
120
|
+
elif logical_type == 'uuid':
|
|
121
|
+
json_type['type'] = 'string'
|
|
122
|
+
json_type['format'] = 'uuid'
|
|
123
|
+
return json_type
|
|
124
|
+
|
|
125
|
+
# If base_type is still a dict, recurse
|
|
126
|
+
if isinstance(base_type, dict):
|
|
127
|
+
if 'logicalType' in base_type:
|
|
128
|
+
return self.avro_primitive_to_json_type(base_type)
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError(f"Avro schema contains unexpected construct {avro_type}")
|
|
131
|
+
|
|
132
|
+
# No logical type or unhandled combination, process base type
|
|
133
|
+
return self.avro_primitive_to_json_type(base_type)
|
|
134
|
+
|
|
135
|
+
mapping = {
|
|
136
|
+
'null': {'type': 'null'},
|
|
137
|
+
'boolean': {'type': 'boolean'},
|
|
138
|
+
'int': {'type': 'integer', 'format': 'int32'},
|
|
139
|
+
'long': {'type': 'integer', 'format': 'int64'},
|
|
140
|
+
'float': {'type': 'number', 'format': 'float'},
|
|
141
|
+
'double': {'type': 'number', 'format': 'double'},
|
|
142
|
+
'bytes': {'type': 'string', 'contentEncoding': 'base64'},
|
|
143
|
+
'string': {'type': 'string'},
|
|
144
|
+
'fixed': {'type': 'string'} # Could specify length in a format or a separate attribute
|
|
145
|
+
}
|
|
146
|
+
type_ref = mapping.get(avro_type, '') # Defaulting to string type for any unknown types
|
|
147
|
+
if not type_ref:
|
|
148
|
+
raise ValueError(f"Avro schema contains unexpected type {avro_type}")
|
|
149
|
+
return type_ref
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def convert_name(self, name: str) -> str:
|
|
153
|
+
"""
|
|
154
|
+
Convert names according to the specified naming mode.
|
|
155
|
+
"""
|
|
156
|
+
if self.naming_mode == 'snake':
|
|
157
|
+
return self.to_snake_case(name)
|
|
158
|
+
elif self.naming_mode == 'camel':
|
|
159
|
+
return self.to_camel_case(name)
|
|
160
|
+
elif self.naming_mode == 'pascal':
|
|
161
|
+
return self.to_pascal_case(name)
|
|
162
|
+
return name
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def to_snake_case(name: str) -> str:
|
|
166
|
+
return ''.join(['_'+c.lower() if c.isupper() else c for c in name]).lstrip('_')
|
|
167
|
+
|
|
168
|
+
@staticmethod
|
|
169
|
+
def to_camel_case(name: str) -> str:
|
|
170
|
+
return ''.join(word.capitalize() if i else word for i, word in enumerate(name.split('_')))
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def to_pascal_case(name: str) -> str:
|
|
174
|
+
return ''.join(word.capitalize() for word in name.split('_'))
|
|
175
|
+
|
|
176
|
+
def is_nullable(self, avro_type: Union[str, Dict[str, Any]]) -> bool:
|
|
177
|
+
"""
|
|
178
|
+
Check if a given Avro type is nullable.
|
|
179
|
+
"""
|
|
180
|
+
if isinstance(avro_type, list):
|
|
181
|
+
return 'null' in avro_type
|
|
182
|
+
return avro_type == 'null'
|
|
183
|
+
|
|
184
|
+
def handle_type_union(self, types: List[Union[str, Dict[str, Any]]]) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
|
|
185
|
+
"""
|
|
186
|
+
Handle Avro type unions, returning a JSON schema that validates against any of the types.
|
|
187
|
+
"""
|
|
188
|
+
non_null_types = [t for t in types if t != 'null']
|
|
189
|
+
if len(non_null_types) == 1:
|
|
190
|
+
# Single non-null type
|
|
191
|
+
return self.parse_avro_schema(non_null_types[0])
|
|
192
|
+
else:
|
|
193
|
+
# Multiple non-null types
|
|
194
|
+
union_types = [self.convert_reference(t) if isinstance(t,str) and t in self.defined_types else self.avro_primitive_to_json_type(t)
|
|
195
|
+
if isinstance(t, str) else self.parse_avro_schema(t)
|
|
196
|
+
for t in non_null_types]
|
|
197
|
+
return {
|
|
198
|
+
'oneOf': union_types
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
def parse_avro_schema(self, avro_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str, is_root = False) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
|
|
202
|
+
"""
|
|
203
|
+
Parse an Avro schema structure and return the corresponding JSON schema.
|
|
204
|
+
"""
|
|
205
|
+
if isinstance(avro_schema, list):
|
|
206
|
+
# Type union
|
|
207
|
+
union = self.handle_type_union(avro_schema)
|
|
208
|
+
if is_root:
|
|
209
|
+
# all the definitions go into 'definitions'
|
|
210
|
+
return {
|
|
211
|
+
"$schema": "http://json-schema.org/draft-07/schema#"
|
|
212
|
+
}
|
|
213
|
+
if is_generic_json_type(union):
|
|
214
|
+
return { "type": "object" }
|
|
215
|
+
else:
|
|
216
|
+
return union
|
|
217
|
+
elif isinstance(avro_schema, dict):
|
|
218
|
+
if 'namespace' in avro_schema:
|
|
219
|
+
namespace = avro_schema['namespace']
|
|
220
|
+
self.update_common_namespace(namespace)
|
|
221
|
+
if avro_schema['type'] == 'record':
|
|
222
|
+
return self.convert_record(avro_schema, is_root)
|
|
223
|
+
elif avro_schema['type'] == 'enum':
|
|
224
|
+
return self.convert_enum(avro_schema, is_root)
|
|
225
|
+
elif avro_schema['type'] == 'fixed':
|
|
226
|
+
return self.convert_fixed(avro_schema, is_root)
|
|
227
|
+
elif avro_schema['type'] == 'array':
|
|
228
|
+
return self.convert_array(avro_schema)
|
|
229
|
+
elif avro_schema['type'] == 'map':
|
|
230
|
+
return self.convert_map(avro_schema)
|
|
231
|
+
elif avro_schema['type'] in self.defined_types:
|
|
232
|
+
# Type reference
|
|
233
|
+
return self.convert_reference(avro_schema)
|
|
234
|
+
else:
|
|
235
|
+
# Nested type or a direct type definition
|
|
236
|
+
return self.parse_avro_schema(avro_schema['type'])
|
|
237
|
+
elif isinstance(avro_schema, str):
|
|
238
|
+
# Primitive type or a reference to a defined type
|
|
239
|
+
if avro_schema in self.defined_types:
|
|
240
|
+
return self.convert_reference(avro_schema)
|
|
241
|
+
elif '.' in avro_schema:
|
|
242
|
+
raise ValueError(f"Unknown type reference {avro_schema}")
|
|
243
|
+
else:
|
|
244
|
+
return self.avro_primitive_to_json_type(avro_schema)
|
|
245
|
+
|
|
246
|
+
def convert_reference(self, avro_schema: Dict[str, Any] | str) -> Dict[str, Any]:
|
|
247
|
+
"""
|
|
248
|
+
Convert a reference to a defined type to a JSON schema object with a reference to the definition.
|
|
249
|
+
"""
|
|
250
|
+
key = avro_schema['type'] if isinstance(avro_schema, dict) else avro_schema
|
|
251
|
+
json_type = self.defined_types[key]
|
|
252
|
+
if 'enum' in json_type:
|
|
253
|
+
return copy.deepcopy(json_type)
|
|
254
|
+
else:
|
|
255
|
+
return {"$ref": self.get_definition_ref(key)}
|
|
256
|
+
|
|
257
|
+
def convert_record(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
|
|
258
|
+
"""
|
|
259
|
+
Convert an Avro record type to a JSON schema object, handling nested types and type definitions.
|
|
260
|
+
"""
|
|
261
|
+
record_name = self.convert_name(avro_schema['name'])
|
|
262
|
+
properties = {}
|
|
263
|
+
required = []
|
|
264
|
+
|
|
265
|
+
json_schema: Dict[str, Any] = {
|
|
266
|
+
"type": "object",
|
|
267
|
+
"title": record_name
|
|
268
|
+
}
|
|
269
|
+
if not is_root:
|
|
270
|
+
self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
|
|
271
|
+
|
|
272
|
+
for field in avro_schema['fields']:
|
|
273
|
+
field_name = self.convert_name(field['name'])
|
|
274
|
+
prop = self.parse_avro_schema(field['type'])
|
|
275
|
+
if 'doc' in field:
|
|
276
|
+
if isinstance(prop, dict):
|
|
277
|
+
prop['description'] = field['doc']
|
|
278
|
+
elif isinstance(prop, list) or isinstance(prop, str):
|
|
279
|
+
prop = {
|
|
280
|
+
'allOf': [
|
|
281
|
+
prop,
|
|
282
|
+
{'description': field['doc']}
|
|
283
|
+
]}
|
|
284
|
+
properties[field_name] = prop
|
|
285
|
+
if not self.is_nullable(field['type']):
|
|
286
|
+
required.append(field_name)
|
|
287
|
+
|
|
288
|
+
if 'doc' in avro_schema:
|
|
289
|
+
json_schema['description'] = avro_schema['doc']
|
|
290
|
+
if properties:
|
|
291
|
+
json_schema['properties'] = properties
|
|
292
|
+
|
|
293
|
+
if required:
|
|
294
|
+
json_schema['required'] = required
|
|
295
|
+
|
|
296
|
+
if not is_root:
|
|
297
|
+
return {"$ref": self.get_definition_ref(self.get_qualified_name(avro_schema))}
|
|
298
|
+
return json_schema
|
|
299
|
+
|
|
300
|
+
def convert_enum(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
|
|
301
|
+
"""
|
|
302
|
+
Convert an Avro enum type to a JSON schema enum, adding the definition to the schema.
|
|
303
|
+
"""
|
|
304
|
+
enum_name = self.convert_name(avro_schema['name'])
|
|
305
|
+
json_schema = {
|
|
306
|
+
"type": "string",
|
|
307
|
+
"enum": avro_schema['symbols'],
|
|
308
|
+
"title": enum_name
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
if 'doc' in avro_schema:
|
|
312
|
+
json_schema['description'] = avro_schema['doc']
|
|
313
|
+
|
|
314
|
+
# Add to defined types
|
|
315
|
+
if not is_root:
|
|
316
|
+
self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
|
|
317
|
+
return json_schema
|
|
318
|
+
|
|
319
|
+
def convert_fixed(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
|
|
320
|
+
"""
|
|
321
|
+
Convert an Avro fixed type to a JSON schema string with length constraints.
|
|
322
|
+
Fixed types are represented as strings with base16 (hex) encoding and
|
|
323
|
+
minLength and maxLength constraints based on the size.
|
|
324
|
+
"""
|
|
325
|
+
fixed_name = self.convert_name(avro_schema['name'])
|
|
326
|
+
size = avro_schema['size']
|
|
327
|
+
# Fixed types in JSON are represented as hex strings, so length is 2 * size
|
|
328
|
+
hex_length = size * 2
|
|
329
|
+
|
|
330
|
+
json_schema = {
|
|
331
|
+
"type": "string",
|
|
332
|
+
"contentEncoding": "base16",
|
|
333
|
+
"minLength": hex_length,
|
|
334
|
+
"maxLength": hex_length,
|
|
335
|
+
"title": fixed_name
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if 'doc' in avro_schema:
|
|
339
|
+
json_schema['description'] = avro_schema['doc']
|
|
340
|
+
|
|
341
|
+
# Add to defined types
|
|
342
|
+
if not is_root:
|
|
343
|
+
self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
|
|
344
|
+
return {"$ref": self.get_definition_ref(self.get_qualified_name(avro_schema))}
|
|
345
|
+
return json_schema
|
|
346
|
+
|
|
347
|
+
def convert_array(self, avro_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
348
|
+
"""
|
|
349
|
+
Convert an Avro array type to a JSON schema array.
|
|
350
|
+
"""
|
|
351
|
+
return {
|
|
352
|
+
"type": "array",
|
|
353
|
+
"items": self.parse_avro_schema(avro_schema['items'])
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
def convert_map(self, avro_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
357
|
+
"""
|
|
358
|
+
Convert an Avro map type to a JSON schema object with additionalProperties.
|
|
359
|
+
"""
|
|
360
|
+
return {
|
|
361
|
+
"type": "object",
|
|
362
|
+
"additionalProperties": self.parse_avro_schema(avro_schema['values'])
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
def convert(self, avro_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
|
|
366
|
+
"""
|
|
367
|
+
Convert the root Avro schema to a JSON schema.
|
|
368
|
+
"""
|
|
369
|
+
json_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str = self.parse_avro_schema(avro_schema, is_root = True)
|
|
370
|
+
|
|
371
|
+
if self.defined_types and isinstance(json_schema, dict):
|
|
372
|
+
for name, definition in self.defined_types.items():
|
|
373
|
+
if isinstance(definition, dict) and 'enum' in definition:
|
|
374
|
+
# enums are inlined
|
|
375
|
+
continue
|
|
376
|
+
current_level = json_schema.setdefault('definitions', {})
|
|
377
|
+
if '.' in name:
|
|
378
|
+
definition_namespace, definition_name = name.rsplit('.',1)
|
|
379
|
+
if not self.common_namespace or (self.common_namespace and definition_namespace == self.common_namespace):
|
|
380
|
+
definition_namespace = ''
|
|
381
|
+
else:
|
|
382
|
+
definition_namespace = definition_namespace[len(self.common_namespace):].lstrip('.')
|
|
383
|
+
# Split the definition_namespace into path segments
|
|
384
|
+
path_segments = definition_namespace.split('.')
|
|
385
|
+
if definition_namespace and len(path_segments) > 0:
|
|
386
|
+
# Traverse through all but the last segment, creating nested dictionaries as needed
|
|
387
|
+
for segment in path_segments:
|
|
388
|
+
# If the segment does not exist, create a new dictionary at that level
|
|
389
|
+
if segment not in current_level:
|
|
390
|
+
current_level[segment] = {}
|
|
391
|
+
# Move deeper into the nested structure
|
|
392
|
+
current_level = current_level[segment]
|
|
393
|
+
else:
|
|
394
|
+
definition_name = name
|
|
395
|
+
current_level[definition_name] = copy.deepcopy(definition)
|
|
396
|
+
|
|
397
|
+
return json_schema
|
|
398
|
+
|
|
399
|
+
def compact_tree(json_schema):
|
|
400
|
+
shared_def_counter = 1
|
|
401
|
+
ignored_hashes = []
|
|
402
|
+
while True:
|
|
403
|
+
thl = build_tree_hash_list(json_schema)
|
|
404
|
+
ghl = group_by_hash(thl)
|
|
405
|
+
if len(ghl) == 0:
|
|
406
|
+
return
|
|
407
|
+
# sort ghl by the count in of the first item in each group
|
|
408
|
+
ghl = dict(sorted(ghl.items(), key=lambda item: -item[1][0].count))
|
|
409
|
+
repeat = True
|
|
410
|
+
while repeat:
|
|
411
|
+
repeat = False
|
|
412
|
+
first_group_key = next((key for key in ghl.keys() if key not in ignored_hashes), None)
|
|
413
|
+
if first_group_key is None:
|
|
414
|
+
return
|
|
415
|
+
ghl_top_item_entries = ghl[first_group_key]
|
|
416
|
+
# sort the items by the shortest .path value
|
|
417
|
+
ghl_top_item_entries = sorted(ghl_top_item_entries, key=lambda item: len(item.path.split('.')))
|
|
418
|
+
top_item_entry: NodeHashReference = ghl_top_item_entries[0]
|
|
419
|
+
top_item_path_segments = top_item_entry.path.split('.')
|
|
420
|
+
if top_item_path_segments[1] == 'definitions' and len(top_item_path_segments) == 3:
|
|
421
|
+
# the top item sits right under definitions, we will merge into that one
|
|
422
|
+
def_key = top_item_path_segments[2]
|
|
423
|
+
ghl_top_item_entries.remove(top_item_entry)
|
|
424
|
+
elif ((top_item_path_segments[-1] == 'options' and top_item_path_segments[-2] == 'properties' and len(top_item_path_segments) > 4) and 'oneOf' in top_item_entry.value):
|
|
425
|
+
# the first case is likely a union we created in j2a that we had to create a top-level item for. We will undo that here.
|
|
426
|
+
json_item = json_schema
|
|
427
|
+
def_key = ''
|
|
428
|
+
for seg in top_item_path_segments[1:-2]:
|
|
429
|
+
def_key += '/' + seg if def_key else seg
|
|
430
|
+
json_item = json_item[seg]
|
|
431
|
+
json_item.clear()
|
|
432
|
+
json_item.update(copy.deepcopy(top_item_entry.value))
|
|
433
|
+
ghl_top_item_entries.remove(top_item_entry)
|
|
434
|
+
elif top_item_path_segments[-2] == 'properties' or top_item_path_segments[-1] == 'properties':
|
|
435
|
+
# the top item is a property of an object, which means that we would create direct
|
|
436
|
+
# links into that object and therefore we will drop that hash
|
|
437
|
+
ignored_hashes.append(first_group_key)
|
|
438
|
+
repeat = True
|
|
439
|
+
continue
|
|
440
|
+
else:
|
|
441
|
+
# the second is indeed a proper type declaration, so we will use the first as the one all other occurrences refer to
|
|
442
|
+
json_item = json_schema
|
|
443
|
+
def_key = ''
|
|
444
|
+
for seg in top_item_path_segments[1:]:
|
|
445
|
+
def_key += '/' + seg if def_key else seg
|
|
446
|
+
ghl_top_item_entries.remove(top_item_entry)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
for ghl_item in ghl_top_item_entries:
|
|
450
|
+
node = ghl_item.value
|
|
451
|
+
if isinstance(node,dict):
|
|
452
|
+
node.clear()
|
|
453
|
+
node.update({
|
|
454
|
+
'$ref': f"#/{def_key}"
|
|
455
|
+
})
|
|
456
|
+
break
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def convert_avro_to_json_schema(avro_schema_file: str, json_schema_file: str, naming_mode: str = 'default') -> None:
|
|
461
|
+
"""
|
|
462
|
+
Convert an Avro schema file to a JSON schema file.
|
|
463
|
+
|
|
464
|
+
:param avro_schema_file: The path to the input Avro schema file.
|
|
465
|
+
:param json_schema_file: The path to the output JSON schema file.
|
|
466
|
+
:param naming_mode: The naming mode for converting names ('snake', 'camel', 'pascal').
|
|
467
|
+
"""
|
|
468
|
+
converter = AvroToJsonSchemaConverter(naming_mode)
|
|
469
|
+
|
|
470
|
+
# Read the Avro schema file
|
|
471
|
+
with open(avro_schema_file, 'r') as file:
|
|
472
|
+
avro_schema = json.load(file)
|
|
473
|
+
|
|
474
|
+
# Convert the Avro schema to JSON schema
|
|
475
|
+
json_schema = converter.convert(avro_schema)
|
|
476
|
+
|
|
477
|
+
compact_tree(json_schema)
|
|
478
|
+
# Write the JSON schema to the output file
|
|
479
|
+
with open(json_schema_file, 'w') as file:
|
|
480
|
+
json.dump(json_schema, file, indent=4)
|
|
481
|
+
|