structurize 2.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +64 -0
- avrotize/__main__.py +6 -0
- avrotize/_version.py +34 -0
- avrotize/asn1toavro.py +160 -0
- avrotize/avrotize.py +152 -0
- avrotize/avrotocpp.py +483 -0
- avrotize/avrotocsharp.py +1075 -0
- avrotize/avrotocsv.py +121 -0
- avrotize/avrotodatapackage.py +173 -0
- avrotize/avrotodb.py +1383 -0
- avrotize/avrotogo.py +476 -0
- avrotize/avrotographql.py +197 -0
- avrotize/avrotoiceberg.py +210 -0
- avrotize/avrotojava.py +2156 -0
- avrotize/avrotojs.py +250 -0
- avrotize/avrotojsons.py +481 -0
- avrotize/avrotojstruct.py +345 -0
- avrotize/avrotokusto.py +364 -0
- avrotize/avrotomd.py +137 -0
- avrotize/avrotools.py +168 -0
- avrotize/avrotoparquet.py +208 -0
- avrotize/avrotoproto.py +359 -0
- avrotize/avrotopython.py +624 -0
- avrotize/avrotorust.py +435 -0
- avrotize/avrotots.py +598 -0
- avrotize/avrotoxsd.py +344 -0
- avrotize/cddltostructure.py +1841 -0
- avrotize/commands.json +3337 -0
- avrotize/common.py +834 -0
- avrotize/constants.py +72 -0
- avrotize/csvtoavro.py +132 -0
- avrotize/datapackagetoavro.py +76 -0
- avrotize/dependencies/cpp/vcpkg/vcpkg.json +19 -0
- avrotize/dependencies/typescript/node22/package.json +16 -0
- avrotize/dependency_resolver.py +348 -0
- avrotize/dependency_version.py +432 -0
- avrotize/jsonstoavro.py +2167 -0
- avrotize/jsonstostructure.py +2642 -0
- avrotize/jstructtoavro.py +878 -0
- avrotize/kstructtoavro.py +93 -0
- avrotize/kustotoavro.py +455 -0
- avrotize/parquettoavro.py +157 -0
- avrotize/proto2parser.py +498 -0
- avrotize/proto3parser.py +403 -0
- avrotize/prototoavro.py +382 -0
- avrotize/structuretocddl.py +597 -0
- avrotize/structuretocpp.py +697 -0
- avrotize/structuretocsharp.py +2295 -0
- avrotize/structuretocsv.py +365 -0
- avrotize/structuretodatapackage.py +659 -0
- avrotize/structuretodb.py +1125 -0
- avrotize/structuretogo.py +720 -0
- avrotize/structuretographql.py +502 -0
- avrotize/structuretoiceberg.py +355 -0
- avrotize/structuretojava.py +853 -0
- avrotize/structuretojsons.py +498 -0
- avrotize/structuretokusto.py +639 -0
- avrotize/structuretomd.py +322 -0
- avrotize/structuretoproto.py +764 -0
- avrotize/structuretopython.py +772 -0
- avrotize/structuretorust.py +714 -0
- avrotize/structuretots.py +653 -0
- avrotize/structuretoxsd.py +679 -0
- avrotize/xsdtoavro.py +413 -0
- structurize-2.19.0.dist-info/METADATA +107 -0
- structurize-2.19.0.dist-info/RECORD +70 -0
- structurize-2.19.0.dist-info/WHEEL +5 -0
- structurize-2.19.0.dist-info/entry_points.txt +2 -0
- structurize-2.19.0.dist-info/licenses/LICENSE +201 -0
- structurize-2.19.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,878 @@
|
|
|
1
|
+
"""
|
|
2
|
+
JSON Structure to Avro Schema Converter
|
|
3
|
+
|
|
4
|
+
Converts JSON Structure documents to Apache Avro schema format.
|
|
5
|
+
This is the reverse operation of avrotojstruct.py.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from typing import Any, Dict, List, Union, Optional
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JsonStructureToAvro:
|
|
13
|
+
"""
|
|
14
|
+
Convert JSON Structure documents to Avro schema format.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
"""Initialize the converter."""
|
|
19
|
+
self.structure_doc: Optional[Dict[str, Any]] = None
|
|
20
|
+
self.converted_types: Dict[str, Dict[str, Any]] = {}
|
|
21
|
+
|
|
22
|
+
def convert(self, structure_schema: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
|
|
23
|
+
"""
|
|
24
|
+
Convert a JSON Structure document to Avro schema.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
structure_schema: The JSON Structure document
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Avro schema (dict or list of dicts)
|
|
31
|
+
"""
|
|
32
|
+
self.structure_doc = structure_schema
|
|
33
|
+
self.converted_types.clear()
|
|
34
|
+
|
|
35
|
+
# Check if this is an inline type (type at root) or uses $root
|
|
36
|
+
root_ref = structure_schema.get('$root')
|
|
37
|
+
has_inline_type = 'type' in structure_schema
|
|
38
|
+
|
|
39
|
+
if has_inline_type:
|
|
40
|
+
# Inline type at root - convert directly
|
|
41
|
+
name = structure_schema.get('name', 'Root')
|
|
42
|
+
namespace = None # Root level doesn't have namespace
|
|
43
|
+
|
|
44
|
+
# Also convert any definitions that might be referenced
|
|
45
|
+
definitions = structure_schema.get('definitions', {})
|
|
46
|
+
if definitions:
|
|
47
|
+
for def_path, def_schema in self._flatten_definitions(definitions).items():
|
|
48
|
+
self._convert_definition(def_path, def_schema)
|
|
49
|
+
|
|
50
|
+
root_schema = self._convert_type_from_schema(structure_schema, namespace, name)
|
|
51
|
+
|
|
52
|
+
# If there are referenced types, return all as a list
|
|
53
|
+
if self.converted_types:
|
|
54
|
+
# Filter out abstract types
|
|
55
|
+
concrete_types = [schema for schema in self.converted_types.values()
|
|
56
|
+
if not (schema.get('type') == 'null' and 'Abstract type' in schema.get('doc', ''))]
|
|
57
|
+
return [root_schema] + concrete_types if concrete_types else root_schema
|
|
58
|
+
|
|
59
|
+
return root_schema
|
|
60
|
+
|
|
61
|
+
if not root_ref:
|
|
62
|
+
raise ValueError("JSON Structure document must have either 'type' or '$root' property")
|
|
63
|
+
|
|
64
|
+
# Extract definitions
|
|
65
|
+
definitions = structure_schema.get('definitions', {})
|
|
66
|
+
if not definitions:
|
|
67
|
+
raise ValueError("JSON Structure document with $root must have definitions")
|
|
68
|
+
|
|
69
|
+
# Convert all definitions first
|
|
70
|
+
for def_path, def_schema in self._flatten_definitions(definitions).items():
|
|
71
|
+
self._convert_definition(def_path, def_schema)
|
|
72
|
+
|
|
73
|
+
# Get the root schema
|
|
74
|
+
root_path = root_ref.replace('#/definitions/', '')
|
|
75
|
+
root_schema = self.converted_types.get(root_path)
|
|
76
|
+
|
|
77
|
+
if not root_schema:
|
|
78
|
+
raise ValueError(f"Root type {root_path} not found in converted types")
|
|
79
|
+
|
|
80
|
+
# Return single schema or list depending on how many types were defined
|
|
81
|
+
if len(self.converted_types) == 1:
|
|
82
|
+
return root_schema
|
|
83
|
+
else:
|
|
84
|
+
# Return all schemas as a list
|
|
85
|
+
return list(self.converted_types.values())
|
|
86
|
+
|
|
87
|
+
def _flatten_definitions(self, definitions: Dict[str, Any], prefix: str = '') -> Dict[str, Dict[str, Any]]:
|
|
88
|
+
"""
|
|
89
|
+
Flatten nested definitions into a flat dictionary with paths as keys.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
definitions: Nested definitions dictionary
|
|
93
|
+
prefix: Current path prefix
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Flattened dictionary {path: definition_schema}
|
|
97
|
+
"""
|
|
98
|
+
flattened = {}
|
|
99
|
+
|
|
100
|
+
for key, value in definitions.items():
|
|
101
|
+
path = f"{prefix}/{key}" if prefix else key
|
|
102
|
+
|
|
103
|
+
if isinstance(value, dict):
|
|
104
|
+
# Check if this is a type definition (has 'type' or other schema properties)
|
|
105
|
+
if 'type' in value or 'oneOf' in value or 'allOf' in value:
|
|
106
|
+
flattened[path] = value
|
|
107
|
+
else:
|
|
108
|
+
# It's a namespace, recurse
|
|
109
|
+
flattened.update(self._flatten_definitions(value, path))
|
|
110
|
+
|
|
111
|
+
return flattened
|
|
112
|
+
|
|
113
|
+
def _resolve_base_schema(self, ref: str) -> Optional[Dict[str, Any]]:
|
|
114
|
+
"""
|
|
115
|
+
Resolve a $ref to its schema definition.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
ref: Reference string like "#/definitions/BaseEntity"
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
The resolved schema or None if not found
|
|
122
|
+
"""
|
|
123
|
+
if not ref.startswith('#/definitions/'):
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
if not self.structure_doc:
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
ref_path = ref.replace('#/definitions/', '')
|
|
130
|
+
definitions = self.structure_doc.get('definitions', {})
|
|
131
|
+
|
|
132
|
+
# Navigate through nested definitions
|
|
133
|
+
parts = ref_path.split('/')
|
|
134
|
+
current = definitions
|
|
135
|
+
for part in parts:
|
|
136
|
+
if isinstance(current, dict) and part in current:
|
|
137
|
+
current = current[part]
|
|
138
|
+
else:
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
return current if isinstance(current, dict) else None
|
|
142
|
+
|
|
143
|
+
def _merge_base_properties(self, schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
144
|
+
"""
|
|
145
|
+
Merge properties from base type(s) via $extends.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
schema: Type schema that may have $extends
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Schema with merged properties
|
|
152
|
+
"""
|
|
153
|
+
extends_ref = schema.get('$extends')
|
|
154
|
+
if not extends_ref:
|
|
155
|
+
return schema
|
|
156
|
+
|
|
157
|
+
# Resolve the base type
|
|
158
|
+
base_schema = self._resolve_base_schema(extends_ref)
|
|
159
|
+
if not base_schema:
|
|
160
|
+
return schema
|
|
161
|
+
|
|
162
|
+
# Recursively merge base's base
|
|
163
|
+
base_schema = self._merge_base_properties(base_schema)
|
|
164
|
+
|
|
165
|
+
# Create merged schema
|
|
166
|
+
merged = dict(schema)
|
|
167
|
+
|
|
168
|
+
# Merge properties - child properties override base
|
|
169
|
+
base_properties = base_schema.get('properties', {})
|
|
170
|
+
child_properties = schema.get('properties', {})
|
|
171
|
+
|
|
172
|
+
if base_properties or child_properties:
|
|
173
|
+
merged['properties'] = {**base_properties, **child_properties}
|
|
174
|
+
|
|
175
|
+
# Merge required fields
|
|
176
|
+
base_required = base_schema.get('required', [])
|
|
177
|
+
child_required = schema.get('required', [])
|
|
178
|
+
|
|
179
|
+
if base_required or child_required:
|
|
180
|
+
# Combine and deduplicate
|
|
181
|
+
all_required = list(set(base_required + child_required))
|
|
182
|
+
merged['required'] = all_required
|
|
183
|
+
|
|
184
|
+
# Add note about inheritance in description (only if not already present)
|
|
185
|
+
if base_schema.get('abstract'):
|
|
186
|
+
base_name = extends_ref.split('/')[-1]
|
|
187
|
+
note = f"(extends abstract {base_name})"
|
|
188
|
+
if 'description' in merged and merged['description']:
|
|
189
|
+
# Only add if not already in description
|
|
190
|
+
if "extends abstract" not in merged['description'].lower():
|
|
191
|
+
merged['description'] = f"{merged['description']} {note}"
|
|
192
|
+
else:
|
|
193
|
+
merged['description'] = f"Extends abstract {base_name}"
|
|
194
|
+
|
|
195
|
+
return merged
|
|
196
|
+
|
|
197
|
+
def _build_doc_with_annotations(self, schema: Dict[str, Any], base_doc: Optional[str] = None) -> Optional[str]:
|
|
198
|
+
"""
|
|
199
|
+
Build documentation string including constraint annotations.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
schema: Property schema with possible annotations
|
|
203
|
+
base_doc: Base documentation from description field
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Enhanced documentation string or None
|
|
207
|
+
"""
|
|
208
|
+
parts = []
|
|
209
|
+
|
|
210
|
+
if base_doc:
|
|
211
|
+
parts.append(base_doc)
|
|
212
|
+
|
|
213
|
+
# Add constraint annotations
|
|
214
|
+
annotations = []
|
|
215
|
+
|
|
216
|
+
if 'maxLength' in schema:
|
|
217
|
+
annotations.append(f"maxLength: {schema['maxLength']}")
|
|
218
|
+
|
|
219
|
+
if 'minLength' in schema:
|
|
220
|
+
annotations.append(f"minLength: {schema['minLength']}")
|
|
221
|
+
|
|
222
|
+
if 'precision' in schema:
|
|
223
|
+
annotations.append(f"precision: {schema['precision']}")
|
|
224
|
+
|
|
225
|
+
if 'scale' in schema:
|
|
226
|
+
annotations.append(f"scale: {schema['scale']}")
|
|
227
|
+
|
|
228
|
+
if 'pattern' in schema:
|
|
229
|
+
annotations.append(f"pattern: {schema['pattern']}")
|
|
230
|
+
|
|
231
|
+
if 'minimum' in schema:
|
|
232
|
+
annotations.append(f"minimum: {schema['minimum']}")
|
|
233
|
+
|
|
234
|
+
if 'maximum' in schema:
|
|
235
|
+
annotations.append(f"maximum: {schema['maximum']}")
|
|
236
|
+
|
|
237
|
+
if 'contentEncoding' in schema:
|
|
238
|
+
annotations.append(f"encoding: {schema['contentEncoding']}")
|
|
239
|
+
|
|
240
|
+
if 'contentMediaType' in schema:
|
|
241
|
+
annotations.append(f"mediaType: {schema['contentMediaType']}")
|
|
242
|
+
|
|
243
|
+
if 'contentCompression' in schema:
|
|
244
|
+
annotations.append(f"compression: {schema['contentCompression']}")
|
|
245
|
+
|
|
246
|
+
if annotations:
|
|
247
|
+
parts.append(f"[{', '.join(annotations)}]")
|
|
248
|
+
|
|
249
|
+
return ' '.join(parts) if parts else None
|
|
250
|
+
|
|
251
|
+
def _convert_definition(self, def_path: str, def_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
252
|
+
"""
|
|
253
|
+
Convert a single type definition from JSON Structure to Avro.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
def_path: The definition path (used as type name)
|
|
257
|
+
def_schema: The JSON Structure type definition
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Avro schema for this type
|
|
261
|
+
"""
|
|
262
|
+
# Skip abstract types - they're not directly instantiable
|
|
263
|
+
if def_schema.get('abstract'):
|
|
264
|
+
# Store a placeholder but don't convert
|
|
265
|
+
return {'type': 'null', 'doc': f'Abstract type: {def_path}'}
|
|
266
|
+
|
|
267
|
+
# Merge base type properties if $extends is present
|
|
268
|
+
merged_schema = self._merge_base_properties(def_schema)
|
|
269
|
+
|
|
270
|
+
# Parse namespace and name from path
|
|
271
|
+
if '/' in def_path:
|
|
272
|
+
parts = def_path.split('/')
|
|
273
|
+
namespace = '.'.join(parts[:-1])
|
|
274
|
+
name = parts[-1]
|
|
275
|
+
else:
|
|
276
|
+
namespace = None
|
|
277
|
+
name = def_path
|
|
278
|
+
|
|
279
|
+
avro_schema = self._convert_type_from_schema(merged_schema, namespace, name)
|
|
280
|
+
self.converted_types[def_path] = avro_schema
|
|
281
|
+
return avro_schema
|
|
282
|
+
|
|
283
|
+
def _convert_type_from_schema(self, def_schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
284
|
+
"""
|
|
285
|
+
Convert a type definition based on its schema properties.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
def_schema: The JSON Structure type definition
|
|
289
|
+
namespace: The namespace for the type
|
|
290
|
+
name: The name of the type
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Avro schema for this type
|
|
294
|
+
"""
|
|
295
|
+
avro_schema: Dict[str, Any] = {}
|
|
296
|
+
|
|
297
|
+
# Handle different JSON Structure types
|
|
298
|
+
type_value = def_schema.get('type')
|
|
299
|
+
|
|
300
|
+
if type_value == 'object':
|
|
301
|
+
avro_schema = self._convert_object(def_schema, namespace, name)
|
|
302
|
+
elif type_value == 'string' and 'enum' in def_schema:
|
|
303
|
+
avro_schema = self._convert_enum(def_schema, namespace, name)
|
|
304
|
+
elif type_value == 'binary' and 'byteLength' in def_schema:
|
|
305
|
+
avro_schema = self._convert_fixed(def_schema, namespace, name)
|
|
306
|
+
elif 'oneOf' in def_schema:
|
|
307
|
+
avro_schema = self._convert_union(def_schema, namespace, name)
|
|
308
|
+
elif type_value == 'choice':
|
|
309
|
+
avro_schema = self._convert_choice(def_schema, namespace, name)
|
|
310
|
+
elif type_value == 'set':
|
|
311
|
+
avro_schema = self._convert_set(def_schema, namespace, name)
|
|
312
|
+
elif type_value == 'tuple':
|
|
313
|
+
avro_schema = self._convert_tuple(def_schema, namespace, name)
|
|
314
|
+
elif type_value == 'any':
|
|
315
|
+
avro_schema = self._convert_any(def_schema, namespace, name)
|
|
316
|
+
elif type_value == 'array':
|
|
317
|
+
# Array as top-level type needs wrapping in a record
|
|
318
|
+
avro_schema = {
|
|
319
|
+
'type': 'record',
|
|
320
|
+
'name': name,
|
|
321
|
+
'fields': [{
|
|
322
|
+
'name': 'items',
|
|
323
|
+
'type': {
|
|
324
|
+
'type': 'array',
|
|
325
|
+
'items': self._convert_type_reference(def_schema.get('items', 'string'))
|
|
326
|
+
}
|
|
327
|
+
}]
|
|
328
|
+
}
|
|
329
|
+
if namespace:
|
|
330
|
+
avro_schema['namespace'] = namespace
|
|
331
|
+
elif type_value == 'map':
|
|
332
|
+
# Map as top-level type needs wrapping in a record
|
|
333
|
+
avro_schema = {
|
|
334
|
+
'type': 'record',
|
|
335
|
+
'name': name,
|
|
336
|
+
'fields': [{
|
|
337
|
+
'name': 'values',
|
|
338
|
+
'type': {
|
|
339
|
+
'type': 'map',
|
|
340
|
+
'values': self._convert_type_reference(def_schema.get('values', 'string'))
|
|
341
|
+
}
|
|
342
|
+
}]
|
|
343
|
+
}
|
|
344
|
+
if namespace:
|
|
345
|
+
avro_schema['namespace'] = namespace
|
|
346
|
+
else:
|
|
347
|
+
# It might be a simple type alias or logical type
|
|
348
|
+
avro_schema = self._convert_simple_type(def_schema, namespace, name)
|
|
349
|
+
|
|
350
|
+
return avro_schema
|
|
351
|
+
|
|
352
|
+
def _convert_object(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
353
|
+
"""Convert JSON Structure object to Avro record."""
|
|
354
|
+
# Merge base properties if $extends is present
|
|
355
|
+
merged_schema = self._merge_base_properties(schema)
|
|
356
|
+
|
|
357
|
+
avro_record: Dict[str, Any] = {
|
|
358
|
+
'type': 'record',
|
|
359
|
+
'name': name
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
if namespace:
|
|
363
|
+
avro_record['namespace'] = namespace
|
|
364
|
+
|
|
365
|
+
if 'description' in merged_schema:
|
|
366
|
+
avro_record['doc'] = merged_schema['description']
|
|
367
|
+
|
|
368
|
+
# Convert properties to fields
|
|
369
|
+
properties = merged_schema.get('properties', {})
|
|
370
|
+
required = merged_schema.get('required', [])
|
|
371
|
+
|
|
372
|
+
fields = []
|
|
373
|
+
for prop_name, prop_schema in properties.items():
|
|
374
|
+
field = {
|
|
375
|
+
'name': prop_name,
|
|
376
|
+
'type': self._convert_type_reference(prop_schema)
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
# Build documentation with annotations
|
|
380
|
+
doc = self._build_doc_with_annotations(
|
|
381
|
+
prop_schema,
|
|
382
|
+
prop_schema.get('description')
|
|
383
|
+
)
|
|
384
|
+
if doc:
|
|
385
|
+
field['doc'] = doc
|
|
386
|
+
|
|
387
|
+
# Handle default values
|
|
388
|
+
if 'default' in prop_schema:
|
|
389
|
+
field['default'] = prop_schema['default']
|
|
390
|
+
elif prop_name not in required:
|
|
391
|
+
# Optional field - make it nullable with null default
|
|
392
|
+
if isinstance(field['type'], list):
|
|
393
|
+
if 'null' not in field['type']:
|
|
394
|
+
field['type'] = ['null'] + field['type']
|
|
395
|
+
else:
|
|
396
|
+
field['type'] = ['null', field['type']]
|
|
397
|
+
field['default'] = None
|
|
398
|
+
|
|
399
|
+
fields.append(field)
|
|
400
|
+
|
|
401
|
+
avro_record['fields'] = fields
|
|
402
|
+
return avro_record
|
|
403
|
+
|
|
404
|
+
def _convert_enum(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
405
|
+
"""Convert JSON Structure enum to Avro enum."""
|
|
406
|
+
avro_enum: Dict[str, Any] = {
|
|
407
|
+
'type': 'enum',
|
|
408
|
+
'name': name,
|
|
409
|
+
'symbols': schema['enum']
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if namespace:
|
|
413
|
+
avro_enum['namespace'] = namespace
|
|
414
|
+
|
|
415
|
+
if 'description' in schema:
|
|
416
|
+
avro_enum['doc'] = schema['description']
|
|
417
|
+
|
|
418
|
+
if 'default' in schema:
|
|
419
|
+
avro_enum['default'] = schema['default']
|
|
420
|
+
|
|
421
|
+
return avro_enum
|
|
422
|
+
|
|
423
|
+
def _convert_fixed(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
424
|
+
"""Convert JSON Structure fixed-length binary to Avro fixed."""
|
|
425
|
+
avro_fixed: Dict[str, Any] = {
|
|
426
|
+
'type': 'fixed',
|
|
427
|
+
'name': name,
|
|
428
|
+
'size': schema['byteLength']
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
if namespace:
|
|
432
|
+
avro_fixed['namespace'] = namespace
|
|
433
|
+
|
|
434
|
+
if 'description' in schema:
|
|
435
|
+
avro_fixed['doc'] = schema['description']
|
|
436
|
+
|
|
437
|
+
return avro_fixed
|
|
438
|
+
|
|
439
|
+
def _convert_union(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
440
|
+
"""Convert JSON Structure oneOf to Avro union or record."""
|
|
441
|
+
# Check if this is a proper union with $extends (discriminated union)
|
|
442
|
+
one_of = schema.get('oneOf', [])
|
|
443
|
+
|
|
444
|
+
# For now, create a simple record that can hold the union
|
|
445
|
+
# TODO: Implement proper discriminated union mapping
|
|
446
|
+
avro_record: Dict[str, Any] = {
|
|
447
|
+
'type': 'record',
|
|
448
|
+
'name': name
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
if namespace:
|
|
452
|
+
avro_record['namespace'] = namespace
|
|
453
|
+
|
|
454
|
+
if 'description' in schema:
|
|
455
|
+
avro_record['doc'] = schema['description']
|
|
456
|
+
|
|
457
|
+
# Create a union field
|
|
458
|
+
union_types = [self._convert_type_reference(choice) for choice in one_of]
|
|
459
|
+
|
|
460
|
+
avro_record['fields'] = [{
|
|
461
|
+
'name': 'value',
|
|
462
|
+
'type': union_types
|
|
463
|
+
}]
|
|
464
|
+
|
|
465
|
+
return avro_record
|
|
466
|
+
|
|
467
|
+
def _convert_choice(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
468
|
+
"""Convert JSON Structure choice to Avro union with discriminator support.
|
|
469
|
+
|
|
470
|
+
For tagged unions (no selector): Creates an enum discriminator field + union field.
|
|
471
|
+
For inline unions (with selector): Ensures selector field exists in each choice type with default value.
|
|
472
|
+
"""
|
|
473
|
+
choices = schema.get('choices', {})
|
|
474
|
+
selector = schema.get('selector')
|
|
475
|
+
extends_ref = schema.get('$extends')
|
|
476
|
+
|
|
477
|
+
if extends_ref and selector:
|
|
478
|
+
# Inline union (Section 3.2.3.7.2) - selector field is part of the data
|
|
479
|
+
# Each choice type should include the selector field with its choice name as default
|
|
480
|
+
avro_record: Dict[str, Any] = {
|
|
481
|
+
'type': 'record',
|
|
482
|
+
'name': name
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if namespace:
|
|
486
|
+
avro_record['namespace'] = namespace
|
|
487
|
+
|
|
488
|
+
if 'description' in schema:
|
|
489
|
+
avro_record['doc'] = schema['description']
|
|
490
|
+
else:
|
|
491
|
+
avro_record['doc'] = f'Inline union with selector field: {selector}'
|
|
492
|
+
|
|
493
|
+
# Build union of choice types
|
|
494
|
+
# Note: The choice types themselves should have the selector field with defaults
|
|
495
|
+
# This would require modifying the referenced types, which we'll handle
|
|
496
|
+
# by documenting the expectation
|
|
497
|
+
union_types = []
|
|
498
|
+
for choice_name, choice_schema in choices.items():
|
|
499
|
+
choice_type = self._convert_type_reference(choice_schema)
|
|
500
|
+
union_types.append(choice_type)
|
|
501
|
+
|
|
502
|
+
# Create wrapper record with union field
|
|
503
|
+
avro_record['fields'] = [{
|
|
504
|
+
'name': 'value',
|
|
505
|
+
'type': union_types,
|
|
506
|
+
'doc': f'Union of choice types. Each type includes "{selector}" field with its discriminator value.'
|
|
507
|
+
}]
|
|
508
|
+
|
|
509
|
+
return avro_record
|
|
510
|
+
else:
|
|
511
|
+
# Tagged union (Section 3.2.3.7.1) - discriminator is the choice key
|
|
512
|
+
# Create enum for type-safe discriminator + union field for value
|
|
513
|
+
|
|
514
|
+
# Build enum type for discriminator
|
|
515
|
+
enum_name = f'{name}Type'
|
|
516
|
+
choice_names = list(choices.keys())
|
|
517
|
+
|
|
518
|
+
discriminator_enum: Dict[str, Any] = {
|
|
519
|
+
'type': 'enum',
|
|
520
|
+
'name': enum_name,
|
|
521
|
+
'symbols': choice_names
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
if namespace:
|
|
525
|
+
discriminator_enum['namespace'] = namespace
|
|
526
|
+
|
|
527
|
+
# Build union of choice types
|
|
528
|
+
union_types = []
|
|
529
|
+
for choice_name, choice_schema in choices.items():
|
|
530
|
+
choice_type = self._convert_type_reference(choice_schema)
|
|
531
|
+
union_types.append(choice_type)
|
|
532
|
+
|
|
533
|
+
# Create wrapper record with discriminator + union fields
|
|
534
|
+
avro_record: Dict[str, Any] = {
|
|
535
|
+
'type': 'record',
|
|
536
|
+
'name': name
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
if namespace:
|
|
540
|
+
avro_record['namespace'] = namespace
|
|
541
|
+
|
|
542
|
+
if 'description' in schema:
|
|
543
|
+
avro_record['doc'] = schema['description']
|
|
544
|
+
else:
|
|
545
|
+
avro_record['doc'] = 'Tagged union with explicit discriminator'
|
|
546
|
+
|
|
547
|
+
avro_record['fields'] = [
|
|
548
|
+
{
|
|
549
|
+
'name': 'choiceType',
|
|
550
|
+
'type': discriminator_enum,
|
|
551
|
+
'doc': 'Discriminator indicating which type is present in the value field'
|
|
552
|
+
},
|
|
553
|
+
{
|
|
554
|
+
'name': 'value',
|
|
555
|
+
'type': union_types,
|
|
556
|
+
'doc': 'The actual value of the selected choice type'
|
|
557
|
+
}
|
|
558
|
+
]
|
|
559
|
+
|
|
560
|
+
return avro_record
|
|
561
|
+
|
|
562
|
+
def _convert_set(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
563
|
+
"""Convert JSON Structure set to Avro array (sets are represented as arrays in Avro)."""
|
|
564
|
+
avro_record: Dict[str, Any] = {
|
|
565
|
+
'type': 'record',
|
|
566
|
+
'name': name
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
if namespace:
|
|
570
|
+
avro_record['namespace'] = namespace
|
|
571
|
+
|
|
572
|
+
if 'description' in schema:
|
|
573
|
+
avro_record['doc'] = schema['description'] + ' (Set - unique unordered elements)'
|
|
574
|
+
else:
|
|
575
|
+
avro_record['doc'] = 'Set - unique unordered elements'
|
|
576
|
+
|
|
577
|
+
# Sets are represented as arrays in Avro
|
|
578
|
+
items_type = schema.get('items', 'string')
|
|
579
|
+
avro_record['fields'] = [{
|
|
580
|
+
'name': 'items',
|
|
581
|
+
'type': {
|
|
582
|
+
'type': 'array',
|
|
583
|
+
'items': self._convert_type_reference(items_type)
|
|
584
|
+
}
|
|
585
|
+
}]
|
|
586
|
+
|
|
587
|
+
return avro_record
|
|
588
|
+
|
|
589
|
+
def _convert_tuple(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
590
|
+
"""Convert JSON Structure tuple to Avro record with ordered fields."""
|
|
591
|
+
avro_record: Dict[str, Any] = {
|
|
592
|
+
'type': 'record',
|
|
593
|
+
'name': name
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
if namespace:
|
|
597
|
+
avro_record['namespace'] = namespace
|
|
598
|
+
|
|
599
|
+
if 'description' in schema:
|
|
600
|
+
avro_record['doc'] = schema['description']
|
|
601
|
+
|
|
602
|
+
# Tuples have a fixed set of items with specific types
|
|
603
|
+
tuple_items = schema.get('tuple', [])
|
|
604
|
+
fields = []
|
|
605
|
+
|
|
606
|
+
for idx, item_schema in enumerate(tuple_items):
|
|
607
|
+
fields.append({
|
|
608
|
+
'name': f'item{idx}',
|
|
609
|
+
'type': self._convert_type_reference(item_schema)
|
|
610
|
+
})
|
|
611
|
+
|
|
612
|
+
avro_record['fields'] = fields
|
|
613
|
+
return avro_record
|
|
614
|
+
|
|
615
|
+
def _convert_any(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
616
|
+
"""Convert JSON Structure 'any' type to Avro union of all basic types."""
|
|
617
|
+
avro_record: Dict[str, Any] = {
|
|
618
|
+
'type': 'record',
|
|
619
|
+
'name': name
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
if namespace:
|
|
623
|
+
avro_record['namespace'] = namespace
|
|
624
|
+
|
|
625
|
+
if 'description' in schema:
|
|
626
|
+
avro_record['doc'] = schema['description'] + ' (Any type)'
|
|
627
|
+
else:
|
|
628
|
+
avro_record['doc'] = 'Any type'
|
|
629
|
+
|
|
630
|
+
# In Avro, 'any' can be represented as a union of all basic types
|
|
631
|
+
# or as a string containing JSON
|
|
632
|
+
avro_record['fields'] = [{
|
|
633
|
+
'name': 'value',
|
|
634
|
+
'type': ['null', 'boolean', 'int', 'long', 'float', 'double', 'string', 'bytes']
|
|
635
|
+
}]
|
|
636
|
+
|
|
637
|
+
return avro_record
|
|
638
|
+
|
|
639
|
+
def _convert_simple_type(self, schema: Dict[str, Any], namespace: Optional[str], name: str) -> Dict[str, Any]:
|
|
640
|
+
"""Convert a simple type (possibly with logical type) to an Avro record with a single field."""
|
|
641
|
+
type_value = schema.get('type')
|
|
642
|
+
logical_type = schema.get('logicalType')
|
|
643
|
+
|
|
644
|
+
# For simple types, we create a record wrapper with a 'value' field
|
|
645
|
+
avro_record: Dict[str, Any] = {
|
|
646
|
+
'type': 'record',
|
|
647
|
+
'name': name
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
if namespace:
|
|
651
|
+
avro_record['namespace'] = namespace
|
|
652
|
+
|
|
653
|
+
if 'description' in schema:
|
|
654
|
+
avro_record['doc'] = schema['description']
|
|
655
|
+
|
|
656
|
+
# Determine the field type
|
|
657
|
+
if logical_type:
|
|
658
|
+
field_type = self._map_logical_type(logical_type, type_value)
|
|
659
|
+
else:
|
|
660
|
+
field_type = self._convert_type_reference(schema)
|
|
661
|
+
|
|
662
|
+
avro_record['fields'] = [{
|
|
663
|
+
'name': 'value',
|
|
664
|
+
'type': field_type
|
|
665
|
+
}]
|
|
666
|
+
|
|
667
|
+
return avro_record
|
|
668
|
+
|
|
669
|
+
def _convert_type_reference(self, schema: Union[Dict[str, Any], str]) -> Union[str, Dict[str, Any], List]:
|
|
670
|
+
"""
|
|
671
|
+
Convert a type reference or inline type definition.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
schema: Type schema or reference
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
Avro type (string, dict, or list for union)
|
|
678
|
+
"""
|
|
679
|
+
if isinstance(schema, str):
|
|
680
|
+
return self._map_primitive_type(schema)
|
|
681
|
+
|
|
682
|
+
if not isinstance(schema, dict):
|
|
683
|
+
raise ValueError(f"Invalid type schema: {schema}")
|
|
684
|
+
|
|
685
|
+
# Handle $ref
|
|
686
|
+
if '$ref' in schema:
|
|
687
|
+
ref = schema['$ref']
|
|
688
|
+
if ref.startswith('#/definitions/'):
|
|
689
|
+
ref_path = ref.replace('#/definitions/', '')
|
|
690
|
+
# Convert path format back to Avro namespace.name format
|
|
691
|
+
return ref_path.replace('/', '.')
|
|
692
|
+
raise ValueError(f"Unsupported reference format: {ref}")
|
|
693
|
+
|
|
694
|
+
# Handle inline types
|
|
695
|
+
type_value = schema.get('type')
|
|
696
|
+
|
|
697
|
+
# Handle union types (type is an array like ["string", "null"])
|
|
698
|
+
if isinstance(type_value, list):
|
|
699
|
+
return [self._map_primitive_type(t) if isinstance(t, str) else self._convert_type_reference(t) for t in type_value]
|
|
700
|
+
|
|
701
|
+
# Handle choice types
|
|
702
|
+
if type_value == 'choice':
|
|
703
|
+
# For nested choice types, we need to convert them fully
|
|
704
|
+
# Generate a unique name based on the choices
|
|
705
|
+
choices = schema.get('choices', {})
|
|
706
|
+
choice_name = f"Choice_{'_'.join(choices.keys())}" if choices else "Choice"
|
|
707
|
+
return self._convert_choice(schema, None, choice_name)
|
|
708
|
+
|
|
709
|
+
# Handle set types
|
|
710
|
+
if type_value == 'set':
|
|
711
|
+
# Sets are represented as arrays in Avro
|
|
712
|
+
return {
|
|
713
|
+
'type': 'array',
|
|
714
|
+
'items': self._convert_type_reference(schema.get('items', 'string'))
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
# Handle tuple types
|
|
718
|
+
if type_value == 'tuple':
|
|
719
|
+
# Tuples need to be records - generate unique name
|
|
720
|
+
tuple_name = f"Tuple_{len(schema.get('tuple', []))}_items"
|
|
721
|
+
return self._convert_tuple(schema, None, tuple_name)
|
|
722
|
+
|
|
723
|
+
# Handle any types
|
|
724
|
+
if type_value == 'any':
|
|
725
|
+
# Return union of all basic types
|
|
726
|
+
return ['null', 'boolean', 'int', 'long', 'float', 'double', 'string', 'bytes']
|
|
727
|
+
|
|
728
|
+
if type_value == 'array':
|
|
729
|
+
return {
|
|
730
|
+
'type': 'array',
|
|
731
|
+
'items': self._convert_type_reference(schema['items'])
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
if type_value == 'map':
|
|
735
|
+
return {
|
|
736
|
+
'type': 'map',
|
|
737
|
+
'values': self._convert_type_reference(schema['values'])
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
# Handle logical types
|
|
741
|
+
logical_type = schema.get('logicalType')
|
|
742
|
+
if logical_type:
|
|
743
|
+
return self._map_logical_type(logical_type, type_value)
|
|
744
|
+
|
|
745
|
+
# Primitive type
|
|
746
|
+
if type_value:
|
|
747
|
+
return self._map_primitive_type(type_value)
|
|
748
|
+
|
|
749
|
+
raise ValueError(f"Cannot convert type schema: {schema}")
|
|
750
|
+
|
|
751
|
+
def _map_primitive_type(self, struct_type: str) -> Union[str, Dict[str, Any]]:
|
|
752
|
+
"""Map JSON Structure primitive type to Avro primitive type.
|
|
753
|
+
|
|
754
|
+
For temporal types, returns Avrotize Schema format with string base type
|
|
755
|
+
and logical type annotation (RFC 3339 format).
|
|
756
|
+
"""
|
|
757
|
+
# Simple types without logical type annotation
|
|
758
|
+
simple_type_mapping = {
|
|
759
|
+
'null': 'null',
|
|
760
|
+
'boolean': 'boolean',
|
|
761
|
+
# Integer types
|
|
762
|
+
'int8': 'int',
|
|
763
|
+
'int16': 'int',
|
|
764
|
+
'int32': 'int',
|
|
765
|
+
'int64': 'long',
|
|
766
|
+
'uint8': 'int',
|
|
767
|
+
'uint16': 'int',
|
|
768
|
+
'uint32': 'long',
|
|
769
|
+
'uint64': 'long',
|
|
770
|
+
'int128': 'string', # Too large for Avro numeric types
|
|
771
|
+
'uint128': 'string',
|
|
772
|
+
# Floating point types
|
|
773
|
+
'float8': 'float',
|
|
774
|
+
'float16': 'float',
|
|
775
|
+
'float32': 'float',
|
|
776
|
+
'float': 'float',
|
|
777
|
+
'float64': 'double',
|
|
778
|
+
'double': 'double',
|
|
779
|
+
'number': 'double', # Generic number → double
|
|
780
|
+
# String and binary types
|
|
781
|
+
'string': 'string',
|
|
782
|
+
'binary': 'bytes',
|
|
783
|
+
'bytes': 'bytes',
|
|
784
|
+
# Other types
|
|
785
|
+
'uri': 'string',
|
|
786
|
+
'jsonpointer': 'string',
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
# Temporal types with Avrotize Schema string-based logical types (RFC 3339 format)
|
|
790
|
+
temporal_type_mapping = {
|
|
791
|
+
'date': {'type': 'string', 'logicalType': 'date'}, # RFC 3339 full-date
|
|
792
|
+
'datetime': {'type': 'string', 'logicalType': 'timestamp-millis'}, # RFC 3339 date-time
|
|
793
|
+
'time': {'type': 'string', 'logicalType': 'time-millis'}, # RFC 3339 partial-time
|
|
794
|
+
'duration': {'type': 'string', 'logicalType': 'duration'}, # RFC 3339 duration
|
|
795
|
+
'timestamp': {'type': 'string', 'logicalType': 'timestamp-millis'}, # RFC 3339 date-time
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
# Special types with logical type annotation
|
|
799
|
+
special_type_mapping = {
|
|
800
|
+
'uuid': {'type': 'string', 'logicalType': 'uuid'},
|
|
801
|
+
'decimal': {'type': 'string', 'logicalType': 'decimal'}, # Avrotize extension: decimal on string
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
# Check in order: temporal, special, simple
|
|
805
|
+
if struct_type in temporal_type_mapping:
|
|
806
|
+
return temporal_type_mapping[struct_type]
|
|
807
|
+
if struct_type in special_type_mapping:
|
|
808
|
+
return special_type_mapping[struct_type]
|
|
809
|
+
if struct_type in simple_type_mapping:
|
|
810
|
+
return simple_type_mapping[struct_type]
|
|
811
|
+
|
|
812
|
+
# Fallback to the type as-is
|
|
813
|
+
return struct_type
|
|
814
|
+
|
|
815
|
+
def _map_logical_type(self, logical_type: str, base_type: Optional[str]) -> Dict[str, Any]:
|
|
816
|
+
"""Map JSON Structure logical type to Avro/Avrotize logical type.
|
|
817
|
+
|
|
818
|
+
Uses Avrotize Schema extensions for string-based temporal types (RFC 3339 format).
|
|
819
|
+
"""
|
|
820
|
+
# Avrotize Schema: temporal types on string (RFC 3339 format)
|
|
821
|
+
logical_mapping = {
|
|
822
|
+
# Timestamps
|
|
823
|
+
'timestampMicros': {'type': 'string', 'logicalType': 'timestamp-micros'},
|
|
824
|
+
'timestampMillis': {'type': 'string', 'logicalType': 'timestamp-millis'},
|
|
825
|
+
'timestamp-micros': {'type': 'string', 'logicalType': 'timestamp-micros'},
|
|
826
|
+
'timestamp-millis': {'type': 'string', 'logicalType': 'timestamp-millis'},
|
|
827
|
+
# Local timestamps (no timezone)
|
|
828
|
+
'localTimestampMicros': {'type': 'string', 'logicalType': 'local-timestamp-micros'},
|
|
829
|
+
'localTimestampMillis': {'type': 'string', 'logicalType': 'local-timestamp-millis'},
|
|
830
|
+
'local-timestamp-micros': {'type': 'string', 'logicalType': 'local-timestamp-micros'},
|
|
831
|
+
'local-timestamp-millis': {'type': 'string', 'logicalType': 'local-timestamp-millis'},
|
|
832
|
+
# Date and time
|
|
833
|
+
'date': {'type': 'string', 'logicalType': 'date'},
|
|
834
|
+
'time-millis': {'type': 'string', 'logicalType': 'time-millis'},
|
|
835
|
+
'time-micros': {'type': 'string', 'logicalType': 'time-micros'},
|
|
836
|
+
'timeMillis': {'type': 'string', 'logicalType': 'time-millis'},
|
|
837
|
+
'timeMicros': {'type': 'string', 'logicalType': 'time-micros'},
|
|
838
|
+
# Duration
|
|
839
|
+
'duration': {'type': 'string', 'logicalType': 'duration'},
|
|
840
|
+
# UUID
|
|
841
|
+
'uuid': {'type': 'string', 'logicalType': 'uuid'},
|
|
842
|
+
# Decimal (Avrotize extension: on string)
|
|
843
|
+
'decimal': {'type': 'string', 'logicalType': 'decimal'},
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
if logical_type in logical_mapping:
|
|
847
|
+
return logical_mapping[logical_type]
|
|
848
|
+
|
|
849
|
+
# Fallback to base type
|
|
850
|
+
if base_type:
|
|
851
|
+
mapped = self._map_primitive_type(base_type)
|
|
852
|
+
if isinstance(mapped, dict):
|
|
853
|
+
return mapped
|
|
854
|
+
return {'type': mapped}
|
|
855
|
+
|
|
856
|
+
return {'type': 'string'}
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def convert_json_structure_to_avro(
|
|
860
|
+
structure_file: str,
|
|
861
|
+
avro_file: str
|
|
862
|
+
) -> None:
|
|
863
|
+
"""
|
|
864
|
+
Convert a JSON Structure file to Avro schema file.
|
|
865
|
+
|
|
866
|
+
Args:
|
|
867
|
+
structure_file: Path to input JSON Structure file
|
|
868
|
+
avro_file: Path to output Avro schema file
|
|
869
|
+
"""
|
|
870
|
+
converter = JsonStructureToAvro()
|
|
871
|
+
|
|
872
|
+
with open(structure_file, 'r', encoding='utf-8') as f:
|
|
873
|
+
structure_schema = json.load(f)
|
|
874
|
+
|
|
875
|
+
avro_schema = converter.convert(structure_schema)
|
|
876
|
+
|
|
877
|
+
with open(avro_file, 'w', encoding='utf-8') as f:
|
|
878
|
+
json.dump(avro_schema, f, indent=2)
|