structurize 2.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +63 -0
- avrotize/__main__.py +6 -0
- avrotize/_version.py +34 -0
- avrotize/asn1toavro.py +160 -0
- avrotize/avrotize.py +152 -0
- avrotize/avrotocpp.py +483 -0
- avrotize/avrotocsharp.py +992 -0
- avrotize/avrotocsv.py +121 -0
- avrotize/avrotodatapackage.py +173 -0
- avrotize/avrotodb.py +1383 -0
- avrotize/avrotogo.py +476 -0
- avrotize/avrotographql.py +197 -0
- avrotize/avrotoiceberg.py +210 -0
- avrotize/avrotojava.py +1023 -0
- avrotize/avrotojs.py +250 -0
- avrotize/avrotojsons.py +481 -0
- avrotize/avrotojstruct.py +345 -0
- avrotize/avrotokusto.py +364 -0
- avrotize/avrotomd.py +137 -0
- avrotize/avrotools.py +168 -0
- avrotize/avrotoparquet.py +208 -0
- avrotize/avrotoproto.py +359 -0
- avrotize/avrotopython.py +622 -0
- avrotize/avrotorust.py +435 -0
- avrotize/avrotots.py +598 -0
- avrotize/avrotoxsd.py +344 -0
- avrotize/commands.json +2433 -0
- avrotize/common.py +829 -0
- avrotize/constants.py +5 -0
- avrotize/csvtoavro.py +132 -0
- avrotize/datapackagetoavro.py +76 -0
- avrotize/dependency_resolver.py +348 -0
- avrotize/jsonstoavro.py +1698 -0
- avrotize/jsonstostructure.py +2642 -0
- avrotize/jstructtoavro.py +878 -0
- avrotize/kstructtoavro.py +93 -0
- avrotize/kustotoavro.py +455 -0
- avrotize/parquettoavro.py +157 -0
- avrotize/proto2parser.py +498 -0
- avrotize/proto3parser.py +403 -0
- avrotize/prototoavro.py +382 -0
- avrotize/structuretocsharp.py +2005 -0
- avrotize/structuretojsons.py +498 -0
- avrotize/structuretopython.py +772 -0
- avrotize/xsdtoavro.py +413 -0
- structurize-2.16.2.dist-info/METADATA +805 -0
- structurize-2.16.2.dist-info/RECORD +51 -0
- structurize-2.16.2.dist-info/WHEEL +5 -0
- structurize-2.16.2.dist-info/entry_points.txt +2 -0
- structurize-2.16.2.dist-info/licenses/LICENSE +201 -0
- structurize-2.16.2.dist-info/top_level.txt +1 -0
avrotize/common.py
ADDED
|
@@ -0,0 +1,829 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Common utility functions for Avrotize.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements, line-too-long
|
|
6
|
+
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
from typing import Dict, Union, Any, List
|
|
13
|
+
from jsoncomparison import NO_DIFF, Compare
|
|
14
|
+
import jinja2
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def avro_name(name):
|
|
18
|
+
"""Convert a name into an Avro name."""
|
|
19
|
+
if isinstance(name, int):
|
|
20
|
+
name = '_'+str(name)
|
|
21
|
+
val = re.sub(r'[^a-zA-Z0-9_]', '_', name)
|
|
22
|
+
# Ensure the name starts with a letter or underscore (required for valid identifiers)
|
|
23
|
+
if re.match(r'^[0-9]', val):
|
|
24
|
+
val = '_' + val
|
|
25
|
+
# Additional check to ensure we always have a valid identifier
|
|
26
|
+
if not val or not re.match(r'^[a-zA-Z_]', val):
|
|
27
|
+
val = '_' + val
|
|
28
|
+
return val
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def avro_name_with_altname(name):
|
|
32
|
+
"""
|
|
33
|
+
Convert a name into an Avro name and return both the normalized name and alternate name info.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
name (str): The original name to convert
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
tuple: (normalized_name, original_name_if_different_or_None)
|
|
40
|
+
"""
|
|
41
|
+
if isinstance(name, int):
|
|
42
|
+
name = str(name)
|
|
43
|
+
|
|
44
|
+
original_name = name
|
|
45
|
+
normalized_name = avro_name(name)
|
|
46
|
+
|
|
47
|
+
# If the normalized name is different from the original, return the original as alt name
|
|
48
|
+
if normalized_name != original_name:
|
|
49
|
+
return normalized_name, original_name
|
|
50
|
+
else:
|
|
51
|
+
return normalized_name, None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def avro_namespace(name):
|
|
55
|
+
"""Convert a name into an Avro name."""
|
|
56
|
+
val = re.sub(r'[^a-zA-Z0-9_\.]', '_', name)
|
|
57
|
+
if re.match(r'^[0-9]', val):
|
|
58
|
+
val = '_' + val
|
|
59
|
+
return val
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def generic_type() -> list[str | dict]:
|
|
63
|
+
"""
|
|
64
|
+
Constructs a generic Avro type for simple types, arrays, and maps.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
list[str | dict]: A list of simple types, arrays, and maps.
|
|
68
|
+
"""
|
|
69
|
+
simple_type_union: list[str | dict] = [
|
|
70
|
+
"null", "boolean", "int", "long", "float", "double", "bytes", "string"]
|
|
71
|
+
l2 = simple_type_union.copy()
|
|
72
|
+
l2.extend([
|
|
73
|
+
{
|
|
74
|
+
"type": "array",
|
|
75
|
+
"items": simple_type_union
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"type": "map",
|
|
79
|
+
"values": simple_type_union
|
|
80
|
+
}])
|
|
81
|
+
l1 = simple_type_union.copy()
|
|
82
|
+
l1.extend([
|
|
83
|
+
{
|
|
84
|
+
"type": "array",
|
|
85
|
+
"items": l2
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"type": "map",
|
|
89
|
+
"values": l2
|
|
90
|
+
}])
|
|
91
|
+
return l1
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def is_generic_avro_type(avro_type: list) -> bool:
|
|
95
|
+
"""
|
|
96
|
+
Check if the given Avro type is a generic type.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
avro_type (Union[str, Dict[str, Any]]): The Avro type to check.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
bool: True if the Avro type is a generic type, False otherwise.
|
|
103
|
+
"""
|
|
104
|
+
if isinstance(avro_type, str) or isinstance(avro_type, dict):
|
|
105
|
+
return False
|
|
106
|
+
compare_type = generic_type()
|
|
107
|
+
return Compare().check(avro_type, compare_type) == NO_DIFF
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def is_generic_json_type(json_type: Dict[str, Any] | List[Dict[str, Any] | str] | str) -> bool:
|
|
111
|
+
"""
|
|
112
|
+
Check if the given JSON type is a generic type.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
json_type (Union[Dict[str, Any], str, List[Union[str, Dict[str, Any]]]]): The JSON type to check.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
bool: True if the JSON type is a generic type, False otherwise.
|
|
119
|
+
"""
|
|
120
|
+
if isinstance(json_type, str) or isinstance(json_type, list):
|
|
121
|
+
return False
|
|
122
|
+
compare_type = generic_type_json()
|
|
123
|
+
return Compare().check(json_type, compare_type) == NO_DIFF
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def generic_type_json() -> dict:
|
|
127
|
+
"""
|
|
128
|
+
Returns a dictionary representing a generic JSON schema for various types.
|
|
129
|
+
|
|
130
|
+
The schema includes support for boolean, integer, number, string, array, and object types.
|
|
131
|
+
Each type can have different formats such as int32, int64, float, double, and byte.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
dict: A dictionary representing the generic JSON schema.
|
|
135
|
+
"""
|
|
136
|
+
return {
|
|
137
|
+
"oneOf": [
|
|
138
|
+
{"type": "boolean"},
|
|
139
|
+
{"type": "integer", "format": "int32"},
|
|
140
|
+
{"type": "integer", "format": "int64"},
|
|
141
|
+
{"type": "number", "format": "float"},
|
|
142
|
+
{"type": "number", "format": "double"},
|
|
143
|
+
{"type": "string", "format": "byte"},
|
|
144
|
+
{"type": "string"},
|
|
145
|
+
{
|
|
146
|
+
"type": "array",
|
|
147
|
+
"items": {
|
|
148
|
+
"oneOf": [
|
|
149
|
+
{"type": "boolean"},
|
|
150
|
+
{"type": "integer", "format": "int32"},
|
|
151
|
+
{"type": "integer", "format": "int64"},
|
|
152
|
+
{"type": "number", "format": "float"},
|
|
153
|
+
{"type": "number", "format": "double"},
|
|
154
|
+
{"type": "string", "format": "byte"},
|
|
155
|
+
{"type": "string"},
|
|
156
|
+
{
|
|
157
|
+
"type": "array",
|
|
158
|
+
"items": {
|
|
159
|
+
"oneOf": [
|
|
160
|
+
{"type": "boolean"},
|
|
161
|
+
{"type": "integer", "format": "int32"},
|
|
162
|
+
{"type": "integer", "format": "int64"},
|
|
163
|
+
{"type": "number", "format": "float"},
|
|
164
|
+
{"type": "number", "format": "double"},
|
|
165
|
+
{"type": "string", "format": "byte"},
|
|
166
|
+
{"type": "string"}
|
|
167
|
+
]
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
"type": "object",
|
|
172
|
+
"additionalProperties": {
|
|
173
|
+
"oneOf": [
|
|
174
|
+
{"type": "boolean"},
|
|
175
|
+
{"type": "integer", "format": "int32"},
|
|
176
|
+
{"type": "integer", "format": "int64"},
|
|
177
|
+
{"type": "number", "format": "float"},
|
|
178
|
+
{"type": "number", "format": "double"},
|
|
179
|
+
{"type": "string", "format": "byte"},
|
|
180
|
+
{"type": "string"}
|
|
181
|
+
]
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
]
|
|
185
|
+
}
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
"type": "object",
|
|
189
|
+
"additionalProperties": {
|
|
190
|
+
"oneOf": [
|
|
191
|
+
{"type": "boolean"},
|
|
192
|
+
{"type": "integer", "format": "int32"},
|
|
193
|
+
{"type": "integer", "format": "int64"},
|
|
194
|
+
{"type": "number", "format": "float"},
|
|
195
|
+
{"type": "number", "format": "double"},
|
|
196
|
+
{"type": "string", "format": "byte"},
|
|
197
|
+
{"type": "string"},
|
|
198
|
+
{
|
|
199
|
+
"type": "array",
|
|
200
|
+
"items": {
|
|
201
|
+
"oneOf": [
|
|
202
|
+
{"type": "boolean"},
|
|
203
|
+
{"type": "integer", "format": "int32"},
|
|
204
|
+
{"type": "integer", "format": "int64"},
|
|
205
|
+
{"type": "number", "format": "float"},
|
|
206
|
+
{"type": "number", "format": "double"},
|
|
207
|
+
{"type": "string", "format": "byte"},
|
|
208
|
+
{"type": "string"}
|
|
209
|
+
]
|
|
210
|
+
}
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"type": "object",
|
|
214
|
+
"additionalProperties": {
|
|
215
|
+
"oneOf": [
|
|
216
|
+
{"type": "boolean"},
|
|
217
|
+
{"type": "integer", "format": "int32"},
|
|
218
|
+
{"type": "integer", "format": "int64"},
|
|
219
|
+
{"type": "number", "format": "float"},
|
|
220
|
+
{"type": "number", "format": "double"},
|
|
221
|
+
{"type": "string", "format": "byte"},
|
|
222
|
+
{"type": "string"}
|
|
223
|
+
]
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
]
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
]
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def find_schema_node(test, avro_schema, recursion_stack=None):
|
|
234
|
+
"""
|
|
235
|
+
Find the first schema node in the avro_schema matching the test
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
test (Callable): The test function.
|
|
239
|
+
avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
|
|
240
|
+
recursion_stack (List[Union[Dict[str, Any], List[Dict[str, Any]]], optional): The recursion stack. Defaults to None.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Union[Dict[str, Any], None]: The schema node if found, otherwise None.
|
|
244
|
+
"""
|
|
245
|
+
if recursion_stack is None:
|
|
246
|
+
recursion_stack = []
|
|
247
|
+
for recursion_item in recursion_stack:
|
|
248
|
+
if avro_schema is recursion_item:
|
|
249
|
+
raise ValueError('Cyclical reference detected in schema')
|
|
250
|
+
if len(recursion_stack) > 50:
|
|
251
|
+
raise ValueError('Maximum recursion depth 50 exceeded in schema')
|
|
252
|
+
try:
|
|
253
|
+
recursion_stack.append(avro_schema)
|
|
254
|
+
if isinstance(avro_schema, dict):
|
|
255
|
+
test_node = test(avro_schema)
|
|
256
|
+
if test_node:
|
|
257
|
+
return avro_schema
|
|
258
|
+
for _, v in avro_schema.items():
|
|
259
|
+
if isinstance(v, (dict, list)):
|
|
260
|
+
node = find_schema_node(test, v, recursion_stack)
|
|
261
|
+
if node:
|
|
262
|
+
return node
|
|
263
|
+
elif isinstance(avro_schema, list):
|
|
264
|
+
for item in avro_schema:
|
|
265
|
+
if isinstance(item, (dict, list)):
|
|
266
|
+
node = find_schema_node(test, item, recursion_stack)
|
|
267
|
+
if node:
|
|
268
|
+
return node
|
|
269
|
+
return None
|
|
270
|
+
finally:
|
|
271
|
+
recursion_stack.pop()
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def set_schema_node(test, replacement, avro_schema):
|
|
275
|
+
"""
|
|
276
|
+
Set the first schema node in the avro_schema matching the test to the replacement
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
test (Callable): The test function.
|
|
280
|
+
replacement (Dict[str, Any]): The replacement schema.
|
|
281
|
+
avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
None
|
|
285
|
+
"""
|
|
286
|
+
if isinstance(avro_schema, dict):
|
|
287
|
+
test_node = test(avro_schema)
|
|
288
|
+
if test_node:
|
|
289
|
+
avro_schema.clear()
|
|
290
|
+
avro_schema.update(replacement)
|
|
291
|
+
return
|
|
292
|
+
for k, v in avro_schema.items():
|
|
293
|
+
if isinstance(v, (dict, list)):
|
|
294
|
+
set_schema_node(test, replacement, v)
|
|
295
|
+
elif isinstance(avro_schema, list):
|
|
296
|
+
for item in avro_schema:
|
|
297
|
+
set_schema_node(test, replacement, item)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class NodeHash:
|
|
301
|
+
""" A hash value and count for a JSON object. """
|
|
302
|
+
def __init__(self: 'NodeHash', hash_value: bytes, count: int):
|
|
303
|
+
self.hash_value: bytes = hash_value
|
|
304
|
+
self.count: int = count
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
class NodeHashReference:
|
|
308
|
+
""" A reference to a JSON object with a hash value and count."""
|
|
309
|
+
def __init__(self, hash_and_count: NodeHash, value, path):
|
|
310
|
+
self.hash_value: bytes = hash_and_count.hash_value
|
|
311
|
+
self.count: int = hash_and_count.count
|
|
312
|
+
self.value: Any = value
|
|
313
|
+
self.path: str = path
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def get_tree_hash(json_obj: Union[dict, list]) -> NodeHash:
|
|
317
|
+
"""
|
|
318
|
+
Generate a hash from a JSON object (dict or list).
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
json_obj (Union[dict, list]): The JSON object to hash.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
NodeHash: The hash value and count.
|
|
325
|
+
"""
|
|
326
|
+
if isinstance(json_obj, dict) or isinstance(json_obj, list):
|
|
327
|
+
s = json.dumps(json_obj, sort_keys=True).encode('utf-8')
|
|
328
|
+
return NodeHash(hashlib.sha256(s).digest(), len(s))
|
|
329
|
+
else:
|
|
330
|
+
s = json.dumps(json_obj).encode('utf-8')
|
|
331
|
+
return NodeHash(hashlib.sha256(s).digest(), len(s))
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def build_tree_hash_list(json_obj: Union[dict, list], path: str = '') -> Dict[str, NodeHashReference]:
|
|
335
|
+
"""
|
|
336
|
+
Build a flat dictionary of hashes for a JSON object.
|
|
337
|
+
The keys are JSON Path expressions, and the values are the hashes.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
json_obj (Union[dict, list]): The JSON object to hash.
|
|
341
|
+
path (str): The current JSON Path expression. Defaults to ''.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Dict[str, NodeHashReference]: A dictionary of JSON Path expressions and hashes.
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
def has_nested_structure(obj: Union[dict, list]) -> bool:
|
|
348
|
+
"""
|
|
349
|
+
Check if the object (list or dict) contains any nested lists or dicts.
|
|
350
|
+
"""
|
|
351
|
+
if isinstance(obj, dict):
|
|
352
|
+
return any(isinstance(value, (dict, list)) for value in obj.values())
|
|
353
|
+
elif isinstance(obj, list):
|
|
354
|
+
return any(isinstance(item, (dict, list)) for item in obj)
|
|
355
|
+
return False
|
|
356
|
+
|
|
357
|
+
tree_hash = {}
|
|
358
|
+
if isinstance(json_obj, dict):
|
|
359
|
+
for key, value in json_obj.items():
|
|
360
|
+
new_path = f'{path}.{key}' if path else f'$.{key}'
|
|
361
|
+
if isinstance(value, dict) and has_nested_structure(value):
|
|
362
|
+
inner_hashes = build_tree_hash_list(value, new_path)
|
|
363
|
+
for inner_path, hash_reference in inner_hashes.items():
|
|
364
|
+
tree_hash[inner_path] = hash_reference
|
|
365
|
+
hash_value = get_tree_hash(value)
|
|
366
|
+
tree_hash[new_path] = NodeHashReference(hash_value, value, new_path)
|
|
367
|
+
elif isinstance(json_obj, list):
|
|
368
|
+
for index, item in enumerate(json_obj):
|
|
369
|
+
new_path = f"{path}[{index}]"
|
|
370
|
+
if isinstance(item, (dict, list)) and has_nested_structure(item):
|
|
371
|
+
inner_hashes = build_tree_hash_list(item, new_path)
|
|
372
|
+
for inner_path, hash_reference in inner_hashes.items():
|
|
373
|
+
tree_hash[inner_path] = hash_reference
|
|
374
|
+
return tree_hash
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def group_by_hash(tree_hash_list: Dict[str, NodeHashReference]) -> Dict[bytes, list]:
|
|
378
|
+
"""
|
|
379
|
+
Group JSON Path expressions by their hash values.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
tree_hash_list (Dict[str, NodeHashReference]): A dictionary of JSON Path expressions and hashes.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Dict[bytes, list]: A dictionary of hash values and lists of JSON Path expressions.
|
|
386
|
+
"""
|
|
387
|
+
hash_groups = defaultdict(list)
|
|
388
|
+
for _, hash_reference in tree_hash_list.items():
|
|
389
|
+
hash_groups[hash_reference.hash_value].append(hash_reference)
|
|
390
|
+
|
|
391
|
+
# Filter out unique hashes to only return groups with more than one path
|
|
392
|
+
for k in list(hash_groups.keys()):
|
|
393
|
+
if len(hash_groups[k]) == 1:
|
|
394
|
+
del hash_groups[k]
|
|
395
|
+
return hash_groups
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def pascal(string):
|
|
399
|
+
"""
|
|
400
|
+
Convert a string to PascalCase from snake_case, camelCase, or PascalCase.
|
|
401
|
+
The string can contain dots or double colons, which are preserved in the output.
|
|
402
|
+
Underscores at the beginning of the string are preserved in the output, but
|
|
403
|
+
underscores in the middle of the string are removed.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
string (str): The string to convert.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
str: The string in PascalCase.
|
|
410
|
+
"""
|
|
411
|
+
if '::' in string:
|
|
412
|
+
strings = string.split('::')
|
|
413
|
+
return strings[0] + '::' + '::'.join(pascal(s) for s in strings[1:])
|
|
414
|
+
if '.' in string:
|
|
415
|
+
strings = string.split('.')
|
|
416
|
+
return '.'.join(pascal(s) for s in strings)
|
|
417
|
+
if not string or len(string) == 0:
|
|
418
|
+
return string
|
|
419
|
+
words = []
|
|
420
|
+
startswith_under = string[0] == '_'
|
|
421
|
+
if '_' in string:
|
|
422
|
+
# snake_case
|
|
423
|
+
words = re.split(r'_', string)
|
|
424
|
+
elif string[0].isupper():
|
|
425
|
+
# PascalCase
|
|
426
|
+
words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
|
|
427
|
+
else:
|
|
428
|
+
# camelCase
|
|
429
|
+
words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
|
|
430
|
+
result = ''.join(word.capitalize() for word in words)
|
|
431
|
+
if startswith_under:
|
|
432
|
+
result = '_' + result
|
|
433
|
+
return result
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def camel(string):
|
|
437
|
+
"""
|
|
438
|
+
Convert a string to camelCase from snake_case, camelCase, or PascalCase.
|
|
439
|
+
The string can contain dots or double colons, which are preserved in the output.
|
|
440
|
+
Underscores at the beginning of the string are preserved in the output, but
|
|
441
|
+
underscores in the middle of the string are removed.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
string (str): The string to convert.
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
str: The string in camelCase.
|
|
448
|
+
"""
|
|
449
|
+
if '::' in string:
|
|
450
|
+
strings = string.split('::')
|
|
451
|
+
return strings[0] + '::' + '::'.join(camel(s) for s in strings[1:])
|
|
452
|
+
if '.' in string:
|
|
453
|
+
strings = string.split('.')
|
|
454
|
+
return '.'.join(camel(s) for s in strings)
|
|
455
|
+
if not string or len(string) == 0:
|
|
456
|
+
return string
|
|
457
|
+
words = []
|
|
458
|
+
if '_' in string:
|
|
459
|
+
# snake_case
|
|
460
|
+
words = re.split(r'_', string)
|
|
461
|
+
elif string[0].isupper():
|
|
462
|
+
# PascalCase
|
|
463
|
+
words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
|
|
464
|
+
else:
|
|
465
|
+
# camelCase
|
|
466
|
+
words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
|
|
467
|
+
result = words[0].lower() + ''.join(word.capitalize()
|
|
468
|
+
for word in words[1:])
|
|
469
|
+
return result
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def snake(string):
|
|
473
|
+
"""
|
|
474
|
+
Convert a string to snake_case from snake_case, camelCase, or PascalCase.
|
|
475
|
+
The string can contain dots or double colons, which are preserved in the output.
|
|
476
|
+
Underscores at the beginning of the string are preserved in the output, but
|
|
477
|
+
underscores in the middle of the string are removed.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
string (str): The string to convert.
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
str: The string in snake_case.
|
|
484
|
+
"""
|
|
485
|
+
if '::' in string:
|
|
486
|
+
strings = string.split('::')
|
|
487
|
+
return strings[0] + '::' + '::'.join(snake(s) for s in strings[1:])
|
|
488
|
+
if '.' in string:
|
|
489
|
+
strings = string.split('.')
|
|
490
|
+
return '.'.join(snake(s) for s in strings)
|
|
491
|
+
if not string or len(string) == 0:
|
|
492
|
+
return string
|
|
493
|
+
words = []
|
|
494
|
+
if '_' in string:
|
|
495
|
+
# snake_case
|
|
496
|
+
words = re.split(r'_', string)
|
|
497
|
+
elif string[0].isupper():
|
|
498
|
+
# PascalCase
|
|
499
|
+
words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
|
|
500
|
+
else:
|
|
501
|
+
# camelCase
|
|
502
|
+
words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
|
|
503
|
+
result = '_'.join(word.lower() for word in words)
|
|
504
|
+
return result
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def fullname(avro_schema: dict| str, parent_namespace: str = '') -> str:
|
|
508
|
+
"""
|
|
509
|
+
Constructs the full name of the Avro schema.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
avro_schema (dict): The Avro schema.
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
str: The full name of the Avro schema.
|
|
516
|
+
"""
|
|
517
|
+
if isinstance(avro_schema, str):
|
|
518
|
+
if not '.' in avro_schema and parent_namespace:
|
|
519
|
+
return parent_namespace + '.' + avro_schema
|
|
520
|
+
return avro_schema
|
|
521
|
+
name = avro_schema.get("name", "")
|
|
522
|
+
namespace = avro_schema.get("namespace", parent_namespace)
|
|
523
|
+
return namespace + "." + name if namespace else name
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def altname(schema_obj: dict, purpose: str):
|
|
527
|
+
"""
|
|
528
|
+
Retrieves the alternative name for a given purpose from the schema object.
|
|
529
|
+
|
|
530
|
+
Args:
|
|
531
|
+
schema_obj (dict): The schema object (record or field).
|
|
532
|
+
default_name (str): The default name.
|
|
533
|
+
purpose (str): The purpose for the alternative name (e.g., 'sql').
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
str: The alternative name if present, otherwise the default name.
|
|
537
|
+
"""
|
|
538
|
+
if "altnames" in schema_obj and purpose in schema_obj["altnames"]:
|
|
539
|
+
return schema_obj["altnames"][purpose]
|
|
540
|
+
return schema_obj["name"]
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def process_template(file_path: str, **kvargs) -> str:
|
|
544
|
+
"""
|
|
545
|
+
Process a file as a Jinja2 template with the given object as input.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
file_path (str): The path to the file.
|
|
549
|
+
obj (Any): The object to use as input for the template.
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
str: The processed template as a string.
|
|
553
|
+
"""
|
|
554
|
+
# Load the template environment
|
|
555
|
+
file_dir = os.path.dirname(__file__)
|
|
556
|
+
template_loader = jinja2.FileSystemLoader(searchpath=file_dir)
|
|
557
|
+
template_env = jinja2.Environment(loader=template_loader)
|
|
558
|
+
template_env.filters['pascal'] = pascal
|
|
559
|
+
template_env.filters['camel'] = camel
|
|
560
|
+
|
|
561
|
+
# Load the template from the file
|
|
562
|
+
template = template_env.get_template(file_path)
|
|
563
|
+
|
|
564
|
+
# Render the template with the object as input
|
|
565
|
+
output = template.render(**kvargs)
|
|
566
|
+
|
|
567
|
+
return output
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def render_template(template: str, output: str, **kvargs):
|
|
571
|
+
"""
|
|
572
|
+
Render a template and write it to a file
|
|
573
|
+
|
|
574
|
+
Args:
|
|
575
|
+
template (str): The template to render.
|
|
576
|
+
output (str): The output file path.
|
|
577
|
+
**kvargs: The keyword arguments to pass to the template.
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
None
|
|
581
|
+
"""
|
|
582
|
+
out = process_template(template, **kvargs)
|
|
583
|
+
# make sure the directory exists
|
|
584
|
+
os.makedirs(os.path.dirname(output), exist_ok=True)
|
|
585
|
+
with open(output, 'w', encoding='utf-8') as f:
|
|
586
|
+
f.write(out)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def get_longest_namespace_prefix(schema):
|
|
590
|
+
""" Get the longest common prefix for the namespace of all types in the schema. """
|
|
591
|
+
namespaces = set(collect_namespaces(schema))
|
|
592
|
+
longest_common_prefix = ''
|
|
593
|
+
# find longest common prefix of the namespaces (not with os.path!!!)
|
|
594
|
+
for ns in namespaces:
|
|
595
|
+
if not longest_common_prefix:
|
|
596
|
+
longest_common_prefix = ns
|
|
597
|
+
else:
|
|
598
|
+
for i in range(min(len(longest_common_prefix), len(ns))):
|
|
599
|
+
if longest_common_prefix[i] != ns[i]:
|
|
600
|
+
longest_common_prefix = longest_common_prefix[:i]
|
|
601
|
+
break
|
|
602
|
+
return longest_common_prefix.strip('.')
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def collect_namespaces(schema: Any, parent_namespace: str = '') -> List[str]:
|
|
606
|
+
""" Performs a deep search of the schema to collect all namespaces """
|
|
607
|
+
namespaces = []
|
|
608
|
+
if isinstance(schema, dict):
|
|
609
|
+
namespace = str(schema.get('namespace', parent_namespace))
|
|
610
|
+
if namespace:
|
|
611
|
+
namespaces.append(namespace)
|
|
612
|
+
if 'fields' in schema and isinstance(schema['fields'], list):
|
|
613
|
+
for field in schema['fields']:
|
|
614
|
+
if isinstance(field, dict) and 'type' in field and isinstance(field['type'], dict):
|
|
615
|
+
namespaces.extend(collect_namespaces(
|
|
616
|
+
field['type'], namespace))
|
|
617
|
+
namespaces.extend(collect_namespaces(field, namespace))
|
|
618
|
+
if 'items' in schema and isinstance(schema['items'], dict):
|
|
619
|
+
namespaces.extend(collect_namespaces(schema['items'], namespace))
|
|
620
|
+
if 'values' in schema and isinstance(schema['values'], dict):
|
|
621
|
+
namespaces.extend(collect_namespaces(schema['values'], namespace))
|
|
622
|
+
elif isinstance(schema, list):
|
|
623
|
+
for item in schema:
|
|
624
|
+
namespaces.extend(collect_namespaces(item, parent_namespace))
|
|
625
|
+
return namespaces
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def build_flat_type_dict(avro_schema) -> Dict[str, Dict]:
|
|
629
|
+
"""Builds a flat dictionary of all named types in the main schema."""
|
|
630
|
+
type_dict = {}
|
|
631
|
+
|
|
632
|
+
def add_to_dict(schema, namespace):
|
|
633
|
+
if isinstance(schema, dict):
|
|
634
|
+
schema_type = schema.get('type')
|
|
635
|
+
name = schema.get('name')
|
|
636
|
+
namespace = schema.get('namespace', namespace)
|
|
637
|
+
if schema_type in ['record', 'enum', 'fixed'] and name:
|
|
638
|
+
qualified_name = f"{namespace}.{name}" if namespace else name
|
|
639
|
+
type_dict[qualified_name] = schema
|
|
640
|
+
if schema_type == 'record':
|
|
641
|
+
for field in schema.get('fields', []):
|
|
642
|
+
field_type = field.get('type')
|
|
643
|
+
add_to_dict(field_type, namespace)
|
|
644
|
+
elif schema_type == 'array':
|
|
645
|
+
add_to_dict(schema.get('items'), namespace)
|
|
646
|
+
elif schema_type == 'map':
|
|
647
|
+
add_to_dict(schema.get('values'), namespace)
|
|
648
|
+
elif isinstance(schema, list):
|
|
649
|
+
for item in schema:
|
|
650
|
+
add_to_dict(item, namespace)
|
|
651
|
+
|
|
652
|
+
if isinstance(avro_schema, dict):
|
|
653
|
+
add_to_dict(avro_schema, avro_schema.get('namespace', ''))
|
|
654
|
+
elif isinstance(avro_schema, list):
|
|
655
|
+
for schema in avro_schema:
|
|
656
|
+
schema_namespace = schema.get('namespace', '')
|
|
657
|
+
add_to_dict(schema, schema_namespace)
|
|
658
|
+
return type_dict
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def evict_tracked_references(avro_schema, parent_namespace, tracker):
|
|
662
|
+
""" Evicts all tracked references in the Avro schema. """
|
|
663
|
+
if isinstance(avro_schema, dict):
|
|
664
|
+
if 'type' in avro_schema and (avro_schema['type'] == 'record' or avro_schema['type'] == 'enum' or avro_schema['type'] == 'fixed'):
|
|
665
|
+
namespace = avro_schema.get('namespace', parent_namespace)
|
|
666
|
+
qualified_name = (
|
|
667
|
+
namespace + '.' if namespace else '') + avro_schema['name']
|
|
668
|
+
if not qualified_name in tracker:
|
|
669
|
+
if 'fields' in avro_schema:
|
|
670
|
+
for field in avro_schema['fields']:
|
|
671
|
+
field['type'] = evict_tracked_references(
|
|
672
|
+
field['type'], namespace, tracker)
|
|
673
|
+
return avro_schema
|
|
674
|
+
else:
|
|
675
|
+
return qualified_name
|
|
676
|
+
# Handling array types
|
|
677
|
+
elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
|
|
678
|
+
avro_schema['items'] = evict_tracked_references(
|
|
679
|
+
avro_schema['items'], parent_namespace, tracker)
|
|
680
|
+
# Handling map types
|
|
681
|
+
elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
|
|
682
|
+
avro_schema['values'] = evict_tracked_references(
|
|
683
|
+
avro_schema['values'], parent_namespace, tracker)
|
|
684
|
+
elif isinstance(avro_schema, list):
|
|
685
|
+
return [evict_tracked_references(item, parent_namespace, tracker) for item in avro_schema]
|
|
686
|
+
return avro_schema
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def inline_avro_references(avro_schema, type_dict, current_namespace, tracker=None, defined_types=None):
|
|
690
|
+
""" Inlines the first reference to a type in the Avro schema. """
|
|
691
|
+
if tracker is None:
|
|
692
|
+
tracker = set()
|
|
693
|
+
if defined_types is None:
|
|
694
|
+
defined_types = set()
|
|
695
|
+
|
|
696
|
+
if isinstance(avro_schema, dict):
|
|
697
|
+
# Register the type if it's a record, enum, or fixed and is inlined in the same schema
|
|
698
|
+
if 'type' in avro_schema and avro_schema['type'] in ['record', 'enum', 'fixed']:
|
|
699
|
+
namespace = avro_schema.get('namespace', current_namespace)
|
|
700
|
+
qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
|
|
701
|
+
defined_types.add(qualified_name)
|
|
702
|
+
|
|
703
|
+
# Process record types
|
|
704
|
+
if 'type' in avro_schema and avro_schema['type'] == 'record' and 'fields' in avro_schema:
|
|
705
|
+
namespace = avro_schema.get('namespace', current_namespace)
|
|
706
|
+
qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
|
|
707
|
+
if qualified_name in tracker:
|
|
708
|
+
return qualified_name
|
|
709
|
+
tracker.add(qualified_name)
|
|
710
|
+
for field in avro_schema['fields']:
|
|
711
|
+
field['type'] = inline_avro_references(
|
|
712
|
+
field['type'], type_dict, namespace, tracker, defined_types)
|
|
713
|
+
|
|
714
|
+
# Handling array types
|
|
715
|
+
elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
|
|
716
|
+
avro_schema['items'] = inline_avro_references(
|
|
717
|
+
avro_schema['items'], type_dict, current_namespace, tracker, defined_types)
|
|
718
|
+
|
|
719
|
+
# Handling map types
|
|
720
|
+
elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
|
|
721
|
+
avro_schema['values'] = inline_avro_references(
|
|
722
|
+
avro_schema['values'], type_dict, current_namespace, tracker, defined_types)
|
|
723
|
+
|
|
724
|
+
# Inline other types, except enum and fixed
|
|
725
|
+
elif 'type' in avro_schema and avro_schema['type'] not in ['enum', 'fixed']:
|
|
726
|
+
avro_schema['type'] = inline_avro_references(
|
|
727
|
+
avro_schema['type'], type_dict, current_namespace, tracker, defined_types)
|
|
728
|
+
|
|
729
|
+
elif isinstance(avro_schema, list):
|
|
730
|
+
return [inline_avro_references(item, type_dict, current_namespace, tracker, defined_types) for item in avro_schema]
|
|
731
|
+
|
|
732
|
+
elif avro_schema in type_dict and avro_schema not in tracker and avro_schema not in defined_types:
|
|
733
|
+
# Inline the referenced schema if not already tracked and not defined in the current schema
|
|
734
|
+
inlined_schema = type_dict[avro_schema].copy()
|
|
735
|
+
if isinstance(inlined_schema, dict) and not inlined_schema.get('namespace', None):
|
|
736
|
+
inlined_schema['namespace'] = '.'.join(avro_schema.split('.')[:-1])
|
|
737
|
+
inlined_schema = inline_avro_references(
|
|
738
|
+
inlined_schema, type_dict, inlined_schema['namespace'], tracker, defined_types)
|
|
739
|
+
tracker.add(avro_schema)
|
|
740
|
+
return inlined_schema
|
|
741
|
+
|
|
742
|
+
return avro_schema
|
|
743
|
+
|
|
744
|
+
def strip_first_doc(schema) -> bool:
|
|
745
|
+
""" strip the first doc field anywhere in the schema"""
|
|
746
|
+
if isinstance(schema, dict):
|
|
747
|
+
if "doc" in schema:
|
|
748
|
+
del schema["doc"]
|
|
749
|
+
return True
|
|
750
|
+
for key in schema:
|
|
751
|
+
if strip_first_doc(schema[key]):
|
|
752
|
+
return True
|
|
753
|
+
elif isinstance(schema, list):
|
|
754
|
+
for item in schema:
|
|
755
|
+
if strip_first_doc(item):
|
|
756
|
+
return True
|
|
757
|
+
return False
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
def is_type_with_alternate(avro_schema: List[Dict[str, Any]]) -> bool:
|
|
761
|
+
"""
|
|
762
|
+
Check if the Avro schema union contains a type with a trailing alternate type.
|
|
763
|
+
Alternate types are maps that mimic the structure of the original type, but
|
|
764
|
+
allow for additional fields. Alternate types are labeled with an 'alternateof'
|
|
765
|
+
attribute extension that points to the original type.
|
|
766
|
+
|
|
767
|
+
Args:
|
|
768
|
+
avro_schema (List[Dict[str, Any]]): The Avro schema to check.
|
|
769
|
+
|
|
770
|
+
Returns:
|
|
771
|
+
bool: True if the Avro schema contains a type with an alternate name, False otherwise.
|
|
772
|
+
"""
|
|
773
|
+
avro_schema = avro_schema.copy()
|
|
774
|
+
if not isinstance(avro_schema, list):
|
|
775
|
+
return False
|
|
776
|
+
if 'null' in avro_schema:
|
|
777
|
+
avro_schema.remove('null')
|
|
778
|
+
if len(avro_schema) != 2:
|
|
779
|
+
return False
|
|
780
|
+
original_type = any(t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t)
|
|
781
|
+
alternate_type = any(t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t)
|
|
782
|
+
if original_type and alternate_type:
|
|
783
|
+
return True
|
|
784
|
+
return False
|
|
785
|
+
|
|
786
|
+
def strip_alternate_type(avro_schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
787
|
+
"""
|
|
788
|
+
Strips the alternate type from the Avro schema union.
|
|
789
|
+
|
|
790
|
+
Args:
|
|
791
|
+
avro_schema (List[Dict[str, Any]]): The Avro schema to strip.
|
|
792
|
+
|
|
793
|
+
Returns:
|
|
794
|
+
List[Dict[str, Any]]: The Avro schema without the alternate type.
|
|
795
|
+
"""
|
|
796
|
+
original_type = next((t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t), None)
|
|
797
|
+
alternate_type = next((t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t), None)
|
|
798
|
+
if original_type and alternate_type:
|
|
799
|
+
avro_schema.remove(alternate_type)
|
|
800
|
+
return avro_schema
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
def get_typing_args_from_string(type_str: str) -> List[str]:
|
|
804
|
+
""" gets the list of generic arguments of a type. """
|
|
805
|
+
# This regex captures the main type and its generic arguments
|
|
806
|
+
pattern = re.compile(r'([\w\.]+)\[(.+)\]')
|
|
807
|
+
match = pattern.match(type_str)
|
|
808
|
+
|
|
809
|
+
if not match:
|
|
810
|
+
return []
|
|
811
|
+
|
|
812
|
+
_, args_str = match.groups()
|
|
813
|
+
# Splitting the arguments while considering nested generic types
|
|
814
|
+
args = []
|
|
815
|
+
depth = 0
|
|
816
|
+
current_arg:List[str] = []
|
|
817
|
+
for char in args_str:
|
|
818
|
+
if char == ',' and depth == 0:
|
|
819
|
+
args.append(''.join(current_arg).strip())
|
|
820
|
+
current_arg = []
|
|
821
|
+
else:
|
|
822
|
+
if char == '[':
|
|
823
|
+
depth += 1
|
|
824
|
+
elif char == ']':
|
|
825
|
+
depth -= 1
|
|
826
|
+
current_arg.append(char)
|
|
827
|
+
if current_arg:
|
|
828
|
+
args.append(''.join(current_arg).strip())
|
|
829
|
+
return args
|