structurize 2.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. avrotize/__init__.py +64 -0
  2. avrotize/__main__.py +6 -0
  3. avrotize/_version.py +34 -0
  4. avrotize/asn1toavro.py +160 -0
  5. avrotize/avrotize.py +152 -0
  6. avrotize/avrotocpp.py +483 -0
  7. avrotize/avrotocsharp.py +1075 -0
  8. avrotize/avrotocsv.py +121 -0
  9. avrotize/avrotodatapackage.py +173 -0
  10. avrotize/avrotodb.py +1383 -0
  11. avrotize/avrotogo.py +476 -0
  12. avrotize/avrotographql.py +197 -0
  13. avrotize/avrotoiceberg.py +210 -0
  14. avrotize/avrotojava.py +2156 -0
  15. avrotize/avrotojs.py +250 -0
  16. avrotize/avrotojsons.py +481 -0
  17. avrotize/avrotojstruct.py +345 -0
  18. avrotize/avrotokusto.py +364 -0
  19. avrotize/avrotomd.py +137 -0
  20. avrotize/avrotools.py +168 -0
  21. avrotize/avrotoparquet.py +208 -0
  22. avrotize/avrotoproto.py +359 -0
  23. avrotize/avrotopython.py +624 -0
  24. avrotize/avrotorust.py +435 -0
  25. avrotize/avrotots.py +598 -0
  26. avrotize/avrotoxsd.py +344 -0
  27. avrotize/cddltostructure.py +1841 -0
  28. avrotize/commands.json +3337 -0
  29. avrotize/common.py +834 -0
  30. avrotize/constants.py +72 -0
  31. avrotize/csvtoavro.py +132 -0
  32. avrotize/datapackagetoavro.py +76 -0
  33. avrotize/dependencies/cpp/vcpkg/vcpkg.json +19 -0
  34. avrotize/dependencies/typescript/node22/package.json +16 -0
  35. avrotize/dependency_resolver.py +348 -0
  36. avrotize/dependency_version.py +432 -0
  37. avrotize/jsonstoavro.py +2167 -0
  38. avrotize/jsonstostructure.py +2642 -0
  39. avrotize/jstructtoavro.py +878 -0
  40. avrotize/kstructtoavro.py +93 -0
  41. avrotize/kustotoavro.py +455 -0
  42. avrotize/parquettoavro.py +157 -0
  43. avrotize/proto2parser.py +498 -0
  44. avrotize/proto3parser.py +403 -0
  45. avrotize/prototoavro.py +382 -0
  46. avrotize/structuretocddl.py +597 -0
  47. avrotize/structuretocpp.py +697 -0
  48. avrotize/structuretocsharp.py +2295 -0
  49. avrotize/structuretocsv.py +365 -0
  50. avrotize/structuretodatapackage.py +659 -0
  51. avrotize/structuretodb.py +1125 -0
  52. avrotize/structuretogo.py +720 -0
  53. avrotize/structuretographql.py +502 -0
  54. avrotize/structuretoiceberg.py +355 -0
  55. avrotize/structuretojava.py +853 -0
  56. avrotize/structuretojsons.py +498 -0
  57. avrotize/structuretokusto.py +639 -0
  58. avrotize/structuretomd.py +322 -0
  59. avrotize/structuretoproto.py +764 -0
  60. avrotize/structuretopython.py +772 -0
  61. avrotize/structuretorust.py +714 -0
  62. avrotize/structuretots.py +653 -0
  63. avrotize/structuretoxsd.py +679 -0
  64. avrotize/xsdtoavro.py +413 -0
  65. structurize-2.19.0.dist-info/METADATA +107 -0
  66. structurize-2.19.0.dist-info/RECORD +70 -0
  67. structurize-2.19.0.dist-info/WHEEL +5 -0
  68. structurize-2.19.0.dist-info/entry_points.txt +2 -0
  69. structurize-2.19.0.dist-info/licenses/LICENSE +201 -0
  70. structurize-2.19.0.dist-info/top_level.txt +1 -0
avrotize/common.py ADDED
@@ -0,0 +1,834 @@
1
+ """
2
+ Common utility functions for Avrotize.
3
+ """
4
+
5
+ # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements, line-too-long
6
+
7
+ from collections import defaultdict
8
+ import copy
9
+ import os
10
+ import re
11
+ import hashlib
12
+ import json
13
+ from typing import Dict, Union, Any, List
14
+ from jsoncomparison import NO_DIFF, Compare
15
+ import jinja2
16
+
17
+
18
+ def avro_name(name):
19
+ """Convert a name into an Avro name."""
20
+ if isinstance(name, int):
21
+ name = '_'+str(name)
22
+ val = re.sub(r'[^a-zA-Z0-9_]', '_', name)
23
+ # Ensure the name starts with a letter or underscore (required for valid identifiers)
24
+ if re.match(r'^[0-9]', val):
25
+ val = '_' + val
26
+ # Additional check to ensure we always have a valid identifier
27
+ if not val or not re.match(r'^[a-zA-Z_]', val):
28
+ val = '_' + val
29
+ return val
30
+
31
+
32
+ def avro_name_with_altname(name):
33
+ """
34
+ Convert a name into an Avro name and return both the normalized name and alternate name info.
35
+
36
+ Args:
37
+ name (str): The original name to convert
38
+
39
+ Returns:
40
+ tuple: (normalized_name, original_name_if_different_or_None)
41
+ """
42
+ if isinstance(name, int):
43
+ name = str(name)
44
+
45
+ original_name = name
46
+ normalized_name = avro_name(name)
47
+
48
+ # If the normalized name is different from the original, return the original as alt name
49
+ if normalized_name != original_name:
50
+ return normalized_name, original_name
51
+ else:
52
+ return normalized_name, None
53
+
54
+
55
+ def avro_namespace(name):
56
+ """Convert a name into an Avro name."""
57
+ val = re.sub(r'[^a-zA-Z0-9_\.]', '_', name)
58
+ if re.match(r'^[0-9]', val):
59
+ val = '_' + val
60
+ return val
61
+
62
+
63
+ def generic_type() -> list[str | dict]:
64
+ """
65
+ Constructs a generic Avro type for simple types, arrays, and maps.
66
+
67
+ Returns:
68
+ list[str | dict]: A list of simple types, arrays, and maps.
69
+ """
70
+ simple_type_union: list[str | dict] = [
71
+ "null", "boolean", "int", "long", "float", "double", "bytes", "string"]
72
+ l2 = simple_type_union.copy()
73
+ l2.extend([
74
+ {
75
+ "type": "array",
76
+ "items": simple_type_union
77
+ },
78
+ {
79
+ "type": "map",
80
+ "values": simple_type_union
81
+ }])
82
+ l1 = simple_type_union.copy()
83
+ l1.extend([
84
+ {
85
+ "type": "array",
86
+ "items": l2
87
+ },
88
+ {
89
+ "type": "map",
90
+ "values": l2
91
+ }])
92
+ return l1
93
+
94
+
95
+ def is_generic_avro_type(avro_type: list) -> bool:
96
+ """
97
+ Check if the given Avro type is a generic type.
98
+
99
+ Args:
100
+ avro_type (Union[str, Dict[str, Any]]): The Avro type to check.
101
+
102
+ Returns:
103
+ bool: True if the Avro type is a generic type, False otherwise.
104
+ """
105
+ if isinstance(avro_type, str) or isinstance(avro_type, dict):
106
+ return False
107
+ compare_type = generic_type()
108
+ return Compare().check(avro_type, compare_type) == NO_DIFF
109
+
110
+
111
+ def is_generic_json_type(json_type: Dict[str, Any] | List[Dict[str, Any] | str] | str) -> bool:
112
+ """
113
+ Check if the given JSON type is a generic type.
114
+
115
+ Args:
116
+ json_type (Union[Dict[str, Any], str, List[Union[str, Dict[str, Any]]]]): The JSON type to check.
117
+
118
+ Returns:
119
+ bool: True if the JSON type is a generic type, False otherwise.
120
+ """
121
+ if isinstance(json_type, str) or isinstance(json_type, list):
122
+ return False
123
+ compare_type = generic_type_json()
124
+ return Compare().check(json_type, compare_type) == NO_DIFF
125
+
126
+
127
+ def generic_type_json() -> dict:
128
+ """
129
+ Returns a dictionary representing a generic JSON schema for various types.
130
+
131
+ The schema includes support for boolean, integer, number, string, array, and object types.
132
+ Each type can have different formats such as int32, int64, float, double, and byte.
133
+
134
+ Returns:
135
+ dict: A dictionary representing the generic JSON schema.
136
+ """
137
+ return {
138
+ "oneOf": [
139
+ {"type": "boolean"},
140
+ {"type": "integer", "format": "int32"},
141
+ {"type": "integer", "format": "int64"},
142
+ {"type": "number", "format": "float"},
143
+ {"type": "number", "format": "double"},
144
+ {"type": "string", "format": "byte"},
145
+ {"type": "string"},
146
+ {
147
+ "type": "array",
148
+ "items": {
149
+ "oneOf": [
150
+ {"type": "boolean"},
151
+ {"type": "integer", "format": "int32"},
152
+ {"type": "integer", "format": "int64"},
153
+ {"type": "number", "format": "float"},
154
+ {"type": "number", "format": "double"},
155
+ {"type": "string", "format": "byte"},
156
+ {"type": "string"},
157
+ {
158
+ "type": "array",
159
+ "items": {
160
+ "oneOf": [
161
+ {"type": "boolean"},
162
+ {"type": "integer", "format": "int32"},
163
+ {"type": "integer", "format": "int64"},
164
+ {"type": "number", "format": "float"},
165
+ {"type": "number", "format": "double"},
166
+ {"type": "string", "format": "byte"},
167
+ {"type": "string"}
168
+ ]
169
+ }
170
+ },
171
+ {
172
+ "type": "object",
173
+ "additionalProperties": {
174
+ "oneOf": [
175
+ {"type": "boolean"},
176
+ {"type": "integer", "format": "int32"},
177
+ {"type": "integer", "format": "int64"},
178
+ {"type": "number", "format": "float"},
179
+ {"type": "number", "format": "double"},
180
+ {"type": "string", "format": "byte"},
181
+ {"type": "string"}
182
+ ]
183
+ }
184
+ }
185
+ ]
186
+ }
187
+ },
188
+ {
189
+ "type": "object",
190
+ "additionalProperties": {
191
+ "oneOf": [
192
+ {"type": "boolean"},
193
+ {"type": "integer", "format": "int32"},
194
+ {"type": "integer", "format": "int64"},
195
+ {"type": "number", "format": "float"},
196
+ {"type": "number", "format": "double"},
197
+ {"type": "string", "format": "byte"},
198
+ {"type": "string"},
199
+ {
200
+ "type": "array",
201
+ "items": {
202
+ "oneOf": [
203
+ {"type": "boolean"},
204
+ {"type": "integer", "format": "int32"},
205
+ {"type": "integer", "format": "int64"},
206
+ {"type": "number", "format": "float"},
207
+ {"type": "number", "format": "double"},
208
+ {"type": "string", "format": "byte"},
209
+ {"type": "string"}
210
+ ]
211
+ }
212
+ },
213
+ {
214
+ "type": "object",
215
+ "additionalProperties": {
216
+ "oneOf": [
217
+ {"type": "boolean"},
218
+ {"type": "integer", "format": "int32"},
219
+ {"type": "integer", "format": "int64"},
220
+ {"type": "number", "format": "float"},
221
+ {"type": "number", "format": "double"},
222
+ {"type": "string", "format": "byte"},
223
+ {"type": "string"}
224
+ ]
225
+ }
226
+ }
227
+ ]
228
+ }
229
+ }
230
+ ]
231
+ }
232
+
233
+
234
+ def find_schema_node(test, avro_schema, recursion_stack=None):
235
+ """
236
+ Find the first schema node in the avro_schema matching the test
237
+
238
+ Args:
239
+ test (Callable): The test function.
240
+ avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
241
+ recursion_stack (List[Union[Dict[str, Any], List[Dict[str, Any]]], optional): The recursion stack. Defaults to None.
242
+
243
+ Returns:
244
+ Union[Dict[str, Any], None]: The schema node if found, otherwise None.
245
+ """
246
+ if recursion_stack is None:
247
+ recursion_stack = []
248
+ for recursion_item in recursion_stack:
249
+ if avro_schema is recursion_item:
250
+ raise ValueError('Cyclical reference detected in schema')
251
+ if len(recursion_stack) > 50:
252
+ raise ValueError('Maximum recursion depth 50 exceeded in schema')
253
+ try:
254
+ recursion_stack.append(avro_schema)
255
+ if isinstance(avro_schema, dict):
256
+ test_node = test(avro_schema)
257
+ if test_node:
258
+ return avro_schema
259
+ for _, v in avro_schema.items():
260
+ if isinstance(v, (dict, list)):
261
+ node = find_schema_node(test, v, recursion_stack)
262
+ if node:
263
+ return node
264
+ elif isinstance(avro_schema, list):
265
+ for item in avro_schema:
266
+ if isinstance(item, (dict, list)):
267
+ node = find_schema_node(test, item, recursion_stack)
268
+ if node:
269
+ return node
270
+ return None
271
+ finally:
272
+ recursion_stack.pop()
273
+
274
+
275
+ def set_schema_node(test, replacement, avro_schema):
276
+ """
277
+ Set the first schema node in the avro_schema matching the test to the replacement
278
+
279
+ Args:
280
+ test (Callable): The test function.
281
+ replacement (Dict[str, Any]): The replacement schema.
282
+ avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
283
+
284
+ Returns:
285
+ None
286
+ """
287
+ if isinstance(avro_schema, dict):
288
+ test_node = test(avro_schema)
289
+ if test_node:
290
+ avro_schema.clear()
291
+ avro_schema.update(replacement)
292
+ return
293
+ for k, v in avro_schema.items():
294
+ if isinstance(v, (dict, list)):
295
+ set_schema_node(test, replacement, v)
296
+ elif isinstance(avro_schema, list):
297
+ for item in avro_schema:
298
+ set_schema_node(test, replacement, item)
299
+
300
+
301
+ class NodeHash:
302
+ """ A hash value and count for a JSON object. """
303
+ def __init__(self: 'NodeHash', hash_value: bytes, count: int):
304
+ self.hash_value: bytes = hash_value
305
+ self.count: int = count
306
+
307
+
308
+ class NodeHashReference:
309
+ """ A reference to a JSON object with a hash value and count."""
310
+ def __init__(self, hash_and_count: NodeHash, value, path):
311
+ self.hash_value: bytes = hash_and_count.hash_value
312
+ self.count: int = hash_and_count.count
313
+ self.value: Any = value
314
+ self.path: str = path
315
+
316
+
317
+ def get_tree_hash(json_obj: Union[dict, list]) -> NodeHash:
318
+ """
319
+ Generate a hash from a JSON object (dict or list).
320
+
321
+ Args:
322
+ json_obj (Union[dict, list]): The JSON object to hash.
323
+
324
+ Returns:
325
+ NodeHash: The hash value and count.
326
+ """
327
+ if isinstance(json_obj, dict) or isinstance(json_obj, list):
328
+ s = json.dumps(json_obj, sort_keys=True).encode('utf-8')
329
+ return NodeHash(hashlib.sha256(s).digest(), len(s))
330
+ else:
331
+ s = json.dumps(json_obj).encode('utf-8')
332
+ return NodeHash(hashlib.sha256(s).digest(), len(s))
333
+
334
+
335
+ def build_tree_hash_list(json_obj: Union[dict, list], path: str = '') -> Dict[str, NodeHashReference]:
336
+ """
337
+ Build a flat dictionary of hashes for a JSON object.
338
+ The keys are JSON Path expressions, and the values are the hashes.
339
+
340
+ Args:
341
+ json_obj (Union[dict, list]): The JSON object to hash.
342
+ path (str): The current JSON Path expression. Defaults to ''.
343
+
344
+ Returns:
345
+ Dict[str, NodeHashReference]: A dictionary of JSON Path expressions and hashes.
346
+ """
347
+
348
+ def has_nested_structure(obj: Union[dict, list]) -> bool:
349
+ """
350
+ Check if the object (list or dict) contains any nested lists or dicts.
351
+ """
352
+ if isinstance(obj, dict):
353
+ return any(isinstance(value, (dict, list)) for value in obj.values())
354
+ elif isinstance(obj, list):
355
+ return any(isinstance(item, (dict, list)) for item in obj)
356
+ return False
357
+
358
+ tree_hash = {}
359
+ if isinstance(json_obj, dict):
360
+ for key, value in json_obj.items():
361
+ new_path = f'{path}.{key}' if path else f'$.{key}'
362
+ if isinstance(value, dict) and has_nested_structure(value):
363
+ inner_hashes = build_tree_hash_list(value, new_path)
364
+ for inner_path, hash_reference in inner_hashes.items():
365
+ tree_hash[inner_path] = hash_reference
366
+ hash_value = get_tree_hash(value)
367
+ tree_hash[new_path] = NodeHashReference(hash_value, value, new_path)
368
+ elif isinstance(json_obj, list):
369
+ for index, item in enumerate(json_obj):
370
+ new_path = f"{path}[{index}]"
371
+ if isinstance(item, (dict, list)) and has_nested_structure(item):
372
+ inner_hashes = build_tree_hash_list(item, new_path)
373
+ for inner_path, hash_reference in inner_hashes.items():
374
+ tree_hash[inner_path] = hash_reference
375
+ return tree_hash
376
+
377
+
378
+ def group_by_hash(tree_hash_list: Dict[str, NodeHashReference]) -> Dict[bytes, list]:
379
+ """
380
+ Group JSON Path expressions by their hash values.
381
+
382
+ Args:
383
+ tree_hash_list (Dict[str, NodeHashReference]): A dictionary of JSON Path expressions and hashes.
384
+
385
+ Returns:
386
+ Dict[bytes, list]: A dictionary of hash values and lists of JSON Path expressions.
387
+ """
388
+ hash_groups = defaultdict(list)
389
+ for _, hash_reference in tree_hash_list.items():
390
+ hash_groups[hash_reference.hash_value].append(hash_reference)
391
+
392
+ # Filter out unique hashes to only return groups with more than one path
393
+ for k in list(hash_groups.keys()):
394
+ if len(hash_groups[k]) == 1:
395
+ del hash_groups[k]
396
+ return hash_groups
397
+
398
+
399
+ def pascal(string):
400
+ """
401
+ Convert a string to PascalCase from snake_case, camelCase, or PascalCase.
402
+ The string can contain dots or double colons, which are preserved in the output.
403
+ Underscores at the beginning of the string are preserved in the output, but
404
+ underscores in the middle of the string are removed.
405
+
406
+ Args:
407
+ string (str): The string to convert.
408
+
409
+ Returns:
410
+ str: The string in PascalCase.
411
+ """
412
+ if '::' in string:
413
+ strings = string.split('::')
414
+ return strings[0] + '::' + '::'.join(pascal(s) for s in strings[1:])
415
+ if '.' in string:
416
+ strings = string.split('.')
417
+ return '.'.join(pascal(s) for s in strings)
418
+ if not string or len(string) == 0:
419
+ return string
420
+ words = []
421
+ startswith_under = string[0] == '_'
422
+ if '_' in string:
423
+ # snake_case
424
+ words = re.split(r'_', string)
425
+ elif string[0].isupper():
426
+ # PascalCase
427
+ words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
428
+ else:
429
+ # camelCase
430
+ words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
431
+ result = ''.join(word.capitalize() for word in words)
432
+ if startswith_under:
433
+ result = '_' + result
434
+ return result
435
+
436
+
437
+ def camel(string):
438
+ """
439
+ Convert a string to camelCase from snake_case, camelCase, or PascalCase.
440
+ The string can contain dots or double colons, which are preserved in the output.
441
+ Underscores at the beginning of the string are preserved in the output, but
442
+ underscores in the middle of the string are removed.
443
+
444
+ Args:
445
+ string (str): The string to convert.
446
+
447
+ Returns:
448
+ str: The string in camelCase.
449
+ """
450
+ if '::' in string:
451
+ strings = string.split('::')
452
+ return strings[0] + '::' + '::'.join(camel(s) for s in strings[1:])
453
+ if '.' in string:
454
+ strings = string.split('.')
455
+ return '.'.join(camel(s) for s in strings)
456
+ if not string or len(string) == 0:
457
+ return string
458
+ words = []
459
+ if '_' in string:
460
+ # snake_case
461
+ words = re.split(r'_', string)
462
+ elif string[0].isupper():
463
+ # PascalCase
464
+ words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
465
+ else:
466
+ # camelCase
467
+ words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
468
+ result = words[0].lower() + ''.join(word.capitalize()
469
+ for word in words[1:])
470
+ return result
471
+
472
+
473
+ def snake(string):
474
+ """
475
+ Convert a string to snake_case from snake_case, camelCase, or PascalCase.
476
+ The string can contain dots or double colons, which are preserved in the output.
477
+ Underscores at the beginning of the string are preserved in the output, but
478
+ underscores in the middle of the string are removed.
479
+
480
+ Args:
481
+ string (str): The string to convert.
482
+
483
+ Returns:
484
+ str: The string in snake_case.
485
+ """
486
+ if '::' in string:
487
+ strings = string.split('::')
488
+ return strings[0] + '::' + '::'.join(snake(s) for s in strings[1:])
489
+ if '.' in string:
490
+ strings = string.split('.')
491
+ return '.'.join(snake(s) for s in strings)
492
+ if not string or len(string) == 0:
493
+ return string
494
+ words = []
495
+ if '_' in string:
496
+ # snake_case
497
+ words = re.split(r'_', string)
498
+ elif string[0].isupper():
499
+ # PascalCase
500
+ words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
501
+ else:
502
+ # camelCase
503
+ words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
504
+ result = '_'.join(word.lower() for word in words)
505
+ return result
506
+
507
+
508
+ def fullname(avro_schema: dict| str, parent_namespace: str = '') -> str:
509
+ """
510
+ Constructs the full name of the Avro schema.
511
+
512
+ Args:
513
+ avro_schema (dict): The Avro schema.
514
+
515
+ Returns:
516
+ str: The full name of the Avro schema.
517
+ """
518
+ if isinstance(avro_schema, str):
519
+ if not '.' in avro_schema and parent_namespace:
520
+ return parent_namespace + '.' + avro_schema
521
+ return avro_schema
522
+ name = avro_schema.get("name", "")
523
+ namespace = avro_schema.get("namespace", parent_namespace)
524
+ return namespace + "." + name if namespace else name
525
+
526
+
527
+ def altname(schema_obj: dict, purpose: str):
528
+ """
529
+ Retrieves the alternative name for a given purpose from the schema object.
530
+
531
+ Args:
532
+ schema_obj (dict): The schema object (record or field).
533
+ default_name (str): The default name.
534
+ purpose (str): The purpose for the alternative name (e.g., 'sql').
535
+
536
+ Returns:
537
+ str: The alternative name if present, otherwise the default name.
538
+ """
539
+ if "altnames" in schema_obj and purpose in schema_obj["altnames"]:
540
+ return schema_obj["altnames"][purpose]
541
+ return schema_obj["name"]
542
+
543
+
544
+ def process_template(file_path: str, **kvargs) -> str:
545
+ """
546
+ Process a file as a Jinja2 template with the given object as input.
547
+
548
+ Args:
549
+ file_path (str): The path to the file.
550
+ obj (Any): The object to use as input for the template.
551
+
552
+ Returns:
553
+ str: The processed template as a string.
554
+ """
555
+ # Load the template environment
556
+ file_dir = os.path.dirname(__file__)
557
+ template_loader = jinja2.FileSystemLoader(searchpath=file_dir)
558
+ template_env = jinja2.Environment(loader=template_loader)
559
+ template_env.filters['pascal'] = pascal
560
+ template_env.filters['camel'] = camel
561
+
562
+ # Load the template from the file
563
+ template = template_env.get_template(file_path)
564
+
565
+ # Render the template with the object as input
566
+ output = template.render(**kvargs)
567
+
568
+ return output
569
+
570
+
571
+ def render_template(template: str, output: str, **kvargs):
572
+ """
573
+ Render a template and write it to a file
574
+
575
+ Args:
576
+ template (str): The template to render.
577
+ output (str): The output file path.
578
+ **kvargs: The keyword arguments to pass to the template.
579
+
580
+ Returns:
581
+ None
582
+ """
583
+ out = process_template(template, **kvargs)
584
+ # make sure the directory exists
585
+ os.makedirs(os.path.dirname(output), exist_ok=True)
586
+ with open(output, 'w', encoding='utf-8') as f:
587
+ f.write(out)
588
+
589
+
590
+ def get_longest_namespace_prefix(schema):
591
+ """ Get the longest common prefix for the namespace of all types in the schema. """
592
+ namespaces = set(collect_namespaces(schema))
593
+ longest_common_prefix = ''
594
+ # find longest common prefix of the namespaces (not with os.path!!!)
595
+ for ns in namespaces:
596
+ if not longest_common_prefix:
597
+ longest_common_prefix = ns
598
+ else:
599
+ for i in range(min(len(longest_common_prefix), len(ns))):
600
+ if longest_common_prefix[i] != ns[i]:
601
+ longest_common_prefix = longest_common_prefix[:i]
602
+ break
603
+ return longest_common_prefix.strip('.')
604
+
605
+
606
+ def collect_namespaces(schema: Any, parent_namespace: str = '') -> List[str]:
607
+ """ Performs a deep search of the schema to collect all namespaces """
608
+ namespaces = []
609
+ if isinstance(schema, dict):
610
+ namespace = str(schema.get('namespace', parent_namespace))
611
+ if namespace:
612
+ namespaces.append(namespace)
613
+ if 'fields' in schema and isinstance(schema['fields'], list):
614
+ for field in schema['fields']:
615
+ if isinstance(field, dict) and 'type' in field and isinstance(field['type'], dict):
616
+ namespaces.extend(collect_namespaces(
617
+ field['type'], namespace))
618
+ namespaces.extend(collect_namespaces(field, namespace))
619
+ if 'items' in schema and isinstance(schema['items'], dict):
620
+ namespaces.extend(collect_namespaces(schema['items'], namespace))
621
+ if 'values' in schema and isinstance(schema['values'], dict):
622
+ namespaces.extend(collect_namespaces(schema['values'], namespace))
623
+ elif isinstance(schema, list):
624
+ for item in schema:
625
+ namespaces.extend(collect_namespaces(item, parent_namespace))
626
+ return namespaces
627
+
628
+
629
+ def build_flat_type_dict(avro_schema) -> Dict[str, Dict]:
630
+ """Builds a flat dictionary of all named types in the main schema."""
631
+ type_dict = {}
632
+
633
+ def add_to_dict(schema, namespace):
634
+ if isinstance(schema, dict):
635
+ schema_type = schema.get('type')
636
+ name = schema.get('name')
637
+ namespace = schema.get('namespace', namespace)
638
+ if schema_type in ['record', 'enum', 'fixed'] and name:
639
+ qualified_name = f"{namespace}.{name}" if namespace else name
640
+ type_dict[qualified_name] = schema
641
+ if schema_type == 'record':
642
+ for field in schema.get('fields', []):
643
+ field_type = field.get('type')
644
+ add_to_dict(field_type, namespace)
645
+ elif schema_type == 'array':
646
+ add_to_dict(schema.get('items'), namespace)
647
+ elif schema_type == 'map':
648
+ add_to_dict(schema.get('values'), namespace)
649
+ elif isinstance(schema, list):
650
+ for item in schema:
651
+ add_to_dict(item, namespace)
652
+
653
+ if isinstance(avro_schema, dict):
654
+ add_to_dict(avro_schema, avro_schema.get('namespace', ''))
655
+ elif isinstance(avro_schema, list):
656
+ for schema in avro_schema:
657
+ schema_namespace = schema.get('namespace', '')
658
+ add_to_dict(schema, schema_namespace)
659
+ return type_dict
660
+
661
+
662
+ def evict_tracked_references(avro_schema, parent_namespace, tracker):
663
+ """ Evicts all tracked references in the Avro schema. """
664
+ if isinstance(avro_schema, dict):
665
+ if 'type' in avro_schema and (avro_schema['type'] == 'record' or avro_schema['type'] == 'enum' or avro_schema['type'] == 'fixed'):
666
+ namespace = avro_schema.get('namespace', parent_namespace)
667
+ qualified_name = (
668
+ namespace + '.' if namespace else '') + avro_schema['name']
669
+ if not qualified_name in tracker:
670
+ if 'fields' in avro_schema:
671
+ for field in avro_schema['fields']:
672
+ field['type'] = evict_tracked_references(
673
+ field['type'], namespace, tracker)
674
+ return avro_schema
675
+ else:
676
+ return qualified_name
677
+ # Handling array types
678
+ elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
679
+ avro_schema['items'] = evict_tracked_references(
680
+ avro_schema['items'], parent_namespace, tracker)
681
+ # Handling map types
682
+ elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
683
+ avro_schema['values'] = evict_tracked_references(
684
+ avro_schema['values'], parent_namespace, tracker)
685
+ elif isinstance(avro_schema, list):
686
+ return [evict_tracked_references(item, parent_namespace, tracker) for item in avro_schema]
687
+ return avro_schema
688
+
689
+
690
+ def inline_avro_references(avro_schema, type_dict, current_namespace, tracker=None, defined_types=None):
691
+ """ Inlines the first reference to a type in the Avro schema. """
692
+ if tracker is None:
693
+ tracker = set()
694
+ if defined_types is None:
695
+ defined_types = set()
696
+
697
+ if isinstance(avro_schema, dict):
698
+ # Register the type if it's a record, enum, or fixed and is inlined in the same schema
699
+ if 'type' in avro_schema and avro_schema['type'] in ['record', 'enum', 'fixed']:
700
+ namespace = avro_schema.get('namespace', current_namespace)
701
+ qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
702
+ # If this named type is already defined, return just the reference string
703
+ if qualified_name in defined_types:
704
+ return qualified_name
705
+ defined_types.add(qualified_name)
706
+
707
+ # Process record types
708
+ if 'type' in avro_schema and avro_schema['type'] == 'record' and 'fields' in avro_schema:
709
+ namespace = avro_schema.get('namespace', current_namespace)
710
+ qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
711
+ if qualified_name in tracker:
712
+ return qualified_name
713
+ tracker.add(qualified_name)
714
+ for field in avro_schema['fields']:
715
+ field['type'] = inline_avro_references(
716
+ field['type'], type_dict, namespace, tracker, defined_types)
717
+
718
+ # Handling array types
719
+ elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
720
+ avro_schema['items'] = inline_avro_references(
721
+ avro_schema['items'], type_dict, current_namespace, tracker, defined_types)
722
+
723
+ # Handling map types
724
+ elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
725
+ avro_schema['values'] = inline_avro_references(
726
+ avro_schema['values'], type_dict, current_namespace, tracker, defined_types)
727
+
728
+ # Inline other types, except enum and fixed
729
+ elif 'type' in avro_schema and avro_schema['type'] not in ['enum', 'fixed']:
730
+ avro_schema['type'] = inline_avro_references(
731
+ avro_schema['type'], type_dict, current_namespace, tracker, defined_types)
732
+
733
+ elif isinstance(avro_schema, list):
734
+ return [inline_avro_references(item, type_dict, current_namespace, tracker, defined_types) for item in avro_schema]
735
+
736
+ elif avro_schema in type_dict and avro_schema not in tracker and avro_schema not in defined_types:
737
+ # Inline the referenced schema if not already tracked and not defined in the current schema
738
+ # Use deepcopy to avoid mutating the original type_dict entries when modifying nested structures
739
+ inlined_schema = copy.deepcopy(type_dict[avro_schema])
740
+ if isinstance(inlined_schema, dict) and not inlined_schema.get('namespace', None):
741
+ inlined_schema['namespace'] = '.'.join(avro_schema.split('.')[:-1])
742
+ inlined_schema = inline_avro_references(
743
+ inlined_schema, type_dict, inlined_schema['namespace'], tracker, defined_types)
744
+ tracker.add(avro_schema)
745
+ return inlined_schema
746
+
747
+ return avro_schema
748
+
749
+ def strip_first_doc(schema) -> bool:
750
+ """ strip the first doc field anywhere in the schema"""
751
+ if isinstance(schema, dict):
752
+ if "doc" in schema:
753
+ del schema["doc"]
754
+ return True
755
+ for key in schema:
756
+ if strip_first_doc(schema[key]):
757
+ return True
758
+ elif isinstance(schema, list):
759
+ for item in schema:
760
+ if strip_first_doc(item):
761
+ return True
762
+ return False
763
+
764
+
765
+ def is_type_with_alternate(avro_schema: List[Dict[str, Any]]) -> bool:
766
+ """
767
+ Check if the Avro schema union contains a type with a trailing alternate type.
768
+ Alternate types are maps that mimic the structure of the original type, but
769
+ allow for additional fields. Alternate types are labeled with an 'alternateof'
770
+ attribute extension that points to the original type.
771
+
772
+ Args:
773
+ avro_schema (List[Dict[str, Any]]): The Avro schema to check.
774
+
775
+ Returns:
776
+ bool: True if the Avro schema contains a type with an alternate name, False otherwise.
777
+ """
778
+ avro_schema = avro_schema.copy()
779
+ if not isinstance(avro_schema, list):
780
+ return False
781
+ if 'null' in avro_schema:
782
+ avro_schema.remove('null')
783
+ if len(avro_schema) != 2:
784
+ return False
785
+ original_type = any(t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t)
786
+ alternate_type = any(t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t)
787
+ if original_type and alternate_type:
788
+ return True
789
+ return False
790
+
791
+ def strip_alternate_type(avro_schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
792
+ """
793
+ Strips the alternate type from the Avro schema union.
794
+
795
+ Args:
796
+ avro_schema (List[Dict[str, Any]]): The Avro schema to strip.
797
+
798
+ Returns:
799
+ List[Dict[str, Any]]: The Avro schema without the alternate type.
800
+ """
801
+ original_type = next((t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t), None)
802
+ alternate_type = next((t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t), None)
803
+ if original_type and alternate_type:
804
+ avro_schema.remove(alternate_type)
805
+ return avro_schema
806
+
807
+
808
+ def get_typing_args_from_string(type_str: str) -> List[str]:
809
+ """ gets the list of generic arguments of a type. """
810
+ # This regex captures the main type and its generic arguments
811
+ pattern = re.compile(r'([\w\.]+)\[(.+)\]')
812
+ match = pattern.match(type_str)
813
+
814
+ if not match:
815
+ return []
816
+
817
+ _, args_str = match.groups()
818
+ # Splitting the arguments while considering nested generic types
819
+ args = []
820
+ depth = 0
821
+ current_arg:List[str] = []
822
+ for char in args_str:
823
+ if char == ',' and depth == 0:
824
+ args.append(''.join(current_arg).strip())
825
+ current_arg = []
826
+ else:
827
+ if char == '[':
828
+ depth += 1
829
+ elif char == ']':
830
+ depth -= 1
831
+ current_arg.append(char)
832
+ if current_arg:
833
+ args.append(''.join(current_arg).strip())
834
+ return args