structurize 2.16.2__py3-none-any.whl → 2.16.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. avrotize/__init__.py +63 -63
  2. avrotize/__main__.py +5 -5
  3. avrotize/_version.py +34 -34
  4. avrotize/asn1toavro.py +160 -160
  5. avrotize/avrotize.py +152 -152
  6. avrotize/avrotocpp.py +483 -483
  7. avrotize/avrotocsharp.py +992 -992
  8. avrotize/avrotocsv.py +121 -121
  9. avrotize/avrotodatapackage.py +173 -173
  10. avrotize/avrotodb.py +1383 -1383
  11. avrotize/avrotogo.py +476 -476
  12. avrotize/avrotographql.py +197 -197
  13. avrotize/avrotoiceberg.py +210 -210
  14. avrotize/avrotojava.py +1023 -1023
  15. avrotize/avrotojs.py +250 -250
  16. avrotize/avrotojsons.py +481 -481
  17. avrotize/avrotojstruct.py +345 -345
  18. avrotize/avrotokusto.py +363 -363
  19. avrotize/avrotomd.py +137 -137
  20. avrotize/avrotools.py +168 -168
  21. avrotize/avrotoparquet.py +208 -208
  22. avrotize/avrotoproto.py +358 -358
  23. avrotize/avrotopython.py +622 -622
  24. avrotize/avrotorust.py +435 -435
  25. avrotize/avrotots.py +598 -598
  26. avrotize/avrotoxsd.py +344 -344
  27. avrotize/commands.json +2493 -2433
  28. avrotize/common.py +828 -828
  29. avrotize/constants.py +4 -4
  30. avrotize/csvtoavro.py +131 -131
  31. avrotize/datapackagetoavro.py +76 -76
  32. avrotize/dependency_resolver.py +348 -348
  33. avrotize/jsonstoavro.py +1698 -1698
  34. avrotize/jsonstostructure.py +2642 -2642
  35. avrotize/jstructtoavro.py +878 -878
  36. avrotize/kstructtoavro.py +93 -93
  37. avrotize/kustotoavro.py +455 -455
  38. avrotize/parquettoavro.py +157 -157
  39. avrotize/proto2parser.py +497 -497
  40. avrotize/proto3parser.py +402 -402
  41. avrotize/prototoavro.py +382 -382
  42. avrotize/structuretocsharp.py +2005 -2005
  43. avrotize/structuretojsons.py +498 -498
  44. avrotize/structuretopython.py +772 -772
  45. avrotize/structuretots.py +653 -0
  46. avrotize/xsdtoavro.py +413 -413
  47. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/METADATA +848 -805
  48. structurize-2.16.5.dist-info/RECORD +52 -0
  49. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/licenses/LICENSE +200 -200
  50. structurize-2.16.2.dist-info/RECORD +0 -51
  51. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/WHEEL +0 -0
  52. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/entry_points.txt +0 -0
  53. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/top_level.txt +0 -0
avrotize/common.py CHANGED
@@ -1,829 +1,829 @@
1
- """
2
- Common utility functions for Avrotize.
3
- """
4
-
5
- # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements, line-too-long
6
-
7
- from collections import defaultdict
8
- import os
9
- import re
10
- import hashlib
11
- import json
12
- from typing import Dict, Union, Any, List
13
- from jsoncomparison import NO_DIFF, Compare
14
- import jinja2
15
-
16
-
17
- def avro_name(name):
18
- """Convert a name into an Avro name."""
19
- if isinstance(name, int):
20
- name = '_'+str(name)
21
- val = re.sub(r'[^a-zA-Z0-9_]', '_', name)
22
- # Ensure the name starts with a letter or underscore (required for valid identifiers)
23
- if re.match(r'^[0-9]', val):
24
- val = '_' + val
25
- # Additional check to ensure we always have a valid identifier
26
- if not val or not re.match(r'^[a-zA-Z_]', val):
27
- val = '_' + val
28
- return val
29
-
30
-
31
- def avro_name_with_altname(name):
32
- """
33
- Convert a name into an Avro name and return both the normalized name and alternate name info.
34
-
35
- Args:
36
- name (str): The original name to convert
37
-
38
- Returns:
39
- tuple: (normalized_name, original_name_if_different_or_None)
40
- """
41
- if isinstance(name, int):
42
- name = str(name)
43
-
44
- original_name = name
45
- normalized_name = avro_name(name)
46
-
47
- # If the normalized name is different from the original, return the original as alt name
48
- if normalized_name != original_name:
49
- return normalized_name, original_name
50
- else:
51
- return normalized_name, None
52
-
53
-
54
- def avro_namespace(name):
55
- """Convert a name into an Avro name."""
56
- val = re.sub(r'[^a-zA-Z0-9_\.]', '_', name)
57
- if re.match(r'^[0-9]', val):
58
- val = '_' + val
59
- return val
60
-
61
-
62
- def generic_type() -> list[str | dict]:
63
- """
64
- Constructs a generic Avro type for simple types, arrays, and maps.
65
-
66
- Returns:
67
- list[str | dict]: A list of simple types, arrays, and maps.
68
- """
69
- simple_type_union: list[str | dict] = [
70
- "null", "boolean", "int", "long", "float", "double", "bytes", "string"]
71
- l2 = simple_type_union.copy()
72
- l2.extend([
73
- {
74
- "type": "array",
75
- "items": simple_type_union
76
- },
77
- {
78
- "type": "map",
79
- "values": simple_type_union
80
- }])
81
- l1 = simple_type_union.copy()
82
- l1.extend([
83
- {
84
- "type": "array",
85
- "items": l2
86
- },
87
- {
88
- "type": "map",
89
- "values": l2
90
- }])
91
- return l1
92
-
93
-
94
- def is_generic_avro_type(avro_type: list) -> bool:
95
- """
96
- Check if the given Avro type is a generic type.
97
-
98
- Args:
99
- avro_type (Union[str, Dict[str, Any]]): The Avro type to check.
100
-
101
- Returns:
102
- bool: True if the Avro type is a generic type, False otherwise.
103
- """
104
- if isinstance(avro_type, str) or isinstance(avro_type, dict):
105
- return False
106
- compare_type = generic_type()
107
- return Compare().check(avro_type, compare_type) == NO_DIFF
108
-
109
-
110
- def is_generic_json_type(json_type: Dict[str, Any] | List[Dict[str, Any] | str] | str) -> bool:
111
- """
112
- Check if the given JSON type is a generic type.
113
-
114
- Args:
115
- json_type (Union[Dict[str, Any], str, List[Union[str, Dict[str, Any]]]]): The JSON type to check.
116
-
117
- Returns:
118
- bool: True if the JSON type is a generic type, False otherwise.
119
- """
120
- if isinstance(json_type, str) or isinstance(json_type, list):
121
- return False
122
- compare_type = generic_type_json()
123
- return Compare().check(json_type, compare_type) == NO_DIFF
124
-
125
-
126
- def generic_type_json() -> dict:
127
- """
128
- Returns a dictionary representing a generic JSON schema for various types.
129
-
130
- The schema includes support for boolean, integer, number, string, array, and object types.
131
- Each type can have different formats such as int32, int64, float, double, and byte.
132
-
133
- Returns:
134
- dict: A dictionary representing the generic JSON schema.
135
- """
136
- return {
137
- "oneOf": [
138
- {"type": "boolean"},
139
- {"type": "integer", "format": "int32"},
140
- {"type": "integer", "format": "int64"},
141
- {"type": "number", "format": "float"},
142
- {"type": "number", "format": "double"},
143
- {"type": "string", "format": "byte"},
144
- {"type": "string"},
145
- {
146
- "type": "array",
147
- "items": {
148
- "oneOf": [
149
- {"type": "boolean"},
150
- {"type": "integer", "format": "int32"},
151
- {"type": "integer", "format": "int64"},
152
- {"type": "number", "format": "float"},
153
- {"type": "number", "format": "double"},
154
- {"type": "string", "format": "byte"},
155
- {"type": "string"},
156
- {
157
- "type": "array",
158
- "items": {
159
- "oneOf": [
160
- {"type": "boolean"},
161
- {"type": "integer", "format": "int32"},
162
- {"type": "integer", "format": "int64"},
163
- {"type": "number", "format": "float"},
164
- {"type": "number", "format": "double"},
165
- {"type": "string", "format": "byte"},
166
- {"type": "string"}
167
- ]
168
- }
169
- },
170
- {
171
- "type": "object",
172
- "additionalProperties": {
173
- "oneOf": [
174
- {"type": "boolean"},
175
- {"type": "integer", "format": "int32"},
176
- {"type": "integer", "format": "int64"},
177
- {"type": "number", "format": "float"},
178
- {"type": "number", "format": "double"},
179
- {"type": "string", "format": "byte"},
180
- {"type": "string"}
181
- ]
182
- }
183
- }
184
- ]
185
- }
186
- },
187
- {
188
- "type": "object",
189
- "additionalProperties": {
190
- "oneOf": [
191
- {"type": "boolean"},
192
- {"type": "integer", "format": "int32"},
193
- {"type": "integer", "format": "int64"},
194
- {"type": "number", "format": "float"},
195
- {"type": "number", "format": "double"},
196
- {"type": "string", "format": "byte"},
197
- {"type": "string"},
198
- {
199
- "type": "array",
200
- "items": {
201
- "oneOf": [
202
- {"type": "boolean"},
203
- {"type": "integer", "format": "int32"},
204
- {"type": "integer", "format": "int64"},
205
- {"type": "number", "format": "float"},
206
- {"type": "number", "format": "double"},
207
- {"type": "string", "format": "byte"},
208
- {"type": "string"}
209
- ]
210
- }
211
- },
212
- {
213
- "type": "object",
214
- "additionalProperties": {
215
- "oneOf": [
216
- {"type": "boolean"},
217
- {"type": "integer", "format": "int32"},
218
- {"type": "integer", "format": "int64"},
219
- {"type": "number", "format": "float"},
220
- {"type": "number", "format": "double"},
221
- {"type": "string", "format": "byte"},
222
- {"type": "string"}
223
- ]
224
- }
225
- }
226
- ]
227
- }
228
- }
229
- ]
230
- }
231
-
232
-
233
- def find_schema_node(test, avro_schema, recursion_stack=None):
234
- """
235
- Find the first schema node in the avro_schema matching the test
236
-
237
- Args:
238
- test (Callable): The test function.
239
- avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
240
- recursion_stack (List[Union[Dict[str, Any], List[Dict[str, Any]]], optional): The recursion stack. Defaults to None.
241
-
242
- Returns:
243
- Union[Dict[str, Any], None]: The schema node if found, otherwise None.
244
- """
245
- if recursion_stack is None:
246
- recursion_stack = []
247
- for recursion_item in recursion_stack:
248
- if avro_schema is recursion_item:
249
- raise ValueError('Cyclical reference detected in schema')
250
- if len(recursion_stack) > 50:
251
- raise ValueError('Maximum recursion depth 50 exceeded in schema')
252
- try:
253
- recursion_stack.append(avro_schema)
254
- if isinstance(avro_schema, dict):
255
- test_node = test(avro_schema)
256
- if test_node:
257
- return avro_schema
258
- for _, v in avro_schema.items():
259
- if isinstance(v, (dict, list)):
260
- node = find_schema_node(test, v, recursion_stack)
261
- if node:
262
- return node
263
- elif isinstance(avro_schema, list):
264
- for item in avro_schema:
265
- if isinstance(item, (dict, list)):
266
- node = find_schema_node(test, item, recursion_stack)
267
- if node:
268
- return node
269
- return None
270
- finally:
271
- recursion_stack.pop()
272
-
273
-
274
- def set_schema_node(test, replacement, avro_schema):
275
- """
276
- Set the first schema node in the avro_schema matching the test to the replacement
277
-
278
- Args:
279
- test (Callable): The test function.
280
- replacement (Dict[str, Any]): The replacement schema.
281
- avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
282
-
283
- Returns:
284
- None
285
- """
286
- if isinstance(avro_schema, dict):
287
- test_node = test(avro_schema)
288
- if test_node:
289
- avro_schema.clear()
290
- avro_schema.update(replacement)
291
- return
292
- for k, v in avro_schema.items():
293
- if isinstance(v, (dict, list)):
294
- set_schema_node(test, replacement, v)
295
- elif isinstance(avro_schema, list):
296
- for item in avro_schema:
297
- set_schema_node(test, replacement, item)
298
-
299
-
300
- class NodeHash:
301
- """ A hash value and count for a JSON object. """
302
- def __init__(self: 'NodeHash', hash_value: bytes, count: int):
303
- self.hash_value: bytes = hash_value
304
- self.count: int = count
305
-
306
-
307
- class NodeHashReference:
308
- """ A reference to a JSON object with a hash value and count."""
309
- def __init__(self, hash_and_count: NodeHash, value, path):
310
- self.hash_value: bytes = hash_and_count.hash_value
311
- self.count: int = hash_and_count.count
312
- self.value: Any = value
313
- self.path: str = path
314
-
315
-
316
- def get_tree_hash(json_obj: Union[dict, list]) -> NodeHash:
317
- """
318
- Generate a hash from a JSON object (dict or list).
319
-
320
- Args:
321
- json_obj (Union[dict, list]): The JSON object to hash.
322
-
323
- Returns:
324
- NodeHash: The hash value and count.
325
- """
326
- if isinstance(json_obj, dict) or isinstance(json_obj, list):
327
- s = json.dumps(json_obj, sort_keys=True).encode('utf-8')
328
- return NodeHash(hashlib.sha256(s).digest(), len(s))
329
- else:
330
- s = json.dumps(json_obj).encode('utf-8')
331
- return NodeHash(hashlib.sha256(s).digest(), len(s))
332
-
333
-
334
- def build_tree_hash_list(json_obj: Union[dict, list], path: str = '') -> Dict[str, NodeHashReference]:
335
- """
336
- Build a flat dictionary of hashes for a JSON object.
337
- The keys are JSON Path expressions, and the values are the hashes.
338
-
339
- Args:
340
- json_obj (Union[dict, list]): The JSON object to hash.
341
- path (str): The current JSON Path expression. Defaults to ''.
342
-
343
- Returns:
344
- Dict[str, NodeHashReference]: A dictionary of JSON Path expressions and hashes.
345
- """
346
-
347
- def has_nested_structure(obj: Union[dict, list]) -> bool:
348
- """
349
- Check if the object (list or dict) contains any nested lists or dicts.
350
- """
351
- if isinstance(obj, dict):
352
- return any(isinstance(value, (dict, list)) for value in obj.values())
353
- elif isinstance(obj, list):
354
- return any(isinstance(item, (dict, list)) for item in obj)
355
- return False
356
-
357
- tree_hash = {}
358
- if isinstance(json_obj, dict):
359
- for key, value in json_obj.items():
360
- new_path = f'{path}.{key}' if path else f'$.{key}'
361
- if isinstance(value, dict) and has_nested_structure(value):
362
- inner_hashes = build_tree_hash_list(value, new_path)
363
- for inner_path, hash_reference in inner_hashes.items():
364
- tree_hash[inner_path] = hash_reference
365
- hash_value = get_tree_hash(value)
366
- tree_hash[new_path] = NodeHashReference(hash_value, value, new_path)
367
- elif isinstance(json_obj, list):
368
- for index, item in enumerate(json_obj):
369
- new_path = f"{path}[{index}]"
370
- if isinstance(item, (dict, list)) and has_nested_structure(item):
371
- inner_hashes = build_tree_hash_list(item, new_path)
372
- for inner_path, hash_reference in inner_hashes.items():
373
- tree_hash[inner_path] = hash_reference
374
- return tree_hash
375
-
376
-
377
- def group_by_hash(tree_hash_list: Dict[str, NodeHashReference]) -> Dict[bytes, list]:
378
- """
379
- Group JSON Path expressions by their hash values.
380
-
381
- Args:
382
- tree_hash_list (Dict[str, NodeHashReference]): A dictionary of JSON Path expressions and hashes.
383
-
384
- Returns:
385
- Dict[bytes, list]: A dictionary of hash values and lists of JSON Path expressions.
386
- """
387
- hash_groups = defaultdict(list)
388
- for _, hash_reference in tree_hash_list.items():
389
- hash_groups[hash_reference.hash_value].append(hash_reference)
390
-
391
- # Filter out unique hashes to only return groups with more than one path
392
- for k in list(hash_groups.keys()):
393
- if len(hash_groups[k]) == 1:
394
- del hash_groups[k]
395
- return hash_groups
396
-
397
-
398
- def pascal(string):
399
- """
400
- Convert a string to PascalCase from snake_case, camelCase, or PascalCase.
401
- The string can contain dots or double colons, which are preserved in the output.
402
- Underscores at the beginning of the string are preserved in the output, but
403
- underscores in the middle of the string are removed.
404
-
405
- Args:
406
- string (str): The string to convert.
407
-
408
- Returns:
409
- str: The string in PascalCase.
410
- """
411
- if '::' in string:
412
- strings = string.split('::')
413
- return strings[0] + '::' + '::'.join(pascal(s) for s in strings[1:])
414
- if '.' in string:
415
- strings = string.split('.')
416
- return '.'.join(pascal(s) for s in strings)
417
- if not string or len(string) == 0:
418
- return string
419
- words = []
420
- startswith_under = string[0] == '_'
421
- if '_' in string:
422
- # snake_case
423
- words = re.split(r'_', string)
424
- elif string[0].isupper():
425
- # PascalCase
426
- words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
427
- else:
428
- # camelCase
429
- words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
430
- result = ''.join(word.capitalize() for word in words)
431
- if startswith_under:
432
- result = '_' + result
433
- return result
434
-
435
-
436
- def camel(string):
437
- """
438
- Convert a string to camelCase from snake_case, camelCase, or PascalCase.
439
- The string can contain dots or double colons, which are preserved in the output.
440
- Underscores at the beginning of the string are preserved in the output, but
441
- underscores in the middle of the string are removed.
442
-
443
- Args:
444
- string (str): The string to convert.
445
-
446
- Returns:
447
- str: The string in camelCase.
448
- """
449
- if '::' in string:
450
- strings = string.split('::')
451
- return strings[0] + '::' + '::'.join(camel(s) for s in strings[1:])
452
- if '.' in string:
453
- strings = string.split('.')
454
- return '.'.join(camel(s) for s in strings)
455
- if not string or len(string) == 0:
456
- return string
457
- words = []
458
- if '_' in string:
459
- # snake_case
460
- words = re.split(r'_', string)
461
- elif string[0].isupper():
462
- # PascalCase
463
- words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
464
- else:
465
- # camelCase
466
- words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
467
- result = words[0].lower() + ''.join(word.capitalize()
468
- for word in words[1:])
469
- return result
470
-
471
-
472
- def snake(string):
473
- """
474
- Convert a string to snake_case from snake_case, camelCase, or PascalCase.
475
- The string can contain dots or double colons, which are preserved in the output.
476
- Underscores at the beginning of the string are preserved in the output, but
477
- underscores in the middle of the string are removed.
478
-
479
- Args:
480
- string (str): The string to convert.
481
-
482
- Returns:
483
- str: The string in snake_case.
484
- """
485
- if '::' in string:
486
- strings = string.split('::')
487
- return strings[0] + '::' + '::'.join(snake(s) for s in strings[1:])
488
- if '.' in string:
489
- strings = string.split('.')
490
- return '.'.join(snake(s) for s in strings)
491
- if not string or len(string) == 0:
492
- return string
493
- words = []
494
- if '_' in string:
495
- # snake_case
496
- words = re.split(r'_', string)
497
- elif string[0].isupper():
498
- # PascalCase
499
- words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
500
- else:
501
- # camelCase
502
- words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
503
- result = '_'.join(word.lower() for word in words)
504
- return result
505
-
506
-
507
- def fullname(avro_schema: dict| str, parent_namespace: str = '') -> str:
508
- """
509
- Constructs the full name of the Avro schema.
510
-
511
- Args:
512
- avro_schema (dict): The Avro schema.
513
-
514
- Returns:
515
- str: The full name of the Avro schema.
516
- """
517
- if isinstance(avro_schema, str):
518
- if not '.' in avro_schema and parent_namespace:
519
- return parent_namespace + '.' + avro_schema
520
- return avro_schema
521
- name = avro_schema.get("name", "")
522
- namespace = avro_schema.get("namespace", parent_namespace)
523
- return namespace + "." + name if namespace else name
524
-
525
-
526
- def altname(schema_obj: dict, purpose: str):
527
- """
528
- Retrieves the alternative name for a given purpose from the schema object.
529
-
530
- Args:
531
- schema_obj (dict): The schema object (record or field).
532
- default_name (str): The default name.
533
- purpose (str): The purpose for the alternative name (e.g., 'sql').
534
-
535
- Returns:
536
- str: The alternative name if present, otherwise the default name.
537
- """
538
- if "altnames" in schema_obj and purpose in schema_obj["altnames"]:
539
- return schema_obj["altnames"][purpose]
540
- return schema_obj["name"]
541
-
542
-
543
- def process_template(file_path: str, **kvargs) -> str:
544
- """
545
- Process a file as a Jinja2 template with the given object as input.
546
-
547
- Args:
548
- file_path (str): The path to the file.
549
- obj (Any): The object to use as input for the template.
550
-
551
- Returns:
552
- str: The processed template as a string.
553
- """
554
- # Load the template environment
555
- file_dir = os.path.dirname(__file__)
556
- template_loader = jinja2.FileSystemLoader(searchpath=file_dir)
557
- template_env = jinja2.Environment(loader=template_loader)
558
- template_env.filters['pascal'] = pascal
559
- template_env.filters['camel'] = camel
560
-
561
- # Load the template from the file
562
- template = template_env.get_template(file_path)
563
-
564
- # Render the template with the object as input
565
- output = template.render(**kvargs)
566
-
567
- return output
568
-
569
-
570
- def render_template(template: str, output: str, **kvargs):
571
- """
572
- Render a template and write it to a file
573
-
574
- Args:
575
- template (str): The template to render.
576
- output (str): The output file path.
577
- **kvargs: The keyword arguments to pass to the template.
578
-
579
- Returns:
580
- None
581
- """
582
- out = process_template(template, **kvargs)
583
- # make sure the directory exists
584
- os.makedirs(os.path.dirname(output), exist_ok=True)
585
- with open(output, 'w', encoding='utf-8') as f:
586
- f.write(out)
587
-
588
-
589
- def get_longest_namespace_prefix(schema):
590
- """ Get the longest common prefix for the namespace of all types in the schema. """
591
- namespaces = set(collect_namespaces(schema))
592
- longest_common_prefix = ''
593
- # find longest common prefix of the namespaces (not with os.path!!!)
594
- for ns in namespaces:
595
- if not longest_common_prefix:
596
- longest_common_prefix = ns
597
- else:
598
- for i in range(min(len(longest_common_prefix), len(ns))):
599
- if longest_common_prefix[i] != ns[i]:
600
- longest_common_prefix = longest_common_prefix[:i]
601
- break
602
- return longest_common_prefix.strip('.')
603
-
604
-
605
- def collect_namespaces(schema: Any, parent_namespace: str = '') -> List[str]:
606
- """ Performs a deep search of the schema to collect all namespaces """
607
- namespaces = []
608
- if isinstance(schema, dict):
609
- namespace = str(schema.get('namespace', parent_namespace))
610
- if namespace:
611
- namespaces.append(namespace)
612
- if 'fields' in schema and isinstance(schema['fields'], list):
613
- for field in schema['fields']:
614
- if isinstance(field, dict) and 'type' in field and isinstance(field['type'], dict):
615
- namespaces.extend(collect_namespaces(
616
- field['type'], namespace))
617
- namespaces.extend(collect_namespaces(field, namespace))
618
- if 'items' in schema and isinstance(schema['items'], dict):
619
- namespaces.extend(collect_namespaces(schema['items'], namespace))
620
- if 'values' in schema and isinstance(schema['values'], dict):
621
- namespaces.extend(collect_namespaces(schema['values'], namespace))
622
- elif isinstance(schema, list):
623
- for item in schema:
624
- namespaces.extend(collect_namespaces(item, parent_namespace))
625
- return namespaces
626
-
627
-
628
- def build_flat_type_dict(avro_schema) -> Dict[str, Dict]:
629
- """Builds a flat dictionary of all named types in the main schema."""
630
- type_dict = {}
631
-
632
- def add_to_dict(schema, namespace):
633
- if isinstance(schema, dict):
634
- schema_type = schema.get('type')
635
- name = schema.get('name')
636
- namespace = schema.get('namespace', namespace)
637
- if schema_type in ['record', 'enum', 'fixed'] and name:
638
- qualified_name = f"{namespace}.{name}" if namespace else name
639
- type_dict[qualified_name] = schema
640
- if schema_type == 'record':
641
- for field in schema.get('fields', []):
642
- field_type = field.get('type')
643
- add_to_dict(field_type, namespace)
644
- elif schema_type == 'array':
645
- add_to_dict(schema.get('items'), namespace)
646
- elif schema_type == 'map':
647
- add_to_dict(schema.get('values'), namespace)
648
- elif isinstance(schema, list):
649
- for item in schema:
650
- add_to_dict(item, namespace)
651
-
652
- if isinstance(avro_schema, dict):
653
- add_to_dict(avro_schema, avro_schema.get('namespace', ''))
654
- elif isinstance(avro_schema, list):
655
- for schema in avro_schema:
656
- schema_namespace = schema.get('namespace', '')
657
- add_to_dict(schema, schema_namespace)
658
- return type_dict
659
-
660
-
661
- def evict_tracked_references(avro_schema, parent_namespace, tracker):
662
- """ Evicts all tracked references in the Avro schema. """
663
- if isinstance(avro_schema, dict):
664
- if 'type' in avro_schema and (avro_schema['type'] == 'record' or avro_schema['type'] == 'enum' or avro_schema['type'] == 'fixed'):
665
- namespace = avro_schema.get('namespace', parent_namespace)
666
- qualified_name = (
667
- namespace + '.' if namespace else '') + avro_schema['name']
668
- if not qualified_name in tracker:
669
- if 'fields' in avro_schema:
670
- for field in avro_schema['fields']:
671
- field['type'] = evict_tracked_references(
672
- field['type'], namespace, tracker)
673
- return avro_schema
674
- else:
675
- return qualified_name
676
- # Handling array types
677
- elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
678
- avro_schema['items'] = evict_tracked_references(
679
- avro_schema['items'], parent_namespace, tracker)
680
- # Handling map types
681
- elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
682
- avro_schema['values'] = evict_tracked_references(
683
- avro_schema['values'], parent_namespace, tracker)
684
- elif isinstance(avro_schema, list):
685
- return [evict_tracked_references(item, parent_namespace, tracker) for item in avro_schema]
686
- return avro_schema
687
-
688
-
689
- def inline_avro_references(avro_schema, type_dict, current_namespace, tracker=None, defined_types=None):
690
- """ Inlines the first reference to a type in the Avro schema. """
691
- if tracker is None:
692
- tracker = set()
693
- if defined_types is None:
694
- defined_types = set()
695
-
696
- if isinstance(avro_schema, dict):
697
- # Register the type if it's a record, enum, or fixed and is inlined in the same schema
698
- if 'type' in avro_schema and avro_schema['type'] in ['record', 'enum', 'fixed']:
699
- namespace = avro_schema.get('namespace', current_namespace)
700
- qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
701
- defined_types.add(qualified_name)
702
-
703
- # Process record types
704
- if 'type' in avro_schema and avro_schema['type'] == 'record' and 'fields' in avro_schema:
705
- namespace = avro_schema.get('namespace', current_namespace)
706
- qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
707
- if qualified_name in tracker:
708
- return qualified_name
709
- tracker.add(qualified_name)
710
- for field in avro_schema['fields']:
711
- field['type'] = inline_avro_references(
712
- field['type'], type_dict, namespace, tracker, defined_types)
713
-
714
- # Handling array types
715
- elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
716
- avro_schema['items'] = inline_avro_references(
717
- avro_schema['items'], type_dict, current_namespace, tracker, defined_types)
718
-
719
- # Handling map types
720
- elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
721
- avro_schema['values'] = inline_avro_references(
722
- avro_schema['values'], type_dict, current_namespace, tracker, defined_types)
723
-
724
- # Inline other types, except enum and fixed
725
- elif 'type' in avro_schema and avro_schema['type'] not in ['enum', 'fixed']:
726
- avro_schema['type'] = inline_avro_references(
727
- avro_schema['type'], type_dict, current_namespace, tracker, defined_types)
728
-
729
- elif isinstance(avro_schema, list):
730
- return [inline_avro_references(item, type_dict, current_namespace, tracker, defined_types) for item in avro_schema]
731
-
732
- elif avro_schema in type_dict and avro_schema not in tracker and avro_schema not in defined_types:
733
- # Inline the referenced schema if not already tracked and not defined in the current schema
734
- inlined_schema = type_dict[avro_schema].copy()
735
- if isinstance(inlined_schema, dict) and not inlined_schema.get('namespace', None):
736
- inlined_schema['namespace'] = '.'.join(avro_schema.split('.')[:-1])
737
- inlined_schema = inline_avro_references(
738
- inlined_schema, type_dict, inlined_schema['namespace'], tracker, defined_types)
739
- tracker.add(avro_schema)
740
- return inlined_schema
741
-
742
- return avro_schema
743
-
744
- def strip_first_doc(schema) -> bool:
745
- """ strip the first doc field anywhere in the schema"""
746
- if isinstance(schema, dict):
747
- if "doc" in schema:
748
- del schema["doc"]
749
- return True
750
- for key in schema:
751
- if strip_first_doc(schema[key]):
752
- return True
753
- elif isinstance(schema, list):
754
- for item in schema:
755
- if strip_first_doc(item):
756
- return True
757
- return False
758
-
759
-
760
- def is_type_with_alternate(avro_schema: List[Dict[str, Any]]) -> bool:
761
- """
762
- Check if the Avro schema union contains a type with a trailing alternate type.
763
- Alternate types are maps that mimic the structure of the original type, but
764
- allow for additional fields. Alternate types are labeled with an 'alternateof'
765
- attribute extension that points to the original type.
766
-
767
- Args:
768
- avro_schema (List[Dict[str, Any]]): The Avro schema to check.
769
-
770
- Returns:
771
- bool: True if the Avro schema contains a type with an alternate name, False otherwise.
772
- """
773
- avro_schema = avro_schema.copy()
774
- if not isinstance(avro_schema, list):
775
- return False
776
- if 'null' in avro_schema:
777
- avro_schema.remove('null')
778
- if len(avro_schema) != 2:
779
- return False
780
- original_type = any(t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t)
781
- alternate_type = any(t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t)
782
- if original_type and alternate_type:
783
- return True
784
- return False
785
-
786
- def strip_alternate_type(avro_schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
787
- """
788
- Strips the alternate type from the Avro schema union.
789
-
790
- Args:
791
- avro_schema (List[Dict[str, Any]]): The Avro schema to strip.
792
-
793
- Returns:
794
- List[Dict[str, Any]]: The Avro schema without the alternate type.
795
- """
796
- original_type = next((t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t), None)
797
- alternate_type = next((t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t), None)
798
- if original_type and alternate_type:
799
- avro_schema.remove(alternate_type)
800
- return avro_schema
801
-
802
-
803
- def get_typing_args_from_string(type_str: str) -> List[str]:
804
- """ gets the list of generic arguments of a type. """
805
- # This regex captures the main type and its generic arguments
806
- pattern = re.compile(r'([\w\.]+)\[(.+)\]')
807
- match = pattern.match(type_str)
808
-
809
- if not match:
810
- return []
811
-
812
- _, args_str = match.groups()
813
- # Splitting the arguments while considering nested generic types
814
- args = []
815
- depth = 0
816
- current_arg:List[str] = []
817
- for char in args_str:
818
- if char == ',' and depth == 0:
819
- args.append(''.join(current_arg).strip())
820
- current_arg = []
821
- else:
822
- if char == '[':
823
- depth += 1
824
- elif char == ']':
825
- depth -= 1
826
- current_arg.append(char)
827
- if current_arg:
828
- args.append(''.join(current_arg).strip())
1
+ """
2
+ Common utility functions for Avrotize.
3
+ """
4
+
5
+ # pylint: disable=too-many-arguments, too-many-locals, too-many-branches, too-many-statements, line-too-long
6
+
7
+ from collections import defaultdict
8
+ import os
9
+ import re
10
+ import hashlib
11
+ import json
12
+ from typing import Dict, Union, Any, List
13
+ from jsoncomparison import NO_DIFF, Compare
14
+ import jinja2
15
+
16
+
17
+ def avro_name(name):
18
+ """Convert a name into an Avro name."""
19
+ if isinstance(name, int):
20
+ name = '_'+str(name)
21
+ val = re.sub(r'[^a-zA-Z0-9_]', '_', name)
22
+ # Ensure the name starts with a letter or underscore (required for valid identifiers)
23
+ if re.match(r'^[0-9]', val):
24
+ val = '_' + val
25
+ # Additional check to ensure we always have a valid identifier
26
+ if not val or not re.match(r'^[a-zA-Z_]', val):
27
+ val = '_' + val
28
+ return val
29
+
30
+
31
+ def avro_name_with_altname(name):
32
+ """
33
+ Convert a name into an Avro name and return both the normalized name and alternate name info.
34
+
35
+ Args:
36
+ name (str): The original name to convert
37
+
38
+ Returns:
39
+ tuple: (normalized_name, original_name_if_different_or_None)
40
+ """
41
+ if isinstance(name, int):
42
+ name = str(name)
43
+
44
+ original_name = name
45
+ normalized_name = avro_name(name)
46
+
47
+ # If the normalized name is different from the original, return the original as alt name
48
+ if normalized_name != original_name:
49
+ return normalized_name, original_name
50
+ else:
51
+ return normalized_name, None
52
+
53
+
54
+ def avro_namespace(name):
55
+ """Convert a name into an Avro name."""
56
+ val = re.sub(r'[^a-zA-Z0-9_\.]', '_', name)
57
+ if re.match(r'^[0-9]', val):
58
+ val = '_' + val
59
+ return val
60
+
61
+
62
+ def generic_type() -> list[str | dict]:
63
+ """
64
+ Constructs a generic Avro type for simple types, arrays, and maps.
65
+
66
+ Returns:
67
+ list[str | dict]: A list of simple types, arrays, and maps.
68
+ """
69
+ simple_type_union: list[str | dict] = [
70
+ "null", "boolean", "int", "long", "float", "double", "bytes", "string"]
71
+ l2 = simple_type_union.copy()
72
+ l2.extend([
73
+ {
74
+ "type": "array",
75
+ "items": simple_type_union
76
+ },
77
+ {
78
+ "type": "map",
79
+ "values": simple_type_union
80
+ }])
81
+ l1 = simple_type_union.copy()
82
+ l1.extend([
83
+ {
84
+ "type": "array",
85
+ "items": l2
86
+ },
87
+ {
88
+ "type": "map",
89
+ "values": l2
90
+ }])
91
+ return l1
92
+
93
+
94
+ def is_generic_avro_type(avro_type: list) -> bool:
95
+ """
96
+ Check if the given Avro type is a generic type.
97
+
98
+ Args:
99
+ avro_type (Union[str, Dict[str, Any]]): The Avro type to check.
100
+
101
+ Returns:
102
+ bool: True if the Avro type is a generic type, False otherwise.
103
+ """
104
+ if isinstance(avro_type, str) or isinstance(avro_type, dict):
105
+ return False
106
+ compare_type = generic_type()
107
+ return Compare().check(avro_type, compare_type) == NO_DIFF
108
+
109
+
110
+ def is_generic_json_type(json_type: Dict[str, Any] | List[Dict[str, Any] | str] | str) -> bool:
111
+ """
112
+ Check if the given JSON type is a generic type.
113
+
114
+ Args:
115
+ json_type (Union[Dict[str, Any], str, List[Union[str, Dict[str, Any]]]]): The JSON type to check.
116
+
117
+ Returns:
118
+ bool: True if the JSON type is a generic type, False otherwise.
119
+ """
120
+ if isinstance(json_type, str) or isinstance(json_type, list):
121
+ return False
122
+ compare_type = generic_type_json()
123
+ return Compare().check(json_type, compare_type) == NO_DIFF
124
+
125
+
126
+ def generic_type_json() -> dict:
127
+ """
128
+ Returns a dictionary representing a generic JSON schema for various types.
129
+
130
+ The schema includes support for boolean, integer, number, string, array, and object types.
131
+ Each type can have different formats such as int32, int64, float, double, and byte.
132
+
133
+ Returns:
134
+ dict: A dictionary representing the generic JSON schema.
135
+ """
136
+ return {
137
+ "oneOf": [
138
+ {"type": "boolean"},
139
+ {"type": "integer", "format": "int32"},
140
+ {"type": "integer", "format": "int64"},
141
+ {"type": "number", "format": "float"},
142
+ {"type": "number", "format": "double"},
143
+ {"type": "string", "format": "byte"},
144
+ {"type": "string"},
145
+ {
146
+ "type": "array",
147
+ "items": {
148
+ "oneOf": [
149
+ {"type": "boolean"},
150
+ {"type": "integer", "format": "int32"},
151
+ {"type": "integer", "format": "int64"},
152
+ {"type": "number", "format": "float"},
153
+ {"type": "number", "format": "double"},
154
+ {"type": "string", "format": "byte"},
155
+ {"type": "string"},
156
+ {
157
+ "type": "array",
158
+ "items": {
159
+ "oneOf": [
160
+ {"type": "boolean"},
161
+ {"type": "integer", "format": "int32"},
162
+ {"type": "integer", "format": "int64"},
163
+ {"type": "number", "format": "float"},
164
+ {"type": "number", "format": "double"},
165
+ {"type": "string", "format": "byte"},
166
+ {"type": "string"}
167
+ ]
168
+ }
169
+ },
170
+ {
171
+ "type": "object",
172
+ "additionalProperties": {
173
+ "oneOf": [
174
+ {"type": "boolean"},
175
+ {"type": "integer", "format": "int32"},
176
+ {"type": "integer", "format": "int64"},
177
+ {"type": "number", "format": "float"},
178
+ {"type": "number", "format": "double"},
179
+ {"type": "string", "format": "byte"},
180
+ {"type": "string"}
181
+ ]
182
+ }
183
+ }
184
+ ]
185
+ }
186
+ },
187
+ {
188
+ "type": "object",
189
+ "additionalProperties": {
190
+ "oneOf": [
191
+ {"type": "boolean"},
192
+ {"type": "integer", "format": "int32"},
193
+ {"type": "integer", "format": "int64"},
194
+ {"type": "number", "format": "float"},
195
+ {"type": "number", "format": "double"},
196
+ {"type": "string", "format": "byte"},
197
+ {"type": "string"},
198
+ {
199
+ "type": "array",
200
+ "items": {
201
+ "oneOf": [
202
+ {"type": "boolean"},
203
+ {"type": "integer", "format": "int32"},
204
+ {"type": "integer", "format": "int64"},
205
+ {"type": "number", "format": "float"},
206
+ {"type": "number", "format": "double"},
207
+ {"type": "string", "format": "byte"},
208
+ {"type": "string"}
209
+ ]
210
+ }
211
+ },
212
+ {
213
+ "type": "object",
214
+ "additionalProperties": {
215
+ "oneOf": [
216
+ {"type": "boolean"},
217
+ {"type": "integer", "format": "int32"},
218
+ {"type": "integer", "format": "int64"},
219
+ {"type": "number", "format": "float"},
220
+ {"type": "number", "format": "double"},
221
+ {"type": "string", "format": "byte"},
222
+ {"type": "string"}
223
+ ]
224
+ }
225
+ }
226
+ ]
227
+ }
228
+ }
229
+ ]
230
+ }
231
+
232
+
233
+ def find_schema_node(test, avro_schema, recursion_stack=None):
234
+ """
235
+ Find the first schema node in the avro_schema matching the test
236
+
237
+ Args:
238
+ test (Callable): The test function.
239
+ avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
240
+ recursion_stack (List[Union[Dict[str, Any], List[Dict[str, Any]]], optional): The recursion stack. Defaults to None.
241
+
242
+ Returns:
243
+ Union[Dict[str, Any], None]: The schema node if found, otherwise None.
244
+ """
245
+ if recursion_stack is None:
246
+ recursion_stack = []
247
+ for recursion_item in recursion_stack:
248
+ if avro_schema is recursion_item:
249
+ raise ValueError('Cyclical reference detected in schema')
250
+ if len(recursion_stack) > 50:
251
+ raise ValueError('Maximum recursion depth 50 exceeded in schema')
252
+ try:
253
+ recursion_stack.append(avro_schema)
254
+ if isinstance(avro_schema, dict):
255
+ test_node = test(avro_schema)
256
+ if test_node:
257
+ return avro_schema
258
+ for _, v in avro_schema.items():
259
+ if isinstance(v, (dict, list)):
260
+ node = find_schema_node(test, v, recursion_stack)
261
+ if node:
262
+ return node
263
+ elif isinstance(avro_schema, list):
264
+ for item in avro_schema:
265
+ if isinstance(item, (dict, list)):
266
+ node = find_schema_node(test, item, recursion_stack)
267
+ if node:
268
+ return node
269
+ return None
270
+ finally:
271
+ recursion_stack.pop()
272
+
273
+
274
+ def set_schema_node(test, replacement, avro_schema):
275
+ """
276
+ Set the first schema node in the avro_schema matching the test to the replacement
277
+
278
+ Args:
279
+ test (Callable): The test function.
280
+ replacement (Dict[str, Any]): The replacement schema.
281
+ avro_schema (Union[Dict[str, Any], List[Dict[str, Any]]]): The Avro schema to search.
282
+
283
+ Returns:
284
+ None
285
+ """
286
+ if isinstance(avro_schema, dict):
287
+ test_node = test(avro_schema)
288
+ if test_node:
289
+ avro_schema.clear()
290
+ avro_schema.update(replacement)
291
+ return
292
+ for k, v in avro_schema.items():
293
+ if isinstance(v, (dict, list)):
294
+ set_schema_node(test, replacement, v)
295
+ elif isinstance(avro_schema, list):
296
+ for item in avro_schema:
297
+ set_schema_node(test, replacement, item)
298
+
299
+
300
+ class NodeHash:
301
+ """ A hash value and count for a JSON object. """
302
+ def __init__(self: 'NodeHash', hash_value: bytes, count: int):
303
+ self.hash_value: bytes = hash_value
304
+ self.count: int = count
305
+
306
+
307
+ class NodeHashReference:
308
+ """ A reference to a JSON object with a hash value and count."""
309
+ def __init__(self, hash_and_count: NodeHash, value, path):
310
+ self.hash_value: bytes = hash_and_count.hash_value
311
+ self.count: int = hash_and_count.count
312
+ self.value: Any = value
313
+ self.path: str = path
314
+
315
+
316
+ def get_tree_hash(json_obj: Union[dict, list]) -> NodeHash:
317
+ """
318
+ Generate a hash from a JSON object (dict or list).
319
+
320
+ Args:
321
+ json_obj (Union[dict, list]): The JSON object to hash.
322
+
323
+ Returns:
324
+ NodeHash: The hash value and count.
325
+ """
326
+ if isinstance(json_obj, dict) or isinstance(json_obj, list):
327
+ s = json.dumps(json_obj, sort_keys=True).encode('utf-8')
328
+ return NodeHash(hashlib.sha256(s).digest(), len(s))
329
+ else:
330
+ s = json.dumps(json_obj).encode('utf-8')
331
+ return NodeHash(hashlib.sha256(s).digest(), len(s))
332
+
333
+
334
+ def build_tree_hash_list(json_obj: Union[dict, list], path: str = '') -> Dict[str, NodeHashReference]:
335
+ """
336
+ Build a flat dictionary of hashes for a JSON object.
337
+ The keys are JSON Path expressions, and the values are the hashes.
338
+
339
+ Args:
340
+ json_obj (Union[dict, list]): The JSON object to hash.
341
+ path (str): The current JSON Path expression. Defaults to ''.
342
+
343
+ Returns:
344
+ Dict[str, NodeHashReference]: A dictionary of JSON Path expressions and hashes.
345
+ """
346
+
347
+ def has_nested_structure(obj: Union[dict, list]) -> bool:
348
+ """
349
+ Check if the object (list or dict) contains any nested lists or dicts.
350
+ """
351
+ if isinstance(obj, dict):
352
+ return any(isinstance(value, (dict, list)) for value in obj.values())
353
+ elif isinstance(obj, list):
354
+ return any(isinstance(item, (dict, list)) for item in obj)
355
+ return False
356
+
357
+ tree_hash = {}
358
+ if isinstance(json_obj, dict):
359
+ for key, value in json_obj.items():
360
+ new_path = f'{path}.{key}' if path else f'$.{key}'
361
+ if isinstance(value, dict) and has_nested_structure(value):
362
+ inner_hashes = build_tree_hash_list(value, new_path)
363
+ for inner_path, hash_reference in inner_hashes.items():
364
+ tree_hash[inner_path] = hash_reference
365
+ hash_value = get_tree_hash(value)
366
+ tree_hash[new_path] = NodeHashReference(hash_value, value, new_path)
367
+ elif isinstance(json_obj, list):
368
+ for index, item in enumerate(json_obj):
369
+ new_path = f"{path}[{index}]"
370
+ if isinstance(item, (dict, list)) and has_nested_structure(item):
371
+ inner_hashes = build_tree_hash_list(item, new_path)
372
+ for inner_path, hash_reference in inner_hashes.items():
373
+ tree_hash[inner_path] = hash_reference
374
+ return tree_hash
375
+
376
+
377
+ def group_by_hash(tree_hash_list: Dict[str, NodeHashReference]) -> Dict[bytes, list]:
378
+ """
379
+ Group JSON Path expressions by their hash values.
380
+
381
+ Args:
382
+ tree_hash_list (Dict[str, NodeHashReference]): A dictionary of JSON Path expressions and hashes.
383
+
384
+ Returns:
385
+ Dict[bytes, list]: A dictionary of hash values and lists of JSON Path expressions.
386
+ """
387
+ hash_groups = defaultdict(list)
388
+ for _, hash_reference in tree_hash_list.items():
389
+ hash_groups[hash_reference.hash_value].append(hash_reference)
390
+
391
+ # Filter out unique hashes to only return groups with more than one path
392
+ for k in list(hash_groups.keys()):
393
+ if len(hash_groups[k]) == 1:
394
+ del hash_groups[k]
395
+ return hash_groups
396
+
397
+
398
+ def pascal(string):
399
+ """
400
+ Convert a string to PascalCase from snake_case, camelCase, or PascalCase.
401
+ The string can contain dots or double colons, which are preserved in the output.
402
+ Underscores at the beginning of the string are preserved in the output, but
403
+ underscores in the middle of the string are removed.
404
+
405
+ Args:
406
+ string (str): The string to convert.
407
+
408
+ Returns:
409
+ str: The string in PascalCase.
410
+ """
411
+ if '::' in string:
412
+ strings = string.split('::')
413
+ return strings[0] + '::' + '::'.join(pascal(s) for s in strings[1:])
414
+ if '.' in string:
415
+ strings = string.split('.')
416
+ return '.'.join(pascal(s) for s in strings)
417
+ if not string or len(string) == 0:
418
+ return string
419
+ words = []
420
+ startswith_under = string[0] == '_'
421
+ if '_' in string:
422
+ # snake_case
423
+ words = re.split(r'_', string)
424
+ elif string[0].isupper():
425
+ # PascalCase
426
+ words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
427
+ else:
428
+ # camelCase
429
+ words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
430
+ result = ''.join(word.capitalize() for word in words)
431
+ if startswith_under:
432
+ result = '_' + result
433
+ return result
434
+
435
+
436
+ def camel(string):
437
+ """
438
+ Convert a string to camelCase from snake_case, camelCase, or PascalCase.
439
+ The string can contain dots or double colons, which are preserved in the output.
440
+ Underscores at the beginning of the string are preserved in the output, but
441
+ underscores in the middle of the string are removed.
442
+
443
+ Args:
444
+ string (str): The string to convert.
445
+
446
+ Returns:
447
+ str: The string in camelCase.
448
+ """
449
+ if '::' in string:
450
+ strings = string.split('::')
451
+ return strings[0] + '::' + '::'.join(camel(s) for s in strings[1:])
452
+ if '.' in string:
453
+ strings = string.split('.')
454
+ return '.'.join(camel(s) for s in strings)
455
+ if not string or len(string) == 0:
456
+ return string
457
+ words = []
458
+ if '_' in string:
459
+ # snake_case
460
+ words = re.split(r'_', string)
461
+ elif string[0].isupper():
462
+ # PascalCase
463
+ words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
464
+ else:
465
+ # camelCase
466
+ words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
467
+ result = words[0].lower() + ''.join(word.capitalize()
468
+ for word in words[1:])
469
+ return result
470
+
471
+
472
+ def snake(string):
473
+ """
474
+ Convert a string to snake_case from snake_case, camelCase, or PascalCase.
475
+ The string can contain dots or double colons, which are preserved in the output.
476
+ Underscores at the beginning of the string are preserved in the output, but
477
+ underscores in the middle of the string are removed.
478
+
479
+ Args:
480
+ string (str): The string to convert.
481
+
482
+ Returns:
483
+ str: The string in snake_case.
484
+ """
485
+ if '::' in string:
486
+ strings = string.split('::')
487
+ return strings[0] + '::' + '::'.join(snake(s) for s in strings[1:])
488
+ if '.' in string:
489
+ strings = string.split('.')
490
+ return '.'.join(snake(s) for s in strings)
491
+ if not string or len(string) == 0:
492
+ return string
493
+ words = []
494
+ if '_' in string:
495
+ # snake_case
496
+ words = re.split(r'_', string)
497
+ elif string[0].isupper():
498
+ # PascalCase
499
+ words = re.findall(r'[A-Z][a-z0-9_]*\.?', string)
500
+ else:
501
+ # camelCase
502
+ words = re.findall(r'[a-z0-9]+\.?|[A-Z][a-z0-9_]*\.?', string)
503
+ result = '_'.join(word.lower() for word in words)
504
+ return result
505
+
506
+
507
+ def fullname(avro_schema: dict| str, parent_namespace: str = '') -> str:
508
+ """
509
+ Constructs the full name of the Avro schema.
510
+
511
+ Args:
512
+ avro_schema (dict): The Avro schema.
513
+
514
+ Returns:
515
+ str: The full name of the Avro schema.
516
+ """
517
+ if isinstance(avro_schema, str):
518
+ if not '.' in avro_schema and parent_namespace:
519
+ return parent_namespace + '.' + avro_schema
520
+ return avro_schema
521
+ name = avro_schema.get("name", "")
522
+ namespace = avro_schema.get("namespace", parent_namespace)
523
+ return namespace + "." + name if namespace else name
524
+
525
+
526
+ def altname(schema_obj: dict, purpose: str):
527
+ """
528
+ Retrieves the alternative name for a given purpose from the schema object.
529
+
530
+ Args:
531
+ schema_obj (dict): The schema object (record or field).
532
+ default_name (str): The default name.
533
+ purpose (str): The purpose for the alternative name (e.g., 'sql').
534
+
535
+ Returns:
536
+ str: The alternative name if present, otherwise the default name.
537
+ """
538
+ if "altnames" in schema_obj and purpose in schema_obj["altnames"]:
539
+ return schema_obj["altnames"][purpose]
540
+ return schema_obj["name"]
541
+
542
+
543
+ def process_template(file_path: str, **kvargs) -> str:
544
+ """
545
+ Process a file as a Jinja2 template with the given object as input.
546
+
547
+ Args:
548
+ file_path (str): The path to the file.
549
+ obj (Any): The object to use as input for the template.
550
+
551
+ Returns:
552
+ str: The processed template as a string.
553
+ """
554
+ # Load the template environment
555
+ file_dir = os.path.dirname(__file__)
556
+ template_loader = jinja2.FileSystemLoader(searchpath=file_dir)
557
+ template_env = jinja2.Environment(loader=template_loader)
558
+ template_env.filters['pascal'] = pascal
559
+ template_env.filters['camel'] = camel
560
+
561
+ # Load the template from the file
562
+ template = template_env.get_template(file_path)
563
+
564
+ # Render the template with the object as input
565
+ output = template.render(**kvargs)
566
+
567
+ return output
568
+
569
+
570
+ def render_template(template: str, output: str, **kvargs):
571
+ """
572
+ Render a template and write it to a file
573
+
574
+ Args:
575
+ template (str): The template to render.
576
+ output (str): The output file path.
577
+ **kvargs: The keyword arguments to pass to the template.
578
+
579
+ Returns:
580
+ None
581
+ """
582
+ out = process_template(template, **kvargs)
583
+ # make sure the directory exists
584
+ os.makedirs(os.path.dirname(output), exist_ok=True)
585
+ with open(output, 'w', encoding='utf-8') as f:
586
+ f.write(out)
587
+
588
+
589
+ def get_longest_namespace_prefix(schema):
590
+ """ Get the longest common prefix for the namespace of all types in the schema. """
591
+ namespaces = set(collect_namespaces(schema))
592
+ longest_common_prefix = ''
593
+ # find longest common prefix of the namespaces (not with os.path!!!)
594
+ for ns in namespaces:
595
+ if not longest_common_prefix:
596
+ longest_common_prefix = ns
597
+ else:
598
+ for i in range(min(len(longest_common_prefix), len(ns))):
599
+ if longest_common_prefix[i] != ns[i]:
600
+ longest_common_prefix = longest_common_prefix[:i]
601
+ break
602
+ return longest_common_prefix.strip('.')
603
+
604
+
605
+ def collect_namespaces(schema: Any, parent_namespace: str = '') -> List[str]:
606
+ """ Performs a deep search of the schema to collect all namespaces """
607
+ namespaces = []
608
+ if isinstance(schema, dict):
609
+ namespace = str(schema.get('namespace', parent_namespace))
610
+ if namespace:
611
+ namespaces.append(namespace)
612
+ if 'fields' in schema and isinstance(schema['fields'], list):
613
+ for field in schema['fields']:
614
+ if isinstance(field, dict) and 'type' in field and isinstance(field['type'], dict):
615
+ namespaces.extend(collect_namespaces(
616
+ field['type'], namespace))
617
+ namespaces.extend(collect_namespaces(field, namespace))
618
+ if 'items' in schema and isinstance(schema['items'], dict):
619
+ namespaces.extend(collect_namespaces(schema['items'], namespace))
620
+ if 'values' in schema and isinstance(schema['values'], dict):
621
+ namespaces.extend(collect_namespaces(schema['values'], namespace))
622
+ elif isinstance(schema, list):
623
+ for item in schema:
624
+ namespaces.extend(collect_namespaces(item, parent_namespace))
625
+ return namespaces
626
+
627
+
628
+ def build_flat_type_dict(avro_schema) -> Dict[str, Dict]:
629
+ """Builds a flat dictionary of all named types in the main schema."""
630
+ type_dict = {}
631
+
632
+ def add_to_dict(schema, namespace):
633
+ if isinstance(schema, dict):
634
+ schema_type = schema.get('type')
635
+ name = schema.get('name')
636
+ namespace = schema.get('namespace', namespace)
637
+ if schema_type in ['record', 'enum', 'fixed'] and name:
638
+ qualified_name = f"{namespace}.{name}" if namespace else name
639
+ type_dict[qualified_name] = schema
640
+ if schema_type == 'record':
641
+ for field in schema.get('fields', []):
642
+ field_type = field.get('type')
643
+ add_to_dict(field_type, namespace)
644
+ elif schema_type == 'array':
645
+ add_to_dict(schema.get('items'), namespace)
646
+ elif schema_type == 'map':
647
+ add_to_dict(schema.get('values'), namespace)
648
+ elif isinstance(schema, list):
649
+ for item in schema:
650
+ add_to_dict(item, namespace)
651
+
652
+ if isinstance(avro_schema, dict):
653
+ add_to_dict(avro_schema, avro_schema.get('namespace', ''))
654
+ elif isinstance(avro_schema, list):
655
+ for schema in avro_schema:
656
+ schema_namespace = schema.get('namespace', '')
657
+ add_to_dict(schema, schema_namespace)
658
+ return type_dict
659
+
660
+
661
+ def evict_tracked_references(avro_schema, parent_namespace, tracker):
662
+ """ Evicts all tracked references in the Avro schema. """
663
+ if isinstance(avro_schema, dict):
664
+ if 'type' in avro_schema and (avro_schema['type'] == 'record' or avro_schema['type'] == 'enum' or avro_schema['type'] == 'fixed'):
665
+ namespace = avro_schema.get('namespace', parent_namespace)
666
+ qualified_name = (
667
+ namespace + '.' if namespace else '') + avro_schema['name']
668
+ if not qualified_name in tracker:
669
+ if 'fields' in avro_schema:
670
+ for field in avro_schema['fields']:
671
+ field['type'] = evict_tracked_references(
672
+ field['type'], namespace, tracker)
673
+ return avro_schema
674
+ else:
675
+ return qualified_name
676
+ # Handling array types
677
+ elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
678
+ avro_schema['items'] = evict_tracked_references(
679
+ avro_schema['items'], parent_namespace, tracker)
680
+ # Handling map types
681
+ elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
682
+ avro_schema['values'] = evict_tracked_references(
683
+ avro_schema['values'], parent_namespace, tracker)
684
+ elif isinstance(avro_schema, list):
685
+ return [evict_tracked_references(item, parent_namespace, tracker) for item in avro_schema]
686
+ return avro_schema
687
+
688
+
689
+ def inline_avro_references(avro_schema, type_dict, current_namespace, tracker=None, defined_types=None):
690
+ """ Inlines the first reference to a type in the Avro schema. """
691
+ if tracker is None:
692
+ tracker = set()
693
+ if defined_types is None:
694
+ defined_types = set()
695
+
696
+ if isinstance(avro_schema, dict):
697
+ # Register the type if it's a record, enum, or fixed and is inlined in the same schema
698
+ if 'type' in avro_schema and avro_schema['type'] in ['record', 'enum', 'fixed']:
699
+ namespace = avro_schema.get('namespace', current_namespace)
700
+ qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
701
+ defined_types.add(qualified_name)
702
+
703
+ # Process record types
704
+ if 'type' in avro_schema and avro_schema['type'] == 'record' and 'fields' in avro_schema:
705
+ namespace = avro_schema.get('namespace', current_namespace)
706
+ qualified_name = (namespace + '.' if namespace else '') + avro_schema['name']
707
+ if qualified_name in tracker:
708
+ return qualified_name
709
+ tracker.add(qualified_name)
710
+ for field in avro_schema['fields']:
711
+ field['type'] = inline_avro_references(
712
+ field['type'], type_dict, namespace, tracker, defined_types)
713
+
714
+ # Handling array types
715
+ elif 'type' in avro_schema and avro_schema['type'] == 'array' and 'items' in avro_schema:
716
+ avro_schema['items'] = inline_avro_references(
717
+ avro_schema['items'], type_dict, current_namespace, tracker, defined_types)
718
+
719
+ # Handling map types
720
+ elif 'type' in avro_schema and avro_schema['type'] == 'map' and 'values' in avro_schema:
721
+ avro_schema['values'] = inline_avro_references(
722
+ avro_schema['values'], type_dict, current_namespace, tracker, defined_types)
723
+
724
+ # Inline other types, except enum and fixed
725
+ elif 'type' in avro_schema and avro_schema['type'] not in ['enum', 'fixed']:
726
+ avro_schema['type'] = inline_avro_references(
727
+ avro_schema['type'], type_dict, current_namespace, tracker, defined_types)
728
+
729
+ elif isinstance(avro_schema, list):
730
+ return [inline_avro_references(item, type_dict, current_namespace, tracker, defined_types) for item in avro_schema]
731
+
732
+ elif avro_schema in type_dict and avro_schema not in tracker and avro_schema not in defined_types:
733
+ # Inline the referenced schema if not already tracked and not defined in the current schema
734
+ inlined_schema = type_dict[avro_schema].copy()
735
+ if isinstance(inlined_schema, dict) and not inlined_schema.get('namespace', None):
736
+ inlined_schema['namespace'] = '.'.join(avro_schema.split('.')[:-1])
737
+ inlined_schema = inline_avro_references(
738
+ inlined_schema, type_dict, inlined_schema['namespace'], tracker, defined_types)
739
+ tracker.add(avro_schema)
740
+ return inlined_schema
741
+
742
+ return avro_schema
743
+
744
+ def strip_first_doc(schema) -> bool:
745
+ """ strip the first doc field anywhere in the schema"""
746
+ if isinstance(schema, dict):
747
+ if "doc" in schema:
748
+ del schema["doc"]
749
+ return True
750
+ for key in schema:
751
+ if strip_first_doc(schema[key]):
752
+ return True
753
+ elif isinstance(schema, list):
754
+ for item in schema:
755
+ if strip_first_doc(item):
756
+ return True
757
+ return False
758
+
759
+
760
+ def is_type_with_alternate(avro_schema: List[Dict[str, Any]]) -> bool:
761
+ """
762
+ Check if the Avro schema union contains a type with a trailing alternate type.
763
+ Alternate types are maps that mimic the structure of the original type, but
764
+ allow for additional fields. Alternate types are labeled with an 'alternateof'
765
+ attribute extension that points to the original type.
766
+
767
+ Args:
768
+ avro_schema (List[Dict[str, Any]]): The Avro schema to check.
769
+
770
+ Returns:
771
+ bool: True if the Avro schema contains a type with an alternate name, False otherwise.
772
+ """
773
+ avro_schema = avro_schema.copy()
774
+ if not isinstance(avro_schema, list):
775
+ return False
776
+ if 'null' in avro_schema:
777
+ avro_schema.remove('null')
778
+ if len(avro_schema) != 2:
779
+ return False
780
+ original_type = any(t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t)
781
+ alternate_type = any(t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t)
782
+ if original_type and alternate_type:
783
+ return True
784
+ return False
785
+
786
+ def strip_alternate_type(avro_schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
787
+ """
788
+ Strips the alternate type from the Avro schema union.
789
+
790
+ Args:
791
+ avro_schema (List[Dict[str, Any]]): The Avro schema to strip.
792
+
793
+ Returns:
794
+ List[Dict[str, Any]]: The Avro schema without the alternate type.
795
+ """
796
+ original_type = next((t for t in avro_schema if isinstance(t, dict) and not 'alternateof' in t), None)
797
+ alternate_type = next((t for t in avro_schema if isinstance(t, dict) and 'alternateof' in t), None)
798
+ if original_type and alternate_type:
799
+ avro_schema.remove(alternate_type)
800
+ return avro_schema
801
+
802
+
803
+ def get_typing_args_from_string(type_str: str) -> List[str]:
804
+ """ gets the list of generic arguments of a type. """
805
+ # This regex captures the main type and its generic arguments
806
+ pattern = re.compile(r'([\w\.]+)\[(.+)\]')
807
+ match = pattern.match(type_str)
808
+
809
+ if not match:
810
+ return []
811
+
812
+ _, args_str = match.groups()
813
+ # Splitting the arguments while considering nested generic types
814
+ args = []
815
+ depth = 0
816
+ current_arg:List[str] = []
817
+ for char in args_str:
818
+ if char == ',' and depth == 0:
819
+ args.append(''.join(current_arg).strip())
820
+ current_arg = []
821
+ else:
822
+ if char == '[':
823
+ depth += 1
824
+ elif char == ']':
825
+ depth -= 1
826
+ current_arg.append(char)
827
+ if current_arg:
828
+ args.append(''.join(current_arg).strip())
829
829
  return args