structurize 2.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. avrotize/__init__.py +64 -0
  2. avrotize/__main__.py +6 -0
  3. avrotize/_version.py +34 -0
  4. avrotize/asn1toavro.py +160 -0
  5. avrotize/avrotize.py +152 -0
  6. avrotize/avrotocpp.py +483 -0
  7. avrotize/avrotocsharp.py +1075 -0
  8. avrotize/avrotocsv.py +121 -0
  9. avrotize/avrotodatapackage.py +173 -0
  10. avrotize/avrotodb.py +1383 -0
  11. avrotize/avrotogo.py +476 -0
  12. avrotize/avrotographql.py +197 -0
  13. avrotize/avrotoiceberg.py +210 -0
  14. avrotize/avrotojava.py +2156 -0
  15. avrotize/avrotojs.py +250 -0
  16. avrotize/avrotojsons.py +481 -0
  17. avrotize/avrotojstruct.py +345 -0
  18. avrotize/avrotokusto.py +364 -0
  19. avrotize/avrotomd.py +137 -0
  20. avrotize/avrotools.py +168 -0
  21. avrotize/avrotoparquet.py +208 -0
  22. avrotize/avrotoproto.py +359 -0
  23. avrotize/avrotopython.py +624 -0
  24. avrotize/avrotorust.py +435 -0
  25. avrotize/avrotots.py +598 -0
  26. avrotize/avrotoxsd.py +344 -0
  27. avrotize/cddltostructure.py +1841 -0
  28. avrotize/commands.json +3337 -0
  29. avrotize/common.py +834 -0
  30. avrotize/constants.py +72 -0
  31. avrotize/csvtoavro.py +132 -0
  32. avrotize/datapackagetoavro.py +76 -0
  33. avrotize/dependencies/cpp/vcpkg/vcpkg.json +19 -0
  34. avrotize/dependencies/typescript/node22/package.json +16 -0
  35. avrotize/dependency_resolver.py +348 -0
  36. avrotize/dependency_version.py +432 -0
  37. avrotize/jsonstoavro.py +2167 -0
  38. avrotize/jsonstostructure.py +2642 -0
  39. avrotize/jstructtoavro.py +878 -0
  40. avrotize/kstructtoavro.py +93 -0
  41. avrotize/kustotoavro.py +455 -0
  42. avrotize/parquettoavro.py +157 -0
  43. avrotize/proto2parser.py +498 -0
  44. avrotize/proto3parser.py +403 -0
  45. avrotize/prototoavro.py +382 -0
  46. avrotize/structuretocddl.py +597 -0
  47. avrotize/structuretocpp.py +697 -0
  48. avrotize/structuretocsharp.py +2295 -0
  49. avrotize/structuretocsv.py +365 -0
  50. avrotize/structuretodatapackage.py +659 -0
  51. avrotize/structuretodb.py +1125 -0
  52. avrotize/structuretogo.py +720 -0
  53. avrotize/structuretographql.py +502 -0
  54. avrotize/structuretoiceberg.py +355 -0
  55. avrotize/structuretojava.py +853 -0
  56. avrotize/structuretojsons.py +498 -0
  57. avrotize/structuretokusto.py +639 -0
  58. avrotize/structuretomd.py +322 -0
  59. avrotize/structuretoproto.py +764 -0
  60. avrotize/structuretopython.py +772 -0
  61. avrotize/structuretorust.py +714 -0
  62. avrotize/structuretots.py +653 -0
  63. avrotize/structuretoxsd.py +679 -0
  64. avrotize/xsdtoavro.py +413 -0
  65. structurize-2.19.0.dist-info/METADATA +107 -0
  66. structurize-2.19.0.dist-info/RECORD +70 -0
  67. structurize-2.19.0.dist-info/WHEEL +5 -0
  68. structurize-2.19.0.dist-info/entry_points.txt +2 -0
  69. structurize-2.19.0.dist-info/licenses/LICENSE +201 -0
  70. structurize-2.19.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2167 @@
1
+ """ JSON to Avro schema converter. """
2
+
3
+ # pylint: disable=too-many-lines, line-too-long, too-many-branches, too-many-statements, too-many-locals, too-many-nested-blocks, too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-boolean-expressions
4
+
5
+ import json
6
+ import os
7
+ import copy
8
+ import urllib
9
+ from urllib.parse import ParseResult, urlparse, unquote
10
+ from typing import Any, Dict, List, Tuple
11
+ import jsonpointer
12
+ from jsonpointer import JsonPointerException
13
+ import requests
14
+
15
+ from avrotize.common import avro_name, avro_namespace, find_schema_node, generic_type, set_schema_node
16
+ from avrotize.dependency_resolver import inline_dependencies_of, sort_messages_by_dependencies
17
+
18
+ primitive_types = ['null', 'string', 'int',
19
+ 'long', 'float', 'double', 'boolean', 'bytes']
20
+
21
+
22
+ class JsonToAvroConverter:
23
+ """
24
+ Converts JSON schema to Avro schema.
25
+
26
+ Attributes:
27
+ imported_types: A dictionary of imported type schemas.
28
+ root_namespace: The namespace for the root schema.
29
+ max_recursion_depth: The maximum recursion depth.
30
+ types_with_unmerged_types: A list of types with unmerged types.
31
+ content_cache: A dictionary for caching fetched URLs.
32
+ utility_namespace: The namespace for utility types.
33
+ maximize_compatiblity: A flag to maximize compatibility.
34
+
35
+ """
36
+
37
+ def __init__(self) -> None:
38
+ self.imported_types: Dict[Any, Any] = {}
39
+ self.root_namespace = 'example.com'
40
+ self.max_recursion_depth = 40
41
+ self.types_with_unmerged_types: List[dict] = []
42
+ self.content_cache: Dict[str, str] = {}
43
+ self.utility_namespace = 'utility.vasters.com'
44
+ self.split_top_level_records = False
45
+ self.root_class_name = 'document'
46
+
47
+ def is_empty_type(self, avro_type):
48
+ """
49
+ Check if the Avro type is an empty type.
50
+
51
+ Parameters:
52
+ avro_type (any): The Avro type to check.
53
+
54
+ Returns:
55
+ bool: True if the Avro type is empty, False otherwise.
56
+ """
57
+ if len(avro_type) == 0:
58
+ return True
59
+ if isinstance(avro_type, list):
60
+ return all(self.is_empty_type(t) for t in avro_type)
61
+ if isinstance(avro_type, dict):
62
+ if not 'type' in avro_type:
63
+ return True
64
+ if (avro_type['type'] == 'record' and (not 'fields' in avro_type or len(avro_type['fields']) == 0)) or \
65
+ (avro_type['type'] == 'enum' and (not 'symbols' in avro_type or len(avro_type['symbols']) == 0)) or \
66
+ (avro_type['type'] == 'array' and (not 'items' in avro_type or not avro_type['items'])) or \
67
+ (avro_type['type'] == 'map' and (not 'values' in avro_type or not avro_type['values'])):
68
+ return True
69
+ return False
70
+
71
+ def is_empty_json_type(self, json_type):
72
+ """
73
+ Check if the JSON type is an empty type.
74
+
75
+ Parameters:
76
+ json_type (any): The JSON type to check.
77
+
78
+ Returns:
79
+ bool: True if the JSON type is empty, False otherwise.
80
+ """
81
+ if len(json_type) == 0:
82
+ return True
83
+ if isinstance(json_type, list):
84
+ return all(self.is_empty_json_type(t) for t in json_type)
85
+ if isinstance(json_type, dict):
86
+ if not 'type' in json_type:
87
+ return True
88
+ return False
89
+
90
+ def detect_discriminated_union(self, json_type: dict):
91
+ """
92
+ Detect if a JSON schema is a discriminated union pattern using allOf with if/then conditionals.
93
+
94
+ A discriminated union pattern consists of:
95
+ - A base schema with a discriminator field (usually 'type') with an enum
96
+ - An allOf array containing if/then conditionals that add fields based on discriminator value
97
+
98
+ Parameters:
99
+ json_type (dict): The JSON schema object to check
100
+
101
+ Returns:
102
+ list | None: List of discriminator values if pattern detected, None otherwise
103
+ """
104
+ if not isinstance(json_type, dict) or 'allOf' not in json_type:
105
+ return None
106
+
107
+ # Check for discriminator field with enum values
108
+ properties = json_type.get('properties', {})
109
+ if 'type' not in properties or 'enum' not in properties.get('type', {}):
110
+ return None
111
+
112
+ discriminator_values = properties['type']['enum']
113
+
114
+ # Check if allOf contains if/then conditionals
115
+ has_if_then = any(
116
+ isinstance(item, dict) and 'if' in item and 'then' in item
117
+ for item in json_type['allOf']
118
+ )
119
+
120
+ if has_if_then and len(discriminator_values) > 0:
121
+ return discriminator_values
122
+
123
+ return None
124
+
125
+ def handle_inline_conditional_schema(self, json_type: dict) -> Tuple[bool, dict]:
126
+ """
127
+ Handle inline if/then/else conditional schemas by converting them to appropriate structures.
128
+
129
+ Supports the following patterns:
130
+ 1. Type-based conditional: if {properties: {type: {enum: [X]}}}, then {...}, else {...}
131
+ - Converted to oneOf with discriminated variants
132
+ 2. Field presence conditional: if {properties: {field: {...}}, required: [field]}
133
+ - Merged into comprehensive type (Avro handles optional fields naturally)
134
+
135
+ Parameters:
136
+ json_type (dict): The JSON schema object to process
137
+
138
+ Returns:
139
+ Tuple[bool, dict]: (was_handled, modified_json_type)
140
+ """
141
+ if not isinstance(json_type, dict) or 'if' not in json_type:
142
+ return (False, json_type)
143
+
144
+ if_clause = json_type.get('if', {})
145
+ then_clause = json_type.get('then', {})
146
+ else_clause = json_type.get('else', None)
147
+
148
+ # Check for type-based discriminator pattern
149
+ # if: {properties: {type: {enum: ["X"]}}}
150
+ if (isinstance(if_clause, dict) and
151
+ 'properties' in if_clause and
152
+ 'type' in if_clause['properties']):
153
+
154
+ type_prop = if_clause['properties']['type']
155
+ if isinstance(type_prop, dict) and 'enum' in type_prop:
156
+ # This is a type-based conditional - convert to oneOf
157
+ return self._convert_type_conditional_to_oneof(json_type, if_clause, then_clause, else_clause)
158
+
159
+ # Check for field presence pattern
160
+ # if: {properties: {field: {...}}, required: [field]}
161
+ if (isinstance(if_clause, dict) and
162
+ 'properties' in if_clause and
163
+ 'required' in if_clause):
164
+ # This is a field presence conditional - merge all branches
165
+ return self._merge_conditional_branches(json_type, then_clause, else_clause)
166
+
167
+ # Unsupported pattern
168
+ return (False, json_type)
169
+
170
+ def _convert_type_conditional_to_oneof(self, json_type: dict, if_clause: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
171
+ """
172
+ Convert a type-based conditional schema to oneOf structure.
173
+
174
+ Example:
175
+ Input: {type: object, properties: {type: {enum: [image, host]}}, if: {...}, then: {...}, else: {...}}
176
+ Output: {oneOf: [then_merged_with_base, else_merged_with_base]}
177
+ """
178
+ # Create a base type without the conditional parts
179
+ base_type = {}
180
+ for key, value in json_type.items():
181
+ if key not in ('if', 'then', 'else'):
182
+ base_type[key] = copy.deepcopy(value)
183
+
184
+ oneof_variants = []
185
+
186
+ # Process then clause
187
+ if then_clause:
188
+ then_variant = self._merge_conditional_branch(base_type, then_clause)
189
+ oneof_variants.append(then_variant)
190
+
191
+ # Process else clause (which may contain nested if/then/else)
192
+ if else_clause:
193
+ if 'if' in else_clause:
194
+ # Recursive handling of nested conditional
195
+ handled, processed_else = self.handle_inline_conditional_schema(else_clause)
196
+ if handled and 'oneOf' in processed_else:
197
+ # Flatten nested oneOf
198
+ for variant in processed_else['oneOf']:
199
+ merged = self._merge_conditional_branch(base_type, variant)
200
+ oneof_variants.append(merged)
201
+ else:
202
+ else_variant = self._merge_conditional_branch(base_type, else_clause)
203
+ oneof_variants.append(else_variant)
204
+ else:
205
+ else_variant = self._merge_conditional_branch(base_type, else_clause)
206
+ oneof_variants.append(else_variant)
207
+
208
+ if len(oneof_variants) > 0:
209
+ result = copy.deepcopy(base_type)
210
+ # Remove properties since they'll be in the variants
211
+ if 'properties' in result:
212
+ del result['properties']
213
+ if 'additionalProperties' in result:
214
+ del result['additionalProperties']
215
+ if 'required' in result:
216
+ del result['required']
217
+ result['oneOf'] = oneof_variants
218
+ return (True, result)
219
+
220
+ return (False, json_type)
221
+
222
+ def _merge_conditional_branches(self, json_type: dict, then_clause: dict, else_clause: dict | None) -> Tuple[bool, dict]:
223
+ """
224
+ Merge conditional branches for field presence patterns.
225
+ Avro handles optional fields naturally, so we can merge all properties.
226
+ """
227
+ result = {}
228
+ for key, value in json_type.items():
229
+ if key not in ('if', 'then', 'else'):
230
+ result[key] = copy.deepcopy(value)
231
+
232
+ # Merge properties from then clause
233
+ if then_clause and 'properties' in then_clause:
234
+ if 'properties' not in result:
235
+ result['properties'] = {}
236
+ for prop_name, prop_def in then_clause['properties'].items():
237
+ if prop_name not in result['properties']:
238
+ result['properties'][prop_name] = copy.deepcopy(prop_def)
239
+
240
+ # Merge properties from else clause
241
+ if else_clause and 'properties' in else_clause:
242
+ if 'properties' not in result:
243
+ result['properties'] = {}
244
+ for prop_name, prop_def in else_clause['properties'].items():
245
+ if prop_name not in result['properties']:
246
+ result['properties'][prop_name] = copy.deepcopy(prop_def)
247
+
248
+ return (True, result)
249
+
250
+ def _merge_conditional_branch(self, base: dict, branch: dict) -> dict:
251
+ """Merge a conditional branch with the base type."""
252
+ result = copy.deepcopy(base)
253
+
254
+ if not branch:
255
+ return result
256
+
257
+ # Merge properties
258
+ if 'properties' in branch:
259
+ if 'properties' not in result:
260
+ result['properties'] = {}
261
+ for prop_name, prop_def in branch['properties'].items():
262
+ result['properties'][prop_name] = copy.deepcopy(prop_def)
263
+
264
+ # Merge additionalProperties
265
+ if 'additionalProperties' in branch:
266
+ result['additionalProperties'] = branch['additionalProperties']
267
+
268
+ # Merge required (union of required fields)
269
+ if 'required' in branch:
270
+ if 'required' not in result:
271
+ result['required'] = []
272
+ for req in branch['required']:
273
+ if req not in result['required']:
274
+ result['required'].append(req)
275
+
276
+ return result
277
+
278
+ def flatten_union(self, type_list: list) -> list:
279
+ """
280
+ Flatten the list of types in a union into a single list.
281
+
282
+ Args:
283
+ type_list (list): The list of types in a union.
284
+
285
+ Returns:
286
+ list: The flattened list of types.
287
+
288
+ """
289
+ flat_list = []
290
+ for t in type_list:
291
+ if isinstance(t, list):
292
+ inner = self.flatten_union(t)
293
+ for u in inner:
294
+ if not u in flat_list:
295
+ flat_list.append(u)
296
+ elif not t in flat_list:
297
+ flat_list.append(t)
298
+ # consolidate array type instances
299
+ array_type = None
300
+ map_type = None
301
+ flat_list_1 = []
302
+ for t in flat_list:
303
+ if isinstance(t, dict) and 'type' in t and t['type'] == 'array' and 'items' in t:
304
+ if not array_type:
305
+ array_type = t
306
+ flat_list_1.append(t)
307
+ else:
308
+ array_type = self.merge_avro_schemas([array_type, t], [])
309
+ elif isinstance(t, dict) and 'type' in t and t['type'] == 'map' and 'values' in t:
310
+ if not map_type:
311
+ map_type = t
312
+ flat_list_1.append(t)
313
+ else:
314
+ map_type = self.merge_avro_schemas([map_type, t], [])
315
+ elif not t in flat_list_1:
316
+ flat_list_1.append(t)
317
+ return flat_list_1
318
+
319
+ # pylint: disable=dangerous-default-value
320
+ def merge_avro_schemas(self, schemas: list, avro_schemas: list, type_name: str | None = None, deps: List[str] = []) -> str | list | dict:
321
+ """Merge multiple Avro type schemas into one."""
322
+
323
+ def split_merge(schema1, schema2, schema_list, offset):
324
+ """ return the continuing schema merges of incompatible schemas """
325
+ remaining_schemas = schema_list[offset +
326
+ 1:] if len(schema_list) > offset else []
327
+ if isinstance(schema2, dict) and 'dependencies' in schema2:
328
+ deps.extend(schema2['dependencies'])
329
+ del schema2['dependencies']
330
+ if isinstance(schema1, dict) and 'dependencies' in schema1:
331
+ deps.extend(schema1['dependencies'])
332
+ del schema1['dependencies']
333
+ schema1_merged = self.merge_avro_schemas(
334
+ [schema2] + remaining_schemas, avro_schemas, type_name, deps)
335
+ schema2_merged = self.merge_avro_schemas(
336
+ [schema1] + remaining_schemas, avro_schemas, type_name, deps)
337
+ if not self.is_empty_type(schema1_merged) and not self.is_empty_type(schema2_merged):
338
+ return self.flatten_union([schema1_merged, schema2_merged])
339
+ else:
340
+ if not self.is_empty_type(schema1_merged):
341
+ return schema1_merged
342
+ if not self.is_empty_type(schema2_merged):
343
+ return schema2_merged
344
+ # if both are empty, we'll return an empty record
345
+ return {'type': 'record', 'fields': []}
346
+
347
+ merged_schema: dict = {}
348
+ if len(schemas) == 1:
349
+ return schemas[0]
350
+ if type_name:
351
+ self.set_avro_type_value(merged_schema, 'name', type_name)
352
+ for i, schema in enumerate(schemas):
353
+ schema = copy.deepcopy(schema)
354
+ if isinstance(schema, dict) and 'dependencies' in schema:
355
+ deps1: List[str] = merged_schema.get('dependencies', [])
356
+ deps1.extend(schema['dependencies'])
357
+ merged_schema['dependencies'] = deps1
358
+ if (isinstance(schema, list) or isinstance(schema, dict)) and len(schema) == 0:
359
+ continue
360
+ if isinstance(schema, str):
361
+ sch = next(
362
+ (s for s in avro_schemas if s.get('name') == schema), None)
363
+ if sch:
364
+ merged_schema.update(sch)
365
+ else:
366
+ merged_schema['type'] = schema
367
+ elif isinstance(schema, list):
368
+ # the incoming schema is a list, so it's a union
369
+ if 'type' not in merged_schema:
370
+ merged_schema['type'] = schema
371
+ else:
372
+ if isinstance(merged_schema['type'], list):
373
+ merged_schema['type'].extend(schema)
374
+ else:
375
+ if isinstance(merged_schema['type'], str):
376
+ if merged_schema['type'] == 'record' or merged_schema['type'] == 'enum' or merged_schema['type'] == 'fixed' \
377
+ or merged_schema['type'] == 'map' or merged_schema['type'] == 'array':
378
+ return split_merge(merged_schema, schema, schemas, i)
379
+ else:
380
+ merged_schema['type'] = [merged_schema['type']]
381
+ else:
382
+ merged_schema['type'].extend(schema)
383
+ elif schema and ('type' not in schema or 'type' not in merged_schema):
384
+ merged_schema.update(schema)
385
+ elif schema:
386
+ if 'type' in merged_schema and schema['type'] != merged_schema['type']:
387
+ return split_merge(merged_schema, schema, schemas, i)
388
+ if not type_name:
389
+ self.set_avro_type_value(merged_schema, 'name', avro_name(
390
+ merged_schema.get('name', '') + schema.get('name', '')))
391
+ if 'fields' in schema:
392
+ if 'fields' in merged_schema:
393
+ for field in schema['fields']:
394
+ if field not in merged_schema['fields']:
395
+ merged_schema['fields'].append(field)
396
+ else:
397
+ merged_schema_field = next(
398
+ f for f in merged_schema['fields'] if f.get('name') == field.get('name'))
399
+ if merged_schema_field['type'] != field['type']:
400
+ merged_schema_field['type'] = [
401
+ field['type'], merged_schema_field['type']]
402
+ if 'doc' in field and 'doc' not in merged_schema_field:
403
+ merged_schema_field['doc'] = field['doc']
404
+ else:
405
+ merged_schema['fields'] = schema['fields']
406
+ if self.is_avro_complex_type(merged_schema) and 'namespace' in merged_schema:
407
+ if merged_schema['type'] in ['array', 'map']:
408
+ del merged_schema['namespace']
409
+ return merged_schema
410
+
411
+ def merge_json_schemas(self, json_schemas: list[dict], intersect: bool = False) -> dict:
412
+ """
413
+ Merge multiple JSON schemas into one.
414
+
415
+ Args:
416
+ json_schemas (list[dict]): A list of JSON schemas to be merged.
417
+ intersect (bool, optional): If True, only keep the intersection of the required fields. Defaults to False.
418
+
419
+ Returns:
420
+ dict: The merged JSON schema.
421
+ """
422
+
423
+ def merge_structures(schema1: dict, schema2: dict) -> dict | list:
424
+ """ merge two JSON dicts recursively """
425
+ if 'type' in schema1 and 'type' in schema2 and schema1['type'] != schema2['type']:
426
+ return [schema1, schema2]
427
+ schema1 = copy.deepcopy(schema1)
428
+ for key in schema2:
429
+ if key not in schema1:
430
+ schema1[key] = schema2[key]
431
+ elif isinstance(schema1[key], dict) and isinstance(schema2[key], dict):
432
+ schema1[key] = merge_structures(schema1[key], schema2[key])
433
+ elif isinstance(schema1[key], list) and isinstance(schema2[key], list):
434
+ schema1[key].extend(schema2[key])
435
+ elif schema1[key] == schema2[key]:
436
+ continue
437
+ else:
438
+ if isinstance(schema1[key], list):
439
+ if schema2[key] not in schema1[key]:
440
+ schema1[key].append(schema2[key])
441
+ else:
442
+ schema1[key] = [schema1[key], schema2[key]]
443
+ return schema1
444
+
445
+ merged_type: dict = {}
446
+
447
+ for json_schema in json_schemas:
448
+ if 'type' not in json_schema or 'type' not in merged_type:
449
+ for key in json_schema:
450
+ if not key in merged_type:
451
+ merged_type[key] = copy.deepcopy(json_schema[key])
452
+ else:
453
+ if key == 'required':
454
+ merged_type[key] = list(
455
+ set(merged_type[key]).union(set(json_schema[key])))
456
+ if key == 'name' or key == 'title' or key == 'description':
457
+ merged_type[key] = merged_type[key] + \
458
+ json_schema[key]
459
+ elif isinstance(merged_type[key], dict):
460
+ merged_type[key] = merge_structures(
461
+ merged_type[key], copy.deepcopy(json_schema[key]))
462
+ elif isinstance(merged_type[key], list) and isinstance(json_schema[key], list):
463
+ for item in json_schema[key]:
464
+ if item not in merged_type[key]:
465
+ merged_type[key].append(item)
466
+ else:
467
+ if merged_type[key] is None:
468
+ merged_type[key] = json_schema[key]
469
+ else:
470
+ merged_type[key] = [merged_type[key],
471
+ copy.deepcopy(json_schema[key])]
472
+ else:
473
+ if 'type' in merged_type and json_schema['type'] != merged_type['type']:
474
+ if isinstance(merged_type['type'], str):
475
+ merged_type['type'] = [merged_type['type']]
476
+ merged_type['type'].append(json_schema['type'])
477
+ if 'required' in json_schema:
478
+ if 'required' in merged_type:
479
+ merged_type['required'] = list(
480
+ set(merged_type['required']).union(set(json_schema['required'])))
481
+ else:
482
+ merged_type['required'] = json_schema['required']
483
+ if 'name' in json_schema:
484
+ if 'name' in merged_type:
485
+ merged_type['name'] = merged_type.get(
486
+ 'name', '') + json_schema['name']
487
+ else:
488
+ merged_type['name'] = json_schema['name']
489
+ if 'properties' in json_schema:
490
+ if 'properties' in merged_type:
491
+ for prop in json_schema['properties']:
492
+ if prop in merged_type['properties']:
493
+ merged_type['properties'][prop] = merge_structures(
494
+ merged_type['properties'][prop], copy.deepcopy(json_schema['properties'][prop]))
495
+ else:
496
+ merged_type['properties'][prop] = json_schema['properties'][prop]
497
+ else:
498
+ merged_type['properties'] = json_schema['properties']
499
+ if 'enum' in json_schema:
500
+ if 'enum' in merged_type:
501
+ merged_type['enum'] = list(
502
+ set(merged_type['enum']).union(set(json_schema['enum'])))
503
+ else:
504
+ merged_type['enum'] = json_schema['enum']
505
+ if 'format' in json_schema:
506
+ if 'format' in merged_type:
507
+ merged_type['format'] = merged_type['format'] + \
508
+ json_schema['format']
509
+ else:
510
+ merged_type['format'] = json_schema['format']
511
+
512
+ if intersect:
513
+ # only keep the intersection of the required fields
514
+ if 'required' in merged_type:
515
+ new_required = merged_type['required']
516
+ for json_schema in json_schemas:
517
+ new_required = list(set(new_required).intersection(
518
+ set(json_schema.get('required', []))))
519
+ merged_type['required'] = new_required
520
+
521
+ return merged_type
522
+
523
+ def ensure_type(self, type: dict | str | list) -> dict | str | list:
524
+ """
525
+ Ensures that the given type is valid by adding a 'type' field if it is missing.
526
+
527
+ Args:
528
+ type (dict | str | list): The type to ensure.
529
+
530
+ Returns:
531
+ dict | str | list: The ensured type.
532
+ """
533
+ if isinstance(type, str) or isinstance(type, list) or 'type' in type:
534
+ return type
535
+
536
+ type['type'] = generic_type()
537
+ return type
538
+
539
+ def json_schema_primitive_to_avro_type(self, json_primitive: str | list, format: str | None, enum: list | None, record_name: str, field_name: str, namespace: str, dependencies: list) -> str | dict[str, Any] | list:
540
+ """
541
+ Convert a JSON-schema primitive type to Avro primitive type.
542
+
543
+ Args:
544
+ json_primitive (str | list): The JSON-schema primitive type to be converted.
545
+ format (str | None): The format of the JSON primitive type, if applicable.
546
+ enum (list | None): The list of enum values, if applicable.
547
+ record_name (str): The name of the record.
548
+ field_name (str): The name of the field.
549
+ namespace (str): The namespace of the Avro type.
550
+ dependencies (list): The list of dependencies.
551
+
552
+ Returns:
553
+ str | dict[str,Any] | list: The converted Avro primitive type.
554
+
555
+ """
556
+ if isinstance(json_primitive, list):
557
+ if enum:
558
+ # Handle mixed-type enums properly using the dedicated helper
559
+ return self.create_enum_for_mixed_types(
560
+ field_name + '_1',
561
+ self.compose_namespace(namespace, record_name + '_types'),
562
+ enum,
563
+ json_primitive
564
+ )
565
+ else:
566
+ union = []
567
+ for item in json_primitive:
568
+ enum2 = item.get('enum') if isinstance(
569
+ item, dict) else None
570
+ format2 = item.get('format') if isinstance(
571
+ item, dict) else None
572
+ avro_primitive = self.json_schema_primitive_to_avro_type(
573
+ item, format2, enum2, record_name, field_name, self.compose_namespace(namespace, record_name, field_name), dependencies)
574
+ union.append(avro_primitive)
575
+ return union
576
+
577
+ if json_primitive == 'string':
578
+ avro_primitive = 'string'
579
+ elif json_primitive == 'integer':
580
+ avro_primitive = 'int'
581
+ if format == 'int64':
582
+ avro_primitive = 'long'
583
+ elif json_primitive == 'number':
584
+ avro_primitive = 'float'
585
+ elif json_primitive == 'boolean':
586
+ avro_primitive = 'boolean'
587
+ elif not format:
588
+ if isinstance(json_primitive, str):
589
+ dependencies.append(json_primitive)
590
+ avro_primitive = json_primitive
591
+
592
+ # if you've got { 'type': 'string', 'format': ['date-time', 'duration'] }, I'm sorry
593
+ if format and isinstance(format, str):
594
+ if format in ('date-time', 'date'):
595
+ avro_primitive = {'type': 'int', 'logicalType': 'date'}
596
+ elif format in ('time'):
597
+ avro_primitive = {'type': 'int', 'logicalType': 'time-millis'}
598
+ elif format in ('duration'):
599
+ avro_primitive = {'type': 'fixed',
600
+ 'size': 12, 'logicalType': 'duration'}
601
+ elif format in ('uuid'):
602
+ avro_primitive = {'type': 'string', 'logicalType': 'uuid'}
603
+
604
+ return avro_primitive
605
+
606
+ def fetch_content(self, url: str | ParseResult):
607
+ """
608
+ Fetches the content from the specified URL.
609
+
610
+ Args:
611
+ url (str or ParseResult): The URL to fetch the content from.
612
+
613
+ Returns:
614
+ str: The fetched content.
615
+
616
+ Raises:
617
+ requests.RequestException: If there is an error while making the HTTP request.
618
+ Exception: If there is an error while reading the file.
619
+
620
+ """
621
+ # Parse the URL to determine the scheme
622
+ if isinstance(url, str):
623
+ parsed_url = urlparse(url)
624
+ else:
625
+ parsed_url = url
626
+
627
+ if parsed_url.geturl() in self.content_cache:
628
+ return self.content_cache[parsed_url.geturl()]
629
+ scheme = parsed_url.scheme
630
+
631
+ # Handle HTTP and HTTPS URLs
632
+ if scheme in ['http', 'https']:
633
+ response = requests.get(url if isinstance(
634
+ url, str) else parsed_url.geturl(), timeout=30)
635
+ # Raises an HTTPError if the response status code is 4XX/5XX
636
+ response.raise_for_status()
637
+ self.content_cache[parsed_url.geturl()] = response.text
638
+ return response.text
639
+
640
+ # Handle file URLs
641
+ elif scheme == 'file':
642
+ # Remove the leading 'file://' from the path for compatibility
643
+ file_path = parsed_url.netloc
644
+ if not file_path:
645
+ file_path = parsed_url.path
646
+ # On Windows, a file URL might start with a '/' but it's not part of the actual path
647
+ if os.name == 'nt' and file_path.startswith('/'):
648
+ file_path = file_path[1:]
649
+ with open(file_path, 'r', encoding='utf-8') as file:
650
+ text = file.read()
651
+ self.content_cache[parsed_url.geturl()] = text
652
+ return text
653
+ else:
654
+ raise NotImplementedError(f'Unsupported URL scheme: {scheme}')
655
+
656
+ def resolve_reference(self, json_type: dict, base_uri: str, json_doc: dict) -> Tuple[dict, dict]:
657
+ """
658
+ Resolve a JSON Pointer reference or a JSON $ref reference.
659
+
660
+ Args:
661
+ json_type (dict): The JSON type containing the reference.
662
+ base_uri (str): The base URI of the JSON document.
663
+ json_doc (dict): The JSON document containing the reference.
664
+
665
+ Returns:
666
+ Tuple[dict, dict]: A tuple containing the resolved JSON schema and the original JSON schema document.
667
+
668
+ Raises:
669
+ Exception: If there is an error decoding JSON from the reference.
670
+ Exception: If there is an error resolving the JSON Pointer reference.
671
+
672
+ """
673
+ try:
674
+ ref = json_type['$ref']
675
+ content = None
676
+ url = urlparse(ref)
677
+ if url.scheme:
678
+ content = self.fetch_content(ref)
679
+ elif url.path:
680
+ file_uri = self.compose_uri(base_uri, url)
681
+ content = self.fetch_content(file_uri)
682
+ if content:
683
+ try:
684
+ json_schema_doc = json_schema = json.loads(content)
685
+ # resolve the JSON Pointer reference, if any
686
+ if url.fragment:
687
+ json_schema = jsonpointer.resolve_pointer(
688
+ json_schema, url.fragment)
689
+ return json_schema, json_schema_doc
690
+ except json.JSONDecodeError:
691
+ raise Exception(f'Error decoding JSON from {ref}')
692
+
693
+ if url.fragment:
694
+ json_pointer = unquote(url.fragment)
695
+ ref_schema = jsonpointer.resolve_pointer(
696
+ json_doc, json_pointer)
697
+ if ref_schema:
698
+ return ref_schema, json_doc
699
+ except JsonPointerException as e:
700
+ raise Exception(
701
+ f'Error resolving JSON Pointer reference for {base_uri}')
702
+ return json_type, json_doc
703
+
704
+ def compose_uri(self, base_uri, url):
705
+ if isinstance(url, str):
706
+ url = urlparse(url)
707
+ if url.scheme:
708
+ return url.geturl()
709
+ if not url.path and not url.netloc:
710
+ return base_uri
711
+ if base_uri.startswith('file'):
712
+ parsed_file_uri = urlparse(base_uri)
713
+ dir = os.path.dirname(
714
+ parsed_file_uri.netloc if parsed_file_uri.netloc else parsed_file_uri.path)
715
+ filename = os.path.join(dir, url.path)
716
+ file_uri = f'file://{filename}'
717
+ else:
718
+ # combine the base URI with the URL
719
+ file_uri = urllib.parse.urljoin(base_uri, url.geturl())
720
+ return file_uri
721
+
722
+ def get_field_type_name(self, field: dict) -> str:
723
+ if isinstance(field['type'], str):
724
+ return field['type']
725
+ elif isinstance(field['type'], list):
726
+ names = []
727
+ for field_type in field['type']:
728
+ if isinstance(field_type, str):
729
+ names.append(field_type)
730
+ elif isinstance(field_type, dict):
731
+ names.append(self.get_field_type_name(field_type))
732
+ else:
733
+ names.append('union')
734
+ return ', '.join(names)
735
+ elif isinstance(field['type'], dict) and 'type' in field['type']:
736
+ return field['type']['type']
737
+ return 'union'
738
+
739
+ def json_type_to_avro_type(self, json_type: str | dict, record_name: str, field_name: str, namespace: str, dependencies: list, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth=1) -> dict | list | str:
740
+ """Convert a JSON type to Avro type."""
741
+
742
+ try:
743
+ if recursion_depth >= self.max_recursion_depth:
744
+ print(
745
+ f'WARNING: Maximum recursion depth reached for {record_name} at field {field_name}')
746
+ return generic_type()
747
+
748
+ avro_type: list | dict | str = {}
749
+ local_name = avro_name(field_name if field_name else record_name)
750
+ hasAnyOf = isinstance(json_type, dict) and 'anyOf' in json_type
751
+
752
+ if isinstance(json_type, dict):
753
+
754
+ json_object_type = json_type.get('type')
755
+ # Check if the type is already an Avro schema (e.g., shared discriminator enum)
756
+ # This happens when a discriminated union property was pre-set with an Avro type
757
+ if isinstance(json_object_type, dict) and 'type' in json_object_type and json_object_type.get('type') in ['enum', 'record', 'fixed', 'array', 'map']:
758
+ return self.post_check_avro_type(dependencies, json_object_type)
759
+ if isinstance(json_object_type, list):
760
+ # if the 'type' is a list, we map it back to a string
761
+ # if the list has only one item or if the list has two items
762
+ # and one of them is 'null'
763
+ # otherwise, we will construct and inject a oneOf type
764
+ # and split the type
765
+
766
+ # Special case: if we have a mixed-type enum (e.g., type: ["string", "integer"] with enum),
767
+ # handle it directly here to avoid duplicate processing
768
+ if 'enum' in json_type and any(t in json_object_type for t in ['string', 'integer', 'int']):
769
+ has_null = 'null' in json_object_type
770
+ avro_type = self.create_enum_for_mixed_types(
771
+ local_name + '_1',
772
+ self.compose_namespace(namespace, record_name + '_types'),
773
+ json_type['enum'],
774
+ json_object_type
775
+ )
776
+ if 'description' in json_type and isinstance(avro_type, dict):
777
+ avro_type['doc'] = json_type['description']
778
+ elif 'description' in json_type and isinstance(avro_type, list):
779
+ # For unions, we can't set doc directly - it will be set on the field
780
+ pass
781
+ return self.post_check_avro_type(dependencies, avro_type)
782
+
783
+ if len(json_object_type) == 1:
784
+ json_object_type = json_object_type[0]
785
+ elif len(json_object_type) == 2 and 'null' in json_object_type:
786
+ if json_object_type[0] == 'null':
787
+ json_object_type = json_object_type[1]
788
+ else:
789
+ json_object_type = json_object_type[0]
790
+ else:
791
+ oneof = []
792
+ for option in json_object_type:
793
+ if not option == 'null':
794
+ oneof.append({
795
+ 'type': option
796
+ })
797
+ if len(oneof) > 0:
798
+ del json_type['type']
799
+ json_type['oneOf'] = oneof
800
+
801
+ if 'if' in json_type or 'then' in json_type or 'else' in json_type or 'dependentSchemas' in json_type or 'dependentRequired' in json_type:
802
+ # Try to handle the conditional schema pattern
803
+ conditional_handled = False
804
+ if 'if' in json_type:
805
+ conditional_handled, json_type = self.handle_inline_conditional_schema(json_type)
806
+
807
+ if not conditional_handled:
808
+ # Only warn for patterns we can't handle
809
+ remaining_conditionals = []
810
+ if 'if' in json_type:
811
+ remaining_conditionals.append('if/then/else')
812
+ if 'dependentSchemas' in json_type:
813
+ remaining_conditionals.append('dependentSchemas')
814
+ if 'dependentRequired' in json_type:
815
+ remaining_conditionals.append('dependentRequired')
816
+
817
+ if remaining_conditionals:
818
+ print(
819
+ f'WARNING: Conditional schema pattern ({", ".join(remaining_conditionals)}) is not fully supported and will be simplified.')
820
+
821
+ if 'if' in json_type:
822
+ del json_type['if']
823
+ if 'then' in json_type:
824
+ del json_type['then']
825
+ if 'else' in json_type:
826
+ del json_type['else']
827
+ if 'dependentSchemas' in json_type:
828
+ del json_type['dependentSchemas']
829
+ if 'dependentRequired' in json_type:
830
+ del json_type['dependentRequired']
831
+
832
+ base_type = json_type.copy()
833
+ if 'oneOf' in base_type:
834
+ del base_type['oneOf']
835
+ if 'anyOf' in base_type:
836
+ del base_type['anyOf']
837
+ if 'allOf' in base_type:
838
+ del base_type['allOf']
839
+ json_types = []
840
+
841
+ if 'allOf' in json_type:
842
+ # Check if this is a discriminated union pattern
843
+ discriminated_union_types = self.detect_discriminated_union(json_type)
844
+
845
+ if discriminated_union_types:
846
+ # Generate separate types for each discriminated variant
847
+ base_props = json_type.get('properties', {})
848
+ discriminator_field = 'type' # The discriminator field
849
+ discriminator_enum = base_props.get(discriminator_field, {}).get('enum', [])
850
+
851
+ # Create a shared enum type for the discriminator field that all variants will reference
852
+ shared_discriminator_enum = None
853
+ if discriminator_enum:
854
+ shared_discriminator_enum = self.create_enum_type(
855
+ discriminator_field,
856
+ self.compose_namespace(namespace, record_name + '_types'),
857
+ discriminator_enum
858
+ )
859
+
860
+ for allof_item in json_type['allOf']:
861
+ if not (isinstance(allof_item, dict) and 'if' in allof_item and 'then' in allof_item):
862
+ continue
863
+
864
+ # Extract the discriminator value from the if clause
865
+ if_clause = allof_item['if']
866
+ discriminator_value = None
867
+ if (isinstance(if_clause, dict) and
868
+ 'properties' in if_clause and
869
+ discriminator_field in if_clause['properties']):
870
+ disc_prop = if_clause['properties'][discriminator_field]
871
+ if 'enum' in disc_prop and len(disc_prop['enum']) > 0:
872
+ discriminator_value = disc_prop['enum'][0]
873
+
874
+ if not discriminator_value:
875
+ continue
876
+
877
+ # Resolve the then clause reference
878
+ then_clause = allof_item['then']
879
+ if isinstance(then_clause, dict) and '$ref' in then_clause:
880
+ resolved_type, _ = self.resolve_reference(then_clause, base_uri, json_schema)
881
+
882
+ # Create a new type combining base properties and resolved type
883
+ variant_type = copy.deepcopy(resolved_type)
884
+
885
+ # Set the variant type name to the discriminator value
886
+ variant_type['title'] = discriminator_value
887
+
888
+ # Preserve description from base type if variant doesn't have one
889
+ if 'description' not in variant_type and 'description' in base_type:
890
+ variant_type['description'] = base_type['description']
891
+
892
+ # Merge base properties into the variant
893
+ if 'properties' not in variant_type:
894
+ variant_type['properties'] = {}
895
+
896
+ for prop_name, prop_def in base_props.items():
897
+ if prop_name not in variant_type['properties']:
898
+ # For non-discriminator fields, copy the property definition
899
+ if prop_name != discriminator_field:
900
+ variant_type['properties'][prop_name] = copy.deepcopy(prop_def)
901
+
902
+ # Set discriminator field to reference the shared enum type
903
+ if shared_discriminator_enum:
904
+ variant_type['properties'][discriminator_field] = {
905
+ 'type': shared_discriminator_enum,
906
+ 'default': discriminator_value,
907
+ 'const': discriminator_value,
908
+ 'discriminator': True
909
+ }
910
+ else:
911
+ # Fallback if no enum was found
912
+ variant_type['properties'][discriminator_field] = {
913
+ 'type': 'string',
914
+ 'default': discriminator_value,
915
+ 'const': discriminator_value,
916
+ 'discriminator': True
917
+ }
918
+
919
+ # Add union annotation to indicate this is part of a discriminated union
920
+ variant_type['union'] = record_name
921
+
922
+ json_types.append(variant_type)
923
+ else:
924
+ # Original allOf merging logic for non-discriminated unions
925
+ type_list = [copy.deepcopy(base_type)]
926
+ for allof_option in json_type['allOf']:
927
+ while isinstance(allof_option, dict) and '$ref' in allof_option:
928
+ resolved_json_type, resolved_schema = self.resolve_reference(
929
+ allof_option, base_uri, json_schema)
930
+ del allof_option['$ref']
931
+ allof_option = self.merge_json_schemas(
932
+ [allof_option, resolved_json_type])
933
+ type_list.append(copy.deepcopy(allof_option))
934
+ merged_type = self.merge_json_schemas(
935
+ type_list, intersect=False)
936
+ json_types.append(merged_type)
937
+
938
+ if 'oneOf' in json_type:
939
+ # if the json type is a oneOf, we create a type union of all types
940
+ if len(json_types) == 0:
941
+ type_to_process = copy.deepcopy(base_type)
942
+ else:
943
+ type_to_process = copy.deepcopy(json_types.pop())
944
+ json_types = []
945
+ oneof = json_type['oneOf']
946
+ if len(json_types) == 0:
947
+ for oneof_option in oneof:
948
+ if isinstance(oneof_option, dict) and 'type' in oneof_option and 'type' in type_to_process and not type_to_process.get('type') == oneof_option.get('type'):
949
+ # we can't merge these due to conflicting types, so we pass the option-type on as-is
950
+ json_types.append(oneof_option)
951
+ else:
952
+ json_types.append(self.merge_json_schemas(
953
+ [type_to_process, oneof_option], intersect=True))
954
+ else:
955
+ new_json_types = []
956
+ for oneof_option in oneof:
957
+ for json_type_option in json_types:
958
+ json_type_option = self.merge_json_schemas(
959
+ [json_type_option, oneof_option], intersect=True)
960
+ new_json_types.append(json_type_option)
961
+ json_types = new_json_types
962
+
963
+ if 'anyOf' in json_type:
964
+ types_to_process = json_types.copy() if len(json_types) > 0 else [
965
+ copy.deepcopy(base_type)]
966
+ json_types = []
967
+ for type_to_process in types_to_process:
968
+ type_list = [copy.deepcopy(type_to_process)]
969
+ # anyOf is a list of types where any number from 1 to all
970
+ # may match the data. Trouble with anyOf is that it doesn't
971
+ # really have a semantic interpretation in the context of Avro.
972
+ for anyof_option in json_type['anyOf']:
973
+ if isinstance(anyof_option, dict) and '$ref' in anyof_option:
974
+ # if we have a ref, we can't merge into the base type, so we pass it on as-is.
975
+ # into the JSON type list
976
+ json_types.append(copy.deepcopy(anyof_option))
977
+ else:
978
+ type_list.append(copy.deepcopy(anyof_option))
979
+ merged_type = self.merge_json_schemas(
980
+ type_list, intersect=False)
981
+ json_types.append(merged_type)
982
+
983
+ if len(json_types) > 0:
984
+ if len(json_types) == 1:
985
+ avro_type = self.json_type_to_avro_type(
986
+ json_types[0], record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
987
+ if isinstance(avro_type, dict) and self.is_empty_type(avro_type) and not 'allOf' in json_type:
988
+ avro_type['type'] = generic_type()
989
+ avro_type = self.post_check_avro_type(
990
+ dependencies, avro_type)
991
+ return avro_type
992
+ else:
993
+ try:
994
+ record_stack.append(
995
+ field_name if field_name else record_name)
996
+ subtypes = []
997
+ count = 1
998
+ type_deps: List[str] = []
999
+ for json_type_option in json_types:
1000
+ if isinstance(json_type_option, dict) and '$ref' in json_type_option:
1001
+ ref = json_type_option['$ref']
1002
+ if ref in self.imported_types:
1003
+ avro_subtype = self.imported_types[ref]
1004
+ subtypes.append(avro_subtype)
1005
+ type_deps.append(avro_subtype)
1006
+ continue
1007
+
1008
+ subtype_deps: List[str] = []
1009
+ # Use title from discriminated union if available, otherwise generate numbered name
1010
+ if isinstance(json_type_option, dict) and 'title' in json_type_option:
1011
+ sub_field_name = avro_name(json_type_option['title'])
1012
+ elif not isinstance(json_type_option, dict) or not '$ref' in json_type_option:
1013
+ sub_field_name = avro_name(local_name + '_' + str(count))
1014
+ else:
1015
+ sub_field_name = None
1016
+ avro_subtype = self.json_type_to_avro_type(
1017
+ json_type_option, record_name, sub_field_name, namespace, subtype_deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1018
+ if not avro_subtype:
1019
+ continue
1020
+ if isinstance(avro_subtype, dict) and 'name' in avro_subtype and 'type' in avro_subtype and (avro_subtype['type'] == 'record' or avro_subtype['type'] == 'enum'):
1021
+ # we have a standalone record or enum so we need to add it to the schema at the top-level
1022
+ # and reference it as a dependency from the parent type if it's not already been added.
1023
+ existing_type = next((t for t in avro_schema if t.get('name') == avro_subtype['name'] and t.get(
1024
+ 'namespace') == avro_subtype.get('namespace')), None)
1025
+ if not existing_type:
1026
+ if subtype_deps:
1027
+ if not 'dependencies' in avro_subtype:
1028
+ avro_subtype['dependencies'] = subtype_deps
1029
+ else:
1030
+ avro_subtype['dependencies'].extend(
1031
+ subtype_deps)
1032
+ if self.is_empty_type(avro_subtype):
1033
+ print(
1034
+ f'WARN: Standalone type {avro_subtype["name"]} is empty')
1035
+ if avro_subtype['type'] != 'enum' and avro_subtype['type'] != 'record' and avro_subtype['type'] != 'fixed':
1036
+ raise ValueError(
1037
+ f'WARN: Standalone type {avro_subtype["name"]} is not a record or enum or fixed type')
1038
+ avro_schema.append(avro_subtype)
1039
+ full_name = self.get_qualified_name(
1040
+ avro_subtype)
1041
+ subtype_deps = [full_name]
1042
+ avro_subtype = full_name
1043
+ if isinstance(avro_subtype, dict) and 'dependencies' in avro_subtype:
1044
+ subtype_deps.extend(
1045
+ avro_subtype['dependencies'])
1046
+ del avro_subtype['dependencies']
1047
+ if len(subtype_deps) > 0:
1048
+ type_deps.extend(subtype_deps)
1049
+ if not self.is_empty_type(avro_subtype):
1050
+ if isinstance(avro_subtype, list):
1051
+ subtypes.extend(
1052
+ copy.deepcopy(avro_subtype))
1053
+ else:
1054
+ subtypes.append(
1055
+ copy.deepcopy(avro_subtype))
1056
+ count += 1
1057
+ if len(type_deps) > 0:
1058
+ dependencies.extend(type_deps)
1059
+ if len(subtypes) == 1:
1060
+ return self.post_check_avro_type(dependencies, subtypes[0])
1061
+ finally:
1062
+ record_stack.pop()
1063
+
1064
+ if hasAnyOf:
1065
+ # if all subtypes are strings, they are either primitive types or type references
1066
+ # which means there's nothing to merge, so we'll return the list of types
1067
+ if all([isinstance(st, str) for st in subtypes]):
1068
+ return self.post_check_avro_type(dependencies, subtypes)
1069
+
1070
+ # we now has a list of types that may match the data, but this would be
1071
+ # an Avro union which is mutually exclusive. We will merge this list
1072
+ # into a record type in postprocessing when all types are available
1073
+ if not isinstance(avro_type, dict):
1074
+ avro_type = {}
1075
+ avro_type['unmerged_types'] = subtypes
1076
+ avro_type['type'] = 'record'
1077
+ avro_type['name'] = avro_name(local_name)
1078
+ if local_name != avro_name(local_name):
1079
+ avro_type['altnames'] = { 'json': local_name }
1080
+ avro_type['namespace'] = namespace
1081
+ avro_type['fields'] = []
1082
+ if 'description' in json_type:
1083
+ avro_type['doc'] = json_type['description']
1084
+ json_type = {}
1085
+ else:
1086
+ return self.post_check_avro_type(dependencies, subtypes)
1087
+
1088
+ if 'properties' in json_type and not 'type' in json_type:
1089
+ json_type['type'] = 'object'
1090
+
1091
+ if 'description' in json_type and isinstance(avro_type, dict):
1092
+ avro_type['doc'] = json_type['description']
1093
+
1094
+ if 'title' in json_type and isinstance(avro_type, dict):
1095
+ self.set_avro_type_value(
1096
+ avro_type, 'name', avro_name(json_type['title']))
1097
+
1098
+ # first, pull in any referenced definitions and merge with this schema
1099
+ if '$ref' in json_type:
1100
+ # the $ref can indeed be a list as a result from a prior allOf/anyOf merge
1101
+ # if that is so, we will copy the type and process each $ref separately
1102
+ # and return the result as a list of types
1103
+ if isinstance(json_type['$ref'], list):
1104
+ types = []
1105
+ for ref in json_type['$ref']:
1106
+ json_type_copy = copy.deepcopy(json_type)
1107
+ json_type_copy['$ref'] = ref
1108
+ types.append(self.json_type_to_avro_type(json_type_copy, record_name, field_name, namespace,
1109
+ dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
1110
+ return self.post_check_avro_type(dependencies, types)
1111
+
1112
+ ref = json_type['$ref']
1113
+ if ref in self.imported_types:
1114
+ # reference was already resolved, so we can resolve the reference simply by returning the type
1115
+ type_ref = copy.deepcopy(self.imported_types[ref])
1116
+ if isinstance(type_ref, str):
1117
+ dependencies.append(type_ref)
1118
+ return self.post_check_avro_type(dependencies, type_ref)
1119
+ else:
1120
+ new_base_uri = self.compose_uri(
1121
+ base_uri, json_type['$ref'])
1122
+ resolved_json_type, resolved_schema = self.resolve_reference(
1123
+ json_type, base_uri, json_schema)
1124
+ if self.is_empty_json_type(json_type):
1125
+ # it's a standalone reference, so will import the type into the schema
1126
+ # and reference it like it was in the same file
1127
+ type_name = record_name
1128
+ type_namespace = namespace
1129
+ parsed_ref = urlparse(ref)
1130
+ if parsed_ref.fragment:
1131
+ type_name = avro_name(
1132
+ parsed_ref.fragment.split('/')[-1])
1133
+ sub_namespace = self.compose_namespace(
1134
+ *parsed_ref.fragment.split('/')[2:-1])
1135
+ type_namespace = self.compose_namespace(
1136
+ self.root_namespace, sub_namespace)
1137
+
1138
+ # registering in imported_types ahead of resolving to prevent circular references.
1139
+ # we only cache the type if it's forseeable that it is usable as a standalone type
1140
+ # which means that it must be either a record or an enum or a fixed type when converted
1141
+ # to Avro. That means we look for the presence of 'type', 'properties', 'allOf', 'anyOf',
1142
+ # and 'enum' in the resolved type.
1143
+ if resolved_json_type and (('type' in resolved_json_type and resolved_json_type['type'] == 'object') or 'properties' in resolved_json_type or 'enum' in resolved_json_type or
1144
+ 'allOf' in resolved_json_type or 'anyOf' in resolved_json_type):
1145
+ self.imported_types[ref] = self.compose_namespace(
1146
+ type_namespace, type_name)
1147
+ # resolve type
1148
+ deps: List[str] = []
1149
+ resolved_avro_type: dict | list | str | None = self.json_type_to_avro_type(
1150
+ resolved_json_type, type_name, '', type_namespace, deps, resolved_schema, new_base_uri, avro_schema, [], recursion_depth + 1)
1151
+ if isinstance(resolved_avro_type, str):
1152
+ dependencies.extend(deps)
1153
+ return self.post_check_avro_type(dependencies, resolved_avro_type)
1154
+ if isinstance(resolved_avro_type, list) or (not isinstance(resolved_avro_type, dict) or (not resolved_avro_type.get('type') == 'record' and not resolved_avro_type.get('type') == 'enum')):
1155
+ if isinstance(resolved_avro_type, dict) and not 'type' in resolved_avro_type:
1156
+ if isinstance(avro_type, dict):
1157
+ # the resolved type didn't have a type and avro_type is a dict,
1158
+ # so we assume it's a mixin into the type we found
1159
+ avro_type.update(resolved_avro_type)
1160
+ resolved_avro_type = None
1161
+ else:
1162
+ # no 'type' definition for this field and we can't mix into the avro type,
1163
+ # so we fallback to a generic type
1164
+ print(
1165
+ f"WARNING: no 'type' definition for {ref} in record {record_name}: {json.dumps(resolved_avro_type)}")
1166
+ resolved_avro_type = generic_type()
1167
+ elif isinstance(avro_type, str) and resolved_avro_type:
1168
+ # this is a plain type reference
1169
+ avro_type = resolved_avro_type
1170
+ self.imported_types[ref] = avro_type
1171
+ resolved_avro_type = None
1172
+ if resolved_avro_type:
1173
+ # this is not a record type that can stand on its own,
1174
+ # so we remove the cached type entry
1175
+ # and pass it on as an inline type
1176
+ dependencies.extend(deps)
1177
+ if ref in self.imported_types:
1178
+ del self.imported_types[ref]
1179
+ avro_type = self.merge_avro_schemas(
1180
+ [avro_type, resolved_avro_type], avro_schema, local_name)
1181
+ if isinstance(avro_type, dict) and 'name' in avro_type and not self.is_standalone_avro_type(avro_type):
1182
+ del avro_type['name']
1183
+ return self.post_check_avro_type(dependencies, avro_type)
1184
+ else:
1185
+ avro_type = resolved_avro_type
1186
+ self.imported_types[ref] = copy.deepcopy(
1187
+ avro_type)
1188
+
1189
+ if len(deps) > 0:
1190
+ if isinstance(avro_type, dict):
1191
+ avro_type['dependencies'] = deps
1192
+ else:
1193
+ dependencies.extend(deps)
1194
+
1195
+ if self.is_standalone_avro_type(avro_type):
1196
+ self.register_type(avro_schema, avro_type)
1197
+ full_name = self.get_qualified_name(avro_type)
1198
+ if ref in self.imported_types:
1199
+ # update the import reference to the resolved type if it's cached
1200
+ self.imported_types[ref] = full_name
1201
+ dependencies.append(full_name)
1202
+ avro_type = full_name
1203
+ else:
1204
+ del json_type['$ref']
1205
+ # it's a reference within a definition, so we will turn this into an inline type
1206
+ if isinstance(resolved_json_type, dict) and 'type' in resolved_json_type and json_type.get('type') and not json_type['type'] == resolved_json_type['type']:
1207
+ # the types conflict, so we can't merge them
1208
+ type1 = self.json_type_to_avro_type(
1209
+ json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
1210
+ type2 = self.json_type_to_avro_type(resolved_json_type, record_name, field_name, namespace,
1211
+ dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
1212
+ # if either of the types are empty, use just the other one
1213
+ if not self.is_empty_type(type1) and not self.is_empty_type(type2):
1214
+ return self.flatten_union([type1, type2])
1215
+ if not self.is_empty_type(type1):
1216
+ avro_type = type1
1217
+ if isinstance(avro_type, list):
1218
+ return self.post_check_avro_type(dependencies, avro_type)
1219
+ if not self.is_empty_type(type2):
1220
+ avro_type = type2
1221
+ if isinstance(avro_type, list):
1222
+ return self.post_check_avro_type(dependencies, avro_type)
1223
+ json_type = {}
1224
+ else:
1225
+ json_type = self.merge_json_schemas(
1226
+ [json_type, resolved_json_type])
1227
+ avro_type = self.json_type_to_avro_type(
1228
+ json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
1229
+ json_type = {}
1230
+ if ref in self.imported_types:
1231
+ # update the import reference to the resolved type if it's cached
1232
+ if isinstance(avro_type, dict) and 'name' in avro_type:
1233
+ self.imported_types[ref] = avro_type['name']
1234
+ else:
1235
+ self.imported_types[ref] = avro_type
1236
+
1237
+ # if 'const' is present, make this an enum
1238
+ if 'const' in json_type:
1239
+ const_list = json_type['const'] if isinstance(
1240
+ json_type['const'], list) else [json_type['const']]
1241
+ avro_type = self.merge_avro_schemas([avro_type, self.create_enum_type(
1242
+ local_name, namespace, const_list)], avro_schema, local_name)
1243
+ if json_object_type or 'enum' in json_type:
1244
+ if json_object_type == 'array':
1245
+ if isinstance(json_type, dict) and 'items' in json_type:
1246
+ deps = []
1247
+ item_type = self.json_type_to_avro_type(
1248
+ json_type['items'], record_name, field_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1249
+ if self.is_standalone_avro_type(item_type):
1250
+ if isinstance(item_type, dict) and len(deps) > 0:
1251
+ item_type['dependencies'] = deps
1252
+ self.register_type(avro_schema, item_type)
1253
+ dependencies.append(
1254
+ self.get_qualified_name(item_type))
1255
+ else:
1256
+ dependencies.extend(deps)
1257
+ if isinstance(item_type, dict) and not 'type' in item_type:
1258
+ item_type = generic_type()
1259
+ elif isinstance(item_type, str) and not item_type in primitive_types:
1260
+ dependencies.append(item_type)
1261
+ else: # not a standalone type, but has a type definition, so we unwind that here
1262
+ item_type = self.post_check_avro_type(
1263
+ dependencies, item_type)
1264
+ avro_type = self.merge_avro_schemas(
1265
+ [avro_type, self.create_array_type(item_type)], avro_schema, '')
1266
+ else:
1267
+ avro_type = self.merge_avro_schemas(
1268
+ [avro_type, self.create_array_type(generic_type())], avro_schema, '')
1269
+ elif json_object_type and (json_object_type == 'object' or 'object' in json_object_type):
1270
+ avro_record_type = self.json_schema_object_to_avro_record(
1271
+ local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1272
+ if isinstance(avro_record_type, list):
1273
+ for record_entry in avro_record_type:
1274
+ self.lift_dependencies_from_type(
1275
+ record_entry, dependencies)
1276
+ avro_type = self.merge_avro_schemas([avro_type, avro_record_type], avro_schema, avro_type.get(
1277
+ 'name', local_name) if isinstance(avro_type, dict) else local_name)
1278
+ self.lift_dependencies_from_type(
1279
+ avro_type, dependencies)
1280
+ elif 'enum' in json_type:
1281
+ # Handle enums with proper type handling for mixed string/int enums
1282
+ enum_values = json_type['enum']
1283
+ schema_type = json_type.get('type', 'string')
1284
+
1285
+ # For pure string enums with valid symbols, use simple enum without suffix
1286
+ string_values = [v for v in enum_values if isinstance(v, str) and v]
1287
+ int_values = [v for v in enum_values if isinstance(v, int)]
1288
+
1289
+ if not int_values and string_values:
1290
+ # Pure string enum
1291
+ if not self.enum_symbols_need_string_fallback(string_values):
1292
+ # Simple case: valid symbols, just create enum
1293
+ avro_type = self.create_enum_type(
1294
+ local_name,
1295
+ self.compose_namespace(namespace, record_name + '_types'),
1296
+ string_values
1297
+ )
1298
+ else:
1299
+ # Symbols need prefixing, use helper with string fallback
1300
+ avro_type = self.create_enum_for_mixed_types(
1301
+ local_name,
1302
+ self.compose_namespace(namespace, record_name + '_types'),
1303
+ enum_values,
1304
+ schema_type
1305
+ )
1306
+ # Register any embedded enum types in the union
1307
+ self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
1308
+ else:
1309
+ # Mixed or int-only enum, use helper
1310
+ avro_type = self.create_enum_for_mixed_types(
1311
+ local_name + '_1',
1312
+ self.compose_namespace(namespace, record_name + '_types'),
1313
+ enum_values,
1314
+ schema_type
1315
+ )
1316
+ # Register any embedded enum types in the union
1317
+ self.register_embedded_types_in_union(avro_type, avro_schema, dependencies)
1318
+ else:
1319
+ avro_type = self.json_schema_primitive_to_avro_type(json_object_type, json_type.get(
1320
+ 'format'), json_type.get('enum'), record_name, field_name, namespace, dependencies)
1321
+ else:
1322
+ if isinstance(json_type, dict):
1323
+ avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(json_type, json_type.get('format'), json_type.get(
1324
+ 'enum'), record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
1325
+ else:
1326
+ avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(
1327
+ json_type, None, None, record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
1328
+
1329
+ if isinstance(avro_type, dict) and 'name' in avro_type and 'type' in avro_type and not (avro_type['type'] in ['array', 'map']):
1330
+ if not 'namespace' in avro_type:
1331
+ avro_type['namespace'] = namespace
1332
+ existing_type = next((t for t in avro_schema if t.get(
1333
+ 'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
1334
+ if existing_type:
1335
+ existing_type_name = self.get_qualified_name(existing_type)
1336
+ if not existing_type_name in dependencies:
1337
+ dependencies.append(existing_type_name)
1338
+ return existing_type_name
1339
+ self.set_avro_type_value(avro_type, 'name', local_name)
1340
+
1341
+ # post-check on the avro type: if the type is a dict, and the 'type' is not
1342
+ # a record, enum, fixed, array, or map, we will just return the basic type
1343
+ # and push its dependencies up the stack
1344
+ avro_type = self.post_check_avro_type(dependencies, avro_type)
1345
+
1346
+ if isinstance(avro_type, dict) and 'unmerged_types' in avro_type:
1347
+ self.types_with_unmerged_types.append(avro_type)
1348
+
1349
+ return avro_type
1350
+ except RecursionError as e:
1351
+ print(
1352
+ f"Recursion error while processing {namespace}:{record_name}:{field_name} with recursion depth {recursion_depth}")
1353
+ raise e
1354
+
1355
+ def post_check_avro_type(self, dependencies, avro_type):
1356
+ """Post-check the Avro type and push dependencies up the stack."""
1357
+ if isinstance(avro_type, dict) and 'type' in avro_type and (isinstance(avro_type, list) or not avro_type['type'] in ['array', 'map', 'record', 'enum', 'fixed']):
1358
+ if 'dependencies' in avro_type:
1359
+ dependencies.extend(avro_type['dependencies'])
1360
+ avro_type = avro_type['type']
1361
+ return avro_type
1362
+
1363
+ def register_type(self, avro_schema, avro_type) -> bool:
1364
+ """Register a type in the Avro schema."""
1365
+ existing_type = next((t for t in avro_schema if t.get(
1366
+ 'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
1367
+ if not existing_type:
1368
+ if self.is_empty_type(avro_type) and not 'unmerged_types' in avro_type:
1369
+ print(f'WARN: Standalone type {avro_type["name"]} is empty')
1370
+ if self.is_standalone_avro_type(avro_type):
1371
+ avro_schema.append(avro_type)
1372
+ return True
1373
+ else:
1374
+ return False
1375
+ else:
1376
+ return True
1377
+
1378
+ def register_embedded_types_in_union(self, avro_type, avro_schema, dependencies):
1379
+ """
1380
+ Register any embedded named types (enum, record, fixed) found within a union type.
1381
+ This ensures that enum types created by create_enum_for_mixed_types are properly
1382
+ registered in the schema and can be referenced by name.
1383
+ """
1384
+ if isinstance(avro_type, list):
1385
+ for i, member in enumerate(avro_type):
1386
+ if isinstance(member, dict) and 'type' in member and member['type'] in ['enum', 'record', 'fixed']:
1387
+ # Register the embedded type
1388
+ if self.register_type(avro_schema, member):
1389
+ # Replace the inline definition with a reference
1390
+ full_name = self.get_qualified_name(member)
1391
+ avro_type[i] = full_name
1392
+ if full_name not in dependencies:
1393
+ dependencies.append(full_name)
1394
+
1395
+ def has_composition_keywords(self, json_object: dict) -> bool:
1396
+ """Check if the JSON object has any of the combining keywords: allOf, oneOf, anyOf."""
1397
+ return isinstance(json_object, dict) and ('allOf' in json_object or 'oneOf' in json_object or 'anyOf' in json_object)
1398
+
1399
+ def has_enum_keyword(self, json_object: dict) -> bool:
1400
+ """Check if the JSON object is an enum."""
1401
+ return isinstance(json_object, dict) and 'enum' in json_object
1402
+
1403
+ def is_array_object(self, json_object: dict) -> bool:
1404
+ """Check if the JSON object is an array object."""
1405
+ return isinstance(json_object, dict) and 'type' in json_object and json_object['type'] == 'array'
1406
+
1407
+ def is_standalone_avro_type(self, avro_type: dict | list | str) -> bool:
1408
+ """Check if the Avro type is a standalone type."""
1409
+ return isinstance(avro_type, dict) and 'type' in avro_type and (avro_type['type'] in ['record', 'enum', 'fixed'])
1410
+
1411
+ def is_avro_complex_type(self, avro_type: dict) -> bool:
1412
+ """Check if the Avro type is a complex type."""
1413
+ return 'type' in avro_type and avro_type['type'] in ['record', 'enum', 'fixed', 'array', 'map']
1414
+
1415
+ def set_avro_type_value(self, avro_type: dict | list | str, name: str, value: dict | list | str):
1416
+ """Set a value in an Avro type."""
1417
+ if isinstance(avro_type, dict):
1418
+ if name == 'namespace' or name == 'name':
1419
+ if 'type' in avro_type:
1420
+ if not (avro_type['type'] in ['record', 'enum', 'fixed']):
1421
+ return
1422
+ avro_type[name] = value
1423
+
1424
+ def create_avro_record(self, name: str, namespace: str, fields: list) -> dict:
1425
+ """Create an Avro record type."""
1426
+ return {
1427
+ 'type': 'record',
1428
+ 'name': avro_name(name),
1429
+ 'namespace': namespace,
1430
+ 'fields': fields
1431
+ }
1432
+
1433
+ def create_wrapper_record(self, wrapper_name: str, wrapper_namespace: str, wrapper_field: str, dependencies: list, avro_type: list | str | dict) -> dict:
1434
+ """Create a union wrapper type in Avro."""
1435
+ rec = self.create_avro_record(wrapper_name, wrapper_namespace, [
1436
+ {
1437
+ 'name': wrapper_field,
1438
+ 'type': avro_type
1439
+ }
1440
+ ])
1441
+ if len(dependencies) > 0:
1442
+ rec['dependencies'] = dependencies
1443
+ return rec
1444
+
1445
+ def create_enum_type(self, name: str, namespace: str, symbols: list) -> dict:
1446
+ """Create an Avro enum type."""
1447
+ # the symbol list may have been merged by composition to we flatten it to have a unique list
1448
+ symbols = self.flatten_union(symbols)
1449
+ return {
1450
+ 'type': 'enum',
1451
+ 'name': name,
1452
+ 'namespace': namespace,
1453
+ 'symbols': [avro_name(s) for s in symbols]
1454
+ }
1455
+
1456
+ def enum_symbols_need_string_fallback(self, symbols: list) -> bool:
1457
+ """
1458
+ Check if any enum symbols will be transformed by avro_name().
1459
+ If symbols are prefixed (e.g., "1" -> "_1"), we need a string fallback
1460
+ in the union to handle original JSON values during deserialization.
1461
+ """
1462
+ for s in symbols:
1463
+ if isinstance(s, str) and s:
1464
+ if avro_name(s) != s:
1465
+ return True
1466
+ return False
1467
+
1468
+ def create_enum_for_mixed_types(self, name: str, namespace: str, enum_values: list, json_types: list) -> dict | list:
1469
+ """
1470
+ Create an Avro type for enums with mixed or special type requirements.
1471
+
1472
+ Handles:
1473
+ - Pure string enum with valid symbols -> enum
1474
+ - Pure string enum with prefixed symbols -> [enum, string]
1475
+ - Pure int enum -> int (with doc hint about allowed values)
1476
+ - Mixed string/int enum -> [enum, string, int]
1477
+
1478
+ Args:
1479
+ name: The enum type name
1480
+ namespace: The namespace for the enum
1481
+ enum_values: The list of enum values from JSON Schema
1482
+ json_types: The JSON Schema type(s), e.g., "string", "integer", or ["string", "integer"]
1483
+
1484
+ Returns:
1485
+ Avro type: either an enum dict, a primitive string, or a union list
1486
+ """
1487
+ if not isinstance(json_types, list):
1488
+ json_types = [json_types]
1489
+
1490
+ # Normalize type names
1491
+ has_string = 'string' in json_types
1492
+ has_int = 'integer' in json_types or 'int' in json_types
1493
+ has_null = 'null' in json_types
1494
+
1495
+ # Separate string and int enum values
1496
+ string_values = [v for v in enum_values if isinstance(v, str) and v]
1497
+ int_values = [v for v in enum_values if isinstance(v, int)]
1498
+
1499
+ # Pure integer enum case
1500
+ if has_int and not has_string and not string_values:
1501
+ # Just use int - no enum type needed for pure int enums
1502
+ # The doc will contain the allowed values hint
1503
+ result = 'int'
1504
+ if has_null:
1505
+ result = ['null', result]
1506
+ return result
1507
+
1508
+ # Build the enum from string values (or string representations of all values)
1509
+ if string_values:
1510
+ enum_symbols = list(set(string_values))
1511
+ else:
1512
+ # No string values but has_string type - shouldn't happen normally
1513
+ enum_symbols = []
1514
+
1515
+ if not enum_symbols:
1516
+ # No valid enum symbols, fall back to primitive types
1517
+ union = []
1518
+ if has_null:
1519
+ union.append('null')
1520
+ if has_string:
1521
+ union.append('string')
1522
+ if has_int:
1523
+ union.append('int')
1524
+ return union if len(union) > 1 else (union[0] if union else 'string')
1525
+
1526
+ # Create the enum type
1527
+ avro_enum = self.create_enum_type(name, namespace, enum_symbols)
1528
+
1529
+ # Determine if we need additional types in union
1530
+ needs_string_fallback = self.enum_symbols_need_string_fallback(enum_symbols)
1531
+
1532
+ # Build the union
1533
+ union = []
1534
+ if has_null:
1535
+ union.append('null')
1536
+ union.append(avro_enum)
1537
+
1538
+ # Add string fallback if symbols were prefixed OR if this is a mixed type enum
1539
+ if needs_string_fallback or has_int:
1540
+ union.append('string')
1541
+
1542
+ # Add int if the schema allows integers
1543
+ if has_int:
1544
+ union.append('int')
1545
+
1546
+ # Return enum directly if no union needed
1547
+ if len(union) == 1:
1548
+ return union[0]
1549
+
1550
+ return union
1551
+
1552
+ def create_array_type(self, items: list | dict | str) -> dict:
1553
+ """Create an Avro array type."""
1554
+ return {
1555
+ 'type': 'array',
1556
+ 'items': items
1557
+ }
1558
+
1559
+ def create_map_type(self, values: list | dict | str) -> dict:
1560
+ """Create an Avro map type."""
1561
+ return {
1562
+ 'type': 'map',
1563
+ 'values': values
1564
+ }
1565
+
1566
+ def nullable(self, avro_type: list | dict | str) -> list | dict | str:
1567
+ """Wrap a type in a union with null."""
1568
+ if isinstance(avro_type, list):
1569
+ cp = avro_type.copy()
1570
+ cp.insert(0, 'null')
1571
+ return cp
1572
+ return ['null', avro_type]
1573
+
1574
+ def merge_description_into_doc(self, source_json: dict, target_avro: dict | list | str):
1575
+ """Merge a description in JSON into Avro doc."""
1576
+ if isinstance(source_json, dict) and 'description' in source_json and isinstance(target_avro, dict):
1577
+ target_avro['doc'] = target_avro['doc'] + ", " + \
1578
+ source_json['description'] if 'doc' in target_avro else source_json['description']
1579
+
1580
+ def merge_dependencies_into_parent(self, dependencies: list, child_type: dict | list | str, parent_type: dict | list | str):
1581
+ """Merge dependencies from a child type into a parent type."""
1582
+ self.lift_dependencies_from_type(child_type, dependencies)
1583
+ if len(dependencies) > 0 and isinstance(parent_type, dict):
1584
+ if 'dependencies' in parent_type:
1585
+ dependencies.extend(parent_type['dependencies'])
1586
+ else:
1587
+ parent_type['dependencies'] = dependencies
1588
+
1589
+ def lift_dependencies_from_type(self, child_type: dict | list | str, dependencies: list):
1590
+ """Lift all dependencies from a type and return a new type with the dependencies lifted."""
1591
+ if isinstance(child_type, dict):
1592
+ if 'dependencies' in child_type:
1593
+ dependencies.extend(child_type['dependencies'])
1594
+ del child_type['dependencies']
1595
+
1596
+ def compose_namespace(self, *names) -> str:
1597
+ """Compose a namespace from a list of names."""
1598
+ return '.'.join([avro_namespace(n) for n in names if n])
1599
+
1600
+ def get_qualified_name(self, avro_type):
1601
+ """Get the qualified name of an Avro type."""
1602
+ return self.compose_namespace(avro_type.get('namespace', ''), avro_type.get('name', ''))
1603
+
1604
+ def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth: int = 1) -> dict | list | str | None:
1605
+ """Convert a JSON schema object declaration to an Avro record."""
1606
+ dependencies: List[str] = []
1607
+ avro_type: list | dict | str = {}
1608
+
1609
+ # handle top-level allOf, anyOf, oneOf
1610
+ if self.has_composition_keywords(json_object):
1611
+ # we will merge allOf, oneOf, anyOf into a union record type
1612
+ type = self.json_type_to_avro_type(
1613
+ json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1614
+ if isinstance(type, str):
1615
+ # we are skipping references and primitives
1616
+ return None
1617
+ if isinstance(type, list):
1618
+ # we should have a union type
1619
+ avro_type = self.create_wrapper_record(
1620
+ name+"_union", self.utility_namespace, 'options', [], type)
1621
+ elif isinstance(type, dict) and 'type' in type and type['type'] != 'record':
1622
+ # merge the type into a record type if it's not a record type
1623
+ print(
1624
+ f'INFO: Standalone type {name} is being wrapped in a record')
1625
+ avro_type = self.create_wrapper_record(avro_name(type.get(
1626
+ 'name', name)+'_wrapper'), self.utility_namespace, 'value', type.get('dependencies', []), type)
1627
+ else:
1628
+ avro_type = type
1629
+ # add external dependencies to the record
1630
+ self.merge_dependencies_into_parent(dependencies, type, avro_type)
1631
+ self.merge_description_into_doc(json_object, avro_type)
1632
+ # return the union type
1633
+ return avro_type
1634
+
1635
+ if self.has_enum_keyword(json_object):
1636
+ # this is an enum
1637
+ avro_enum = self.create_enum_type(
1638
+ avro_name(name), namespace, json_object['enum'])
1639
+ self.merge_description_into_doc(json_object, avro_enum)
1640
+ return avro_enum
1641
+
1642
+ if self.is_array_object(json_object):
1643
+ # this is an array, which can't be standalone in Avro, so we will wraps it into a record
1644
+ # and include the type as an inline
1645
+ print(
1646
+ f'WARN: Standalone array type {name} will be wrapped in a record')
1647
+ deps: List[str] = []
1648
+ array_type = self.json_type_to_avro_type(json_object, name, avro_name(
1649
+ name), namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1650
+ avro_array = self.create_wrapper_record(
1651
+ avro_name(name+'_wrapper'), self.utility_namespace, 'items', [], array_type)
1652
+ self.merge_description_into_doc(json_object, avro_array)
1653
+ self.merge_dependencies_into_parent(deps, array_type, avro_array)
1654
+ return avro_array
1655
+
1656
+ # at this point, we have to assume that we have a JSON schema object
1657
+ title = json_object.get('title')
1658
+ record_name = avro_name(name if name else title if title else None)
1659
+ if record_name is None:
1660
+ raise ValueError(
1661
+ f"Cannot determine record name for json_object {json_object}")
1662
+ if len(record_stack) > 0:
1663
+ # if we have a record stack, we need to add the current name to
1664
+ # the namespace since nested types are disambiguated by their namespace
1665
+ namespace = self.compose_namespace(
1666
+ namespace, record_stack[-1] + "_types")
1667
+ # at this point we have a record type
1668
+ avro_record = self.create_avro_record(record_name, namespace, [])
1669
+ # Check if this record has a 'union' annotation from discriminated union pattern
1670
+ if 'union' in json_object:
1671
+ avro_record['union'] = json_object['union']
1672
+ # we need to prevent circular dependencies, so we will maintain a stack of the in-progress
1673
+ # records and will resolve the cycle as we go. if this record is already in the stack, we will
1674
+ # just return a reference to a record that contains this record
1675
+ if record_name in record_stack:
1676
+ # to break the cycle, we will use a containment type that references
1677
+ # the record that is being defined
1678
+ print(
1679
+ f'WARN: Circular dependency found for record {record_name}. Creating {record_name}_ref.')
1680
+ ref_name = avro_name(record_name + '_ref')
1681
+ return self.create_wrapper_record(ref_name, namespace, record_name, [], self.compose_namespace(namespace, record_name))
1682
+ try:
1683
+ # enter the record stack scope for this record
1684
+ record_stack.append(record_name)
1685
+ # collect the required fields so we can make those fields non-null
1686
+ required_fields = json_object.get('required', [])
1687
+
1688
+ field_refs = []
1689
+ if 'properties' in json_object and isinstance(json_object['properties'], dict):
1690
+ # add the properties as fields
1691
+ for field_name, json_field_types in json_object['properties'].items():
1692
+ if isinstance(json_field_types, bool):
1693
+ # for "propertyname": true, we skip. schema bug.
1694
+ continue
1695
+ if not isinstance(json_field_types, list):
1696
+ json_field_types = [json_field_types]
1697
+ field_type_list = []
1698
+ field_ref_type_list = []
1699
+ const = None
1700
+ default = None
1701
+ description = None
1702
+ discriminator = None
1703
+ for json_field_type in json_field_types:
1704
+ # skip fields with an bad or empty type
1705
+ if not isinstance(json_field_type, dict):
1706
+ continue
1707
+ field_name = avro_name(field_name)
1708
+ # last const wins if there are multiple
1709
+ const = json_field_type.get('const', const)
1710
+ # last default wins if there are multiple
1711
+ default_value = json_field_type.get('default')
1712
+ if default_value and not isinstance(default_value, dict) and not isinstance(default_value, list):
1713
+ default = default_value
1714
+ # get the description from the field type
1715
+ description = json_field_type.get('description', description)
1716
+ # check for discriminator annotation
1717
+ discriminator = json_field_type.get('discriminator', discriminator)
1718
+ # convert the JSON-type field to an Avro-type field
1719
+ avro_field_ref_type = avro_field_type = self.ensure_type(self.json_type_to_avro_type(
1720
+ json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
1721
+ if isinstance(avro_field_type, list):
1722
+ avro_field_type = self.flatten_union(
1723
+ avro_field_type)
1724
+ avro_field_ref_type = avro_field_type
1725
+ elif isinstance(avro_field_type, dict):
1726
+ self.lift_dependencies_from_type(
1727
+ avro_field_type, dependencies)
1728
+ # if the first call gave us a global type that got added to the schema, this call will give us a reference
1729
+ if self.is_standalone_avro_type(avro_field_type):
1730
+ avro_field_ref_type = self.get_qualified_name(
1731
+ avro_field_type)
1732
+ if avro_field_type is None:
1733
+ # None type is a problem
1734
+ raise ValueError(
1735
+ f"avro_field_type is None for field {field_name}")
1736
+ if isinstance(avro_field_type, dict) and 'type' in avro_field_type and not self.is_avro_complex_type(avro_field_type):
1737
+ # if the field type is a basic type, inline it
1738
+ avro_field_type = avro_field_type['type']
1739
+ field_type_list.append(avro_field_type)
1740
+ field_ref_type_list.append(avro_field_ref_type)
1741
+
1742
+ effective_field_type = field_type_list[0] if len(
1743
+ field_type_list) == 1 else field_type_list
1744
+ effective_field_ref_type = field_ref_type_list[0] if len(
1745
+ field_ref_type_list) == 1 else field_ref_type_list
1746
+ avro_field = {
1747
+ 'name': avro_name(field_name),
1748
+ 'type': self.nullable(effective_field_type) if not field_name in required_fields and 'null' not in effective_field_type else effective_field_type
1749
+ }
1750
+ if field_name != avro_name(field_name):
1751
+ avro_field['altnames'] = { "json": field_name }
1752
+ if const:
1753
+ avro_field['const'] = const
1754
+ if default:
1755
+ avro_field['default'] = default
1756
+ if description:
1757
+ avro_field['doc'] = description
1758
+ if discriminator:
1759
+ avro_field['discriminator'] = discriminator
1760
+ field_type_list.append(avro_field_type)
1761
+ avro_field_ref = {
1762
+ 'name': avro_name(field_name),
1763
+ 'type': self.nullable(effective_field_ref_type) if not field_name in required_fields and 'null' not in effective_field_ref_type else effective_field_ref_type
1764
+ }
1765
+ if description:
1766
+ avro_field_ref['doc'] = description
1767
+ field_ref_type_list.append(avro_field_ref)
1768
+ # add the field to the record
1769
+ avro_record['fields'].append(avro_field)
1770
+ field_refs.append(avro_field_ref)
1771
+ elif not 'additionalProperties' in json_object and not 'patternProperties' in json_object:
1772
+ if 'type' in json_object and (json_object['type'] == 'object' or 'object' in json_object['type']) and \
1773
+ not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
1774
+ # we don't have any fields, but we have an object type, so we create a map
1775
+ avro_record = self.create_map_type(generic_type())
1776
+ elif 'type' in json_object and (json_object['type'] == 'array' or 'array' in json_object['type']) and \
1777
+ not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
1778
+ # we don't have any fields, but we have an array type, so we create a record with an 'items' field
1779
+ avro_record = self.create_array_type(
1780
+ self.json_type_to_avro_type(
1781
+ json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1782
+ if 'items' in json_object
1783
+ else generic_type())
1784
+ else:
1785
+ return json_object['type'] if 'type' in json_object else generic_type()
1786
+
1787
+ extension_types = []
1788
+ prop_docs = ''
1789
+ if 'patternProperties' in json_object and isinstance(json_object['patternProperties'], dict) and len(json_object['patternProperties']) > 0:
1790
+ # pattern properties are represented as a record with field names that are the patterns
1791
+ pattern_props = json_object['patternProperties']
1792
+ for pattern_name, props in pattern_props.items():
1793
+ deps = []
1794
+ prop_type = self.ensure_type(self.json_type_to_avro_type(
1795
+ props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
1796
+ if self.is_standalone_avro_type(prop_type):
1797
+ self.lift_dependencies_from_type(prop_type, deps)
1798
+ self.set_avro_type_value(
1799
+ prop_type, 'namespace', namespace)
1800
+ self.register_type(avro_schema, prop_type)
1801
+ prop_type_ref = self.get_qualified_name(prop_type)
1802
+ dependencies.append(prop_type_ref)
1803
+ else:
1804
+ dependencies.extend(deps)
1805
+ if isinstance(prop_type, str) and not prop_type in primitive_types:
1806
+ dependencies.append(prop_type)
1807
+ if self.is_empty_type(prop_type):
1808
+ prop_type = generic_type()
1809
+ prop_docs += f"Name pattern '{pattern_name}': [{self.get_field_type_name({'type':prop_type})}]. "
1810
+ extension_types.append(prop_type)
1811
+
1812
+ if 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], bool):
1813
+ if True == json_object['additionalProperties']:
1814
+ prop_type = generic_type()
1815
+ extension_types.append(prop_type)
1816
+ elif 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], dict) and len(json_object['additionalProperties']) > 0:
1817
+ # additional properties are represented as a map of string to the type of the value
1818
+ additional_props = json_object['additionalProperties']
1819
+ deps = []
1820
+ values_type = self.json_type_to_avro_type(
1821
+ additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
1822
+ if self.is_standalone_avro_type(values_type):
1823
+ self.lift_dependencies_from_type(values_type, deps)
1824
+ self.set_avro_type_value(
1825
+ values_type, 'namespace', namespace)
1826
+ self.register_type(avro_schema, values_type)
1827
+ values_type_ref = self.get_qualified_name(values_type)
1828
+ dependencies.append(values_type_ref)
1829
+ else:
1830
+ dependencies.extend(deps)
1831
+ if isinstance(values_type, str) and not values_type in primitive_types:
1832
+ dependencies.append(values_type)
1833
+ if self.is_empty_type(values_type):
1834
+ values_type = generic_type()
1835
+ prop_docs += f"Extra properties: [{self.get_field_type_name({'type':values_type})}]. "
1836
+ extension_types.append(values_type)
1837
+ self.merge_description_into_doc(json_object, avro_record)
1838
+
1839
+ avro_alternate_record = None
1840
+ if extension_types:
1841
+ # Since Avro Schema does not allow fields with dynamic names
1842
+ # to appear alongside regular fields, we will union the types of all properties with the
1843
+ # type of the additionalProperties and document this in the record's description
1844
+ json_field_types = [field['type'] for field in field_refs]
1845
+ field_type_names = [
1846
+ [field['name'], self.get_field_type_name(field)] for field in field_refs]
1847
+ field_type_name_list: str = ', '.join(
1848
+ [f"'{field[0]}': [{field[1]}]" for field in field_type_names])
1849
+ json_field_types.extend(extension_types)
1850
+ json_field_types = self.flatten_union(json_field_types)
1851
+ if len(json_field_types) == 1:
1852
+ json_field_types = json_field_types[0]
1853
+ doc = f"Alternate map: {field_type_name_list}. " if field_type_names else ''
1854
+ doc += prop_docs
1855
+ avro_alternate_record = self.create_map_type(json_field_types)
1856
+ if not self.is_empty_type(avro_record):
1857
+ avro_alternate_record['alternateof'] = self.get_qualified_name(avro_record)
1858
+ dependencies.append(
1859
+ self.compose_namespace(namespace, record_name))
1860
+ avro_record['doc'] = doc if not 'doc' in avro_record else avro_record['doc'] + ', ' + doc
1861
+
1862
+ if len(dependencies) > 0:
1863
+ # dedupe the list
1864
+ dependencies = list(set(dependencies))
1865
+ avro_record['dependencies'] = dependencies
1866
+ finally:
1867
+ record_stack.pop()
1868
+ if avro_alternate_record:
1869
+ if self.is_empty_type(avro_record):
1870
+ # there's no substantive content in the record,
1871
+ # so we will just return the alternate record, which
1872
+ # is a plain map
1873
+ return avro_alternate_record
1874
+ return [avro_record, avro_alternate_record]
1875
+ return avro_record
1876
+
1877
+ def postprocess_schema(self, avro_schema: list) -> None:
1878
+ """ Post-process the Avro Schema for cases wheer we need a second pass """
1879
+ if len(self.types_with_unmerged_types) > 0:
1880
+ types_with_unmerged_types = copy.deepcopy(
1881
+ self.types_with_unmerged_types)
1882
+ self.types_with_unmerged_types = []
1883
+ for ref_type in types_with_unmerged_types:
1884
+ # find ref_type anywhere in the avro_schema graph, matching
1885
+ # on name and namespace.
1886
+ def find_fn(
1887
+ t): return 'name' in t and t['name'] == ref_type['name'] and 'namespace' in t and t['namespace'] == ref_type['namespace']
1888
+ type = find_schema_node(find_fn, avro_schema)
1889
+ if not type:
1890
+ raise ValueError(
1891
+ f"Couldn't find type {ref_type['namespace']}.{ref_type['name']} in the Avro Schema.")
1892
+ # resolve the unmerged types
1893
+ local_name = type.get('name')
1894
+ if not isinstance(type, dict):
1895
+ continue
1896
+ unmerged_types = type.get('unmerged_types', [])
1897
+ if len(unmerged_types) == 0:
1898
+ if 'unmerged_types' in type:
1899
+ del type['unmerged_types']
1900
+ continue
1901
+ base_type = copy.deepcopy(type)
1902
+ if 'unmerged_types' in base_type:
1903
+ del base_type['unmerged_types']
1904
+ mergeable_types = [base_type]
1905
+ deps: List[str] = []
1906
+ self.lift_dependencies_from_type(type, deps)
1907
+ for item in unmerged_types:
1908
+ if isinstance(item, str):
1909
+ found_avro_type = next(
1910
+ (t for t in avro_schema if self.get_qualified_name(t) == item), None)
1911
+ if not found_avro_type:
1912
+ continue
1913
+ elif isinstance(item, dict):
1914
+ found_avro_type = item
1915
+ self.lift_dependencies_from_type(found_avro_type, deps)
1916
+ if isinstance(found_avro_type, dict):
1917
+ candidate = found_avro_type
1918
+ if 'unmerged_types' in candidate:
1919
+ del candidate['unmerged_types']
1920
+ mergeable_types.append(candidate)
1921
+ merge_result = self.merge_avro_schemas(
1922
+ mergeable_types, avro_schema, local_name, deps)
1923
+ if isinstance(merge_result, dict):
1924
+ merge_result['dependencies'] = deps
1925
+ if 'unmerged_types' in merge_result:
1926
+ del merge_result['unmerged_types']
1927
+ if isinstance(merge_result, list):
1928
+ # unmerged field containers have fields - wrap the union in a record
1929
+ # Keep the original name since references expect it
1930
+ self.set_avro_type_value(
1931
+ type, 'fields', [{'name': 'value', 'type': merge_result}])
1932
+ if 'unmerged_types' in type:
1933
+ del type['unmerged_types']
1934
+ merge_result = copy.deepcopy(type)
1935
+ set_schema_node(find_fn, merge_result, avro_schema)
1936
+
1937
+ def process_definition_list(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema_list):
1938
+ """Process a schema definition list."""
1939
+ for sub_schema_name, schema in json_schema_list.items():
1940
+ if not isinstance(schema, dict) and not isinstance(schema, list):
1941
+ # skip items that are not schema definitions or lists
1942
+ continue
1943
+ if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
1944
+ # this is a schema definition
1945
+ self.process_definition(
1946
+ json_schema, namespace, base_uri, avro_schema, record_stack, sub_schema_name, schema)
1947
+ continue
1948
+ # it's a schema definition list
1949
+ self.process_definition_list(
1950
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema)
1951
+
1952
+ def process_definition(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema, is_root: bool = False) -> Tuple[str, str] | None:
1953
+ """ Process a schema definition. """
1954
+ avro_schema_item = None
1955
+ avro_schema_item_list = self.json_schema_object_to_avro_record(
1956
+ schema_name, schema, namespace, json_schema, base_uri, avro_schema, record_stack)
1957
+ if not isinstance(avro_schema_item_list, list) and not isinstance(avro_schema_item_list, dict):
1958
+ # skip if the record couldn't be resolved
1959
+ return None
1960
+ # the call above usually returns a single record, but we pretend it's normally a list to handle allOf/anyOf/oneOf cases
1961
+ if isinstance(avro_schema_item_list, list) and is_root and len(avro_schema_item_list) > 1:
1962
+ # if we have multiple root-level records, we will wrap them all in a single record
1963
+ root_avro_schema_item = self.create_wrapper_record(
1964
+ schema_name+'_wrapper', namespace, 'root', [], avro_schema_item_list)
1965
+ for avro_schema_item in avro_schema_item_list:
1966
+ self.merge_dependencies_into_parent(
1967
+ [], avro_schema_item, root_avro_schema_item)
1968
+ self.register_type(avro_schema, root_avro_schema_item)
1969
+ return root_avro_schema_item['namespace'], root_avro_schema_item['name']
1970
+ elif not isinstance(avro_schema_item_list, list):
1971
+ # is not a list, so we'll wrap it in a list
1972
+ avro_schema_item_list = [avro_schema_item_list]
1973
+ for avro_schema_item in avro_schema_item_list:
1974
+ # add the item to the schema if it's not already there
1975
+ if isinstance(avro_schema_item, str):
1976
+ continue
1977
+ if isinstance(avro_schema_item, dict) and not 'name' in avro_schema_item:
1978
+ avro_schema_item['name'] = avro_name(schema_name)
1979
+ existing_type = next((t for t in avro_schema if t.get('name') == avro_schema_item['name'] and t.get(
1980
+ 'namespace') == avro_schema_item.get('namespace')), None)
1981
+ if not existing_type:
1982
+ if (not self.is_empty_type(avro_schema_item) or 'unmerged_types' in avro_schema_item) and \
1983
+ self.is_standalone_avro_type(avro_schema_item):
1984
+ # we only register record/enum as type. the other defs are mix-ins
1985
+ self.register_type(avro_schema, avro_schema_item)
1986
+ return avro_schema_item['namespace'], avro_schema_item['name']
1987
+ elif is_root:
1988
+ # at the root, we will wrap the type in a record to make it top-level
1989
+ deps: List[str] = []
1990
+ self.lift_dependencies_from_type(avro_schema_item, deps)
1991
+ avro_schema_wrapper = self.create_wrapper_record(schema_name, avro_schema_item.get(
1992
+ 'namespace', namespace), avro_schema_item['name'], deps, avro_schema_item)
1993
+ if len(deps) > 0:
1994
+ avro_schema_wrapper['dependencies'] = deps
1995
+ avro_schema_item = avro_schema_wrapper
1996
+ self.register_type(avro_schema, avro_schema_item)
1997
+ return avro_schema_item['namespace'], avro_schema_item['name']
1998
+ return None
1999
+
2000
+ def id_to_avro_namespace(self, id: str) -> str:
2001
+ """Convert a XSD namespace to Avro Namespace."""
2002
+ parsed_url = urlparse(id)
2003
+ # strip the file extension
2004
+ path = parsed_url.path.rsplit('.')[0]
2005
+ path_segments = path.strip('/').replace('-', '_').split('/')
2006
+ reversed_path_segments = reversed(path_segments)
2007
+ namespace_suffix = self.compose_namespace(*reversed_path_segments)
2008
+ if parsed_url.hostname:
2009
+ namespace_prefix = self.compose_namespace(
2010
+ *reversed(parsed_url.hostname.split('.')))
2011
+ namespace = self.compose_namespace(namespace_prefix, namespace_suffix)
2012
+ return namespace
2013
+
2014
+ def jsons_to_avro(self, json_schema: dict | list, namespace: str, base_uri: str) -> list | dict | str:
2015
+ """Convert a JSON-schema to an Avro-schema."""
2016
+ avro_schema: List[dict] = []
2017
+ record_stack: List[str] = []
2018
+
2019
+ parsed_url = urlparse(base_uri)
2020
+ schema_name = self.root_class_name
2021
+
2022
+ if isinstance(json_schema, dict) and ('definitions' in json_schema or '$defs' in json_schema):
2023
+ # this is a swagger file or has a 'definitions' block
2024
+ json_schema_defs = json_schema.get(
2025
+ 'definitions', json_schema.get('$defs', []))
2026
+ for def_schema_name, schema in json_schema_defs.items():
2027
+ if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
2028
+ # this is a schema definition
2029
+ self.process_definition(
2030
+ json_schema, namespace, base_uri, avro_schema, record_stack, def_schema_name, schema)
2031
+ else:
2032
+ # it's a schema definition list
2033
+ self.process_definition_list(
2034
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema.copy())
2035
+ elif isinstance(json_schema, list):
2036
+ # this is a schema definition list
2037
+ self.process_definition_list(
2038
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema)
2039
+
2040
+ root_namespace = None
2041
+ root_name = None
2042
+ if isinstance(json_schema, dict) and 'type' in json_schema or 'allOf' in json_schema or 'oneOf' in json_schema or 'anyOf' in json_schema or 'properties' in json_schema:
2043
+ # this is a schema definition
2044
+ if isinstance(json_schema, dict) and '$ref' in json_schema:
2045
+ # if there is a $ref at the root level, resolve the reference and merge it with the current schema
2046
+ ref = json_schema['$ref']
2047
+ if ref:
2048
+ ref_schema, json_doc = self.resolve_reference(
2049
+ json_schema, base_uri, json_schema)
2050
+ json_schema = self.merge_json_schemas(
2051
+ [json_schema, ref_schema], intersect=False)
2052
+ root_info = self.process_definition(
2053
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema, is_root=True)
2054
+ if root_info:
2055
+ root_namespace, root_name = root_info
2056
+
2057
+ # postprocessing pass
2058
+ self.postprocess_schema(avro_schema)
2059
+
2060
+ if isinstance(avro_schema, list) and len(avro_schema) > 1 and self.split_top_level_records:
2061
+ new_avro_schema = []
2062
+ for item in avro_schema:
2063
+ if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
2064
+ # we need to make a copy since the inlining operation shuffles types
2065
+ schema_copy = copy.deepcopy(avro_schema)
2066
+ # find the item with the same name and namespace in the copy
2067
+ found_item = next((t for t in schema_copy if t.get(
2068
+ 'name') == item['name'] and t.get('namespace') == item.get('namespace')), None)
2069
+ if found_item:
2070
+ # inline all dependencies of the item
2071
+ inline_dependencies_of(schema_copy, found_item)
2072
+ new_avro_schema.append(found_item)
2073
+ avro_schema = new_avro_schema
2074
+ else:
2075
+ # sort the records by their dependencies
2076
+ if root_name and root_namespace and not ('definitions' in json_schema or '$defs' in json_schema):
2077
+ # inline all dependencies if this is a doc with only a root level definition
2078
+ root = find_schema_node(
2079
+ lambda t: 'name' in t and t['name'] == root_name and 'namespace' in t and t['namespace'] == root_namespace, avro_schema)
2080
+ inline_dependencies_of(avro_schema, root)
2081
+ return root
2082
+ else:
2083
+ avro_schema = sort_messages_by_dependencies(avro_schema)
2084
+
2085
+ if parsed_url.fragment and isinstance(json_schema, dict):
2086
+ # if the fragment is present in the URL, it's a reference to a schema definition
2087
+ # so we will resolve that reference and return a type
2088
+ self.imported_types.clear()
2089
+ fragment_schema: List[dict] = []
2090
+ json_pointer = parsed_url.fragment
2091
+ schema_name = parsed_url.fragment.split('/')[-1]
2092
+ schema = jsonpointer.resolve_pointer(json_schema, json_pointer)
2093
+ avro_schema_item = self.json_schema_object_to_avro_record(
2094
+ schema_name, schema, namespace, json_schema, base_uri, fragment_schema, record_stack)
2095
+ if avro_schema_item:
2096
+ # we roll all the types into this record as the top level type
2097
+ inline_dependencies_of(avro_schema, avro_schema_item)
2098
+ return avro_schema_item
2099
+
2100
+ return avro_schema
2101
+
2102
+ def convert_jsons_to_avro(self, json_schema_file_path: str, avro_schema_path: str, namespace: str | None = None, utility_namespace: str | None = None) -> list | dict | str:
2103
+ """Convert JSON schema file to Avro schema file."""
2104
+ # turn the file path into a file URI if it's not a URI already
2105
+ parsed_url = urlparse(json_schema_file_path)
2106
+ if not parsed_url.hostname and not parsed_url.scheme == 'file':
2107
+ json_schema_file_path = 'file://' + json_schema_file_path
2108
+ parsed_url = urlparse(json_schema_file_path)
2109
+ content = self.fetch_content(parsed_url.geturl())
2110
+ json_schema = json.loads(content)
2111
+
2112
+ if not namespace:
2113
+ namespace = parsed_url.geturl().replace('\\', '/').replace('-',
2114
+ '_').split('/')[-1].split('.')[0]
2115
+ # get the $id if present
2116
+ if '$id' in json_schema:
2117
+ namespace = self.id_to_avro_namespace(json_schema['$id'])
2118
+ self.root_namespace = namespace
2119
+ if utility_namespace:
2120
+ self.utility_namespace = utility_namespace
2121
+ else:
2122
+ self.utility_namespace = self.root_namespace + '.utility'
2123
+
2124
+ # drop the file name from the parsed URL to get the base URI
2125
+ avro_schema = self.jsons_to_avro(
2126
+ json_schema, namespace, parsed_url.geturl())
2127
+ if len(avro_schema) == 1:
2128
+ avro_schema = avro_schema[0]
2129
+
2130
+ # create the directory for the Avro schema file if it doesn't exist
2131
+ dir = os.path.dirname(
2132
+ avro_schema_path) if not self.split_top_level_records else avro_schema_path
2133
+ if dir != '' and not os.path.exists(dir):
2134
+ os.makedirs(dir, exist_ok=True)
2135
+ if self.split_top_level_records:
2136
+ # if we are splitting top level records, we will create a file for each record
2137
+ for item in avro_schema:
2138
+ if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
2139
+ schema_file_path = os.path.join(
2140
+ dir, item['name'] + '.avsc')
2141
+ with open(schema_file_path, 'w') as avro_file:
2142
+ json.dump(item, avro_file, indent=4)
2143
+ else:
2144
+ with open(avro_schema_path, 'w') as avro_file:
2145
+ json.dump(avro_schema, avro_file, indent=4)
2146
+ return avro_schema
2147
+
2148
+
2149
+ def convert_jsons_to_avro(json_schema_file_path: str, avro_schema_path: str, namespace: str = '', utility_namespace='', root_class_name='', split_top_level_records=False) -> list | dict | str:
2150
+ """Convert JSON schema file to Avro schema file."""
2151
+
2152
+ if not json_schema_file_path:
2153
+ raise ValueError('JSON schema file path is required')
2154
+ if not json_schema_file_path.startswith('http'):
2155
+ if not os.path.exists(json_schema_file_path):
2156
+ raise FileNotFoundError(f'JSON schema file {json_schema_file_path} not found')
2157
+
2158
+ try:
2159
+ converter = JsonToAvroConverter()
2160
+ converter.split_top_level_records = split_top_level_records
2161
+ if root_class_name:
2162
+ converter.root_class_name = root_class_name
2163
+ return converter.convert_jsons_to_avro(json_schema_file_path, avro_schema_path, namespace, utility_namespace)
2164
+ except Exception as e:
2165
+ print(
2166
+ f'Error converting JSON {json_schema_file_path} to Avro: {e.args[0]}')
2167
+ return []