structurize 2.16.2__py3-none-any.whl → 2.16.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. avrotize/__init__.py +63 -63
  2. avrotize/__main__.py +5 -5
  3. avrotize/_version.py +34 -34
  4. avrotize/asn1toavro.py +160 -160
  5. avrotize/avrotize.py +152 -152
  6. avrotize/avrotocpp.py +483 -483
  7. avrotize/avrotocsharp.py +992 -992
  8. avrotize/avrotocsv.py +121 -121
  9. avrotize/avrotodatapackage.py +173 -173
  10. avrotize/avrotodb.py +1383 -1383
  11. avrotize/avrotogo.py +476 -476
  12. avrotize/avrotographql.py +197 -197
  13. avrotize/avrotoiceberg.py +210 -210
  14. avrotize/avrotojava.py +1023 -1023
  15. avrotize/avrotojs.py +250 -250
  16. avrotize/avrotojsons.py +481 -481
  17. avrotize/avrotojstruct.py +345 -345
  18. avrotize/avrotokusto.py +363 -363
  19. avrotize/avrotomd.py +137 -137
  20. avrotize/avrotools.py +168 -168
  21. avrotize/avrotoparquet.py +208 -208
  22. avrotize/avrotoproto.py +358 -358
  23. avrotize/avrotopython.py +622 -622
  24. avrotize/avrotorust.py +435 -435
  25. avrotize/avrotots.py +598 -598
  26. avrotize/avrotoxsd.py +344 -344
  27. avrotize/commands.json +2493 -2433
  28. avrotize/common.py +828 -828
  29. avrotize/constants.py +4 -4
  30. avrotize/csvtoavro.py +131 -131
  31. avrotize/datapackagetoavro.py +76 -76
  32. avrotize/dependency_resolver.py +348 -348
  33. avrotize/jsonstoavro.py +1698 -1698
  34. avrotize/jsonstostructure.py +2642 -2642
  35. avrotize/jstructtoavro.py +878 -878
  36. avrotize/kstructtoavro.py +93 -93
  37. avrotize/kustotoavro.py +455 -455
  38. avrotize/parquettoavro.py +157 -157
  39. avrotize/proto2parser.py +497 -497
  40. avrotize/proto3parser.py +402 -402
  41. avrotize/prototoavro.py +382 -382
  42. avrotize/structuretocsharp.py +2005 -2005
  43. avrotize/structuretojsons.py +498 -498
  44. avrotize/structuretopython.py +772 -772
  45. avrotize/structuretots.py +653 -0
  46. avrotize/xsdtoavro.py +413 -413
  47. structurize-2.16.6.dist-info/METADATA +107 -0
  48. structurize-2.16.6.dist-info/RECORD +52 -0
  49. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/licenses/LICENSE +200 -200
  50. structurize-2.16.2.dist-info/METADATA +0 -805
  51. structurize-2.16.2.dist-info/RECORD +0 -51
  52. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/WHEEL +0 -0
  53. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/entry_points.txt +0 -0
  54. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/top_level.txt +0 -0
avrotize/jsonstoavro.py CHANGED
@@ -1,1698 +1,1698 @@
1
- """ JSON to Avro schema converter. """
2
-
3
- # pylint: disable=too-many-lines, line-too-long, too-many-branches, too-many-statements, too-many-locals, too-many-nested-blocks, too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-boolean-expressions
4
-
5
- import json
6
- import os
7
- import copy
8
- import urllib
9
- from urllib.parse import ParseResult, urlparse, unquote
10
- from typing import Any, Dict, List, Tuple
11
- import jsonpointer
12
- from jsonpointer import JsonPointerException
13
- import requests
14
-
15
- from avrotize.common import avro_name, avro_namespace, find_schema_node, generic_type, set_schema_node
16
- from avrotize.dependency_resolver import inline_dependencies_of, sort_messages_by_dependencies
17
-
18
- primitive_types = ['null', 'string', 'int',
19
- 'long', 'float', 'double', 'boolean', 'bytes']
20
-
21
-
22
- class JsonToAvroConverter:
23
- """
24
- Converts JSON schema to Avro schema.
25
-
26
- Attributes:
27
- imported_types: A dictionary of imported type schemas.
28
- root_namespace: The namespace for the root schema.
29
- max_recursion_depth: The maximum recursion depth.
30
- types_with_unmerged_types: A list of types with unmerged types.
31
- content_cache: A dictionary for caching fetched URLs.
32
- utility_namespace: The namespace for utility types.
33
- maximize_compatiblity: A flag to maximize compatibility.
34
-
35
- """
36
-
37
- def __init__(self) -> None:
38
- self.imported_types: Dict[Any, Any] = {}
39
- self.root_namespace = 'example.com'
40
- self.max_recursion_depth = 40
41
- self.types_with_unmerged_types: List[dict] = []
42
- self.content_cache: Dict[str, str] = {}
43
- self.utility_namespace = 'utility.vasters.com'
44
- self.split_top_level_records = False
45
- self.root_class_name = 'document'
46
-
47
- def is_empty_type(self, avro_type):
48
- """
49
- Check if the Avro type is an empty type.
50
-
51
- Parameters:
52
- avro_type (any): The Avro type to check.
53
-
54
- Returns:
55
- bool: True if the Avro type is empty, False otherwise.
56
- """
57
- if len(avro_type) == 0:
58
- return True
59
- if isinstance(avro_type, list):
60
- return all(self.is_empty_type(t) for t in avro_type)
61
- if isinstance(avro_type, dict):
62
- if not 'type' in avro_type:
63
- return True
64
- if (avro_type['type'] == 'record' and (not 'fields' in avro_type or len(avro_type['fields']) == 0)) or \
65
- (avro_type['type'] == 'enum' and (not 'symbols' in avro_type or len(avro_type['symbols']) == 0)) or \
66
- (avro_type['type'] == 'array' and (not 'items' in avro_type or not avro_type['items'])) or \
67
- (avro_type['type'] == 'map' and (not 'values' in avro_type or not avro_type['values'])):
68
- return True
69
- return False
70
-
71
- def is_empty_json_type(self, json_type):
72
- """
73
- Check if the JSON type is an empty type.
74
-
75
- Parameters:
76
- json_type (any): The JSON type to check.
77
-
78
- Returns:
79
- bool: True if the JSON type is empty, False otherwise.
80
- """
81
- if len(json_type) == 0:
82
- return True
83
- if isinstance(json_type, list):
84
- return all(self.is_empty_json_type(t) for t in json_type)
85
- if isinstance(json_type, dict):
86
- if not 'type' in json_type:
87
- return True
88
- return False
89
-
90
- def flatten_union(self, type_list: list) -> list:
91
- """
92
- Flatten the list of types in a union into a single list.
93
-
94
- Args:
95
- type_list (list): The list of types in a union.
96
-
97
- Returns:
98
- list: The flattened list of types.
99
-
100
- """
101
- flat_list = []
102
- for t in type_list:
103
- if isinstance(t, list):
104
- inner = self.flatten_union(t)
105
- for u in inner:
106
- if not u in flat_list:
107
- flat_list.append(u)
108
- elif not t in flat_list:
109
- flat_list.append(t)
110
- # consolidate array type instances
111
- array_type = None
112
- map_type = None
113
- flat_list_1 = []
114
- for t in flat_list:
115
- if isinstance(t, dict) and 'type' in t and t['type'] == 'array' and 'items' in t:
116
- if not array_type:
117
- array_type = t
118
- flat_list_1.append(t)
119
- else:
120
- array_type = self.merge_avro_schemas([array_type, t], [])
121
- elif isinstance(t, dict) and 'type' in t and t['type'] == 'map' and 'values' in t:
122
- if not map_type:
123
- map_type = t
124
- flat_list_1.append(t)
125
- else:
126
- map_type = self.merge_avro_schemas([map_type, t], [])
127
- elif not t in flat_list_1:
128
- flat_list_1.append(t)
129
- return flat_list_1
130
-
131
- # pylint: disable=dangerous-default-value
132
- def merge_avro_schemas(self, schemas: list, avro_schemas: list, type_name: str | None = None, deps: List[str] = []) -> str | list | dict:
133
- """Merge multiple Avro type schemas into one."""
134
-
135
- def split_merge(schema1, schema2, schema_list, offset):
136
- """ return the continuing schema merges of incompatible schemas """
137
- remaining_schemas = schema_list[offset +
138
- 1:] if len(schema_list) > offset else []
139
- if isinstance(schema2, dict) and 'dependencies' in schema2:
140
- deps.extend(schema2['dependencies'])
141
- del schema2['dependencies']
142
- if isinstance(schema1, dict) and 'dependencies' in schema1:
143
- deps.extend(schema1['dependencies'])
144
- del schema1['dependencies']
145
- schema1_merged = self.merge_avro_schemas(
146
- [schema2] + remaining_schemas, avro_schemas, type_name, deps)
147
- schema2_merged = self.merge_avro_schemas(
148
- [schema1] + remaining_schemas, avro_schemas, type_name, deps)
149
- if not self.is_empty_type(schema1_merged) and not self.is_empty_type(schema2_merged):
150
- return self.flatten_union([schema1_merged, schema2_merged])
151
- else:
152
- if not self.is_empty_type(schema1_merged):
153
- return schema1_merged
154
- if not self.is_empty_type(schema2_merged):
155
- return schema2_merged
156
- # if both are empty, we'll return an empty record
157
- return {'type': 'record', 'fields': []}
158
-
159
- merged_schema: dict = {}
160
- if len(schemas) == 1:
161
- return schemas[0]
162
- if type_name:
163
- self.set_avro_type_value(merged_schema, 'name', type_name)
164
- for i, schema in enumerate(schemas):
165
- schema = copy.deepcopy(schema)
166
- if isinstance(schema, dict) and 'dependencies' in schema:
167
- deps1: List[str] = merged_schema.get('dependencies', [])
168
- deps1.extend(schema['dependencies'])
169
- merged_schema['dependencies'] = deps1
170
- if (isinstance(schema, list) or isinstance(schema, dict)) and len(schema) == 0:
171
- continue
172
- if isinstance(schema, str):
173
- sch = next(
174
- (s for s in avro_schemas if s.get('name') == schema), None)
175
- if sch:
176
- merged_schema.update(sch)
177
- else:
178
- merged_schema['type'] = schema
179
- elif isinstance(schema, list):
180
- # the incoming schema is a list, so it's a union
181
- if 'type' not in merged_schema:
182
- merged_schema['type'] = schema
183
- else:
184
- if isinstance(merged_schema['type'], list):
185
- merged_schema['type'].extend(schema)
186
- else:
187
- if isinstance(merged_schema['type'], str):
188
- if merged_schema['type'] == 'record' or merged_schema['type'] == 'enum' or merged_schema['type'] == 'fixed' \
189
- or merged_schema['type'] == 'map' or merged_schema['type'] == 'array':
190
- return split_merge(merged_schema, schema, schemas, i)
191
- else:
192
- merged_schema['type'] = [merged_schema['type']]
193
- else:
194
- merged_schema['type'].extend(schema)
195
- elif schema and ('type' not in schema or 'type' not in merged_schema):
196
- merged_schema.update(schema)
197
- elif schema:
198
- if 'type' in merged_schema and schema['type'] != merged_schema['type']:
199
- return split_merge(merged_schema, schema, schemas, i)
200
- if not type_name:
201
- self.set_avro_type_value(merged_schema, 'name', avro_name(
202
- merged_schema.get('name', '') + schema.get('name', '')))
203
- if 'fields' in schema:
204
- if 'fields' in merged_schema:
205
- for field in schema['fields']:
206
- if field not in merged_schema['fields']:
207
- merged_schema['fields'].append(field)
208
- else:
209
- merged_schema_field = next(
210
- f for f in merged_schema['fields'] if f.get('name') == field.get('name'))
211
- if merged_schema_field['type'] != field['type']:
212
- merged_schema_field['type'] = [
213
- field['type'], merged_schema_field['type']]
214
- if 'doc' in field and 'doc' not in merged_schema_field:
215
- merged_schema_field['doc'] = field['doc']
216
- else:
217
- merged_schema['fields'] = schema['fields']
218
- if self.is_avro_complex_type(merged_schema) and 'namespace' in merged_schema:
219
- if merged_schema['type'] in ['array', 'map']:
220
- del merged_schema['namespace']
221
- return merged_schema
222
-
223
- def merge_json_schemas(self, json_schemas: list[dict], intersect: bool = False) -> dict:
224
- """
225
- Merge multiple JSON schemas into one.
226
-
227
- Args:
228
- json_schemas (list[dict]): A list of JSON schemas to be merged.
229
- intersect (bool, optional): If True, only keep the intersection of the required fields. Defaults to False.
230
-
231
- Returns:
232
- dict: The merged JSON schema.
233
- """
234
-
235
- def merge_structures(schema1: dict, schema2: dict) -> dict | list:
236
- """ merge two JSON dicts recursively """
237
- if 'type' in schema1 and 'type' in schema2 and schema1['type'] != schema2['type']:
238
- return [schema1, schema2]
239
- schema1 = copy.deepcopy(schema1)
240
- for key in schema2:
241
- if key not in schema1:
242
- schema1[key] = schema2[key]
243
- elif isinstance(schema1[key], dict) and isinstance(schema2[key], dict):
244
- schema1[key] = merge_structures(schema1[key], schema2[key])
245
- elif isinstance(schema1[key], list) and isinstance(schema2[key], list):
246
- schema1[key].extend(schema2[key])
247
- elif schema1[key] == schema2[key]:
248
- continue
249
- else:
250
- if isinstance(schema1[key], list):
251
- if schema2[key] not in schema1[key]:
252
- schema1[key].append(schema2[key])
253
- else:
254
- schema1[key] = [schema1[key], schema2[key]]
255
- return schema1
256
-
257
- merged_type: dict = {}
258
-
259
- for json_schema in json_schemas:
260
- if 'type' not in json_schema or 'type' not in merged_type:
261
- for key in json_schema:
262
- if not key in merged_type:
263
- merged_type[key] = copy.deepcopy(json_schema[key])
264
- else:
265
- if key == 'required':
266
- merged_type[key] = list(
267
- set(merged_type[key]).union(set(json_schema[key])))
268
- if key == 'name' or key == 'title' or key == 'description':
269
- merged_type[key] = merged_type[key] + \
270
- json_schema[key]
271
- elif isinstance(merged_type[key], dict):
272
- merged_type[key] = merge_structures(
273
- merged_type[key], copy.deepcopy(json_schema[key]))
274
- elif isinstance(merged_type[key], list) and isinstance(json_schema[key], list):
275
- for item in json_schema[key]:
276
- if item not in merged_type[key]:
277
- merged_type[key].append(item)
278
- else:
279
- if merged_type[key] is None:
280
- merged_type[key] = json_schema[key]
281
- else:
282
- merged_type[key] = [merged_type[key],
283
- copy.deepcopy(json_schema[key])]
284
- else:
285
- if 'type' in merged_type and json_schema['type'] != merged_type['type']:
286
- if isinstance(merged_type['type'], str):
287
- merged_type['type'] = [merged_type['type']]
288
- merged_type['type'].append(json_schema['type'])
289
- if 'required' in json_schema:
290
- if 'required' in merged_type:
291
- merged_type['required'] = list(
292
- set(merged_type['required']).union(set(json_schema['required'])))
293
- else:
294
- merged_type['required'] = json_schema['required']
295
- if 'name' in json_schema:
296
- if 'name' in merged_type:
297
- merged_type['name'] = merged_type.get(
298
- 'name', '') + json_schema['name']
299
- else:
300
- merged_type['name'] = json_schema['name']
301
- if 'properties' in json_schema:
302
- if 'properties' in merged_type:
303
- for prop in json_schema['properties']:
304
- if prop in merged_type['properties']:
305
- merged_type['properties'][prop] = merge_structures(
306
- merged_type['properties'][prop], copy.deepcopy(json_schema['properties'][prop]))
307
- else:
308
- merged_type['properties'][prop] = json_schema['properties'][prop]
309
- else:
310
- merged_type['properties'] = json_schema['properties']
311
- if 'enum' in json_schema:
312
- if 'enum' in merged_type:
313
- merged_type['enum'] = list(
314
- set(merged_type['enum']).union(set(json_schema['enum'])))
315
- else:
316
- merged_type['enum'] = json_schema['enum']
317
- if 'format' in json_schema:
318
- if 'format' in merged_type:
319
- merged_type['format'] = merged_type['format'] + \
320
- json_schema['format']
321
- else:
322
- merged_type['format'] = json_schema['format']
323
-
324
- if intersect:
325
- # only keep the intersection of the required fields
326
- if 'required' in merged_type:
327
- new_required = merged_type['required']
328
- for json_schema in json_schemas:
329
- new_required = list(set(new_required).intersection(
330
- set(json_schema.get('required', []))))
331
- merged_type['required'] = new_required
332
-
333
- return merged_type
334
-
335
- def ensure_type(self, type: dict | str | list) -> dict | str | list:
336
- """
337
- Ensures that the given type is valid by adding a 'type' field if it is missing.
338
-
339
- Args:
340
- type (dict | str | list): The type to ensure.
341
-
342
- Returns:
343
- dict | str | list: The ensured type.
344
- """
345
- if isinstance(type, str) or isinstance(type, list) or 'type' in type:
346
- return type
347
-
348
- type['type'] = generic_type()
349
- return type
350
-
351
- def json_schema_primitive_to_avro_type(self, json_primitive: str | list, format: str | None, enum: list | None, record_name: str, field_name: str, namespace: str, dependencies: list) -> str | dict[str, Any] | list:
352
- """
353
- Convert a JSON-schema primitive type to Avro primitive type.
354
-
355
- Args:
356
- json_primitive (str | list): The JSON-schema primitive type to be converted.
357
- format (str | None): The format of the JSON primitive type, if applicable.
358
- enum (list | None): The list of enum values, if applicable.
359
- record_name (str): The name of the record.
360
- field_name (str): The name of the field.
361
- namespace (str): The namespace of the Avro type.
362
- dependencies (list): The list of dependencies.
363
-
364
- Returns:
365
- str | dict[str,Any] | list: The converted Avro primitive type.
366
-
367
- """
368
- if isinstance(json_primitive, list):
369
- if enum:
370
- json_primitive = 'string'
371
- else:
372
- union = []
373
- for item in json_primitive:
374
- enum2 = item.get('enum') if isinstance(
375
- item, dict) else None
376
- format2 = item.get('format') if isinstance(
377
- item, dict) else None
378
- avro_primitive = self.json_schema_primitive_to_avro_type(
379
- item, format2, enum2, record_name, field_name, self.compose_namespace(namespace, record_name, field_name), dependencies)
380
- union.append(avro_primitive)
381
- return union
382
-
383
- if json_primitive == 'string':
384
- avro_primitive = 'string'
385
- elif json_primitive == 'integer':
386
- avro_primitive = 'int'
387
- if format == 'int64':
388
- avro_primitive = 'long'
389
- elif json_primitive == 'number':
390
- avro_primitive = 'float'
391
- elif json_primitive == 'boolean':
392
- avro_primitive = 'boolean'
393
- elif not format:
394
- if isinstance(json_primitive, str):
395
- dependencies.append(json_primitive)
396
- avro_primitive = json_primitive
397
-
398
- # if you've got { 'type': 'string', 'format': ['date-time', 'duration'] }, I'm sorry
399
- if format and isinstance(format, str):
400
- if format in ('date-time', 'date'):
401
- avro_primitive = {'type': 'int', 'logicalType': 'date'}
402
- elif format in ('time'):
403
- avro_primitive = {'type': 'int', 'logicalType': 'time-millis'}
404
- elif format in ('duration'):
405
- avro_primitive = {'type': 'fixed',
406
- 'size': 12, 'logicalType': 'duration'}
407
- elif format in ('uuid'):
408
- avro_primitive = {'type': 'string', 'logicalType': 'uuid'}
409
-
410
- return avro_primitive
411
-
412
- def fetch_content(self, url: str | ParseResult):
413
- """
414
- Fetches the content from the specified URL.
415
-
416
- Args:
417
- url (str or ParseResult): The URL to fetch the content from.
418
-
419
- Returns:
420
- str: The fetched content.
421
-
422
- Raises:
423
- requests.RequestException: If there is an error while making the HTTP request.
424
- Exception: If there is an error while reading the file.
425
-
426
- """
427
- # Parse the URL to determine the scheme
428
- if isinstance(url, str):
429
- parsed_url = urlparse(url)
430
- else:
431
- parsed_url = url
432
-
433
- if parsed_url.geturl() in self.content_cache:
434
- return self.content_cache[parsed_url.geturl()]
435
- scheme = parsed_url.scheme
436
-
437
- # Handle HTTP and HTTPS URLs
438
- if scheme in ['http', 'https']:
439
- response = requests.get(url if isinstance(
440
- url, str) else parsed_url.geturl(), timeout=30)
441
- # Raises an HTTPError if the response status code is 4XX/5XX
442
- response.raise_for_status()
443
- self.content_cache[parsed_url.geturl()] = response.text
444
- return response.text
445
-
446
- # Handle file URLs
447
- elif scheme == 'file':
448
- # Remove the leading 'file://' from the path for compatibility
449
- file_path = parsed_url.netloc
450
- if not file_path:
451
- file_path = parsed_url.path
452
- # On Windows, a file URL might start with a '/' but it's not part of the actual path
453
- if os.name == 'nt' and file_path.startswith('/'):
454
- file_path = file_path[1:]
455
- with open(file_path, 'r', encoding='utf-8') as file:
456
- text = file.read()
457
- self.content_cache[parsed_url.geturl()] = text
458
- return text
459
- else:
460
- raise NotImplementedError(f'Unsupported URL scheme: {scheme}')
461
-
462
- def resolve_reference(self, json_type: dict, base_uri: str, json_doc: dict) -> Tuple[dict, dict]:
463
- """
464
- Resolve a JSON Pointer reference or a JSON $ref reference.
465
-
466
- Args:
467
- json_type (dict): The JSON type containing the reference.
468
- base_uri (str): The base URI of the JSON document.
469
- json_doc (dict): The JSON document containing the reference.
470
-
471
- Returns:
472
- Tuple[dict, dict]: A tuple containing the resolved JSON schema and the original JSON schema document.
473
-
474
- Raises:
475
- Exception: If there is an error decoding JSON from the reference.
476
- Exception: If there is an error resolving the JSON Pointer reference.
477
-
478
- """
479
- try:
480
- ref = json_type['$ref']
481
- content = None
482
- url = urlparse(ref)
483
- if url.scheme:
484
- content = self.fetch_content(ref)
485
- elif url.path:
486
- file_uri = self.compose_uri(base_uri, url)
487
- content = self.fetch_content(file_uri)
488
- if content:
489
- try:
490
- json_schema_doc = json_schema = json.loads(content)
491
- # resolve the JSON Pointer reference, if any
492
- if url.fragment:
493
- json_schema = jsonpointer.resolve_pointer(
494
- json_schema, url.fragment)
495
- return json_schema, json_schema_doc
496
- except json.JSONDecodeError:
497
- raise Exception(f'Error decoding JSON from {ref}')
498
-
499
- if url.fragment:
500
- json_pointer = unquote(url.fragment)
501
- ref_schema = jsonpointer.resolve_pointer(
502
- json_doc, json_pointer)
503
- if ref_schema:
504
- return ref_schema, json_doc
505
- except JsonPointerException as e:
506
- raise Exception(
507
- f'Error resolving JSON Pointer reference for {base_uri}')
508
- return json_type, json_doc
509
-
510
- def compose_uri(self, base_uri, url):
511
- if isinstance(url, str):
512
- url = urlparse(url)
513
- if url.scheme:
514
- return url.geturl()
515
- if not url.path and not url.netloc:
516
- return base_uri
517
- if base_uri.startswith('file'):
518
- parsed_file_uri = urlparse(base_uri)
519
- dir = os.path.dirname(
520
- parsed_file_uri.netloc if parsed_file_uri.netloc else parsed_file_uri.path)
521
- filename = os.path.join(dir, url.path)
522
- file_uri = f'file://{filename}'
523
- else:
524
- # combine the base URI with the URL
525
- file_uri = urllib.parse.urljoin(base_uri, url.geturl())
526
- return file_uri
527
-
528
- def get_field_type_name(self, field: dict) -> str:
529
- if isinstance(field['type'], str):
530
- return field['type']
531
- elif isinstance(field['type'], list):
532
- names = []
533
- for field_type in field['type']:
534
- if isinstance(field_type, str):
535
- names.append(field_type)
536
- elif isinstance(field_type, dict):
537
- names.append(self.get_field_type_name(field_type))
538
- else:
539
- names.append('union')
540
- return ', '.join(names)
541
- elif isinstance(field['type'], dict) and 'type' in field['type']:
542
- return field['type']['type']
543
- return 'union'
544
-
545
- def json_type_to_avro_type(self, json_type: str | dict, record_name: str, field_name: str, namespace: str, dependencies: list, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth=1) -> dict | list | str:
546
- """Convert a JSON type to Avro type."""
547
-
548
- try:
549
- if recursion_depth >= self.max_recursion_depth:
550
- print(
551
- f'WARNING: Maximum recursion depth reached for {record_name} at field {field_name}')
552
- return generic_type()
553
-
554
- avro_type: list | dict | str = {}
555
- local_name = avro_name(field_name if field_name else record_name)
556
- hasAnyOf = isinstance(json_type, dict) and 'anyOf' in json_type
557
-
558
- if isinstance(json_type, dict):
559
-
560
- json_object_type = json_type.get('type')
561
- if isinstance(json_object_type, list):
562
- # if the 'type' is a list, we map it back to a string
563
- # if the list has only one item or if the list has two items
564
- # and one of them is 'null'
565
- # otherwise, we will construct and inject a oneOf type
566
- # and split the type
567
- if len(json_object_type) == 1:
568
- json_object_type = json_object_type[0]
569
- elif len(json_object_type) == 2 and 'null' in json_object_type:
570
- if json_object_type[0] == 'null':
571
- json_object_type = json_object_type[1]
572
- else:
573
- json_object_type = json_object_type[0]
574
- else:
575
- oneof = []
576
- for option in json_object_type:
577
- if not option == 'null':
578
- oneof.append({
579
- 'type': option
580
- })
581
- if len(oneof) > 0:
582
- del json_type['type']
583
- json_type['oneOf'] = oneof
584
-
585
- if 'if' in json_type or 'then' in json_type or 'else' in json_type or 'dependentSchemas' in json_type or 'dependentRequired' in json_type:
586
- print(
587
- 'WARNING: Conditional schema is not supported and will be ignored.')
588
- if 'if' in json_type:
589
- del json_type['if']
590
- if 'then' in json_type:
591
- del json_type['then']
592
- if 'else' in json_type:
593
- del json_type['else']
594
- if 'dependentSchemas' in json_type:
595
- del json_type['dependentSchemas']
596
- if 'dependentRequired' in json_type:
597
- del json_type['dependentRequired']
598
-
599
- base_type = json_type.copy()
600
- if 'oneOf' in base_type:
601
- del base_type['oneOf']
602
- if 'anyOf' in base_type:
603
- del base_type['anyOf']
604
- if 'allOf' in base_type:
605
- del base_type['allOf']
606
- json_types = []
607
-
608
- if 'allOf' in json_type:
609
- # if the json type is an allOf, we merge all types into one
610
- # this may be lossy if aspects of the types overlap but differ
611
- type_list = [copy.deepcopy(base_type)]
612
- for allof_option in json_type['allOf']:
613
- while isinstance(allof_option, dict) and '$ref' in allof_option:
614
- resolved_json_type, resolved_schema = self.resolve_reference(
615
- allof_option, base_uri, json_schema)
616
- del allof_option['$ref']
617
- allof_option = self.merge_json_schemas(
618
- [allof_option, resolved_json_type])
619
- type_list.append(copy.deepcopy(allof_option))
620
- merged_type = self.merge_json_schemas(
621
- type_list, intersect=False)
622
- json_types.append(merged_type)
623
-
624
- if 'oneOf' in json_type:
625
- # if the json type is a oneOf, we create a type union of all types
626
- if len(json_types) == 0:
627
- type_to_process = copy.deepcopy(base_type)
628
- else:
629
- type_to_process = copy.deepcopy(json_types.pop())
630
- json_types = []
631
- oneof = json_type['oneOf']
632
- if len(json_types) == 0:
633
- for oneof_option in oneof:
634
- if isinstance(oneof_option, dict) and 'type' in oneof_option and 'type' in type_to_process and not type_to_process.get('type') == oneof_option.get('type'):
635
- # we can't merge these due to conflicting types, so we pass the option-type on as-is
636
- json_types.append(oneof_option)
637
- else:
638
- json_types.append(self.merge_json_schemas(
639
- [type_to_process, oneof_option], intersect=True))
640
- else:
641
- new_json_types = []
642
- for oneof_option in oneof:
643
- for json_type_option in json_types:
644
- json_type_option = self.merge_json_schemas(
645
- [json_type_option, oneof_option], intersect=True)
646
- new_json_types.append(json_type_option)
647
- json_types = new_json_types
648
-
649
- if 'anyOf' in json_type:
650
- types_to_process = json_types.copy() if len(json_types) > 0 else [
651
- copy.deepcopy(base_type)]
652
- json_types = []
653
- for type_to_process in types_to_process:
654
- type_list = [copy.deepcopy(type_to_process)]
655
- # anyOf is a list of types where any number from 1 to all
656
- # may match the data. Trouble with anyOf is that it doesn't
657
- # really have a semantic interpretation in the context of Avro.
658
- for anyof_option in json_type['anyOf']:
659
- if isinstance(anyof_option, dict) and '$ref' in anyof_option:
660
- # if we have a ref, we can't merge into the base type, so we pass it on as-is.
661
- # into the JSON type list
662
- json_types.append(copy.deepcopy(anyof_option))
663
- else:
664
- type_list.append(copy.deepcopy(anyof_option))
665
- merged_type = self.merge_json_schemas(
666
- type_list, intersect=False)
667
- json_types.append(merged_type)
668
-
669
- if len(json_types) > 0:
670
- if len(json_types) == 1:
671
- avro_type = self.json_type_to_avro_type(
672
- json_types[0], record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
673
- if isinstance(avro_type, dict) and self.is_empty_type(avro_type) and not 'allOf' in json_type:
674
- avro_type['type'] = generic_type()
675
- avro_type = self.post_check_avro_type(
676
- dependencies, avro_type)
677
- return avro_type
678
- else:
679
- try:
680
- record_stack.append(
681
- field_name if field_name else record_name)
682
- subtypes = []
683
- count = 1
684
- type_deps: List[str] = []
685
- for json_type_option in json_types:
686
- if isinstance(json_type_option, dict) and '$ref' in json_type_option:
687
- ref = json_type_option['$ref']
688
- if ref in self.imported_types:
689
- avro_subtype = self.imported_types[ref]
690
- subtypes.append(avro_subtype)
691
- type_deps.append(avro_subtype)
692
- continue
693
-
694
- subtype_deps: List[str] = []
695
- sub_field_name = avro_name(local_name + '_' + str(count)) if not isinstance(
696
- json_type_option, dict) or not '$ref' in json_type_option else None
697
- avro_subtype = self.json_type_to_avro_type(
698
- json_type_option, record_name, sub_field_name, namespace, subtype_deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
699
- if not avro_subtype:
700
- continue
701
- if isinstance(avro_subtype, dict) and 'name' in avro_subtype and 'type' in avro_subtype and (avro_subtype['type'] == 'record' or avro_subtype['type'] == 'enum'):
702
- # we have a standalone record or enum so we need to add it to the schema at the top-level
703
- # and reference it as a dependency from the parent type if it's not already been added.
704
- existing_type = next((t for t in avro_schema if t.get('name') == avro_subtype['name'] and t.get(
705
- 'namespace') == avro_subtype.get('namespace')), None)
706
- if not existing_type:
707
- if subtype_deps:
708
- if not 'dependencies' in avro_subtype:
709
- avro_subtype['dependencies'] = subtype_deps
710
- else:
711
- avro_subtype['dependencies'].extend(
712
- subtype_deps)
713
- if self.is_empty_type(avro_subtype):
714
- print(
715
- f'WARN: Standalone type {avro_subtype["name"]} is empty')
716
- if avro_subtype['type'] != 'enum' and avro_subtype['type'] != 'record' and avro_subtype['type'] != 'fixed':
717
- raise ValueError(
718
- f'WARN: Standalone type {avro_subtype["name"]} is not a record or enum or fixed type')
719
- avro_schema.append(avro_subtype)
720
- full_name = self.get_qualified_name(
721
- avro_subtype)
722
- subtype_deps = [full_name]
723
- avro_subtype = full_name
724
- if isinstance(avro_subtype, dict) and 'dependencies' in avro_subtype:
725
- subtype_deps.extend(
726
- avro_subtype['dependencies'])
727
- del avro_subtype['dependencies']
728
- if len(subtype_deps) > 0:
729
- type_deps.extend(subtype_deps)
730
- if not self.is_empty_type(avro_subtype):
731
- if isinstance(avro_subtype, list):
732
- subtypes.extend(
733
- copy.deepcopy(avro_subtype))
734
- else:
735
- subtypes.append(
736
- copy.deepcopy(avro_subtype))
737
- count += 1
738
- if len(type_deps) > 0:
739
- dependencies.extend(type_deps)
740
- if len(subtypes) == 1:
741
- return self.post_check_avro_type(dependencies, subtypes[0])
742
- finally:
743
- record_stack.pop()
744
-
745
- if hasAnyOf:
746
- # if all subtypes are strings, they are either primitive types or type references
747
- # which means there's nothing to merge, so we'll return the list of types
748
- if all([isinstance(st, str) for st in subtypes]):
749
- return self.post_check_avro_type(dependencies, subtypes)
750
-
751
- # we now has a list of types that may match the data, but this would be
752
- # an Avro union which is mutually exclusive. We will merge this list
753
- # into a record type in postprocessing when all types are available
754
- if not isinstance(avro_type, dict):
755
- avro_type = {}
756
- avro_type['unmerged_types'] = subtypes
757
- avro_type['type'] = 'record'
758
- avro_type['name'] = avro_name(local_name)
759
- if local_name != avro_name(local_name):
760
- avro_type['altnames'] = { 'json': local_name }
761
- avro_type['namespace'] = namespace
762
- avro_type['fields'] = []
763
- if 'description' in json_type:
764
- avro_type['doc'] = json_type['description']
765
- json_type = {}
766
- else:
767
- return self.post_check_avro_type(dependencies, subtypes)
768
-
769
- if 'properties' in json_type and not 'type' in json_type:
770
- json_type['type'] = 'object'
771
-
772
- if 'description' in json_type and isinstance(avro_type, dict):
773
- avro_type['doc'] = json_type['description']
774
-
775
- if 'title' in json_type and isinstance(avro_type, dict):
776
- self.set_avro_type_value(
777
- avro_type, 'name', avro_name(json_type['title']))
778
-
779
- # first, pull in any referenced definitions and merge with this schema
780
- if '$ref' in json_type:
781
- # the $ref can indeed be a list as a result from a prior allOf/anyOf merge
782
- # if that is so, we will copy the type and process each $ref separately
783
- # and return the result as a list of types
784
- if isinstance(json_type['$ref'], list):
785
- types = []
786
- for ref in json_type['$ref']:
787
- json_type_copy = copy.deepcopy(json_type)
788
- json_type_copy['$ref'] = ref
789
- types.append(self.json_type_to_avro_type(json_type_copy, record_name, field_name, namespace,
790
- dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
791
- return self.post_check_avro_type(dependencies, types)
792
-
793
- ref = json_type['$ref']
794
- if ref in self.imported_types:
795
- # reference was already resolved, so we can resolve the reference simply by returning the type
796
- type_ref = copy.deepcopy(self.imported_types[ref])
797
- if isinstance(type_ref, str):
798
- dependencies.append(type_ref)
799
- return self.post_check_avro_type(dependencies, type_ref)
800
- else:
801
- new_base_uri = self.compose_uri(
802
- base_uri, json_type['$ref'])
803
- resolved_json_type, resolved_schema = self.resolve_reference(
804
- json_type, base_uri, json_schema)
805
- if self.is_empty_json_type(json_type):
806
- # it's a standalone reference, so will import the type into the schema
807
- # and reference it like it was in the same file
808
- type_name = record_name
809
- type_namespace = namespace
810
- parsed_ref = urlparse(ref)
811
- if parsed_ref.fragment:
812
- type_name = avro_name(
813
- parsed_ref.fragment.split('/')[-1])
814
- sub_namespace = self.compose_namespace(
815
- *parsed_ref.fragment.split('/')[2:-1])
816
- type_namespace = self.compose_namespace(
817
- self.root_namespace, sub_namespace)
818
-
819
- # registering in imported_types ahead of resolving to prevent circular references.
820
- # we only cache the type if it's forseeable that it is usable as a standalone type
821
- # which means that it must be either a record or an enum or a fixed type when converted
822
- # to Avro. That means we look for the presence of 'type', 'properties', 'allOf', 'anyOf',
823
- # and 'enum' in the resolved type.
824
- if resolved_json_type and (('type' in resolved_json_type and resolved_json_type['type'] == 'object') or 'properties' in resolved_json_type or 'enum' in resolved_json_type or
825
- 'allOf' in resolved_json_type or 'anyOf' in resolved_json_type):
826
- self.imported_types[ref] = self.compose_namespace(
827
- type_namespace, type_name)
828
- # resolve type
829
- deps: List[str] = []
830
- resolved_avro_type: dict | list | str | None = self.json_type_to_avro_type(
831
- resolved_json_type, type_name, '', type_namespace, deps, resolved_schema, new_base_uri, avro_schema, [], recursion_depth + 1)
832
- if isinstance(resolved_avro_type, str):
833
- dependencies.extend(deps)
834
- return self.post_check_avro_type(dependencies, resolved_avro_type)
835
- if isinstance(resolved_avro_type, list) or (not isinstance(resolved_avro_type, dict) or (not resolved_avro_type.get('type') == 'record' and not resolved_avro_type.get('type') == 'enum')):
836
- if isinstance(resolved_avro_type, dict) and not 'type' in resolved_avro_type:
837
- if isinstance(avro_type, dict):
838
- # the resolved type didn't have a type and avro_type is a dict,
839
- # so we assume it's a mixin into the type we found
840
- avro_type.update(resolved_avro_type)
841
- resolved_avro_type = None
842
- else:
843
- # no 'type' definition for this field and we can't mix into the avro type,
844
- # so we fallback to a generic type
845
- print(
846
- f"WARNING: no 'type' definition for {ref} in record {record_name}: {json.dumps(resolved_avro_type)}")
847
- resolved_avro_type = generic_type()
848
- elif isinstance(avro_type, str) and resolved_avro_type:
849
- # this is a plain type reference
850
- avro_type = resolved_avro_type
851
- self.imported_types[ref] = avro_type
852
- resolved_avro_type = None
853
- if resolved_avro_type:
854
- # this is not a record type that can stand on its own,
855
- # so we remove the cached type entry
856
- # and pass it on as an inline type
857
- dependencies.extend(deps)
858
- if ref in self.imported_types:
859
- del self.imported_types[ref]
860
- avro_type = self.merge_avro_schemas(
861
- [avro_type, resolved_avro_type], avro_schema, local_name)
862
- if isinstance(avro_type, dict) and 'name' in avro_type and not self.is_standalone_avro_type(avro_type):
863
- del avro_type['name']
864
- return self.post_check_avro_type(dependencies, avro_type)
865
- else:
866
- avro_type = resolved_avro_type
867
- self.imported_types[ref] = copy.deepcopy(
868
- avro_type)
869
-
870
- if len(deps) > 0:
871
- if isinstance(avro_type, dict):
872
- avro_type['dependencies'] = deps
873
- else:
874
- dependencies.extend(deps)
875
-
876
- if self.is_standalone_avro_type(avro_type):
877
- self.register_type(avro_schema, avro_type)
878
- full_name = self.get_qualified_name(avro_type)
879
- if ref in self.imported_types:
880
- # update the import reference to the resolved type if it's cached
881
- self.imported_types[ref] = full_name
882
- dependencies.append(full_name)
883
- avro_type = full_name
884
- else:
885
- del json_type['$ref']
886
- # it's a reference within a definition, so we will turn this into an inline type
887
- if isinstance(resolved_json_type, dict) and 'type' in resolved_json_type and json_type.get('type') and not json_type['type'] == resolved_json_type['type']:
888
- # the types conflict, so we can't merge them
889
- type1 = self.json_type_to_avro_type(
890
- json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
891
- type2 = self.json_type_to_avro_type(resolved_json_type, record_name, field_name, namespace,
892
- dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
893
- # if either of the types are empty, use just the other one
894
- if not self.is_empty_type(type1) and not self.is_empty_type(type2):
895
- return self.flatten_union([type1, type2])
896
- if not self.is_empty_type(type1):
897
- avro_type = type1
898
- if isinstance(avro_type, list):
899
- return self.post_check_avro_type(dependencies, avro_type)
900
- if not self.is_empty_type(type2):
901
- avro_type = type2
902
- if isinstance(avro_type, list):
903
- return self.post_check_avro_type(dependencies, avro_type)
904
- json_type = {}
905
- else:
906
- json_type = self.merge_json_schemas(
907
- [json_type, resolved_json_type])
908
- avro_type = self.json_type_to_avro_type(
909
- json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
910
- json_type = {}
911
- if ref in self.imported_types:
912
- # update the import reference to the resolved type if it's cached
913
- if isinstance(avro_type, dict) and 'name' in avro_type:
914
- self.imported_types[ref] = avro_type['name']
915
- else:
916
- self.imported_types[ref] = avro_type
917
-
918
- # if 'const' is present, make this an enum
919
- if 'const' in json_type:
920
- const_list = json_type['const'] if isinstance(
921
- json_type['const'], list) else [json_type['const']]
922
- avro_type = self.merge_avro_schemas([avro_type, self.create_enum_type(
923
- local_name, namespace, const_list)], avro_schema, local_name)
924
- if json_object_type or 'enum' in json_type:
925
- if json_object_type == 'array':
926
- if isinstance(json_type, dict) and 'items' in json_type:
927
- deps = []
928
- item_type = self.json_type_to_avro_type(
929
- json_type['items'], record_name, field_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
930
- if self.is_standalone_avro_type(item_type):
931
- if isinstance(item_type, dict) and len(deps) > 0:
932
- item_type['dependencies'] = deps
933
- self.register_type(avro_schema, item_type)
934
- dependencies.append(
935
- self.get_qualified_name(item_type))
936
- else:
937
- dependencies.extend(deps)
938
- if isinstance(item_type, dict) and not 'type' in item_type:
939
- item_type = generic_type()
940
- elif isinstance(item_type, str) and not item_type in primitive_types:
941
- dependencies.append(item_type)
942
- else: # not a standalone type, but has a type definition, so we unwind that here
943
- item_type = self.post_check_avro_type(
944
- dependencies, item_type)
945
- avro_type = self.merge_avro_schemas(
946
- [avro_type, self.create_array_type(item_type)], avro_schema, '')
947
- else:
948
- avro_type = self.merge_avro_schemas(
949
- [avro_type, self.create_array_type(generic_type())], avro_schema, '')
950
- elif json_object_type and (json_object_type == 'object' or 'object' in json_object_type):
951
- avro_record_type = self.json_schema_object_to_avro_record(
952
- local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack)
953
- if isinstance(avro_record_type, list):
954
- for record_entry in avro_record_type:
955
- self.lift_dependencies_from_type(
956
- record_entry, dependencies)
957
- avro_type = self.merge_avro_schemas([avro_type, avro_record_type], avro_schema, avro_type.get(
958
- 'name', local_name) if isinstance(avro_type, dict) else local_name)
959
- self.lift_dependencies_from_type(
960
- avro_type, dependencies)
961
- elif 'enum' in json_type and (not 'type' in json_type or json_type['type'] == "string"):
962
- # we skip all enums that are not of implicit or explicit type 'string'
963
- enum = [avro_name(e) for e in json_type['enum'] if isinstance(
964
- e, str) and e != '']
965
- if len(enum) > 0:
966
- # if the enum ends up empty (only non-strings in the enum), we will skip it
967
- enum = list(set(enum))
968
- if len(enum) > 0:
969
- avro_type = self.create_enum_type(local_name, self.compose_namespace(
970
- namespace, record_name + '_types'), enum)
971
- else:
972
- avro_type = self.json_schema_primitive_to_avro_type(json_object_type, json_type.get(
973
- 'format'), json_type.get('enum'), record_name, field_name, namespace, dependencies)
974
- else:
975
- if isinstance(json_type, dict):
976
- avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(json_type, json_type.get('format'), json_type.get(
977
- 'enum'), record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
978
- else:
979
- avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(
980
- json_type, None, None, record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
981
-
982
- if isinstance(avro_type, dict) and 'name' in avro_type and 'type' in avro_type and not (avro_type['type'] in ['array', 'map']):
983
- if not 'namespace' in avro_type:
984
- avro_type['namespace'] = namespace
985
- existing_type = next((t for t in avro_schema if t.get(
986
- 'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
987
- if existing_type:
988
- existing_type_name = self.get_qualified_name(existing_type)
989
- if not existing_type_name in dependencies:
990
- dependencies.append(existing_type_name)
991
- return existing_type_name
992
- self.set_avro_type_value(avro_type, 'name', local_name)
993
-
994
- # post-check on the avro type: if the type is a dict, and the 'type' is not
995
- # a record, enum, fixed, array, or map, we will just return the basic type
996
- # and push its dependencies up the stack
997
- avro_type = self.post_check_avro_type(dependencies, avro_type)
998
-
999
- if isinstance(avro_type, dict) and 'unmerged_types' in avro_type:
1000
- self.types_with_unmerged_types.append(avro_type)
1001
-
1002
- return avro_type
1003
- except RecursionError as e:
1004
- print(
1005
- f"Recursion error while processing {namespace}:{record_name}:{field_name} with recursion depth {recursion_depth}")
1006
- raise e
1007
-
1008
- def post_check_avro_type(self, dependencies, avro_type):
1009
- """Post-check the Avro type and push dependencies up the stack."""
1010
- if isinstance(avro_type, dict) and 'type' in avro_type and (isinstance(avro_type, list) or not avro_type['type'] in ['array', 'map', 'record', 'enum', 'fixed']):
1011
- if 'dependencies' in avro_type:
1012
- dependencies.extend(avro_type['dependencies'])
1013
- avro_type = avro_type['type']
1014
- return avro_type
1015
-
1016
- def register_type(self, avro_schema, avro_type) -> bool:
1017
- """Register a type in the Avro schema."""
1018
- existing_type = next((t for t in avro_schema if t.get(
1019
- 'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
1020
- if not existing_type:
1021
- if self.is_empty_type(avro_type) and not 'unmerged_types' in avro_type:
1022
- print(f'WARN: Standalone type {avro_type["name"]} is empty')
1023
- if self.is_standalone_avro_type(avro_type):
1024
- avro_schema.append(avro_type)
1025
- return True
1026
- else:
1027
- return False
1028
- else:
1029
- return True
1030
-
1031
- def has_composition_keywords(self, json_object: dict) -> bool:
1032
- """Check if the JSON object has any of the combining keywords: allOf, oneOf, anyOf."""
1033
- return isinstance(json_object, dict) and ('allOf' in json_object or 'oneOf' in json_object or 'anyOf' in json_object)
1034
-
1035
- def has_enum_keyword(self, json_object: dict) -> bool:
1036
- """Check if the JSON object is an enum."""
1037
- return isinstance(json_object, dict) and 'enum' in json_object
1038
-
1039
- def is_array_object(self, json_object: dict) -> bool:
1040
- """Check if the JSON object is an array object."""
1041
- return isinstance(json_object, dict) and 'type' in json_object and json_object['type'] == 'array'
1042
-
1043
- def is_standalone_avro_type(self, avro_type: dict | list | str) -> bool:
1044
- """Check if the Avro type is a standalone type."""
1045
- return isinstance(avro_type, dict) and 'type' in avro_type and (avro_type['type'] in ['record', 'enum', 'fixed'])
1046
-
1047
- def is_avro_complex_type(self, avro_type: dict) -> bool:
1048
- """Check if the Avro type is a complex type."""
1049
- return 'type' in avro_type and avro_type['type'] in ['record', 'enum', 'fixed', 'array', 'map']
1050
-
1051
- def set_avro_type_value(self, avro_type: dict | list | str, name: str, value: dict | list | str):
1052
- """Set a value in an Avro type."""
1053
- if isinstance(avro_type, dict):
1054
- if name == 'namespace' or name == 'name':
1055
- if 'type' in avro_type:
1056
- if not (avro_type['type'] in ['record', 'enum', 'fixed']):
1057
- return
1058
- avro_type[name] = value
1059
-
1060
- def create_avro_record(self, name: str, namespace: str, fields: list) -> dict:
1061
- """Create an Avro record type."""
1062
- return {
1063
- 'type': 'record',
1064
- 'name': avro_name(name),
1065
- 'namespace': namespace,
1066
- 'fields': fields
1067
- }
1068
-
1069
- def create_wrapper_record(self, wrapper_name: str, wrapper_namespace: str, wrapper_field: str, dependencies: list, avro_type: list | str | dict) -> dict:
1070
- """Create a union wrapper type in Avro."""
1071
- rec = self.create_avro_record(wrapper_name, wrapper_namespace, [
1072
- {
1073
- 'name': wrapper_field,
1074
- 'type': avro_type
1075
- }
1076
- ])
1077
- if len(dependencies) > 0:
1078
- rec['dependencies'] = dependencies
1079
- return rec
1080
-
1081
- def create_enum_type(self, name: str, namespace: str, symbols: list) -> dict:
1082
- """Create an Avro enum type."""
1083
- # the symbol list may have been merged by composition to we flatten it to have a unique list
1084
- symbols = self.flatten_union(symbols)
1085
- return {
1086
- 'type': 'enum',
1087
- 'name': name,
1088
- 'namespace': namespace,
1089
- 'symbols': [avro_name(s) for s in symbols]
1090
- }
1091
-
1092
- def create_array_type(self, items: list | dict | str) -> dict:
1093
- """Create an Avro array type."""
1094
- return {
1095
- 'type': 'array',
1096
- 'items': items
1097
- }
1098
-
1099
- def create_map_type(self, values: list | dict | str) -> dict:
1100
- """Create an Avro map type."""
1101
- return {
1102
- 'type': 'map',
1103
- 'values': values
1104
- }
1105
-
1106
- def nullable(self, avro_type: list | dict | str) -> list | dict | str:
1107
- """Wrap a type in a union with null."""
1108
- if isinstance(avro_type, list):
1109
- cp = avro_type.copy()
1110
- cp.insert(0, 'null')
1111
- return cp
1112
- return ['null', avro_type]
1113
-
1114
- def merge_description_into_doc(self, source_json: dict, target_avro: dict | list | str):
1115
- """Merge a description in JSON into Avro doc."""
1116
- if isinstance(source_json, dict) and 'description' in source_json and isinstance(target_avro, dict):
1117
- target_avro['doc'] = target_avro['doc'] + ", " + \
1118
- source_json['description'] if 'doc' in target_avro else source_json['description']
1119
-
1120
- def merge_dependencies_into_parent(self, dependencies: list, child_type: dict | list | str, parent_type: dict | list | str):
1121
- """Merge dependencies from a child type into a parent type."""
1122
- self.lift_dependencies_from_type(child_type, dependencies)
1123
- if len(dependencies) > 0 and isinstance(parent_type, dict):
1124
- if 'dependencies' in parent_type:
1125
- dependencies.extend(parent_type['dependencies'])
1126
- else:
1127
- parent_type['dependencies'] = dependencies
1128
-
1129
- def lift_dependencies_from_type(self, child_type: dict | list | str, dependencies: list):
1130
- """Lift all dependencies from a type and return a new type with the dependencies lifted."""
1131
- if isinstance(child_type, dict):
1132
- if 'dependencies' in child_type:
1133
- dependencies.extend(child_type['dependencies'])
1134
- del child_type['dependencies']
1135
-
1136
- def compose_namespace(self, *names) -> str:
1137
- """Compose a namespace from a list of names."""
1138
- return '.'.join([avro_namespace(n) for n in names if n])
1139
-
1140
- def get_qualified_name(self, avro_type):
1141
- """Get the qualified name of an Avro type."""
1142
- return self.compose_namespace(avro_type.get('namespace', ''), avro_type.get('name', ''))
1143
-
1144
- def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list) -> dict | list | str | None:
1145
- """Convert a JSON schema object declaration to an Avro record."""
1146
- dependencies: List[str] = []
1147
- avro_type: list | dict | str = {}
1148
-
1149
- # handle top-level allOf, anyOf, oneOf
1150
- if self.has_composition_keywords(json_object):
1151
- # we will merge allOf, oneOf, anyOf into a union record type
1152
- type = self.json_type_to_avro_type(
1153
- json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1154
- if isinstance(type, str):
1155
- # we are skipping references and primitives
1156
- return None
1157
- if isinstance(type, list):
1158
- # we should have a union type
1159
- avro_type = self.create_wrapper_record(
1160
- name+"_union", self.utility_namespace, 'options', [], type)
1161
- elif isinstance(type, dict) and 'type' in type and type['type'] != 'record':
1162
- # merge the type into a record type if it's not a record type
1163
- print(
1164
- f'INFO: Standalone type {name} is being wrapped in a record')
1165
- avro_type = self.create_wrapper_record(avro_name(type.get(
1166
- 'name', name)+'_wrapper'), self.utility_namespace, 'value', type.get('dependencies', []), type)
1167
- else:
1168
- avro_type = type
1169
- # add external dependencies to the record
1170
- self.merge_dependencies_into_parent(dependencies, type, avro_type)
1171
- self.merge_description_into_doc(json_object, avro_type)
1172
- # return the union type
1173
- return avro_type
1174
-
1175
- if self.has_enum_keyword(json_object):
1176
- # this is an enum
1177
- avro_enum = self.create_enum_type(
1178
- avro_name(name), namespace, json_object['enum'])
1179
- self.merge_description_into_doc(json_object, avro_enum)
1180
- return avro_enum
1181
-
1182
- if self.is_array_object(json_object):
1183
- # this is an array, which can't be standalone in Avro, so we will wraps it into a record
1184
- # and include the type as an inline
1185
- print(
1186
- f'WARN: Standalone array type {name} will be wrapped in a record')
1187
- deps: List[str] = []
1188
- array_type = self.json_type_to_avro_type(json_object, name, avro_name(
1189
- name), namespace, deps, json_schema, base_uri, avro_schema, record_stack)
1190
- avro_array = self.create_wrapper_record(
1191
- avro_name(name+'_wrapper'), self.utility_namespace, 'items', [], array_type)
1192
- self.merge_description_into_doc(json_object, avro_array)
1193
- self.merge_dependencies_into_parent(deps, array_type, avro_array)
1194
- return avro_array
1195
-
1196
- # at this point, we have to assume that we have a JSON schema object
1197
- title = json_object.get('title')
1198
- record_name = avro_name(name if name else title if title else None)
1199
- if record_name is None:
1200
- raise ValueError(
1201
- f"Cannot determine record name for json_object {json_object}")
1202
- if len(record_stack) > 0:
1203
- # if we have a record stack, we need to add the current name to
1204
- # the namespace since nested types are disambiguated by their namespace
1205
- namespace = self.compose_namespace(
1206
- namespace, record_stack[-1] + "_types")
1207
- # at this point we have a record type
1208
- avro_record = self.create_avro_record(record_name, namespace, [])
1209
- # we need to prevent circular dependencies, so we will maintain a stack of the in-progress
1210
- # records and will resolve the cycle as we go. if this record is already in the stack, we will
1211
- # just return a reference to a record that contains this record
1212
- if record_name in record_stack:
1213
- # to break the cycle, we will use a containment type that references
1214
- # the record that is being defined
1215
- print(
1216
- f'WARN: Circular dependency found for record {record_name}. Creating {record_name}_ref.')
1217
- ref_name = avro_name(record_name + '_ref')
1218
- return self.create_wrapper_record(ref_name, namespace, record_name, [], self.compose_namespace(namespace, record_name))
1219
- try:
1220
- # enter the record stack scope for this record
1221
- record_stack.append(record_name)
1222
- # collect the required fields so we can make those fields non-null
1223
- required_fields = json_object.get('required', [])
1224
-
1225
- field_refs = []
1226
- if 'properties' in json_object and isinstance(json_object['properties'], dict):
1227
- # add the properties as fields
1228
- for field_name, json_field_types in json_object['properties'].items():
1229
- if isinstance(json_field_types, bool):
1230
- # for "propertyname": true, we skip. schema bug.
1231
- continue
1232
- if not isinstance(json_field_types, list):
1233
- json_field_types = [json_field_types]
1234
- field_type_list = []
1235
- field_ref_type_list = []
1236
- const = None
1237
- default = None
1238
- description = None
1239
- for json_field_type in json_field_types:
1240
- # skip fields with an bad or empty type
1241
- if not isinstance(json_field_type, dict):
1242
- continue
1243
- field_name = avro_name(field_name)
1244
- # last const wins if there are multiple
1245
- const = json_field_type.get('const', const)
1246
- # last default wins if there are multiple
1247
- default_value = json_field_type.get('default')
1248
- if default_value and not isinstance(default_value, dict) and not isinstance(default_value, list):
1249
- default = default_value
1250
- # get the description from the field type
1251
- description = json_field_type.get('description', description)
1252
- # convert the JSON-type field to an Avro-type field
1253
- avro_field_ref_type = avro_field_type = self.ensure_type(self.json_type_to_avro_type(
1254
- json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack))
1255
- if isinstance(avro_field_type, list):
1256
- avro_field_type = self.flatten_union(
1257
- avro_field_type)
1258
- avro_field_ref_type = avro_field_type
1259
- elif isinstance(avro_field_type, dict):
1260
- self.lift_dependencies_from_type(
1261
- avro_field_type, dependencies)
1262
- # if the first call gave us a global type that got added to the schema, this call will give us a reference
1263
- if self.is_standalone_avro_type(avro_field_type):
1264
- avro_field_ref_type = self.get_qualified_name(
1265
- avro_field_type)
1266
- if avro_field_type is None:
1267
- # None type is a problem
1268
- raise ValueError(
1269
- f"avro_field_type is None for field {field_name}")
1270
- if isinstance(avro_field_type, dict) and 'type' in avro_field_type and not self.is_avro_complex_type(avro_field_type):
1271
- # if the field type is a basic type, inline it
1272
- avro_field_type = avro_field_type['type']
1273
- field_type_list.append(avro_field_type)
1274
- field_ref_type_list.append(avro_field_ref_type)
1275
-
1276
- effective_field_type = field_type_list[0] if len(
1277
- field_type_list) == 1 else field_type_list
1278
- effective_field_ref_type = field_ref_type_list[0] if len(
1279
- field_ref_type_list) == 1 else field_ref_type_list
1280
- avro_field = {
1281
- 'name': avro_name(field_name),
1282
- 'type': self.nullable(effective_field_type) if not field_name in required_fields and 'null' not in effective_field_type else effective_field_type
1283
- }
1284
- if field_name != avro_name(field_name):
1285
- avro_field['altnames'] = { "json": field_name }
1286
- if const:
1287
- avro_field['const'] = const
1288
- if default:
1289
- avro_field['default'] = default
1290
- if description:
1291
- avro_field['doc'] = description
1292
- field_type_list.append(avro_field_type)
1293
- avro_field_ref = {
1294
- 'name': avro_name(field_name),
1295
- 'type': self.nullable(effective_field_ref_type) if not field_name in required_fields and 'null' not in effective_field_ref_type else effective_field_ref_type
1296
- }
1297
- if description:
1298
- avro_field_ref['doc'] = description
1299
- field_ref_type_list.append(avro_field_ref)
1300
- # add the field to the record
1301
- avro_record['fields'].append(avro_field)
1302
- field_refs.append(avro_field_ref)
1303
- elif not 'additionalProperties' in json_object and not 'patternProperties' in json_object:
1304
- if 'type' in json_object and (json_object['type'] == 'object' or 'object' in json_object['type']) and \
1305
- not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
1306
- # we don't have any fields, but we have an object type, so we create a map
1307
- avro_record = self.create_map_type(generic_type())
1308
- elif 'type' in json_object and (json_object['type'] == 'array' or 'array' in json_object['type']) and \
1309
- not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
1310
- # we don't have any fields, but we have an array type, so we create a record with an 'items' field
1311
- avro_record = self.create_array_type(
1312
- self.json_type_to_avro_type(
1313
- json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1314
- if 'items' in json_object
1315
- else generic_type())
1316
- else:
1317
- return json_object['type'] if 'type' in json_object else generic_type()
1318
-
1319
- extension_types = []
1320
- prop_docs = ''
1321
- if 'patternProperties' in json_object and isinstance(json_object['patternProperties'], dict) and len(json_object['patternProperties']) > 0:
1322
- # pattern properties are represented as a record with field names that are the patterns
1323
- pattern_props = json_object['patternProperties']
1324
- for pattern_name, props in pattern_props.items():
1325
- deps = []
1326
- prop_type = self.ensure_type(self.json_type_to_avro_type(
1327
- props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack))
1328
- if self.is_standalone_avro_type(prop_type):
1329
- self.lift_dependencies_from_type(prop_type, deps)
1330
- self.set_avro_type_value(
1331
- prop_type, 'namespace', namespace)
1332
- self.register_type(avro_schema, prop_type)
1333
- prop_type_ref = self.get_qualified_name(prop_type)
1334
- dependencies.append(prop_type_ref)
1335
- else:
1336
- dependencies.extend(deps)
1337
- if isinstance(prop_type, str) and not prop_type in primitive_types:
1338
- dependencies.append(prop_type)
1339
- if self.is_empty_type(prop_type):
1340
- prop_type = generic_type()
1341
- prop_docs += f"Name pattern '{pattern_name}': [{self.get_field_type_name({'type':prop_type})}]. "
1342
- extension_types.append(prop_type)
1343
-
1344
- if 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], bool):
1345
- if True == json_object['additionalProperties']:
1346
- prop_type = generic_type()
1347
- extension_types.append(prop_type)
1348
- elif 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], dict) and len(json_object['additionalProperties']) > 0:
1349
- # additional properties are represented as a map of string to the type of the value
1350
- additional_props = json_object['additionalProperties']
1351
- deps = []
1352
- values_type = self.json_type_to_avro_type(
1353
- additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1354
- if self.is_standalone_avro_type(values_type):
1355
- self.lift_dependencies_from_type(values_type, deps)
1356
- self.set_avro_type_value(
1357
- values_type, 'namespace', namespace)
1358
- self.register_type(avro_schema, values_type)
1359
- values_type_ref = self.get_qualified_name(values_type)
1360
- dependencies.append(values_type_ref)
1361
- else:
1362
- dependencies.extend(deps)
1363
- if isinstance(values_type, str) and not values_type in primitive_types:
1364
- dependencies.append(values_type)
1365
- if self.is_empty_type(values_type):
1366
- values_type = generic_type()
1367
- prop_docs += f"Extra properties: [{self.get_field_type_name({'type':values_type})}]. "
1368
- extension_types.append(values_type)
1369
- self.merge_description_into_doc(json_object, avro_record)
1370
-
1371
- avro_alternate_record = None
1372
- if extension_types:
1373
- # Since Avro Schema does not allow fields with dynamic names
1374
- # to appear alongside regular fields, we will union the types of all properties with the
1375
- # type of the additionalProperties and document this in the record's description
1376
- json_field_types = [field['type'] for field in field_refs]
1377
- field_type_names = [
1378
- [field['name'], self.get_field_type_name(field)] for field in field_refs]
1379
- field_type_name_list: str = ', '.join(
1380
- [f"'{field[0]}': [{field[1]}]" for field in field_type_names])
1381
- json_field_types.extend(extension_types)
1382
- json_field_types = self.flatten_union(json_field_types)
1383
- if len(json_field_types) == 1:
1384
- json_field_types = json_field_types[0]
1385
- doc = f"Alternate map: {field_type_name_list}. " if field_type_names else ''
1386
- doc += prop_docs
1387
- avro_alternate_record = self.create_map_type(json_field_types)
1388
- if not self.is_empty_type(avro_record):
1389
- avro_alternate_record['alternateof'] = self.get_qualified_name(avro_record)
1390
- dependencies.append(
1391
- self.compose_namespace(namespace, record_name))
1392
- avro_record['doc'] = doc if not 'doc' in avro_record else avro_record['doc'] + ', ' + doc
1393
-
1394
- if len(dependencies) > 0:
1395
- # dedupe the list
1396
- dependencies = list(set(dependencies))
1397
- avro_record['dependencies'] = dependencies
1398
- finally:
1399
- record_stack.pop()
1400
- if avro_alternate_record:
1401
- if self.is_empty_type(avro_record):
1402
- # there's no substantive content in the record,
1403
- # so we will just return the alternate record, which
1404
- # is a plain map
1405
- return avro_alternate_record
1406
- return [avro_record, avro_alternate_record]
1407
- return avro_record
1408
-
1409
- def postprocess_schema(self, avro_schema: list) -> None:
1410
- """ Post-process the Avro Schema for cases wheer we need a second pass """
1411
- if len(self.types_with_unmerged_types) > 0:
1412
- types_with_unmerged_types = copy.deepcopy(
1413
- self.types_with_unmerged_types)
1414
- self.types_with_unmerged_types = []
1415
- for ref_type in types_with_unmerged_types:
1416
- # find ref_type anywhere in the avro_schema graph, matching
1417
- # on name and namespace.
1418
- def find_fn(
1419
- t): return 'name' in t and t['name'] == ref_type['name'] and 'namespace' in t and t['namespace'] == ref_type['namespace']
1420
- type = find_schema_node(find_fn, avro_schema)
1421
- if not type:
1422
- raise ValueError(
1423
- f"Couldn't find type {ref_type['namespace']}.{ref_type['name']} in the Avro Schema.")
1424
- # resolve the unmerged types
1425
- local_name = type.get('name')
1426
- if not isinstance(type, dict):
1427
- continue
1428
- unmerged_types = type.get('unmerged_types', [])
1429
- if len(unmerged_types) == 0:
1430
- if 'unmerged_types' in type:
1431
- del type['unmerged_types']
1432
- continue
1433
- base_type = copy.deepcopy(type)
1434
- if 'unmerged_types' in base_type:
1435
- del base_type['unmerged_types']
1436
- mergeable_types = [base_type]
1437
- deps: List[str] = []
1438
- self.lift_dependencies_from_type(type, deps)
1439
- for item in unmerged_types:
1440
- if isinstance(item, str):
1441
- found_avro_type = next(
1442
- (t for t in avro_schema if self.get_qualified_name(t) == item), None)
1443
- if not found_avro_type:
1444
- continue
1445
- elif isinstance(item, dict):
1446
- found_avro_type = item
1447
- self.lift_dependencies_from_type(found_avro_type, deps)
1448
- if isinstance(found_avro_type, dict):
1449
- candidate = found_avro_type
1450
- if 'unmerged_types' in candidate:
1451
- del candidate['unmerged_types']
1452
- mergeable_types.append(candidate)
1453
- merge_result = self.merge_avro_schemas(
1454
- mergeable_types, avro_schema, local_name, deps)
1455
- if isinstance(merge_result, dict):
1456
- merge_result['dependencies'] = deps
1457
- if 'unmerged_types' in merge_result:
1458
- del merge_result['unmerged_types']
1459
- if isinstance(merge_result, list):
1460
- # unmerged field containers have fields.
1461
- self.set_avro_type_value(
1462
- type, 'name', type['name'] + '_item')
1463
- self.set_avro_type_value(
1464
- type, 'fields', [{'name': 'value', 'type': merge_result}])
1465
- merge_result = copy.deepcopy(type)
1466
- set_schema_node(find_fn, merge_result, avro_schema)
1467
-
1468
- def process_definition_list(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema_list):
1469
- """Process a schema definition list."""
1470
- for sub_schema_name, schema in json_schema_list.items():
1471
- if not isinstance(schema, dict) and not isinstance(schema, list):
1472
- # skip items that are not schema definitions or lists
1473
- continue
1474
- if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
1475
- # this is a schema definition
1476
- self.process_definition(
1477
- json_schema, namespace, base_uri, avro_schema, record_stack, sub_schema_name, schema)
1478
- continue
1479
- # it's a schema definition list
1480
- self.process_definition_list(
1481
- json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema)
1482
-
1483
- def process_definition(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema, is_root: bool = False) -> Tuple[str, str] | None:
1484
- """ Process a schema definition. """
1485
- avro_schema_item = None
1486
- avro_schema_item_list = self.json_schema_object_to_avro_record(
1487
- schema_name, schema, namespace, json_schema, base_uri, avro_schema, record_stack)
1488
- if not isinstance(avro_schema_item_list, list) and not isinstance(avro_schema_item_list, dict):
1489
- # skip if the record couldn't be resolved
1490
- return None
1491
- # the call above usually returns a single record, but we pretend it's normally a list to handle allOf/anyOf/oneOf cases
1492
- if isinstance(avro_schema_item_list, list) and is_root and len(avro_schema_item_list) > 1:
1493
- # if we have multiple root-level records, we will wrap them all in a single record
1494
- root_avro_schema_item = self.create_wrapper_record(
1495
- schema_name+'_wrapper', namespace, 'root', [], avro_schema_item_list)
1496
- for avro_schema_item in avro_schema_item_list:
1497
- self.merge_dependencies_into_parent(
1498
- [], avro_schema_item, root_avro_schema_item)
1499
- self.register_type(avro_schema, root_avro_schema_item)
1500
- return root_avro_schema_item['namespace'], root_avro_schema_item['name']
1501
- elif not isinstance(avro_schema_item_list, list):
1502
- # is not a list, so we'll wrap it in a list
1503
- avro_schema_item_list = [avro_schema_item_list]
1504
- for avro_schema_item in avro_schema_item_list:
1505
- # add the item to the schema if it's not already there
1506
- if isinstance(avro_schema_item, str):
1507
- continue
1508
- if isinstance(avro_schema_item, dict) and not 'name' in avro_schema_item:
1509
- avro_schema_item['name'] = avro_name(schema_name)
1510
- existing_type = next((t for t in avro_schema if t.get('name') == avro_schema_item['name'] and t.get(
1511
- 'namespace') == avro_schema_item.get('namespace')), None)
1512
- if not existing_type:
1513
- if (not self.is_empty_type(avro_schema_item) or 'unmerged_types' in avro_schema_item) and \
1514
- self.is_standalone_avro_type(avro_schema_item):
1515
- # we only register record/enum as type. the other defs are mix-ins
1516
- self.register_type(avro_schema, avro_schema_item)
1517
- return avro_schema_item['namespace'], avro_schema_item['name']
1518
- elif is_root:
1519
- # at the root, we will wrap the type in a record to make it top-level
1520
- deps: List[str] = []
1521
- self.lift_dependencies_from_type(avro_schema_item, deps)
1522
- avro_schema_wrapper = self.create_wrapper_record(schema_name, avro_schema_item.get(
1523
- 'namespace', namespace), avro_schema_item['name'], deps, avro_schema_item)
1524
- if len(deps) > 0:
1525
- avro_schema_wrapper['dependencies'] = deps
1526
- avro_schema_item = avro_schema_wrapper
1527
- self.register_type(avro_schema, avro_schema_item)
1528
- return avro_schema_item['namespace'], avro_schema_item['name']
1529
- return None
1530
-
1531
- def id_to_avro_namespace(self, id: str) -> str:
1532
- """Convert a XSD namespace to Avro Namespace."""
1533
- parsed_url = urlparse(id)
1534
- # strip the file extension
1535
- path = parsed_url.path.rsplit('.')[0]
1536
- path_segments = path.strip('/').replace('-', '_').split('/')
1537
- reversed_path_segments = reversed(path_segments)
1538
- namespace_suffix = self.compose_namespace(*reversed_path_segments)
1539
- if parsed_url.hostname:
1540
- namespace_prefix = self.compose_namespace(
1541
- *reversed(parsed_url.hostname.split('.')))
1542
- namespace = self.compose_namespace(namespace_prefix, namespace_suffix)
1543
- return namespace
1544
-
1545
- def jsons_to_avro(self, json_schema: dict | list, namespace: str, base_uri: str) -> list | dict | str:
1546
- """Convert a JSON-schema to an Avro-schema."""
1547
- avro_schema: List[dict] = []
1548
- record_stack: List[str] = []
1549
-
1550
- parsed_url = urlparse(base_uri)
1551
- schema_name = self.root_class_name
1552
-
1553
- if isinstance(json_schema, dict) and ('definitions' in json_schema or '$defs' in json_schema):
1554
- # this is a swagger file or has a 'definitions' block
1555
- json_schema_defs = json_schema.get(
1556
- 'definitions', json_schema.get('$defs', []))
1557
- for def_schema_name, schema in json_schema_defs.items():
1558
- if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
1559
- # this is a schema definition
1560
- self.process_definition(
1561
- json_schema, namespace, base_uri, avro_schema, record_stack, def_schema_name, schema)
1562
- else:
1563
- # it's a schema definition list
1564
- self.process_definition_list(
1565
- json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema.copy())
1566
- elif isinstance(json_schema, list):
1567
- # this is a schema definition list
1568
- self.process_definition_list(
1569
- json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema)
1570
-
1571
- root_namespace = None
1572
- root_name = None
1573
- if isinstance(json_schema, dict) and 'type' in json_schema or 'allOf' in json_schema or 'oneOf' in json_schema or 'anyOf' in json_schema or 'properties' in json_schema:
1574
- # this is a schema definition
1575
- if isinstance(json_schema, dict) and '$ref' in json_schema:
1576
- # if there is a $ref at the root level, resolve the reference and merge it with the current schema
1577
- ref = json_schema['$ref']
1578
- if ref:
1579
- ref_schema, json_doc = self.resolve_reference(
1580
- json_schema, base_uri, json_schema)
1581
- json_schema = self.merge_json_schemas(
1582
- [json_schema, ref_schema], intersect=False)
1583
- root_info = self.process_definition(
1584
- json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema, is_root=True)
1585
- if root_info:
1586
- root_namespace, root_name = root_info
1587
-
1588
- # postprocessing pass
1589
- self.postprocess_schema(avro_schema)
1590
-
1591
- if isinstance(avro_schema, list) and len(avro_schema) > 1 and self.split_top_level_records:
1592
- new_avro_schema = []
1593
- for item in avro_schema:
1594
- if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
1595
- # we need to make a copy since the inlining operation shuffles types
1596
- schema_copy = copy.deepcopy(avro_schema)
1597
- # find the item with the same name and namespace in the copy
1598
- found_item = next((t for t in schema_copy if t.get(
1599
- 'name') == item['name'] and t.get('namespace') == item.get('namespace')), None)
1600
- if found_item:
1601
- # inline all dependencies of the item
1602
- inline_dependencies_of(schema_copy, found_item)
1603
- new_avro_schema.append(found_item)
1604
- avro_schema = new_avro_schema
1605
- else:
1606
- # sort the records by their dependencies
1607
- if root_name and root_namespace and not ('definitions' in json_schema or '$defs' in json_schema):
1608
- # inline all dependencies if this is a doc with only a root level definition
1609
- root = find_schema_node(
1610
- lambda t: 'name' in t and t['name'] == root_name and 'namespace' in t and t['namespace'] == root_namespace, avro_schema)
1611
- inline_dependencies_of(avro_schema, root)
1612
- return root
1613
- else:
1614
- avro_schema = sort_messages_by_dependencies(avro_schema)
1615
-
1616
- if parsed_url.fragment and isinstance(json_schema, dict):
1617
- # if the fragment is present in the URL, it's a reference to a schema definition
1618
- # so we will resolve that reference and return a type
1619
- self.imported_types.clear()
1620
- fragment_schema: List[dict] = []
1621
- json_pointer = parsed_url.fragment
1622
- schema_name = parsed_url.fragment.split('/')[-1]
1623
- schema = jsonpointer.resolve_pointer(json_schema, json_pointer)
1624
- avro_schema_item = self.json_schema_object_to_avro_record(
1625
- schema_name, schema, namespace, json_schema, base_uri, fragment_schema, record_stack)
1626
- if avro_schema_item:
1627
- # we roll all the types into this record as the top level type
1628
- inline_dependencies_of(avro_schema, avro_schema_item)
1629
- return avro_schema_item
1630
-
1631
- return avro_schema
1632
-
1633
- def convert_jsons_to_avro(self, json_schema_file_path: str, avro_schema_path: str, namespace: str | None = None, utility_namespace: str | None = None) -> list | dict | str:
1634
- """Convert JSON schema file to Avro schema file."""
1635
- # turn the file path into a file URI if it's not a URI already
1636
- parsed_url = urlparse(json_schema_file_path)
1637
- if not parsed_url.hostname and not parsed_url.scheme == 'file':
1638
- json_schema_file_path = 'file://' + json_schema_file_path
1639
- parsed_url = urlparse(json_schema_file_path)
1640
- content = self.fetch_content(parsed_url.geturl())
1641
- json_schema = json.loads(content)
1642
-
1643
- if not namespace:
1644
- namespace = parsed_url.geturl().replace('\\', '/').replace('-',
1645
- '_').split('/')[-1].split('.')[0]
1646
- # get the $id if present
1647
- if '$id' in json_schema:
1648
- namespace = self.id_to_avro_namespace(json_schema['$id'])
1649
- self.root_namespace = namespace
1650
- if utility_namespace:
1651
- self.utility_namespace = utility_namespace
1652
- else:
1653
- self.utility_namespace = self.root_namespace + '.utility'
1654
-
1655
- # drop the file name from the parsed URL to get the base URI
1656
- avro_schema = self.jsons_to_avro(
1657
- json_schema, namespace, parsed_url.geturl())
1658
- if len(avro_schema) == 1:
1659
- avro_schema = avro_schema[0]
1660
-
1661
- # create the directory for the Avro schema file if it doesn't exist
1662
- dir = os.path.dirname(
1663
- avro_schema_path) if not self.split_top_level_records else avro_schema_path
1664
- if dir != '' and not os.path.exists(dir):
1665
- os.makedirs(dir, exist_ok=True)
1666
- if self.split_top_level_records:
1667
- # if we are splitting top level records, we will create a file for each record
1668
- for item in avro_schema:
1669
- if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
1670
- schema_file_path = os.path.join(
1671
- dir, item['name'] + '.avsc')
1672
- with open(schema_file_path, 'w') as avro_file:
1673
- json.dump(item, avro_file, indent=4)
1674
- else:
1675
- with open(avro_schema_path, 'w') as avro_file:
1676
- json.dump(avro_schema, avro_file, indent=4)
1677
- return avro_schema
1678
-
1679
-
1680
- def convert_jsons_to_avro(json_schema_file_path: str, avro_schema_path: str, namespace: str = '', utility_namespace='', root_class_name='', split_top_level_records=False) -> list | dict | str:
1681
- """Convert JSON schema file to Avro schema file."""
1682
-
1683
- if not json_schema_file_path:
1684
- raise ValueError('JSON schema file path is required')
1685
- if not json_schema_file_path.startswith('http'):
1686
- if not os.path.exists(json_schema_file_path):
1687
- raise FileNotFoundError(f'JSON schema file {json_schema_file_path} not found')
1688
-
1689
- try:
1690
- converter = JsonToAvroConverter()
1691
- converter.split_top_level_records = split_top_level_records
1692
- if root_class_name:
1693
- converter.root_class_name = root_class_name
1694
- return converter.convert_jsons_to_avro(json_schema_file_path, avro_schema_path, namespace, utility_namespace)
1695
- except Exception as e:
1696
- print(
1697
- f'Error converting JSON {json_schema_file_path} to Avro: {e.args[0]}')
1698
- return []
1
+ """ JSON to Avro schema converter. """
2
+
3
+ # pylint: disable=too-many-lines, line-too-long, too-many-branches, too-many-statements, too-many-locals, too-many-nested-blocks, too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-boolean-expressions
4
+
5
+ import json
6
+ import os
7
+ import copy
8
+ import urllib
9
+ from urllib.parse import ParseResult, urlparse, unquote
10
+ from typing import Any, Dict, List, Tuple
11
+ import jsonpointer
12
+ from jsonpointer import JsonPointerException
13
+ import requests
14
+
15
+ from avrotize.common import avro_name, avro_namespace, find_schema_node, generic_type, set_schema_node
16
+ from avrotize.dependency_resolver import inline_dependencies_of, sort_messages_by_dependencies
17
+
18
+ primitive_types = ['null', 'string', 'int',
19
+ 'long', 'float', 'double', 'boolean', 'bytes']
20
+
21
+
22
+ class JsonToAvroConverter:
23
+ """
24
+ Converts JSON schema to Avro schema.
25
+
26
+ Attributes:
27
+ imported_types: A dictionary of imported type schemas.
28
+ root_namespace: The namespace for the root schema.
29
+ max_recursion_depth: The maximum recursion depth.
30
+ types_with_unmerged_types: A list of types with unmerged types.
31
+ content_cache: A dictionary for caching fetched URLs.
32
+ utility_namespace: The namespace for utility types.
33
+ maximize_compatiblity: A flag to maximize compatibility.
34
+
35
+ """
36
+
37
+ def __init__(self) -> None:
38
+ self.imported_types: Dict[Any, Any] = {}
39
+ self.root_namespace = 'example.com'
40
+ self.max_recursion_depth = 40
41
+ self.types_with_unmerged_types: List[dict] = []
42
+ self.content_cache: Dict[str, str] = {}
43
+ self.utility_namespace = 'utility.vasters.com'
44
+ self.split_top_level_records = False
45
+ self.root_class_name = 'document'
46
+
47
+ def is_empty_type(self, avro_type):
48
+ """
49
+ Check if the Avro type is an empty type.
50
+
51
+ Parameters:
52
+ avro_type (any): The Avro type to check.
53
+
54
+ Returns:
55
+ bool: True if the Avro type is empty, False otherwise.
56
+ """
57
+ if len(avro_type) == 0:
58
+ return True
59
+ if isinstance(avro_type, list):
60
+ return all(self.is_empty_type(t) for t in avro_type)
61
+ if isinstance(avro_type, dict):
62
+ if not 'type' in avro_type:
63
+ return True
64
+ if (avro_type['type'] == 'record' and (not 'fields' in avro_type or len(avro_type['fields']) == 0)) or \
65
+ (avro_type['type'] == 'enum' and (not 'symbols' in avro_type or len(avro_type['symbols']) == 0)) or \
66
+ (avro_type['type'] == 'array' and (not 'items' in avro_type or not avro_type['items'])) or \
67
+ (avro_type['type'] == 'map' and (not 'values' in avro_type or not avro_type['values'])):
68
+ return True
69
+ return False
70
+
71
+ def is_empty_json_type(self, json_type):
72
+ """
73
+ Check if the JSON type is an empty type.
74
+
75
+ Parameters:
76
+ json_type (any): The JSON type to check.
77
+
78
+ Returns:
79
+ bool: True if the JSON type is empty, False otherwise.
80
+ """
81
+ if len(json_type) == 0:
82
+ return True
83
+ if isinstance(json_type, list):
84
+ return all(self.is_empty_json_type(t) for t in json_type)
85
+ if isinstance(json_type, dict):
86
+ if not 'type' in json_type:
87
+ return True
88
+ return False
89
+
90
+ def flatten_union(self, type_list: list) -> list:
91
+ """
92
+ Flatten the list of types in a union into a single list.
93
+
94
+ Args:
95
+ type_list (list): The list of types in a union.
96
+
97
+ Returns:
98
+ list: The flattened list of types.
99
+
100
+ """
101
+ flat_list = []
102
+ for t in type_list:
103
+ if isinstance(t, list):
104
+ inner = self.flatten_union(t)
105
+ for u in inner:
106
+ if not u in flat_list:
107
+ flat_list.append(u)
108
+ elif not t in flat_list:
109
+ flat_list.append(t)
110
+ # consolidate array type instances
111
+ array_type = None
112
+ map_type = None
113
+ flat_list_1 = []
114
+ for t in flat_list:
115
+ if isinstance(t, dict) and 'type' in t and t['type'] == 'array' and 'items' in t:
116
+ if not array_type:
117
+ array_type = t
118
+ flat_list_1.append(t)
119
+ else:
120
+ array_type = self.merge_avro_schemas([array_type, t], [])
121
+ elif isinstance(t, dict) and 'type' in t and t['type'] == 'map' and 'values' in t:
122
+ if not map_type:
123
+ map_type = t
124
+ flat_list_1.append(t)
125
+ else:
126
+ map_type = self.merge_avro_schemas([map_type, t], [])
127
+ elif not t in flat_list_1:
128
+ flat_list_1.append(t)
129
+ return flat_list_1
130
+
131
+ # pylint: disable=dangerous-default-value
132
+ def merge_avro_schemas(self, schemas: list, avro_schemas: list, type_name: str | None = None, deps: List[str] = []) -> str | list | dict:
133
+ """Merge multiple Avro type schemas into one."""
134
+
135
+ def split_merge(schema1, schema2, schema_list, offset):
136
+ """ return the continuing schema merges of incompatible schemas """
137
+ remaining_schemas = schema_list[offset +
138
+ 1:] if len(schema_list) > offset else []
139
+ if isinstance(schema2, dict) and 'dependencies' in schema2:
140
+ deps.extend(schema2['dependencies'])
141
+ del schema2['dependencies']
142
+ if isinstance(schema1, dict) and 'dependencies' in schema1:
143
+ deps.extend(schema1['dependencies'])
144
+ del schema1['dependencies']
145
+ schema1_merged = self.merge_avro_schemas(
146
+ [schema2] + remaining_schemas, avro_schemas, type_name, deps)
147
+ schema2_merged = self.merge_avro_schemas(
148
+ [schema1] + remaining_schemas, avro_schemas, type_name, deps)
149
+ if not self.is_empty_type(schema1_merged) and not self.is_empty_type(schema2_merged):
150
+ return self.flatten_union([schema1_merged, schema2_merged])
151
+ else:
152
+ if not self.is_empty_type(schema1_merged):
153
+ return schema1_merged
154
+ if not self.is_empty_type(schema2_merged):
155
+ return schema2_merged
156
+ # if both are empty, we'll return an empty record
157
+ return {'type': 'record', 'fields': []}
158
+
159
+ merged_schema: dict = {}
160
+ if len(schemas) == 1:
161
+ return schemas[0]
162
+ if type_name:
163
+ self.set_avro_type_value(merged_schema, 'name', type_name)
164
+ for i, schema in enumerate(schemas):
165
+ schema = copy.deepcopy(schema)
166
+ if isinstance(schema, dict) and 'dependencies' in schema:
167
+ deps1: List[str] = merged_schema.get('dependencies', [])
168
+ deps1.extend(schema['dependencies'])
169
+ merged_schema['dependencies'] = deps1
170
+ if (isinstance(schema, list) or isinstance(schema, dict)) and len(schema) == 0:
171
+ continue
172
+ if isinstance(schema, str):
173
+ sch = next(
174
+ (s for s in avro_schemas if s.get('name') == schema), None)
175
+ if sch:
176
+ merged_schema.update(sch)
177
+ else:
178
+ merged_schema['type'] = schema
179
+ elif isinstance(schema, list):
180
+ # the incoming schema is a list, so it's a union
181
+ if 'type' not in merged_schema:
182
+ merged_schema['type'] = schema
183
+ else:
184
+ if isinstance(merged_schema['type'], list):
185
+ merged_schema['type'].extend(schema)
186
+ else:
187
+ if isinstance(merged_schema['type'], str):
188
+ if merged_schema['type'] == 'record' or merged_schema['type'] == 'enum' or merged_schema['type'] == 'fixed' \
189
+ or merged_schema['type'] == 'map' or merged_schema['type'] == 'array':
190
+ return split_merge(merged_schema, schema, schemas, i)
191
+ else:
192
+ merged_schema['type'] = [merged_schema['type']]
193
+ else:
194
+ merged_schema['type'].extend(schema)
195
+ elif schema and ('type' not in schema or 'type' not in merged_schema):
196
+ merged_schema.update(schema)
197
+ elif schema:
198
+ if 'type' in merged_schema and schema['type'] != merged_schema['type']:
199
+ return split_merge(merged_schema, schema, schemas, i)
200
+ if not type_name:
201
+ self.set_avro_type_value(merged_schema, 'name', avro_name(
202
+ merged_schema.get('name', '') + schema.get('name', '')))
203
+ if 'fields' in schema:
204
+ if 'fields' in merged_schema:
205
+ for field in schema['fields']:
206
+ if field not in merged_schema['fields']:
207
+ merged_schema['fields'].append(field)
208
+ else:
209
+ merged_schema_field = next(
210
+ f for f in merged_schema['fields'] if f.get('name') == field.get('name'))
211
+ if merged_schema_field['type'] != field['type']:
212
+ merged_schema_field['type'] = [
213
+ field['type'], merged_schema_field['type']]
214
+ if 'doc' in field and 'doc' not in merged_schema_field:
215
+ merged_schema_field['doc'] = field['doc']
216
+ else:
217
+ merged_schema['fields'] = schema['fields']
218
+ if self.is_avro_complex_type(merged_schema) and 'namespace' in merged_schema:
219
+ if merged_schema['type'] in ['array', 'map']:
220
+ del merged_schema['namespace']
221
+ return merged_schema
222
+
223
+ def merge_json_schemas(self, json_schemas: list[dict], intersect: bool = False) -> dict:
224
+ """
225
+ Merge multiple JSON schemas into one.
226
+
227
+ Args:
228
+ json_schemas (list[dict]): A list of JSON schemas to be merged.
229
+ intersect (bool, optional): If True, only keep the intersection of the required fields. Defaults to False.
230
+
231
+ Returns:
232
+ dict: The merged JSON schema.
233
+ """
234
+
235
+ def merge_structures(schema1: dict, schema2: dict) -> dict | list:
236
+ """ merge two JSON dicts recursively """
237
+ if 'type' in schema1 and 'type' in schema2 and schema1['type'] != schema2['type']:
238
+ return [schema1, schema2]
239
+ schema1 = copy.deepcopy(schema1)
240
+ for key in schema2:
241
+ if key not in schema1:
242
+ schema1[key] = schema2[key]
243
+ elif isinstance(schema1[key], dict) and isinstance(schema2[key], dict):
244
+ schema1[key] = merge_structures(schema1[key], schema2[key])
245
+ elif isinstance(schema1[key], list) and isinstance(schema2[key], list):
246
+ schema1[key].extend(schema2[key])
247
+ elif schema1[key] == schema2[key]:
248
+ continue
249
+ else:
250
+ if isinstance(schema1[key], list):
251
+ if schema2[key] not in schema1[key]:
252
+ schema1[key].append(schema2[key])
253
+ else:
254
+ schema1[key] = [schema1[key], schema2[key]]
255
+ return schema1
256
+
257
+ merged_type: dict = {}
258
+
259
+ for json_schema in json_schemas:
260
+ if 'type' not in json_schema or 'type' not in merged_type:
261
+ for key in json_schema:
262
+ if not key in merged_type:
263
+ merged_type[key] = copy.deepcopy(json_schema[key])
264
+ else:
265
+ if key == 'required':
266
+ merged_type[key] = list(
267
+ set(merged_type[key]).union(set(json_schema[key])))
268
+ if key == 'name' or key == 'title' or key == 'description':
269
+ merged_type[key] = merged_type[key] + \
270
+ json_schema[key]
271
+ elif isinstance(merged_type[key], dict):
272
+ merged_type[key] = merge_structures(
273
+ merged_type[key], copy.deepcopy(json_schema[key]))
274
+ elif isinstance(merged_type[key], list) and isinstance(json_schema[key], list):
275
+ for item in json_schema[key]:
276
+ if item not in merged_type[key]:
277
+ merged_type[key].append(item)
278
+ else:
279
+ if merged_type[key] is None:
280
+ merged_type[key] = json_schema[key]
281
+ else:
282
+ merged_type[key] = [merged_type[key],
283
+ copy.deepcopy(json_schema[key])]
284
+ else:
285
+ if 'type' in merged_type and json_schema['type'] != merged_type['type']:
286
+ if isinstance(merged_type['type'], str):
287
+ merged_type['type'] = [merged_type['type']]
288
+ merged_type['type'].append(json_schema['type'])
289
+ if 'required' in json_schema:
290
+ if 'required' in merged_type:
291
+ merged_type['required'] = list(
292
+ set(merged_type['required']).union(set(json_schema['required'])))
293
+ else:
294
+ merged_type['required'] = json_schema['required']
295
+ if 'name' in json_schema:
296
+ if 'name' in merged_type:
297
+ merged_type['name'] = merged_type.get(
298
+ 'name', '') + json_schema['name']
299
+ else:
300
+ merged_type['name'] = json_schema['name']
301
+ if 'properties' in json_schema:
302
+ if 'properties' in merged_type:
303
+ for prop in json_schema['properties']:
304
+ if prop in merged_type['properties']:
305
+ merged_type['properties'][prop] = merge_structures(
306
+ merged_type['properties'][prop], copy.deepcopy(json_schema['properties'][prop]))
307
+ else:
308
+ merged_type['properties'][prop] = json_schema['properties'][prop]
309
+ else:
310
+ merged_type['properties'] = json_schema['properties']
311
+ if 'enum' in json_schema:
312
+ if 'enum' in merged_type:
313
+ merged_type['enum'] = list(
314
+ set(merged_type['enum']).union(set(json_schema['enum'])))
315
+ else:
316
+ merged_type['enum'] = json_schema['enum']
317
+ if 'format' in json_schema:
318
+ if 'format' in merged_type:
319
+ merged_type['format'] = merged_type['format'] + \
320
+ json_schema['format']
321
+ else:
322
+ merged_type['format'] = json_schema['format']
323
+
324
+ if intersect:
325
+ # only keep the intersection of the required fields
326
+ if 'required' in merged_type:
327
+ new_required = merged_type['required']
328
+ for json_schema in json_schemas:
329
+ new_required = list(set(new_required).intersection(
330
+ set(json_schema.get('required', []))))
331
+ merged_type['required'] = new_required
332
+
333
+ return merged_type
334
+
335
+ def ensure_type(self, type: dict | str | list) -> dict | str | list:
336
+ """
337
+ Ensures that the given type is valid by adding a 'type' field if it is missing.
338
+
339
+ Args:
340
+ type (dict | str | list): The type to ensure.
341
+
342
+ Returns:
343
+ dict | str | list: The ensured type.
344
+ """
345
+ if isinstance(type, str) or isinstance(type, list) or 'type' in type:
346
+ return type
347
+
348
+ type['type'] = generic_type()
349
+ return type
350
+
351
+ def json_schema_primitive_to_avro_type(self, json_primitive: str | list, format: str | None, enum: list | None, record_name: str, field_name: str, namespace: str, dependencies: list) -> str | dict[str, Any] | list:
352
+ """
353
+ Convert a JSON-schema primitive type to Avro primitive type.
354
+
355
+ Args:
356
+ json_primitive (str | list): The JSON-schema primitive type to be converted.
357
+ format (str | None): The format of the JSON primitive type, if applicable.
358
+ enum (list | None): The list of enum values, if applicable.
359
+ record_name (str): The name of the record.
360
+ field_name (str): The name of the field.
361
+ namespace (str): The namespace of the Avro type.
362
+ dependencies (list): The list of dependencies.
363
+
364
+ Returns:
365
+ str | dict[str,Any] | list: The converted Avro primitive type.
366
+
367
+ """
368
+ if isinstance(json_primitive, list):
369
+ if enum:
370
+ json_primitive = 'string'
371
+ else:
372
+ union = []
373
+ for item in json_primitive:
374
+ enum2 = item.get('enum') if isinstance(
375
+ item, dict) else None
376
+ format2 = item.get('format') if isinstance(
377
+ item, dict) else None
378
+ avro_primitive = self.json_schema_primitive_to_avro_type(
379
+ item, format2, enum2, record_name, field_name, self.compose_namespace(namespace, record_name, field_name), dependencies)
380
+ union.append(avro_primitive)
381
+ return union
382
+
383
+ if json_primitive == 'string':
384
+ avro_primitive = 'string'
385
+ elif json_primitive == 'integer':
386
+ avro_primitive = 'int'
387
+ if format == 'int64':
388
+ avro_primitive = 'long'
389
+ elif json_primitive == 'number':
390
+ avro_primitive = 'float'
391
+ elif json_primitive == 'boolean':
392
+ avro_primitive = 'boolean'
393
+ elif not format:
394
+ if isinstance(json_primitive, str):
395
+ dependencies.append(json_primitive)
396
+ avro_primitive = json_primitive
397
+
398
+ # if you've got { 'type': 'string', 'format': ['date-time', 'duration'] }, I'm sorry
399
+ if format and isinstance(format, str):
400
+ if format in ('date-time', 'date'):
401
+ avro_primitive = {'type': 'int', 'logicalType': 'date'}
402
+ elif format in ('time'):
403
+ avro_primitive = {'type': 'int', 'logicalType': 'time-millis'}
404
+ elif format in ('duration'):
405
+ avro_primitive = {'type': 'fixed',
406
+ 'size': 12, 'logicalType': 'duration'}
407
+ elif format in ('uuid'):
408
+ avro_primitive = {'type': 'string', 'logicalType': 'uuid'}
409
+
410
+ return avro_primitive
411
+
412
+ def fetch_content(self, url: str | ParseResult):
413
+ """
414
+ Fetches the content from the specified URL.
415
+
416
+ Args:
417
+ url (str or ParseResult): The URL to fetch the content from.
418
+
419
+ Returns:
420
+ str: The fetched content.
421
+
422
+ Raises:
423
+ requests.RequestException: If there is an error while making the HTTP request.
424
+ Exception: If there is an error while reading the file.
425
+
426
+ """
427
+ # Parse the URL to determine the scheme
428
+ if isinstance(url, str):
429
+ parsed_url = urlparse(url)
430
+ else:
431
+ parsed_url = url
432
+
433
+ if parsed_url.geturl() in self.content_cache:
434
+ return self.content_cache[parsed_url.geturl()]
435
+ scheme = parsed_url.scheme
436
+
437
+ # Handle HTTP and HTTPS URLs
438
+ if scheme in ['http', 'https']:
439
+ response = requests.get(url if isinstance(
440
+ url, str) else parsed_url.geturl(), timeout=30)
441
+ # Raises an HTTPError if the response status code is 4XX/5XX
442
+ response.raise_for_status()
443
+ self.content_cache[parsed_url.geturl()] = response.text
444
+ return response.text
445
+
446
+ # Handle file URLs
447
+ elif scheme == 'file':
448
+ # Remove the leading 'file://' from the path for compatibility
449
+ file_path = parsed_url.netloc
450
+ if not file_path:
451
+ file_path = parsed_url.path
452
+ # On Windows, a file URL might start with a '/' but it's not part of the actual path
453
+ if os.name == 'nt' and file_path.startswith('/'):
454
+ file_path = file_path[1:]
455
+ with open(file_path, 'r', encoding='utf-8') as file:
456
+ text = file.read()
457
+ self.content_cache[parsed_url.geturl()] = text
458
+ return text
459
+ else:
460
+ raise NotImplementedError(f'Unsupported URL scheme: {scheme}')
461
+
462
+ def resolve_reference(self, json_type: dict, base_uri: str, json_doc: dict) -> Tuple[dict, dict]:
463
+ """
464
+ Resolve a JSON Pointer reference or a JSON $ref reference.
465
+
466
+ Args:
467
+ json_type (dict): The JSON type containing the reference.
468
+ base_uri (str): The base URI of the JSON document.
469
+ json_doc (dict): The JSON document containing the reference.
470
+
471
+ Returns:
472
+ Tuple[dict, dict]: A tuple containing the resolved JSON schema and the original JSON schema document.
473
+
474
+ Raises:
475
+ Exception: If there is an error decoding JSON from the reference.
476
+ Exception: If there is an error resolving the JSON Pointer reference.
477
+
478
+ """
479
+ try:
480
+ ref = json_type['$ref']
481
+ content = None
482
+ url = urlparse(ref)
483
+ if url.scheme:
484
+ content = self.fetch_content(ref)
485
+ elif url.path:
486
+ file_uri = self.compose_uri(base_uri, url)
487
+ content = self.fetch_content(file_uri)
488
+ if content:
489
+ try:
490
+ json_schema_doc = json_schema = json.loads(content)
491
+ # resolve the JSON Pointer reference, if any
492
+ if url.fragment:
493
+ json_schema = jsonpointer.resolve_pointer(
494
+ json_schema, url.fragment)
495
+ return json_schema, json_schema_doc
496
+ except json.JSONDecodeError:
497
+ raise Exception(f'Error decoding JSON from {ref}')
498
+
499
+ if url.fragment:
500
+ json_pointer = unquote(url.fragment)
501
+ ref_schema = jsonpointer.resolve_pointer(
502
+ json_doc, json_pointer)
503
+ if ref_schema:
504
+ return ref_schema, json_doc
505
+ except JsonPointerException as e:
506
+ raise Exception(
507
+ f'Error resolving JSON Pointer reference for {base_uri}')
508
+ return json_type, json_doc
509
+
510
+ def compose_uri(self, base_uri, url):
511
+ if isinstance(url, str):
512
+ url = urlparse(url)
513
+ if url.scheme:
514
+ return url.geturl()
515
+ if not url.path and not url.netloc:
516
+ return base_uri
517
+ if base_uri.startswith('file'):
518
+ parsed_file_uri = urlparse(base_uri)
519
+ dir = os.path.dirname(
520
+ parsed_file_uri.netloc if parsed_file_uri.netloc else parsed_file_uri.path)
521
+ filename = os.path.join(dir, url.path)
522
+ file_uri = f'file://{filename}'
523
+ else:
524
+ # combine the base URI with the URL
525
+ file_uri = urllib.parse.urljoin(base_uri, url.geturl())
526
+ return file_uri
527
+
528
+ def get_field_type_name(self, field: dict) -> str:
529
+ if isinstance(field['type'], str):
530
+ return field['type']
531
+ elif isinstance(field['type'], list):
532
+ names = []
533
+ for field_type in field['type']:
534
+ if isinstance(field_type, str):
535
+ names.append(field_type)
536
+ elif isinstance(field_type, dict):
537
+ names.append(self.get_field_type_name(field_type))
538
+ else:
539
+ names.append('union')
540
+ return ', '.join(names)
541
+ elif isinstance(field['type'], dict) and 'type' in field['type']:
542
+ return field['type']['type']
543
+ return 'union'
544
+
545
+ def json_type_to_avro_type(self, json_type: str | dict, record_name: str, field_name: str, namespace: str, dependencies: list, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list, recursion_depth=1) -> dict | list | str:
546
+ """Convert a JSON type to Avro type."""
547
+
548
+ try:
549
+ if recursion_depth >= self.max_recursion_depth:
550
+ print(
551
+ f'WARNING: Maximum recursion depth reached for {record_name} at field {field_name}')
552
+ return generic_type()
553
+
554
+ avro_type: list | dict | str = {}
555
+ local_name = avro_name(field_name if field_name else record_name)
556
+ hasAnyOf = isinstance(json_type, dict) and 'anyOf' in json_type
557
+
558
+ if isinstance(json_type, dict):
559
+
560
+ json_object_type = json_type.get('type')
561
+ if isinstance(json_object_type, list):
562
+ # if the 'type' is a list, we map it back to a string
563
+ # if the list has only one item or if the list has two items
564
+ # and one of them is 'null'
565
+ # otherwise, we will construct and inject a oneOf type
566
+ # and split the type
567
+ if len(json_object_type) == 1:
568
+ json_object_type = json_object_type[0]
569
+ elif len(json_object_type) == 2 and 'null' in json_object_type:
570
+ if json_object_type[0] == 'null':
571
+ json_object_type = json_object_type[1]
572
+ else:
573
+ json_object_type = json_object_type[0]
574
+ else:
575
+ oneof = []
576
+ for option in json_object_type:
577
+ if not option == 'null':
578
+ oneof.append({
579
+ 'type': option
580
+ })
581
+ if len(oneof) > 0:
582
+ del json_type['type']
583
+ json_type['oneOf'] = oneof
584
+
585
+ if 'if' in json_type or 'then' in json_type or 'else' in json_type or 'dependentSchemas' in json_type or 'dependentRequired' in json_type:
586
+ print(
587
+ 'WARNING: Conditional schema is not supported and will be ignored.')
588
+ if 'if' in json_type:
589
+ del json_type['if']
590
+ if 'then' in json_type:
591
+ del json_type['then']
592
+ if 'else' in json_type:
593
+ del json_type['else']
594
+ if 'dependentSchemas' in json_type:
595
+ del json_type['dependentSchemas']
596
+ if 'dependentRequired' in json_type:
597
+ del json_type['dependentRequired']
598
+
599
+ base_type = json_type.copy()
600
+ if 'oneOf' in base_type:
601
+ del base_type['oneOf']
602
+ if 'anyOf' in base_type:
603
+ del base_type['anyOf']
604
+ if 'allOf' in base_type:
605
+ del base_type['allOf']
606
+ json_types = []
607
+
608
+ if 'allOf' in json_type:
609
+ # if the json type is an allOf, we merge all types into one
610
+ # this may be lossy if aspects of the types overlap but differ
611
+ type_list = [copy.deepcopy(base_type)]
612
+ for allof_option in json_type['allOf']:
613
+ while isinstance(allof_option, dict) and '$ref' in allof_option:
614
+ resolved_json_type, resolved_schema = self.resolve_reference(
615
+ allof_option, base_uri, json_schema)
616
+ del allof_option['$ref']
617
+ allof_option = self.merge_json_schemas(
618
+ [allof_option, resolved_json_type])
619
+ type_list.append(copy.deepcopy(allof_option))
620
+ merged_type = self.merge_json_schemas(
621
+ type_list, intersect=False)
622
+ json_types.append(merged_type)
623
+
624
+ if 'oneOf' in json_type:
625
+ # if the json type is a oneOf, we create a type union of all types
626
+ if len(json_types) == 0:
627
+ type_to_process = copy.deepcopy(base_type)
628
+ else:
629
+ type_to_process = copy.deepcopy(json_types.pop())
630
+ json_types = []
631
+ oneof = json_type['oneOf']
632
+ if len(json_types) == 0:
633
+ for oneof_option in oneof:
634
+ if isinstance(oneof_option, dict) and 'type' in oneof_option and 'type' in type_to_process and not type_to_process.get('type') == oneof_option.get('type'):
635
+ # we can't merge these due to conflicting types, so we pass the option-type on as-is
636
+ json_types.append(oneof_option)
637
+ else:
638
+ json_types.append(self.merge_json_schemas(
639
+ [type_to_process, oneof_option], intersect=True))
640
+ else:
641
+ new_json_types = []
642
+ for oneof_option in oneof:
643
+ for json_type_option in json_types:
644
+ json_type_option = self.merge_json_schemas(
645
+ [json_type_option, oneof_option], intersect=True)
646
+ new_json_types.append(json_type_option)
647
+ json_types = new_json_types
648
+
649
+ if 'anyOf' in json_type:
650
+ types_to_process = json_types.copy() if len(json_types) > 0 else [
651
+ copy.deepcopy(base_type)]
652
+ json_types = []
653
+ for type_to_process in types_to_process:
654
+ type_list = [copy.deepcopy(type_to_process)]
655
+ # anyOf is a list of types where any number from 1 to all
656
+ # may match the data. Trouble with anyOf is that it doesn't
657
+ # really have a semantic interpretation in the context of Avro.
658
+ for anyof_option in json_type['anyOf']:
659
+ if isinstance(anyof_option, dict) and '$ref' in anyof_option:
660
+ # if we have a ref, we can't merge into the base type, so we pass it on as-is.
661
+ # into the JSON type list
662
+ json_types.append(copy.deepcopy(anyof_option))
663
+ else:
664
+ type_list.append(copy.deepcopy(anyof_option))
665
+ merged_type = self.merge_json_schemas(
666
+ type_list, intersect=False)
667
+ json_types.append(merged_type)
668
+
669
+ if len(json_types) > 0:
670
+ if len(json_types) == 1:
671
+ avro_type = self.json_type_to_avro_type(
672
+ json_types[0], record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
673
+ if isinstance(avro_type, dict) and self.is_empty_type(avro_type) and not 'allOf' in json_type:
674
+ avro_type['type'] = generic_type()
675
+ avro_type = self.post_check_avro_type(
676
+ dependencies, avro_type)
677
+ return avro_type
678
+ else:
679
+ try:
680
+ record_stack.append(
681
+ field_name if field_name else record_name)
682
+ subtypes = []
683
+ count = 1
684
+ type_deps: List[str] = []
685
+ for json_type_option in json_types:
686
+ if isinstance(json_type_option, dict) and '$ref' in json_type_option:
687
+ ref = json_type_option['$ref']
688
+ if ref in self.imported_types:
689
+ avro_subtype = self.imported_types[ref]
690
+ subtypes.append(avro_subtype)
691
+ type_deps.append(avro_subtype)
692
+ continue
693
+
694
+ subtype_deps: List[str] = []
695
+ sub_field_name = avro_name(local_name + '_' + str(count)) if not isinstance(
696
+ json_type_option, dict) or not '$ref' in json_type_option else None
697
+ avro_subtype = self.json_type_to_avro_type(
698
+ json_type_option, record_name, sub_field_name, namespace, subtype_deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
699
+ if not avro_subtype:
700
+ continue
701
+ if isinstance(avro_subtype, dict) and 'name' in avro_subtype and 'type' in avro_subtype and (avro_subtype['type'] == 'record' or avro_subtype['type'] == 'enum'):
702
+ # we have a standalone record or enum so we need to add it to the schema at the top-level
703
+ # and reference it as a dependency from the parent type if it's not already been added.
704
+ existing_type = next((t for t in avro_schema if t.get('name') == avro_subtype['name'] and t.get(
705
+ 'namespace') == avro_subtype.get('namespace')), None)
706
+ if not existing_type:
707
+ if subtype_deps:
708
+ if not 'dependencies' in avro_subtype:
709
+ avro_subtype['dependencies'] = subtype_deps
710
+ else:
711
+ avro_subtype['dependencies'].extend(
712
+ subtype_deps)
713
+ if self.is_empty_type(avro_subtype):
714
+ print(
715
+ f'WARN: Standalone type {avro_subtype["name"]} is empty')
716
+ if avro_subtype['type'] != 'enum' and avro_subtype['type'] != 'record' and avro_subtype['type'] != 'fixed':
717
+ raise ValueError(
718
+ f'WARN: Standalone type {avro_subtype["name"]} is not a record or enum or fixed type')
719
+ avro_schema.append(avro_subtype)
720
+ full_name = self.get_qualified_name(
721
+ avro_subtype)
722
+ subtype_deps = [full_name]
723
+ avro_subtype = full_name
724
+ if isinstance(avro_subtype, dict) and 'dependencies' in avro_subtype:
725
+ subtype_deps.extend(
726
+ avro_subtype['dependencies'])
727
+ del avro_subtype['dependencies']
728
+ if len(subtype_deps) > 0:
729
+ type_deps.extend(subtype_deps)
730
+ if not self.is_empty_type(avro_subtype):
731
+ if isinstance(avro_subtype, list):
732
+ subtypes.extend(
733
+ copy.deepcopy(avro_subtype))
734
+ else:
735
+ subtypes.append(
736
+ copy.deepcopy(avro_subtype))
737
+ count += 1
738
+ if len(type_deps) > 0:
739
+ dependencies.extend(type_deps)
740
+ if len(subtypes) == 1:
741
+ return self.post_check_avro_type(dependencies, subtypes[0])
742
+ finally:
743
+ record_stack.pop()
744
+
745
+ if hasAnyOf:
746
+ # if all subtypes are strings, they are either primitive types or type references
747
+ # which means there's nothing to merge, so we'll return the list of types
748
+ if all([isinstance(st, str) for st in subtypes]):
749
+ return self.post_check_avro_type(dependencies, subtypes)
750
+
751
+ # we now has a list of types that may match the data, but this would be
752
+ # an Avro union which is mutually exclusive. We will merge this list
753
+ # into a record type in postprocessing when all types are available
754
+ if not isinstance(avro_type, dict):
755
+ avro_type = {}
756
+ avro_type['unmerged_types'] = subtypes
757
+ avro_type['type'] = 'record'
758
+ avro_type['name'] = avro_name(local_name)
759
+ if local_name != avro_name(local_name):
760
+ avro_type['altnames'] = { 'json': local_name }
761
+ avro_type['namespace'] = namespace
762
+ avro_type['fields'] = []
763
+ if 'description' in json_type:
764
+ avro_type['doc'] = json_type['description']
765
+ json_type = {}
766
+ else:
767
+ return self.post_check_avro_type(dependencies, subtypes)
768
+
769
+ if 'properties' in json_type and not 'type' in json_type:
770
+ json_type['type'] = 'object'
771
+
772
+ if 'description' in json_type and isinstance(avro_type, dict):
773
+ avro_type['doc'] = json_type['description']
774
+
775
+ if 'title' in json_type and isinstance(avro_type, dict):
776
+ self.set_avro_type_value(
777
+ avro_type, 'name', avro_name(json_type['title']))
778
+
779
+ # first, pull in any referenced definitions and merge with this schema
780
+ if '$ref' in json_type:
781
+ # the $ref can indeed be a list as a result from a prior allOf/anyOf merge
782
+ # if that is so, we will copy the type and process each $ref separately
783
+ # and return the result as a list of types
784
+ if isinstance(json_type['$ref'], list):
785
+ types = []
786
+ for ref in json_type['$ref']:
787
+ json_type_copy = copy.deepcopy(json_type)
788
+ json_type_copy['$ref'] = ref
789
+ types.append(self.json_type_to_avro_type(json_type_copy, record_name, field_name, namespace,
790
+ dependencies, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1))
791
+ return self.post_check_avro_type(dependencies, types)
792
+
793
+ ref = json_type['$ref']
794
+ if ref in self.imported_types:
795
+ # reference was already resolved, so we can resolve the reference simply by returning the type
796
+ type_ref = copy.deepcopy(self.imported_types[ref])
797
+ if isinstance(type_ref, str):
798
+ dependencies.append(type_ref)
799
+ return self.post_check_avro_type(dependencies, type_ref)
800
+ else:
801
+ new_base_uri = self.compose_uri(
802
+ base_uri, json_type['$ref'])
803
+ resolved_json_type, resolved_schema = self.resolve_reference(
804
+ json_type, base_uri, json_schema)
805
+ if self.is_empty_json_type(json_type):
806
+ # it's a standalone reference, so will import the type into the schema
807
+ # and reference it like it was in the same file
808
+ type_name = record_name
809
+ type_namespace = namespace
810
+ parsed_ref = urlparse(ref)
811
+ if parsed_ref.fragment:
812
+ type_name = avro_name(
813
+ parsed_ref.fragment.split('/')[-1])
814
+ sub_namespace = self.compose_namespace(
815
+ *parsed_ref.fragment.split('/')[2:-1])
816
+ type_namespace = self.compose_namespace(
817
+ self.root_namespace, sub_namespace)
818
+
819
+ # registering in imported_types ahead of resolving to prevent circular references.
820
+ # we only cache the type if it's forseeable that it is usable as a standalone type
821
+ # which means that it must be either a record or an enum or a fixed type when converted
822
+ # to Avro. That means we look for the presence of 'type', 'properties', 'allOf', 'anyOf',
823
+ # and 'enum' in the resolved type.
824
+ if resolved_json_type and (('type' in resolved_json_type and resolved_json_type['type'] == 'object') or 'properties' in resolved_json_type or 'enum' in resolved_json_type or
825
+ 'allOf' in resolved_json_type or 'anyOf' in resolved_json_type):
826
+ self.imported_types[ref] = self.compose_namespace(
827
+ type_namespace, type_name)
828
+ # resolve type
829
+ deps: List[str] = []
830
+ resolved_avro_type: dict | list | str | None = self.json_type_to_avro_type(
831
+ resolved_json_type, type_name, '', type_namespace, deps, resolved_schema, new_base_uri, avro_schema, [], recursion_depth + 1)
832
+ if isinstance(resolved_avro_type, str):
833
+ dependencies.extend(deps)
834
+ return self.post_check_avro_type(dependencies, resolved_avro_type)
835
+ if isinstance(resolved_avro_type, list) or (not isinstance(resolved_avro_type, dict) or (not resolved_avro_type.get('type') == 'record' and not resolved_avro_type.get('type') == 'enum')):
836
+ if isinstance(resolved_avro_type, dict) and not 'type' in resolved_avro_type:
837
+ if isinstance(avro_type, dict):
838
+ # the resolved type didn't have a type and avro_type is a dict,
839
+ # so we assume it's a mixin into the type we found
840
+ avro_type.update(resolved_avro_type)
841
+ resolved_avro_type = None
842
+ else:
843
+ # no 'type' definition for this field and we can't mix into the avro type,
844
+ # so we fallback to a generic type
845
+ print(
846
+ f"WARNING: no 'type' definition for {ref} in record {record_name}: {json.dumps(resolved_avro_type)}")
847
+ resolved_avro_type = generic_type()
848
+ elif isinstance(avro_type, str) and resolved_avro_type:
849
+ # this is a plain type reference
850
+ avro_type = resolved_avro_type
851
+ self.imported_types[ref] = avro_type
852
+ resolved_avro_type = None
853
+ if resolved_avro_type:
854
+ # this is not a record type that can stand on its own,
855
+ # so we remove the cached type entry
856
+ # and pass it on as an inline type
857
+ dependencies.extend(deps)
858
+ if ref in self.imported_types:
859
+ del self.imported_types[ref]
860
+ avro_type = self.merge_avro_schemas(
861
+ [avro_type, resolved_avro_type], avro_schema, local_name)
862
+ if isinstance(avro_type, dict) and 'name' in avro_type and not self.is_standalone_avro_type(avro_type):
863
+ del avro_type['name']
864
+ return self.post_check_avro_type(dependencies, avro_type)
865
+ else:
866
+ avro_type = resolved_avro_type
867
+ self.imported_types[ref] = copy.deepcopy(
868
+ avro_type)
869
+
870
+ if len(deps) > 0:
871
+ if isinstance(avro_type, dict):
872
+ avro_type['dependencies'] = deps
873
+ else:
874
+ dependencies.extend(deps)
875
+
876
+ if self.is_standalone_avro_type(avro_type):
877
+ self.register_type(avro_schema, avro_type)
878
+ full_name = self.get_qualified_name(avro_type)
879
+ if ref in self.imported_types:
880
+ # update the import reference to the resolved type if it's cached
881
+ self.imported_types[ref] = full_name
882
+ dependencies.append(full_name)
883
+ avro_type = full_name
884
+ else:
885
+ del json_type['$ref']
886
+ # it's a reference within a definition, so we will turn this into an inline type
887
+ if isinstance(resolved_json_type, dict) and 'type' in resolved_json_type and json_type.get('type') and not json_type['type'] == resolved_json_type['type']:
888
+ # the types conflict, so we can't merge them
889
+ type1 = self.json_type_to_avro_type(
890
+ json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
891
+ type2 = self.json_type_to_avro_type(resolved_json_type, record_name, field_name, namespace,
892
+ dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
893
+ # if either of the types are empty, use just the other one
894
+ if not self.is_empty_type(type1) and not self.is_empty_type(type2):
895
+ return self.flatten_union([type1, type2])
896
+ if not self.is_empty_type(type1):
897
+ avro_type = type1
898
+ if isinstance(avro_type, list):
899
+ return self.post_check_avro_type(dependencies, avro_type)
900
+ if not self.is_empty_type(type2):
901
+ avro_type = type2
902
+ if isinstance(avro_type, list):
903
+ return self.post_check_avro_type(dependencies, avro_type)
904
+ json_type = {}
905
+ else:
906
+ json_type = self.merge_json_schemas(
907
+ [json_type, resolved_json_type])
908
+ avro_type = self.json_type_to_avro_type(
909
+ json_type, record_name, field_name, namespace, dependencies, resolved_schema, new_base_uri, avro_schema, record_stack, recursion_depth + 1)
910
+ json_type = {}
911
+ if ref in self.imported_types:
912
+ # update the import reference to the resolved type if it's cached
913
+ if isinstance(avro_type, dict) and 'name' in avro_type:
914
+ self.imported_types[ref] = avro_type['name']
915
+ else:
916
+ self.imported_types[ref] = avro_type
917
+
918
+ # if 'const' is present, make this an enum
919
+ if 'const' in json_type:
920
+ const_list = json_type['const'] if isinstance(
921
+ json_type['const'], list) else [json_type['const']]
922
+ avro_type = self.merge_avro_schemas([avro_type, self.create_enum_type(
923
+ local_name, namespace, const_list)], avro_schema, local_name)
924
+ if json_object_type or 'enum' in json_type:
925
+ if json_object_type == 'array':
926
+ if isinstance(json_type, dict) and 'items' in json_type:
927
+ deps = []
928
+ item_type = self.json_type_to_avro_type(
929
+ json_type['items'], record_name, field_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack, recursion_depth + 1)
930
+ if self.is_standalone_avro_type(item_type):
931
+ if isinstance(item_type, dict) and len(deps) > 0:
932
+ item_type['dependencies'] = deps
933
+ self.register_type(avro_schema, item_type)
934
+ dependencies.append(
935
+ self.get_qualified_name(item_type))
936
+ else:
937
+ dependencies.extend(deps)
938
+ if isinstance(item_type, dict) and not 'type' in item_type:
939
+ item_type = generic_type()
940
+ elif isinstance(item_type, str) and not item_type in primitive_types:
941
+ dependencies.append(item_type)
942
+ else: # not a standalone type, but has a type definition, so we unwind that here
943
+ item_type = self.post_check_avro_type(
944
+ dependencies, item_type)
945
+ avro_type = self.merge_avro_schemas(
946
+ [avro_type, self.create_array_type(item_type)], avro_schema, '')
947
+ else:
948
+ avro_type = self.merge_avro_schemas(
949
+ [avro_type, self.create_array_type(generic_type())], avro_schema, '')
950
+ elif json_object_type and (json_object_type == 'object' or 'object' in json_object_type):
951
+ avro_record_type = self.json_schema_object_to_avro_record(
952
+ local_name, json_type, namespace, json_schema, base_uri, avro_schema, record_stack)
953
+ if isinstance(avro_record_type, list):
954
+ for record_entry in avro_record_type:
955
+ self.lift_dependencies_from_type(
956
+ record_entry, dependencies)
957
+ avro_type = self.merge_avro_schemas([avro_type, avro_record_type], avro_schema, avro_type.get(
958
+ 'name', local_name) if isinstance(avro_type, dict) else local_name)
959
+ self.lift_dependencies_from_type(
960
+ avro_type, dependencies)
961
+ elif 'enum' in json_type and (not 'type' in json_type or json_type['type'] == "string"):
962
+ # we skip all enums that are not of implicit or explicit type 'string'
963
+ enum = [avro_name(e) for e in json_type['enum'] if isinstance(
964
+ e, str) and e != '']
965
+ if len(enum) > 0:
966
+ # if the enum ends up empty (only non-strings in the enum), we will skip it
967
+ enum = list(set(enum))
968
+ if len(enum) > 0:
969
+ avro_type = self.create_enum_type(local_name, self.compose_namespace(
970
+ namespace, record_name + '_types'), enum)
971
+ else:
972
+ avro_type = self.json_schema_primitive_to_avro_type(json_object_type, json_type.get(
973
+ 'format'), json_type.get('enum'), record_name, field_name, namespace, dependencies)
974
+ else:
975
+ if isinstance(json_type, dict):
976
+ avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(json_type, json_type.get('format'), json_type.get(
977
+ 'enum'), record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
978
+ else:
979
+ avro_type = self.merge_avro_schemas([avro_type, self.json_schema_primitive_to_avro_type(
980
+ json_type, None, None, record_name, field_name, namespace, dependencies)], avro_schema, avro_type.get('name', local_name) if isinstance(avro_type, dict) else local_name)
981
+
982
+ if isinstance(avro_type, dict) and 'name' in avro_type and 'type' in avro_type and not (avro_type['type'] in ['array', 'map']):
983
+ if not 'namespace' in avro_type:
984
+ avro_type['namespace'] = namespace
985
+ existing_type = next((t for t in avro_schema if t.get(
986
+ 'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
987
+ if existing_type:
988
+ existing_type_name = self.get_qualified_name(existing_type)
989
+ if not existing_type_name in dependencies:
990
+ dependencies.append(existing_type_name)
991
+ return existing_type_name
992
+ self.set_avro_type_value(avro_type, 'name', local_name)
993
+
994
+ # post-check on the avro type: if the type is a dict, and the 'type' is not
995
+ # a record, enum, fixed, array, or map, we will just return the basic type
996
+ # and push its dependencies up the stack
997
+ avro_type = self.post_check_avro_type(dependencies, avro_type)
998
+
999
+ if isinstance(avro_type, dict) and 'unmerged_types' in avro_type:
1000
+ self.types_with_unmerged_types.append(avro_type)
1001
+
1002
+ return avro_type
1003
+ except RecursionError as e:
1004
+ print(
1005
+ f"Recursion error while processing {namespace}:{record_name}:{field_name} with recursion depth {recursion_depth}")
1006
+ raise e
1007
+
1008
+ def post_check_avro_type(self, dependencies, avro_type):
1009
+ """Post-check the Avro type and push dependencies up the stack."""
1010
+ if isinstance(avro_type, dict) and 'type' in avro_type and (isinstance(avro_type, list) or not avro_type['type'] in ['array', 'map', 'record', 'enum', 'fixed']):
1011
+ if 'dependencies' in avro_type:
1012
+ dependencies.extend(avro_type['dependencies'])
1013
+ avro_type = avro_type['type']
1014
+ return avro_type
1015
+
1016
+ def register_type(self, avro_schema, avro_type) -> bool:
1017
+ """Register a type in the Avro schema."""
1018
+ existing_type = next((t for t in avro_schema if t.get(
1019
+ 'name') == avro_type['name'] and t.get('namespace') == avro_type.get('namespace')), None)
1020
+ if not existing_type:
1021
+ if self.is_empty_type(avro_type) and not 'unmerged_types' in avro_type:
1022
+ print(f'WARN: Standalone type {avro_type["name"]} is empty')
1023
+ if self.is_standalone_avro_type(avro_type):
1024
+ avro_schema.append(avro_type)
1025
+ return True
1026
+ else:
1027
+ return False
1028
+ else:
1029
+ return True
1030
+
1031
+ def has_composition_keywords(self, json_object: dict) -> bool:
1032
+ """Check if the JSON object has any of the combining keywords: allOf, oneOf, anyOf."""
1033
+ return isinstance(json_object, dict) and ('allOf' in json_object or 'oneOf' in json_object or 'anyOf' in json_object)
1034
+
1035
+ def has_enum_keyword(self, json_object: dict) -> bool:
1036
+ """Check if the JSON object is an enum."""
1037
+ return isinstance(json_object, dict) and 'enum' in json_object
1038
+
1039
+ def is_array_object(self, json_object: dict) -> bool:
1040
+ """Check if the JSON object is an array object."""
1041
+ return isinstance(json_object, dict) and 'type' in json_object and json_object['type'] == 'array'
1042
+
1043
+ def is_standalone_avro_type(self, avro_type: dict | list | str) -> bool:
1044
+ """Check if the Avro type is a standalone type."""
1045
+ return isinstance(avro_type, dict) and 'type' in avro_type and (avro_type['type'] in ['record', 'enum', 'fixed'])
1046
+
1047
+ def is_avro_complex_type(self, avro_type: dict) -> bool:
1048
+ """Check if the Avro type is a complex type."""
1049
+ return 'type' in avro_type and avro_type['type'] in ['record', 'enum', 'fixed', 'array', 'map']
1050
+
1051
+ def set_avro_type_value(self, avro_type: dict | list | str, name: str, value: dict | list | str):
1052
+ """Set a value in an Avro type."""
1053
+ if isinstance(avro_type, dict):
1054
+ if name == 'namespace' or name == 'name':
1055
+ if 'type' in avro_type:
1056
+ if not (avro_type['type'] in ['record', 'enum', 'fixed']):
1057
+ return
1058
+ avro_type[name] = value
1059
+
1060
+ def create_avro_record(self, name: str, namespace: str, fields: list) -> dict:
1061
+ """Create an Avro record type."""
1062
+ return {
1063
+ 'type': 'record',
1064
+ 'name': avro_name(name),
1065
+ 'namespace': namespace,
1066
+ 'fields': fields
1067
+ }
1068
+
1069
+ def create_wrapper_record(self, wrapper_name: str, wrapper_namespace: str, wrapper_field: str, dependencies: list, avro_type: list | str | dict) -> dict:
1070
+ """Create a union wrapper type in Avro."""
1071
+ rec = self.create_avro_record(wrapper_name, wrapper_namespace, [
1072
+ {
1073
+ 'name': wrapper_field,
1074
+ 'type': avro_type
1075
+ }
1076
+ ])
1077
+ if len(dependencies) > 0:
1078
+ rec['dependencies'] = dependencies
1079
+ return rec
1080
+
1081
+ def create_enum_type(self, name: str, namespace: str, symbols: list) -> dict:
1082
+ """Create an Avro enum type."""
1083
+ # the symbol list may have been merged by composition to we flatten it to have a unique list
1084
+ symbols = self.flatten_union(symbols)
1085
+ return {
1086
+ 'type': 'enum',
1087
+ 'name': name,
1088
+ 'namespace': namespace,
1089
+ 'symbols': [avro_name(s) for s in symbols]
1090
+ }
1091
+
1092
+ def create_array_type(self, items: list | dict | str) -> dict:
1093
+ """Create an Avro array type."""
1094
+ return {
1095
+ 'type': 'array',
1096
+ 'items': items
1097
+ }
1098
+
1099
+ def create_map_type(self, values: list | dict | str) -> dict:
1100
+ """Create an Avro map type."""
1101
+ return {
1102
+ 'type': 'map',
1103
+ 'values': values
1104
+ }
1105
+
1106
+ def nullable(self, avro_type: list | dict | str) -> list | dict | str:
1107
+ """Wrap a type in a union with null."""
1108
+ if isinstance(avro_type, list):
1109
+ cp = avro_type.copy()
1110
+ cp.insert(0, 'null')
1111
+ return cp
1112
+ return ['null', avro_type]
1113
+
1114
+ def merge_description_into_doc(self, source_json: dict, target_avro: dict | list | str):
1115
+ """Merge a description in JSON into Avro doc."""
1116
+ if isinstance(source_json, dict) and 'description' in source_json and isinstance(target_avro, dict):
1117
+ target_avro['doc'] = target_avro['doc'] + ", " + \
1118
+ source_json['description'] if 'doc' in target_avro else source_json['description']
1119
+
1120
+ def merge_dependencies_into_parent(self, dependencies: list, child_type: dict | list | str, parent_type: dict | list | str):
1121
+ """Merge dependencies from a child type into a parent type."""
1122
+ self.lift_dependencies_from_type(child_type, dependencies)
1123
+ if len(dependencies) > 0 and isinstance(parent_type, dict):
1124
+ if 'dependencies' in parent_type:
1125
+ dependencies.extend(parent_type['dependencies'])
1126
+ else:
1127
+ parent_type['dependencies'] = dependencies
1128
+
1129
+ def lift_dependencies_from_type(self, child_type: dict | list | str, dependencies: list):
1130
+ """Lift all dependencies from a type and return a new type with the dependencies lifted."""
1131
+ if isinstance(child_type, dict):
1132
+ if 'dependencies' in child_type:
1133
+ dependencies.extend(child_type['dependencies'])
1134
+ del child_type['dependencies']
1135
+
1136
+ def compose_namespace(self, *names) -> str:
1137
+ """Compose a namespace from a list of names."""
1138
+ return '.'.join([avro_namespace(n) for n in names if n])
1139
+
1140
+ def get_qualified_name(self, avro_type):
1141
+ """Get the qualified name of an Avro type."""
1142
+ return self.compose_namespace(avro_type.get('namespace', ''), avro_type.get('name', ''))
1143
+
1144
+ def json_schema_object_to_avro_record(self, name: str, json_object: dict, namespace: str, json_schema: dict, base_uri: str, avro_schema: list, record_stack: list) -> dict | list | str | None:
1145
+ """Convert a JSON schema object declaration to an Avro record."""
1146
+ dependencies: List[str] = []
1147
+ avro_type: list | dict | str = {}
1148
+
1149
+ # handle top-level allOf, anyOf, oneOf
1150
+ if self.has_composition_keywords(json_object):
1151
+ # we will merge allOf, oneOf, anyOf into a union record type
1152
+ type = self.json_type_to_avro_type(
1153
+ json_object, name, '', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1154
+ if isinstance(type, str):
1155
+ # we are skipping references and primitives
1156
+ return None
1157
+ if isinstance(type, list):
1158
+ # we should have a union type
1159
+ avro_type = self.create_wrapper_record(
1160
+ name+"_union", self.utility_namespace, 'options', [], type)
1161
+ elif isinstance(type, dict) and 'type' in type and type['type'] != 'record':
1162
+ # merge the type into a record type if it's not a record type
1163
+ print(
1164
+ f'INFO: Standalone type {name} is being wrapped in a record')
1165
+ avro_type = self.create_wrapper_record(avro_name(type.get(
1166
+ 'name', name)+'_wrapper'), self.utility_namespace, 'value', type.get('dependencies', []), type)
1167
+ else:
1168
+ avro_type = type
1169
+ # add external dependencies to the record
1170
+ self.merge_dependencies_into_parent(dependencies, type, avro_type)
1171
+ self.merge_description_into_doc(json_object, avro_type)
1172
+ # return the union type
1173
+ return avro_type
1174
+
1175
+ if self.has_enum_keyword(json_object):
1176
+ # this is an enum
1177
+ avro_enum = self.create_enum_type(
1178
+ avro_name(name), namespace, json_object['enum'])
1179
+ self.merge_description_into_doc(json_object, avro_enum)
1180
+ return avro_enum
1181
+
1182
+ if self.is_array_object(json_object):
1183
+ # this is an array, which can't be standalone in Avro, so we will wraps it into a record
1184
+ # and include the type as an inline
1185
+ print(
1186
+ f'WARN: Standalone array type {name} will be wrapped in a record')
1187
+ deps: List[str] = []
1188
+ array_type = self.json_type_to_avro_type(json_object, name, avro_name(
1189
+ name), namespace, deps, json_schema, base_uri, avro_schema, record_stack)
1190
+ avro_array = self.create_wrapper_record(
1191
+ avro_name(name+'_wrapper'), self.utility_namespace, 'items', [], array_type)
1192
+ self.merge_description_into_doc(json_object, avro_array)
1193
+ self.merge_dependencies_into_parent(deps, array_type, avro_array)
1194
+ return avro_array
1195
+
1196
+ # at this point, we have to assume that we have a JSON schema object
1197
+ title = json_object.get('title')
1198
+ record_name = avro_name(name if name else title if title else None)
1199
+ if record_name is None:
1200
+ raise ValueError(
1201
+ f"Cannot determine record name for json_object {json_object}")
1202
+ if len(record_stack) > 0:
1203
+ # if we have a record stack, we need to add the current name to
1204
+ # the namespace since nested types are disambiguated by their namespace
1205
+ namespace = self.compose_namespace(
1206
+ namespace, record_stack[-1] + "_types")
1207
+ # at this point we have a record type
1208
+ avro_record = self.create_avro_record(record_name, namespace, [])
1209
+ # we need to prevent circular dependencies, so we will maintain a stack of the in-progress
1210
+ # records and will resolve the cycle as we go. if this record is already in the stack, we will
1211
+ # just return a reference to a record that contains this record
1212
+ if record_name in record_stack:
1213
+ # to break the cycle, we will use a containment type that references
1214
+ # the record that is being defined
1215
+ print(
1216
+ f'WARN: Circular dependency found for record {record_name}. Creating {record_name}_ref.')
1217
+ ref_name = avro_name(record_name + '_ref')
1218
+ return self.create_wrapper_record(ref_name, namespace, record_name, [], self.compose_namespace(namespace, record_name))
1219
+ try:
1220
+ # enter the record stack scope for this record
1221
+ record_stack.append(record_name)
1222
+ # collect the required fields so we can make those fields non-null
1223
+ required_fields = json_object.get('required', [])
1224
+
1225
+ field_refs = []
1226
+ if 'properties' in json_object and isinstance(json_object['properties'], dict):
1227
+ # add the properties as fields
1228
+ for field_name, json_field_types in json_object['properties'].items():
1229
+ if isinstance(json_field_types, bool):
1230
+ # for "propertyname": true, we skip. schema bug.
1231
+ continue
1232
+ if not isinstance(json_field_types, list):
1233
+ json_field_types = [json_field_types]
1234
+ field_type_list = []
1235
+ field_ref_type_list = []
1236
+ const = None
1237
+ default = None
1238
+ description = None
1239
+ for json_field_type in json_field_types:
1240
+ # skip fields with an bad or empty type
1241
+ if not isinstance(json_field_type, dict):
1242
+ continue
1243
+ field_name = avro_name(field_name)
1244
+ # last const wins if there are multiple
1245
+ const = json_field_type.get('const', const)
1246
+ # last default wins if there are multiple
1247
+ default_value = json_field_type.get('default')
1248
+ if default_value and not isinstance(default_value, dict) and not isinstance(default_value, list):
1249
+ default = default_value
1250
+ # get the description from the field type
1251
+ description = json_field_type.get('description', description)
1252
+ # convert the JSON-type field to an Avro-type field
1253
+ avro_field_ref_type = avro_field_type = self.ensure_type(self.json_type_to_avro_type(
1254
+ json_field_type, record_name, field_name, namespace, dependencies, json_schema, base_uri, avro_schema, record_stack))
1255
+ if isinstance(avro_field_type, list):
1256
+ avro_field_type = self.flatten_union(
1257
+ avro_field_type)
1258
+ avro_field_ref_type = avro_field_type
1259
+ elif isinstance(avro_field_type, dict):
1260
+ self.lift_dependencies_from_type(
1261
+ avro_field_type, dependencies)
1262
+ # if the first call gave us a global type that got added to the schema, this call will give us a reference
1263
+ if self.is_standalone_avro_type(avro_field_type):
1264
+ avro_field_ref_type = self.get_qualified_name(
1265
+ avro_field_type)
1266
+ if avro_field_type is None:
1267
+ # None type is a problem
1268
+ raise ValueError(
1269
+ f"avro_field_type is None for field {field_name}")
1270
+ if isinstance(avro_field_type, dict) and 'type' in avro_field_type and not self.is_avro_complex_type(avro_field_type):
1271
+ # if the field type is a basic type, inline it
1272
+ avro_field_type = avro_field_type['type']
1273
+ field_type_list.append(avro_field_type)
1274
+ field_ref_type_list.append(avro_field_ref_type)
1275
+
1276
+ effective_field_type = field_type_list[0] if len(
1277
+ field_type_list) == 1 else field_type_list
1278
+ effective_field_ref_type = field_ref_type_list[0] if len(
1279
+ field_ref_type_list) == 1 else field_ref_type_list
1280
+ avro_field = {
1281
+ 'name': avro_name(field_name),
1282
+ 'type': self.nullable(effective_field_type) if not field_name in required_fields and 'null' not in effective_field_type else effective_field_type
1283
+ }
1284
+ if field_name != avro_name(field_name):
1285
+ avro_field['altnames'] = { "json": field_name }
1286
+ if const:
1287
+ avro_field['const'] = const
1288
+ if default:
1289
+ avro_field['default'] = default
1290
+ if description:
1291
+ avro_field['doc'] = description
1292
+ field_type_list.append(avro_field_type)
1293
+ avro_field_ref = {
1294
+ 'name': avro_name(field_name),
1295
+ 'type': self.nullable(effective_field_ref_type) if not field_name in required_fields and 'null' not in effective_field_ref_type else effective_field_ref_type
1296
+ }
1297
+ if description:
1298
+ avro_field_ref['doc'] = description
1299
+ field_ref_type_list.append(avro_field_ref)
1300
+ # add the field to the record
1301
+ avro_record['fields'].append(avro_field)
1302
+ field_refs.append(avro_field_ref)
1303
+ elif not 'additionalProperties' in json_object and not 'patternProperties' in json_object:
1304
+ if 'type' in json_object and (json_object['type'] == 'object' or 'object' in json_object['type']) and \
1305
+ not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
1306
+ # we don't have any fields, but we have an object type, so we create a map
1307
+ avro_record = self.create_map_type(generic_type())
1308
+ elif 'type' in json_object and (json_object['type'] == 'array' or 'array' in json_object['type']) and \
1309
+ not 'allOf' in json_object and not 'oneOf' in json_object and not 'anyOf' in json_object:
1310
+ # we don't have any fields, but we have an array type, so we create a record with an 'items' field
1311
+ avro_record = self.create_array_type(
1312
+ self.json_type_to_avro_type(
1313
+ json_object['items'], record_name, 'values', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1314
+ if 'items' in json_object
1315
+ else generic_type())
1316
+ else:
1317
+ return json_object['type'] if 'type' in json_object else generic_type()
1318
+
1319
+ extension_types = []
1320
+ prop_docs = ''
1321
+ if 'patternProperties' in json_object and isinstance(json_object['patternProperties'], dict) and len(json_object['patternProperties']) > 0:
1322
+ # pattern properties are represented as a record with field names that are the patterns
1323
+ pattern_props = json_object['patternProperties']
1324
+ for pattern_name, props in pattern_props.items():
1325
+ deps = []
1326
+ prop_type = self.ensure_type(self.json_type_to_avro_type(
1327
+ props, record_name, pattern_name, namespace, deps, json_schema, base_uri, avro_schema, record_stack))
1328
+ if self.is_standalone_avro_type(prop_type):
1329
+ self.lift_dependencies_from_type(prop_type, deps)
1330
+ self.set_avro_type_value(
1331
+ prop_type, 'namespace', namespace)
1332
+ self.register_type(avro_schema, prop_type)
1333
+ prop_type_ref = self.get_qualified_name(prop_type)
1334
+ dependencies.append(prop_type_ref)
1335
+ else:
1336
+ dependencies.extend(deps)
1337
+ if isinstance(prop_type, str) and not prop_type in primitive_types:
1338
+ dependencies.append(prop_type)
1339
+ if self.is_empty_type(prop_type):
1340
+ prop_type = generic_type()
1341
+ prop_docs += f"Name pattern '{pattern_name}': [{self.get_field_type_name({'type':prop_type})}]. "
1342
+ extension_types.append(prop_type)
1343
+
1344
+ if 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], bool):
1345
+ if True == json_object['additionalProperties']:
1346
+ prop_type = generic_type()
1347
+ extension_types.append(prop_type)
1348
+ elif 'additionalProperties' in json_object and isinstance(json_object['additionalProperties'], dict) and len(json_object['additionalProperties']) > 0:
1349
+ # additional properties are represented as a map of string to the type of the value
1350
+ additional_props = json_object['additionalProperties']
1351
+ deps = []
1352
+ values_type = self.json_type_to_avro_type(
1353
+ additional_props, record_name, record_name + '_extensions', namespace, dependencies, json_schema, base_uri, avro_schema, record_stack)
1354
+ if self.is_standalone_avro_type(values_type):
1355
+ self.lift_dependencies_from_type(values_type, deps)
1356
+ self.set_avro_type_value(
1357
+ values_type, 'namespace', namespace)
1358
+ self.register_type(avro_schema, values_type)
1359
+ values_type_ref = self.get_qualified_name(values_type)
1360
+ dependencies.append(values_type_ref)
1361
+ else:
1362
+ dependencies.extend(deps)
1363
+ if isinstance(values_type, str) and not values_type in primitive_types:
1364
+ dependencies.append(values_type)
1365
+ if self.is_empty_type(values_type):
1366
+ values_type = generic_type()
1367
+ prop_docs += f"Extra properties: [{self.get_field_type_name({'type':values_type})}]. "
1368
+ extension_types.append(values_type)
1369
+ self.merge_description_into_doc(json_object, avro_record)
1370
+
1371
+ avro_alternate_record = None
1372
+ if extension_types:
1373
+ # Since Avro Schema does not allow fields with dynamic names
1374
+ # to appear alongside regular fields, we will union the types of all properties with the
1375
+ # type of the additionalProperties and document this in the record's description
1376
+ json_field_types = [field['type'] for field in field_refs]
1377
+ field_type_names = [
1378
+ [field['name'], self.get_field_type_name(field)] for field in field_refs]
1379
+ field_type_name_list: str = ', '.join(
1380
+ [f"'{field[0]}': [{field[1]}]" for field in field_type_names])
1381
+ json_field_types.extend(extension_types)
1382
+ json_field_types = self.flatten_union(json_field_types)
1383
+ if len(json_field_types) == 1:
1384
+ json_field_types = json_field_types[0]
1385
+ doc = f"Alternate map: {field_type_name_list}. " if field_type_names else ''
1386
+ doc += prop_docs
1387
+ avro_alternate_record = self.create_map_type(json_field_types)
1388
+ if not self.is_empty_type(avro_record):
1389
+ avro_alternate_record['alternateof'] = self.get_qualified_name(avro_record)
1390
+ dependencies.append(
1391
+ self.compose_namespace(namespace, record_name))
1392
+ avro_record['doc'] = doc if not 'doc' in avro_record else avro_record['doc'] + ', ' + doc
1393
+
1394
+ if len(dependencies) > 0:
1395
+ # dedupe the list
1396
+ dependencies = list(set(dependencies))
1397
+ avro_record['dependencies'] = dependencies
1398
+ finally:
1399
+ record_stack.pop()
1400
+ if avro_alternate_record:
1401
+ if self.is_empty_type(avro_record):
1402
+ # there's no substantive content in the record,
1403
+ # so we will just return the alternate record, which
1404
+ # is a plain map
1405
+ return avro_alternate_record
1406
+ return [avro_record, avro_alternate_record]
1407
+ return avro_record
1408
+
1409
+ def postprocess_schema(self, avro_schema: list) -> None:
1410
+ """ Post-process the Avro Schema for cases wheer we need a second pass """
1411
+ if len(self.types_with_unmerged_types) > 0:
1412
+ types_with_unmerged_types = copy.deepcopy(
1413
+ self.types_with_unmerged_types)
1414
+ self.types_with_unmerged_types = []
1415
+ for ref_type in types_with_unmerged_types:
1416
+ # find ref_type anywhere in the avro_schema graph, matching
1417
+ # on name and namespace.
1418
+ def find_fn(
1419
+ t): return 'name' in t and t['name'] == ref_type['name'] and 'namespace' in t and t['namespace'] == ref_type['namespace']
1420
+ type = find_schema_node(find_fn, avro_schema)
1421
+ if not type:
1422
+ raise ValueError(
1423
+ f"Couldn't find type {ref_type['namespace']}.{ref_type['name']} in the Avro Schema.")
1424
+ # resolve the unmerged types
1425
+ local_name = type.get('name')
1426
+ if not isinstance(type, dict):
1427
+ continue
1428
+ unmerged_types = type.get('unmerged_types', [])
1429
+ if len(unmerged_types) == 0:
1430
+ if 'unmerged_types' in type:
1431
+ del type['unmerged_types']
1432
+ continue
1433
+ base_type = copy.deepcopy(type)
1434
+ if 'unmerged_types' in base_type:
1435
+ del base_type['unmerged_types']
1436
+ mergeable_types = [base_type]
1437
+ deps: List[str] = []
1438
+ self.lift_dependencies_from_type(type, deps)
1439
+ for item in unmerged_types:
1440
+ if isinstance(item, str):
1441
+ found_avro_type = next(
1442
+ (t for t in avro_schema if self.get_qualified_name(t) == item), None)
1443
+ if not found_avro_type:
1444
+ continue
1445
+ elif isinstance(item, dict):
1446
+ found_avro_type = item
1447
+ self.lift_dependencies_from_type(found_avro_type, deps)
1448
+ if isinstance(found_avro_type, dict):
1449
+ candidate = found_avro_type
1450
+ if 'unmerged_types' in candidate:
1451
+ del candidate['unmerged_types']
1452
+ mergeable_types.append(candidate)
1453
+ merge_result = self.merge_avro_schemas(
1454
+ mergeable_types, avro_schema, local_name, deps)
1455
+ if isinstance(merge_result, dict):
1456
+ merge_result['dependencies'] = deps
1457
+ if 'unmerged_types' in merge_result:
1458
+ del merge_result['unmerged_types']
1459
+ if isinstance(merge_result, list):
1460
+ # unmerged field containers have fields.
1461
+ self.set_avro_type_value(
1462
+ type, 'name', type['name'] + '_item')
1463
+ self.set_avro_type_value(
1464
+ type, 'fields', [{'name': 'value', 'type': merge_result}])
1465
+ merge_result = copy.deepcopy(type)
1466
+ set_schema_node(find_fn, merge_result, avro_schema)
1467
+
1468
+ def process_definition_list(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema_list):
1469
+ """Process a schema definition list."""
1470
+ for sub_schema_name, schema in json_schema_list.items():
1471
+ if not isinstance(schema, dict) and not isinstance(schema, list):
1472
+ # skip items that are not schema definitions or lists
1473
+ continue
1474
+ if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
1475
+ # this is a schema definition
1476
+ self.process_definition(
1477
+ json_schema, namespace, base_uri, avro_schema, record_stack, sub_schema_name, schema)
1478
+ continue
1479
+ # it's a schema definition list
1480
+ self.process_definition_list(
1481
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema)
1482
+
1483
+ def process_definition(self, json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema, is_root: bool = False) -> Tuple[str, str] | None:
1484
+ """ Process a schema definition. """
1485
+ avro_schema_item = None
1486
+ avro_schema_item_list = self.json_schema_object_to_avro_record(
1487
+ schema_name, schema, namespace, json_schema, base_uri, avro_schema, record_stack)
1488
+ if not isinstance(avro_schema_item_list, list) and not isinstance(avro_schema_item_list, dict):
1489
+ # skip if the record couldn't be resolved
1490
+ return None
1491
+ # the call above usually returns a single record, but we pretend it's normally a list to handle allOf/anyOf/oneOf cases
1492
+ if isinstance(avro_schema_item_list, list) and is_root and len(avro_schema_item_list) > 1:
1493
+ # if we have multiple root-level records, we will wrap them all in a single record
1494
+ root_avro_schema_item = self.create_wrapper_record(
1495
+ schema_name+'_wrapper', namespace, 'root', [], avro_schema_item_list)
1496
+ for avro_schema_item in avro_schema_item_list:
1497
+ self.merge_dependencies_into_parent(
1498
+ [], avro_schema_item, root_avro_schema_item)
1499
+ self.register_type(avro_schema, root_avro_schema_item)
1500
+ return root_avro_schema_item['namespace'], root_avro_schema_item['name']
1501
+ elif not isinstance(avro_schema_item_list, list):
1502
+ # is not a list, so we'll wrap it in a list
1503
+ avro_schema_item_list = [avro_schema_item_list]
1504
+ for avro_schema_item in avro_schema_item_list:
1505
+ # add the item to the schema if it's not already there
1506
+ if isinstance(avro_schema_item, str):
1507
+ continue
1508
+ if isinstance(avro_schema_item, dict) and not 'name' in avro_schema_item:
1509
+ avro_schema_item['name'] = avro_name(schema_name)
1510
+ existing_type = next((t for t in avro_schema if t.get('name') == avro_schema_item['name'] and t.get(
1511
+ 'namespace') == avro_schema_item.get('namespace')), None)
1512
+ if not existing_type:
1513
+ if (not self.is_empty_type(avro_schema_item) or 'unmerged_types' in avro_schema_item) and \
1514
+ self.is_standalone_avro_type(avro_schema_item):
1515
+ # we only register record/enum as type. the other defs are mix-ins
1516
+ self.register_type(avro_schema, avro_schema_item)
1517
+ return avro_schema_item['namespace'], avro_schema_item['name']
1518
+ elif is_root:
1519
+ # at the root, we will wrap the type in a record to make it top-level
1520
+ deps: List[str] = []
1521
+ self.lift_dependencies_from_type(avro_schema_item, deps)
1522
+ avro_schema_wrapper = self.create_wrapper_record(schema_name, avro_schema_item.get(
1523
+ 'namespace', namespace), avro_schema_item['name'], deps, avro_schema_item)
1524
+ if len(deps) > 0:
1525
+ avro_schema_wrapper['dependencies'] = deps
1526
+ avro_schema_item = avro_schema_wrapper
1527
+ self.register_type(avro_schema, avro_schema_item)
1528
+ return avro_schema_item['namespace'], avro_schema_item['name']
1529
+ return None
1530
+
1531
+ def id_to_avro_namespace(self, id: str) -> str:
1532
+ """Convert a XSD namespace to Avro Namespace."""
1533
+ parsed_url = urlparse(id)
1534
+ # strip the file extension
1535
+ path = parsed_url.path.rsplit('.')[0]
1536
+ path_segments = path.strip('/').replace('-', '_').split('/')
1537
+ reversed_path_segments = reversed(path_segments)
1538
+ namespace_suffix = self.compose_namespace(*reversed_path_segments)
1539
+ if parsed_url.hostname:
1540
+ namespace_prefix = self.compose_namespace(
1541
+ *reversed(parsed_url.hostname.split('.')))
1542
+ namespace = self.compose_namespace(namespace_prefix, namespace_suffix)
1543
+ return namespace
1544
+
1545
+ def jsons_to_avro(self, json_schema: dict | list, namespace: str, base_uri: str) -> list | dict | str:
1546
+ """Convert a JSON-schema to an Avro-schema."""
1547
+ avro_schema: List[dict] = []
1548
+ record_stack: List[str] = []
1549
+
1550
+ parsed_url = urlparse(base_uri)
1551
+ schema_name = self.root_class_name
1552
+
1553
+ if isinstance(json_schema, dict) and ('definitions' in json_schema or '$defs' in json_schema):
1554
+ # this is a swagger file or has a 'definitions' block
1555
+ json_schema_defs = json_schema.get(
1556
+ 'definitions', json_schema.get('$defs', []))
1557
+ for def_schema_name, schema in json_schema_defs.items():
1558
+ if 'type' in schema or 'allOf' in schema or 'oneOf' in schema or 'anyOf' in schema or 'properties' in schema or 'enum' in schema or '$ref' in schema or 'additionalProperties' in schema or 'patternProperties' in schema:
1559
+ # this is a schema definition
1560
+ self.process_definition(
1561
+ json_schema, namespace, base_uri, avro_schema, record_stack, def_schema_name, schema)
1562
+ else:
1563
+ # it's a schema definition list
1564
+ self.process_definition_list(
1565
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, schema.copy())
1566
+ elif isinstance(json_schema, list):
1567
+ # this is a schema definition list
1568
+ self.process_definition_list(
1569
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema)
1570
+
1571
+ root_namespace = None
1572
+ root_name = None
1573
+ if isinstance(json_schema, dict) and 'type' in json_schema or 'allOf' in json_schema or 'oneOf' in json_schema or 'anyOf' in json_schema or 'properties' in json_schema:
1574
+ # this is a schema definition
1575
+ if isinstance(json_schema, dict) and '$ref' in json_schema:
1576
+ # if there is a $ref at the root level, resolve the reference and merge it with the current schema
1577
+ ref = json_schema['$ref']
1578
+ if ref:
1579
+ ref_schema, json_doc = self.resolve_reference(
1580
+ json_schema, base_uri, json_schema)
1581
+ json_schema = self.merge_json_schemas(
1582
+ [json_schema, ref_schema], intersect=False)
1583
+ root_info = self.process_definition(
1584
+ json_schema, namespace, base_uri, avro_schema, record_stack, schema_name, json_schema, is_root=True)
1585
+ if root_info:
1586
+ root_namespace, root_name = root_info
1587
+
1588
+ # postprocessing pass
1589
+ self.postprocess_schema(avro_schema)
1590
+
1591
+ if isinstance(avro_schema, list) and len(avro_schema) > 1 and self.split_top_level_records:
1592
+ new_avro_schema = []
1593
+ for item in avro_schema:
1594
+ if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
1595
+ # we need to make a copy since the inlining operation shuffles types
1596
+ schema_copy = copy.deepcopy(avro_schema)
1597
+ # find the item with the same name and namespace in the copy
1598
+ found_item = next((t for t in schema_copy if t.get(
1599
+ 'name') == item['name'] and t.get('namespace') == item.get('namespace')), None)
1600
+ if found_item:
1601
+ # inline all dependencies of the item
1602
+ inline_dependencies_of(schema_copy, found_item)
1603
+ new_avro_schema.append(found_item)
1604
+ avro_schema = new_avro_schema
1605
+ else:
1606
+ # sort the records by their dependencies
1607
+ if root_name and root_namespace and not ('definitions' in json_schema or '$defs' in json_schema):
1608
+ # inline all dependencies if this is a doc with only a root level definition
1609
+ root = find_schema_node(
1610
+ lambda t: 'name' in t and t['name'] == root_name and 'namespace' in t and t['namespace'] == root_namespace, avro_schema)
1611
+ inline_dependencies_of(avro_schema, root)
1612
+ return root
1613
+ else:
1614
+ avro_schema = sort_messages_by_dependencies(avro_schema)
1615
+
1616
+ if parsed_url.fragment and isinstance(json_schema, dict):
1617
+ # if the fragment is present in the URL, it's a reference to a schema definition
1618
+ # so we will resolve that reference and return a type
1619
+ self.imported_types.clear()
1620
+ fragment_schema: List[dict] = []
1621
+ json_pointer = parsed_url.fragment
1622
+ schema_name = parsed_url.fragment.split('/')[-1]
1623
+ schema = jsonpointer.resolve_pointer(json_schema, json_pointer)
1624
+ avro_schema_item = self.json_schema_object_to_avro_record(
1625
+ schema_name, schema, namespace, json_schema, base_uri, fragment_schema, record_stack)
1626
+ if avro_schema_item:
1627
+ # we roll all the types into this record as the top level type
1628
+ inline_dependencies_of(avro_schema, avro_schema_item)
1629
+ return avro_schema_item
1630
+
1631
+ return avro_schema
1632
+
1633
+ def convert_jsons_to_avro(self, json_schema_file_path: str, avro_schema_path: str, namespace: str | None = None, utility_namespace: str | None = None) -> list | dict | str:
1634
+ """Convert JSON schema file to Avro schema file."""
1635
+ # turn the file path into a file URI if it's not a URI already
1636
+ parsed_url = urlparse(json_schema_file_path)
1637
+ if not parsed_url.hostname and not parsed_url.scheme == 'file':
1638
+ json_schema_file_path = 'file://' + json_schema_file_path
1639
+ parsed_url = urlparse(json_schema_file_path)
1640
+ content = self.fetch_content(parsed_url.geturl())
1641
+ json_schema = json.loads(content)
1642
+
1643
+ if not namespace:
1644
+ namespace = parsed_url.geturl().replace('\\', '/').replace('-',
1645
+ '_').split('/')[-1].split('.')[0]
1646
+ # get the $id if present
1647
+ if '$id' in json_schema:
1648
+ namespace = self.id_to_avro_namespace(json_schema['$id'])
1649
+ self.root_namespace = namespace
1650
+ if utility_namespace:
1651
+ self.utility_namespace = utility_namespace
1652
+ else:
1653
+ self.utility_namespace = self.root_namespace + '.utility'
1654
+
1655
+ # drop the file name from the parsed URL to get the base URI
1656
+ avro_schema = self.jsons_to_avro(
1657
+ json_schema, namespace, parsed_url.geturl())
1658
+ if len(avro_schema) == 1:
1659
+ avro_schema = avro_schema[0]
1660
+
1661
+ # create the directory for the Avro schema file if it doesn't exist
1662
+ dir = os.path.dirname(
1663
+ avro_schema_path) if not self.split_top_level_records else avro_schema_path
1664
+ if dir != '' and not os.path.exists(dir):
1665
+ os.makedirs(dir, exist_ok=True)
1666
+ if self.split_top_level_records:
1667
+ # if we are splitting top level records, we will create a file for each record
1668
+ for item in avro_schema:
1669
+ if isinstance(item, dict) and 'type' in item and item['type'] == 'record':
1670
+ schema_file_path = os.path.join(
1671
+ dir, item['name'] + '.avsc')
1672
+ with open(schema_file_path, 'w') as avro_file:
1673
+ json.dump(item, avro_file, indent=4)
1674
+ else:
1675
+ with open(avro_schema_path, 'w') as avro_file:
1676
+ json.dump(avro_schema, avro_file, indent=4)
1677
+ return avro_schema
1678
+
1679
+
1680
+ def convert_jsons_to_avro(json_schema_file_path: str, avro_schema_path: str, namespace: str = '', utility_namespace='', root_class_name='', split_top_level_records=False) -> list | dict | str:
1681
+ """Convert JSON schema file to Avro schema file."""
1682
+
1683
+ if not json_schema_file_path:
1684
+ raise ValueError('JSON schema file path is required')
1685
+ if not json_schema_file_path.startswith('http'):
1686
+ if not os.path.exists(json_schema_file_path):
1687
+ raise FileNotFoundError(f'JSON schema file {json_schema_file_path} not found')
1688
+
1689
+ try:
1690
+ converter = JsonToAvroConverter()
1691
+ converter.split_top_level_records = split_top_level_records
1692
+ if root_class_name:
1693
+ converter.root_class_name = root_class_name
1694
+ return converter.convert_jsons_to_avro(json_schema_file_path, avro_schema_path, namespace, utility_namespace)
1695
+ except Exception as e:
1696
+ print(
1697
+ f'Error converting JSON {json_schema_file_path} to Avro: {e.args[0]}')
1698
+ return []