structurize 2.16.2__py3-none-any.whl → 2.16.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. avrotize/__init__.py +63 -63
  2. avrotize/__main__.py +5 -5
  3. avrotize/_version.py +34 -34
  4. avrotize/asn1toavro.py +160 -160
  5. avrotize/avrotize.py +152 -152
  6. avrotize/avrotocpp.py +483 -483
  7. avrotize/avrotocsharp.py +992 -992
  8. avrotize/avrotocsv.py +121 -121
  9. avrotize/avrotodatapackage.py +173 -173
  10. avrotize/avrotodb.py +1383 -1383
  11. avrotize/avrotogo.py +476 -476
  12. avrotize/avrotographql.py +197 -197
  13. avrotize/avrotoiceberg.py +210 -210
  14. avrotize/avrotojava.py +1023 -1023
  15. avrotize/avrotojs.py +250 -250
  16. avrotize/avrotojsons.py +481 -481
  17. avrotize/avrotojstruct.py +345 -345
  18. avrotize/avrotokusto.py +363 -363
  19. avrotize/avrotomd.py +137 -137
  20. avrotize/avrotools.py +168 -168
  21. avrotize/avrotoparquet.py +208 -208
  22. avrotize/avrotoproto.py +358 -358
  23. avrotize/avrotopython.py +622 -622
  24. avrotize/avrotorust.py +435 -435
  25. avrotize/avrotots.py +598 -598
  26. avrotize/avrotoxsd.py +344 -344
  27. avrotize/commands.json +2493 -2433
  28. avrotize/common.py +828 -828
  29. avrotize/constants.py +4 -4
  30. avrotize/csvtoavro.py +131 -131
  31. avrotize/datapackagetoavro.py +76 -76
  32. avrotize/dependency_resolver.py +348 -348
  33. avrotize/jsonstoavro.py +1698 -1698
  34. avrotize/jsonstostructure.py +2642 -2642
  35. avrotize/jstructtoavro.py +878 -878
  36. avrotize/kstructtoavro.py +93 -93
  37. avrotize/kustotoavro.py +455 -455
  38. avrotize/parquettoavro.py +157 -157
  39. avrotize/proto2parser.py +497 -497
  40. avrotize/proto3parser.py +402 -402
  41. avrotize/prototoavro.py +382 -382
  42. avrotize/structuretocsharp.py +2005 -2005
  43. avrotize/structuretojsons.py +498 -498
  44. avrotize/structuretopython.py +772 -772
  45. avrotize/structuretots.py +653 -0
  46. avrotize/xsdtoavro.py +413 -413
  47. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/METADATA +848 -805
  48. structurize-2.16.5.dist-info/RECORD +52 -0
  49. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/licenses/LICENSE +200 -200
  50. structurize-2.16.2.dist-info/RECORD +0 -51
  51. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/WHEEL +0 -0
  52. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/entry_points.txt +0 -0
  53. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/top_level.txt +0 -0
avrotize/avrotoxsd.py CHANGED
@@ -1,344 +1,344 @@
1
- from functools import reduce
2
- import json
3
- from typing import Dict, List
4
- import xml.etree.ElementTree as ET
5
- from xml.etree.ElementTree import Element, SubElement, tostring
6
- from xml.dom import minidom
7
-
8
- from avrotize.common import is_generic_avro_type
9
-
10
- class AvroToXSD:
11
- def __init__(self, target_namespace: str = ''):
12
- self.xmlns = {"xs": "http://www.w3.org/2001/XMLSchema"}
13
- self.union_types: Dict[str, str] = {}
14
- self.known_types: List[str] = []
15
- self.common_namespace = ''
16
- self.target_namespace = target_namespace
17
-
18
- def find_common_namespace(self, namespaces: List[str]) -> str:
19
- """
20
- Find the common namespace prefix from a list of namespaces.
21
- """
22
- if not namespaces:
23
- return ''
24
-
25
- def common_prefix(a, b):
26
- prefix = ''
27
- for a_char, b_char in zip(a.split('.'), b.split('.')):
28
- if a_char == b_char:
29
- prefix += a_char + '.'
30
- else:
31
- break
32
- return prefix.rstrip('.')
33
-
34
- return reduce(common_prefix, namespaces)
35
-
36
- def update_common_namespace(self, namespace: str) -> None:
37
- """
38
- Update the common namespace based on the provided namespace.
39
- """
40
- if not self.common_namespace:
41
- self.common_namespace = namespace
42
- else:
43
- self.common_namespace = self.find_common_namespace([self.common_namespace, namespace])
44
-
45
- def convert_avro_primitive(self, avro_type: str | dict) -> str:
46
- """Map Avro primitive types to XML schema (XSD) data types."""
47
-
48
- if isinstance(avro_type, dict) and 'logicalType' in avro_type:
49
- type = avro_type['type']
50
- logical_type = avro_type.get('logicalType')
51
-
52
- if logical_type == 'decimal':
53
- return f"decimal"
54
- if logical_type == 'timestamp-millis':
55
- return f"dateTime"
56
- if logical_type == 'date':
57
- return f"date"
58
- if logical_type in {'time-millis', 'time-micros'}:
59
- return f"time"
60
- if logical_type == 'uuid':
61
- return f"string"
62
- elif isinstance(avro_type, str):
63
- mapping = {
64
- 'null': 'string', # Defaulting to string for nullables
65
- 'boolean': 'boolean',
66
- 'int': 'integer',
67
- 'long': 'long',
68
- 'float': 'float',
69
- 'double': 'double',
70
- 'bytes': 'hexBinary',
71
- 'string': 'string',
72
- }
73
-
74
- type = mapping.get(avro_type, '') # Fallback to string
75
- if type:
76
- return f"xs:{type}"
77
- else:
78
- return avro_type.split('.')[-1]
79
- return f"xs:string"
80
-
81
- def is_avro_primitive(self, avro_type: str) -> bool:
82
- """Check if the Avro type is a primitive type."""
83
- if isinstance(avro_type, dict) and 'logicalType' in avro_type and 'type' in avro_type:
84
- return avro_type['type'] in {'int', 'long', 'float', 'double', 'bytes', 'string'}
85
- elif isinstance(avro_type, str):
86
- return avro_type in {'null', 'boolean', 'int', 'long', 'float', 'double', 'bytes', 'string'}
87
- else:
88
- return False
89
-
90
- def create_element(self, parent: Element, tag: str, **attributes) -> Element:
91
- """Create an XML element with the proper namespace."""
92
- return SubElement(parent, f"{{{self.xmlns['xs']}}}{tag}", **attributes)
93
-
94
- def create_complex_type(self, parent: Element, **attributes) -> Element:
95
- """Create an XML complexType element."""
96
- return self.create_element(parent, "complexType", **attributes)
97
-
98
- def create_fixed(self, schema_root, field_type):
99
- """ handle Avro 'fixed' type"""
100
- simple_type = self.create_element(schema_root, "simpleType", name=field_type['name'])
101
- restriction = self.create_element(simple_type, "restriction", base="xs:hexBinary")
102
- restriction.set("fixed", "true")
103
- restriction.set("value", field_type['size'])
104
-
105
- def create_map(self, schema_root: ET.Element, record_name: str, parent: ET.Element, map_schema: dict):
106
- """ handle Avro 'map' type"""
107
- complex_type = self.create_element(parent, "complexType")
108
- sequence = self.create_element(complex_type, "sequence")
109
- item = self.create_element(sequence, "element", name="item", minOccurs="0", maxOccurs="unbounded")
110
- inner_complex_type = self.create_element(item, "complexType")
111
- inner_sequence = self.create_element(inner_complex_type, "sequence")
112
- self.create_element(inner_sequence, "element", name="key", type="xs:string")
113
- map_values = map_schema['values']
114
- if isinstance(map_values, list):
115
- item_value = self.create_union(schema_root, record_name, inner_sequence, "value", map_values)
116
- else:
117
- item_value = self.create_element(inner_sequence, "element", name="value")
118
- self.set_field_type(schema_root, record_name, item_value, map_schema['values'])
119
-
120
- def create_union(self, schema_root: ET.Element, record_name: str, parent: ET.Element, field_name: str, field_type: list, insert_annotation: lambda e: None | None = None) -> ET.Element:
121
- """Create an XML element for union types."""
122
-
123
- def create_or_get_union_simple_type(self, schema_root: ET.Element, parent:Element, types: List[str], **attributes) -> str:
124
- """Create an XML simpleType element for union types."""
125
-
126
- type_key_list = types.copy()
127
- type_key_list.sort()
128
- type_key = ''.join(type_key_list)
129
- if type_key in self.union_types:
130
- return self.union_types[type_key]
131
-
132
- name = "And".join([t.capitalize() for t in types])
133
- simple_type = self.create_element(schema_root, "simpleType", **attributes)
134
- simple_type.set("name", name)
135
- union = self.create_element(simple_type, "union")
136
- union.set("memberTypes", ' '.join([self.convert_avro_primitive(t) for t in types if t != 'null']))
137
- self.union_types[type_key] = name
138
- return name
139
-
140
- if isinstance(field_type, list) and is_generic_avro_type(field_type):
141
- element = self.create_element(parent, "any", minOccurs="0", maxOccurs="unbounded")
142
- if insert_annotation:
143
- insert_annotation(element)
144
- return element
145
-
146
- non_null_types = [t for t in field_type if t != 'null']
147
- if len(non_null_types) == 1:
148
- element = self.create_element(parent, "element", name=field_name)
149
- if insert_annotation:
150
- insert_annotation(element)
151
- self.set_field_type(schema_root, record_name, element, non_null_types[0])
152
- return element
153
- else:
154
- element = self.create_element(parent, "element", name=field_name)
155
- if insert_annotation:
156
- insert_annotation(element)
157
- primitives = [t for t in non_null_types if self.is_avro_primitive(t)]
158
- for primitive in primitives:
159
- non_null_types.remove(primitive)
160
- union_type_ref = ''
161
- if len(primitives) > 0:
162
- union_type_ref = create_or_get_union_simple_type(self, schema_root, parent, primitives)
163
- if len(non_null_types) == 0:
164
- element.set('type', union_type_ref)
165
- if len(non_null_types) > 0:
166
- abstract_complex_type_name = record_name+field_name.capitalize()
167
- element.set('type', abstract_complex_type_name)
168
- if not abstract_complex_type_name in self.known_types:
169
- self.known_types.append(abstract_complex_type_name)
170
- self.create_element(schema_root, "complexType", name=abstract_complex_type_name, abstract="true")
171
- if union_type_ref:
172
- complex_content_option = self.create_element(schema_root, "complexType", name=abstract_complex_type_name+'1')
173
- complex_content = self.create_element(complex_content_option, "complexContent")
174
- complex_extension = self.create_element(complex_content, "extension", base=abstract_complex_type_name)
175
- complex_sequence = self.create_element(complex_extension, "sequence")
176
- complex_element = self.create_element(complex_sequence, "element", name='value', type=union_type_ref)
177
- for i, union_type in enumerate(non_null_types):
178
- complex_content_option = self.create_element(schema_root, "complexType", name=abstract_complex_type_name+str(i+2))
179
- complex_content = self.create_element(complex_content_option, "complexContent")
180
- complex_extension = self.create_element(complex_content, "extension", base=abstract_complex_type_name)
181
- complex_sequence = self.create_element(complex_extension, "sequence")
182
- complex_element = self.create_element(complex_sequence, "element", name=field_name)
183
- self.set_field_type(schema_root, record_name, complex_element, union_type)
184
- return element
185
-
186
- def create_array(self, schema_root: ET.Element, record_name: str, parent: ET.Element, array_schema: dict):
187
- """ handle Avro 'array' type """
188
- complex_type = self.create_element(parent, "complexType")
189
- sequence = self.create_element(complex_type, "sequence")
190
- item_type = array_schema['items']
191
- if isinstance(item_type, list):
192
- item = self.create_union(schema_root, record_name, sequence, "item", item_type)
193
- item.set('minOccurs', '0')
194
- item.set('maxOccurs', 'unbounded')
195
- else:
196
- item = self.create_element(sequence, "element", name="item", minOccurs="0", maxOccurs="unbounded")
197
- self.set_field_type(schema_root, record_name, item, item_type)
198
-
199
- def create_enum(self, schema_root: ET.Element, enum_schema: dict) -> str:
200
- """Convert an Avro enum to an XML simpleType."""
201
- name = enum_schema['name']
202
- doc = enum_schema.get('doc', '')
203
- if name in self.known_types:
204
- return name
205
- simple_type = self.create_element(schema_root, "simpleType")
206
- if doc:
207
- annotation = self.create_element(simple_type, "annotation")
208
- documentation = self.create_element(annotation, "documentation")
209
- documentation.text = doc
210
- simple_type.set('name', name)
211
- restriction = self.create_element(simple_type, "restriction", base="xs:string")
212
- for enum_symbol in enum_schema['symbols']:
213
- self.create_element(restriction, "enumeration", value=enum_symbol)
214
- self.known_types.append(name)
215
- return name
216
-
217
- def set_field_type(self, schema_root: ET.Element, record_name: str, element: ET.Element, field_type: dict|str):
218
- """ set the type or create a subtype on the element for the given avro field type"""
219
- if isinstance(field_type, dict):
220
- if 'type' in field_type:
221
- if field_type['type'] == 'record':
222
- if 'namespace' in field_type:
223
- self.update_common_namespace(field_type['namespace'])
224
- type = self.create_record(schema_root, field_type)
225
- element.set('type', type)
226
- elif field_type['type'] == 'enum':
227
- if 'namespace' in field_type:
228
- self.update_common_namespace(field_type['namespace'])
229
- type = self.create_enum(schema_root, field_type)
230
- element.set('type', type)
231
- elif field_type['type'] == 'array':
232
- self.create_array(schema_root, record_name, element, field_type)
233
- elif field_type['type'] == 'map':
234
- self.create_map(schema_root, record_name, element, field_type)
235
- elif field_type['type'] == 'fixed':
236
- self.create_fixed(schema_root, field_type)
237
- else:
238
- return self.set_field_type(schema_root, record_name, element, field_type['type'])
239
- else:
240
- raise ValueError(f"Invalid field type")
241
- else:
242
- element.set('type', self.convert_avro_primitive(field_type))
243
-
244
- def create_field(self, schema_root: Element, record_name: str, parent: Element, field: dict, attributes_parent: Element) -> ET.Element:
245
- """Convert an Avro field to an XML element."""
246
- field_name = field['name']
247
- field_type = field['type']
248
- field_doc = field.get('doc', '')
249
- xmlkind = field.get('xmlkind', 'element')
250
- if isinstance(field_type,list):
251
- def ia(e) -> None:
252
- if field_doc:
253
- annotation = self.create_element(e, "annotation")
254
- documentation = self.create_element(annotation, "documentation")
255
- documentation.text = field_doc
256
- element = self.create_union(schema_root, record_name, parent, field_name, field_type, ia)
257
- else:
258
- if xmlkind == 'attribute':
259
- element = self.create_element(attributes_parent, "attribute", name=field_name)
260
- else:
261
- element = self.create_element(parent, "element", name=field_name)
262
- if field_doc:
263
- annotation = self.create_element(element, "annotation")
264
- documentation = self.create_element(annotation, "documentation")
265
- documentation.text = field_doc
266
- self.set_field_type(schema_root, record_name, element, field_type)
267
-
268
- return element
269
-
270
- def create_record(self, schema_root: Element, record: dict) -> str:
271
- """Convert an Avro record to an XML complex type."""
272
- name = record['name']
273
- doc = record.get('doc', '')
274
- if name in self.known_types:
275
- return name
276
- complex_type = self.create_complex_type(schema_root, name=name)
277
- if doc:
278
- annotation = self.create_element(complex_type, "annotation")
279
- documentation = self.create_element(annotation, "documentation")
280
- documentation.text = doc
281
- sequence = self.create_element(complex_type, "sequence")
282
- attributes_parent = complex_type # Attributes should be direct children of the complexType, not inside the sequence
283
- for field in record['fields']:
284
- self.create_field(schema_root, name, sequence, field, attributes_parent)
285
- self.known_types.append(name)
286
- return name
287
-
288
- def xsd_namespace_from_avro_namespace(self, namespace: str):
289
- """Convert an Avro namespace to an XML schema namespace."""
290
- if not self.target_namespace:
291
- return "urn:"+namespace.replace('.', ':')
292
- else:
293
- return self.target_namespace
294
-
295
- def avro_schema_to_xsd(self, avro_schema: dict) -> Element:
296
- """Convert the top-level Avro schema to an XML schema."""
297
- ET.register_namespace('xs', self.xmlns['xs'])
298
- schema = Element(f"{{{self.xmlns['xs']}}}schema")
299
- if isinstance(avro_schema, list):
300
- for record in avro_schema:
301
- if record['type'] == 'record':
302
- if 'namespace' in record:
303
- self.update_common_namespace(record['namespace'])
304
- self.create_record(schema, record)
305
- elif record['type'] == 'enum':
306
- if 'namespace' in record:
307
- self.update_common_namespace(record['namespace'])
308
- self.create_enum(schema, record)
309
- else:
310
- if avro_schema['type'] == 'record':
311
- if 'namespace' in avro_schema:
312
- self.update_common_namespace(avro_schema['namespace'])
313
- self.create_record(schema, avro_schema)
314
- elif avro_schema['type'] == 'enum':
315
- if 'namespace' in avro_schema:
316
- self.update_common_namespace(avro_schema['namespace'])
317
- self.create_enum(schema, avro_schema)
318
- elif avro_schema['type'] == 'fixed':
319
- if 'namespace' in avro_schema:
320
- self.update_common_namespace(avro_schema['namespace'])
321
- self.create_fixed(schema, avro_schema)
322
- schema.set('targetNamespace', self.xsd_namespace_from_avro_namespace(self.common_namespace))
323
- schema.set('xmlns', self.xsd_namespace_from_avro_namespace(self.common_namespace))
324
- ET.register_namespace('', self.xsd_namespace_from_avro_namespace(self.common_namespace))
325
- return schema
326
-
327
- def save_xsd_to_file(self, schema: Element, xml_path: str) -> None:
328
- """Save the XML schema to a file."""
329
- tree_str = tostring(schema, 'utf-8')
330
- pretty_tree = minidom.parseString(tree_str).toprettyxml(indent=" ")
331
- with open(xml_path, 'w', encoding='utf-8') as xml_file:
332
- xml_file.write(pretty_tree)
333
-
334
- def convert_avro_to_xsd(self, avro_schema_path: str, xml_file_path: str) -> None:
335
- """Convert Avro schema file to XML schema file."""
336
- with open(avro_schema_path, 'r', encoding='utf-8') as avro_file:
337
- avro_schema = json.load(avro_file)
338
-
339
- xml_schema = self.avro_schema_to_xsd(avro_schema)
340
- self.save_xsd_to_file(xml_schema, xml_file_path)
341
-
342
- def convert_avro_to_xsd(avro_schema_path: str, xml_file_path: str, target_namespace: str = '') -> None:
343
- avrotoxml = AvroToXSD(target_namespace)
344
- avrotoxml.convert_avro_to_xsd(avro_schema_path, xml_file_path)
1
+ from functools import reduce
2
+ import json
3
+ from typing import Dict, List
4
+ import xml.etree.ElementTree as ET
5
+ from xml.etree.ElementTree import Element, SubElement, tostring
6
+ from xml.dom import minidom
7
+
8
+ from avrotize.common import is_generic_avro_type
9
+
10
+ class AvroToXSD:
11
+ def __init__(self, target_namespace: str = ''):
12
+ self.xmlns = {"xs": "http://www.w3.org/2001/XMLSchema"}
13
+ self.union_types: Dict[str, str] = {}
14
+ self.known_types: List[str] = []
15
+ self.common_namespace = ''
16
+ self.target_namespace = target_namespace
17
+
18
+ def find_common_namespace(self, namespaces: List[str]) -> str:
19
+ """
20
+ Find the common namespace prefix from a list of namespaces.
21
+ """
22
+ if not namespaces:
23
+ return ''
24
+
25
+ def common_prefix(a, b):
26
+ prefix = ''
27
+ for a_char, b_char in zip(a.split('.'), b.split('.')):
28
+ if a_char == b_char:
29
+ prefix += a_char + '.'
30
+ else:
31
+ break
32
+ return prefix.rstrip('.')
33
+
34
+ return reduce(common_prefix, namespaces)
35
+
36
+ def update_common_namespace(self, namespace: str) -> None:
37
+ """
38
+ Update the common namespace based on the provided namespace.
39
+ """
40
+ if not self.common_namespace:
41
+ self.common_namespace = namespace
42
+ else:
43
+ self.common_namespace = self.find_common_namespace([self.common_namespace, namespace])
44
+
45
+ def convert_avro_primitive(self, avro_type: str | dict) -> str:
46
+ """Map Avro primitive types to XML schema (XSD) data types."""
47
+
48
+ if isinstance(avro_type, dict) and 'logicalType' in avro_type:
49
+ type = avro_type['type']
50
+ logical_type = avro_type.get('logicalType')
51
+
52
+ if logical_type == 'decimal':
53
+ return f"decimal"
54
+ if logical_type == 'timestamp-millis':
55
+ return f"dateTime"
56
+ if logical_type == 'date':
57
+ return f"date"
58
+ if logical_type in {'time-millis', 'time-micros'}:
59
+ return f"time"
60
+ if logical_type == 'uuid':
61
+ return f"string"
62
+ elif isinstance(avro_type, str):
63
+ mapping = {
64
+ 'null': 'string', # Defaulting to string for nullables
65
+ 'boolean': 'boolean',
66
+ 'int': 'integer',
67
+ 'long': 'long',
68
+ 'float': 'float',
69
+ 'double': 'double',
70
+ 'bytes': 'hexBinary',
71
+ 'string': 'string',
72
+ }
73
+
74
+ type = mapping.get(avro_type, '') # Fallback to string
75
+ if type:
76
+ return f"xs:{type}"
77
+ else:
78
+ return avro_type.split('.')[-1]
79
+ return f"xs:string"
80
+
81
+ def is_avro_primitive(self, avro_type: str) -> bool:
82
+ """Check if the Avro type is a primitive type."""
83
+ if isinstance(avro_type, dict) and 'logicalType' in avro_type and 'type' in avro_type:
84
+ return avro_type['type'] in {'int', 'long', 'float', 'double', 'bytes', 'string'}
85
+ elif isinstance(avro_type, str):
86
+ return avro_type in {'null', 'boolean', 'int', 'long', 'float', 'double', 'bytes', 'string'}
87
+ else:
88
+ return False
89
+
90
+ def create_element(self, parent: Element, tag: str, **attributes) -> Element:
91
+ """Create an XML element with the proper namespace."""
92
+ return SubElement(parent, f"{{{self.xmlns['xs']}}}{tag}", **attributes)
93
+
94
+ def create_complex_type(self, parent: Element, **attributes) -> Element:
95
+ """Create an XML complexType element."""
96
+ return self.create_element(parent, "complexType", **attributes)
97
+
98
+ def create_fixed(self, schema_root, field_type):
99
+ """ handle Avro 'fixed' type"""
100
+ simple_type = self.create_element(schema_root, "simpleType", name=field_type['name'])
101
+ restriction = self.create_element(simple_type, "restriction", base="xs:hexBinary")
102
+ restriction.set("fixed", "true")
103
+ restriction.set("value", field_type['size'])
104
+
105
+ def create_map(self, schema_root: ET.Element, record_name: str, parent: ET.Element, map_schema: dict):
106
+ """ handle Avro 'map' type"""
107
+ complex_type = self.create_element(parent, "complexType")
108
+ sequence = self.create_element(complex_type, "sequence")
109
+ item = self.create_element(sequence, "element", name="item", minOccurs="0", maxOccurs="unbounded")
110
+ inner_complex_type = self.create_element(item, "complexType")
111
+ inner_sequence = self.create_element(inner_complex_type, "sequence")
112
+ self.create_element(inner_sequence, "element", name="key", type="xs:string")
113
+ map_values = map_schema['values']
114
+ if isinstance(map_values, list):
115
+ item_value = self.create_union(schema_root, record_name, inner_sequence, "value", map_values)
116
+ else:
117
+ item_value = self.create_element(inner_sequence, "element", name="value")
118
+ self.set_field_type(schema_root, record_name, item_value, map_schema['values'])
119
+
120
+ def create_union(self, schema_root: ET.Element, record_name: str, parent: ET.Element, field_name: str, field_type: list, insert_annotation: lambda e: None | None = None) -> ET.Element:
121
+ """Create an XML element for union types."""
122
+
123
+ def create_or_get_union_simple_type(self, schema_root: ET.Element, parent:Element, types: List[str], **attributes) -> str:
124
+ """Create an XML simpleType element for union types."""
125
+
126
+ type_key_list = types.copy()
127
+ type_key_list.sort()
128
+ type_key = ''.join(type_key_list)
129
+ if type_key in self.union_types:
130
+ return self.union_types[type_key]
131
+
132
+ name = "And".join([t.capitalize() for t in types])
133
+ simple_type = self.create_element(schema_root, "simpleType", **attributes)
134
+ simple_type.set("name", name)
135
+ union = self.create_element(simple_type, "union")
136
+ union.set("memberTypes", ' '.join([self.convert_avro_primitive(t) for t in types if t != 'null']))
137
+ self.union_types[type_key] = name
138
+ return name
139
+
140
+ if isinstance(field_type, list) and is_generic_avro_type(field_type):
141
+ element = self.create_element(parent, "any", minOccurs="0", maxOccurs="unbounded")
142
+ if insert_annotation:
143
+ insert_annotation(element)
144
+ return element
145
+
146
+ non_null_types = [t for t in field_type if t != 'null']
147
+ if len(non_null_types) == 1:
148
+ element = self.create_element(parent, "element", name=field_name)
149
+ if insert_annotation:
150
+ insert_annotation(element)
151
+ self.set_field_type(schema_root, record_name, element, non_null_types[0])
152
+ return element
153
+ else:
154
+ element = self.create_element(parent, "element", name=field_name)
155
+ if insert_annotation:
156
+ insert_annotation(element)
157
+ primitives = [t for t in non_null_types if self.is_avro_primitive(t)]
158
+ for primitive in primitives:
159
+ non_null_types.remove(primitive)
160
+ union_type_ref = ''
161
+ if len(primitives) > 0:
162
+ union_type_ref = create_or_get_union_simple_type(self, schema_root, parent, primitives)
163
+ if len(non_null_types) == 0:
164
+ element.set('type', union_type_ref)
165
+ if len(non_null_types) > 0:
166
+ abstract_complex_type_name = record_name+field_name.capitalize()
167
+ element.set('type', abstract_complex_type_name)
168
+ if not abstract_complex_type_name in self.known_types:
169
+ self.known_types.append(abstract_complex_type_name)
170
+ self.create_element(schema_root, "complexType", name=abstract_complex_type_name, abstract="true")
171
+ if union_type_ref:
172
+ complex_content_option = self.create_element(schema_root, "complexType", name=abstract_complex_type_name+'1')
173
+ complex_content = self.create_element(complex_content_option, "complexContent")
174
+ complex_extension = self.create_element(complex_content, "extension", base=abstract_complex_type_name)
175
+ complex_sequence = self.create_element(complex_extension, "sequence")
176
+ complex_element = self.create_element(complex_sequence, "element", name='value', type=union_type_ref)
177
+ for i, union_type in enumerate(non_null_types):
178
+ complex_content_option = self.create_element(schema_root, "complexType", name=abstract_complex_type_name+str(i+2))
179
+ complex_content = self.create_element(complex_content_option, "complexContent")
180
+ complex_extension = self.create_element(complex_content, "extension", base=abstract_complex_type_name)
181
+ complex_sequence = self.create_element(complex_extension, "sequence")
182
+ complex_element = self.create_element(complex_sequence, "element", name=field_name)
183
+ self.set_field_type(schema_root, record_name, complex_element, union_type)
184
+ return element
185
+
186
+ def create_array(self, schema_root: ET.Element, record_name: str, parent: ET.Element, array_schema: dict):
187
+ """ handle Avro 'array' type """
188
+ complex_type = self.create_element(parent, "complexType")
189
+ sequence = self.create_element(complex_type, "sequence")
190
+ item_type = array_schema['items']
191
+ if isinstance(item_type, list):
192
+ item = self.create_union(schema_root, record_name, sequence, "item", item_type)
193
+ item.set('minOccurs', '0')
194
+ item.set('maxOccurs', 'unbounded')
195
+ else:
196
+ item = self.create_element(sequence, "element", name="item", minOccurs="0", maxOccurs="unbounded")
197
+ self.set_field_type(schema_root, record_name, item, item_type)
198
+
199
+ def create_enum(self, schema_root: ET.Element, enum_schema: dict) -> str:
200
+ """Convert an Avro enum to an XML simpleType."""
201
+ name = enum_schema['name']
202
+ doc = enum_schema.get('doc', '')
203
+ if name in self.known_types:
204
+ return name
205
+ simple_type = self.create_element(schema_root, "simpleType")
206
+ if doc:
207
+ annotation = self.create_element(simple_type, "annotation")
208
+ documentation = self.create_element(annotation, "documentation")
209
+ documentation.text = doc
210
+ simple_type.set('name', name)
211
+ restriction = self.create_element(simple_type, "restriction", base="xs:string")
212
+ for enum_symbol in enum_schema['symbols']:
213
+ self.create_element(restriction, "enumeration", value=enum_symbol)
214
+ self.known_types.append(name)
215
+ return name
216
+
217
+ def set_field_type(self, schema_root: ET.Element, record_name: str, element: ET.Element, field_type: dict|str):
218
+ """ set the type or create a subtype on the element for the given avro field type"""
219
+ if isinstance(field_type, dict):
220
+ if 'type' in field_type:
221
+ if field_type['type'] == 'record':
222
+ if 'namespace' in field_type:
223
+ self.update_common_namespace(field_type['namespace'])
224
+ type = self.create_record(schema_root, field_type)
225
+ element.set('type', type)
226
+ elif field_type['type'] == 'enum':
227
+ if 'namespace' in field_type:
228
+ self.update_common_namespace(field_type['namespace'])
229
+ type = self.create_enum(schema_root, field_type)
230
+ element.set('type', type)
231
+ elif field_type['type'] == 'array':
232
+ self.create_array(schema_root, record_name, element, field_type)
233
+ elif field_type['type'] == 'map':
234
+ self.create_map(schema_root, record_name, element, field_type)
235
+ elif field_type['type'] == 'fixed':
236
+ self.create_fixed(schema_root, field_type)
237
+ else:
238
+ return self.set_field_type(schema_root, record_name, element, field_type['type'])
239
+ else:
240
+ raise ValueError(f"Invalid field type")
241
+ else:
242
+ element.set('type', self.convert_avro_primitive(field_type))
243
+
244
+ def create_field(self, schema_root: Element, record_name: str, parent: Element, field: dict, attributes_parent: Element) -> ET.Element:
245
+ """Convert an Avro field to an XML element."""
246
+ field_name = field['name']
247
+ field_type = field['type']
248
+ field_doc = field.get('doc', '')
249
+ xmlkind = field.get('xmlkind', 'element')
250
+ if isinstance(field_type,list):
251
+ def ia(e) -> None:
252
+ if field_doc:
253
+ annotation = self.create_element(e, "annotation")
254
+ documentation = self.create_element(annotation, "documentation")
255
+ documentation.text = field_doc
256
+ element = self.create_union(schema_root, record_name, parent, field_name, field_type, ia)
257
+ else:
258
+ if xmlkind == 'attribute':
259
+ element = self.create_element(attributes_parent, "attribute", name=field_name)
260
+ else:
261
+ element = self.create_element(parent, "element", name=field_name)
262
+ if field_doc:
263
+ annotation = self.create_element(element, "annotation")
264
+ documentation = self.create_element(annotation, "documentation")
265
+ documentation.text = field_doc
266
+ self.set_field_type(schema_root, record_name, element, field_type)
267
+
268
+ return element
269
+
270
+ def create_record(self, schema_root: Element, record: dict) -> str:
271
+ """Convert an Avro record to an XML complex type."""
272
+ name = record['name']
273
+ doc = record.get('doc', '')
274
+ if name in self.known_types:
275
+ return name
276
+ complex_type = self.create_complex_type(schema_root, name=name)
277
+ if doc:
278
+ annotation = self.create_element(complex_type, "annotation")
279
+ documentation = self.create_element(annotation, "documentation")
280
+ documentation.text = doc
281
+ sequence = self.create_element(complex_type, "sequence")
282
+ attributes_parent = complex_type # Attributes should be direct children of the complexType, not inside the sequence
283
+ for field in record['fields']:
284
+ self.create_field(schema_root, name, sequence, field, attributes_parent)
285
+ self.known_types.append(name)
286
+ return name
287
+
288
+ def xsd_namespace_from_avro_namespace(self, namespace: str):
289
+ """Convert an Avro namespace to an XML schema namespace."""
290
+ if not self.target_namespace:
291
+ return "urn:"+namespace.replace('.', ':')
292
+ else:
293
+ return self.target_namespace
294
+
295
+ def avro_schema_to_xsd(self, avro_schema: dict) -> Element:
296
+ """Convert the top-level Avro schema to an XML schema."""
297
+ ET.register_namespace('xs', self.xmlns['xs'])
298
+ schema = Element(f"{{{self.xmlns['xs']}}}schema")
299
+ if isinstance(avro_schema, list):
300
+ for record in avro_schema:
301
+ if record['type'] == 'record':
302
+ if 'namespace' in record:
303
+ self.update_common_namespace(record['namespace'])
304
+ self.create_record(schema, record)
305
+ elif record['type'] == 'enum':
306
+ if 'namespace' in record:
307
+ self.update_common_namespace(record['namespace'])
308
+ self.create_enum(schema, record)
309
+ else:
310
+ if avro_schema['type'] == 'record':
311
+ if 'namespace' in avro_schema:
312
+ self.update_common_namespace(avro_schema['namespace'])
313
+ self.create_record(schema, avro_schema)
314
+ elif avro_schema['type'] == 'enum':
315
+ if 'namespace' in avro_schema:
316
+ self.update_common_namespace(avro_schema['namespace'])
317
+ self.create_enum(schema, avro_schema)
318
+ elif avro_schema['type'] == 'fixed':
319
+ if 'namespace' in avro_schema:
320
+ self.update_common_namespace(avro_schema['namespace'])
321
+ self.create_fixed(schema, avro_schema)
322
+ schema.set('targetNamespace', self.xsd_namespace_from_avro_namespace(self.common_namespace))
323
+ schema.set('xmlns', self.xsd_namespace_from_avro_namespace(self.common_namespace))
324
+ ET.register_namespace('', self.xsd_namespace_from_avro_namespace(self.common_namespace))
325
+ return schema
326
+
327
+ def save_xsd_to_file(self, schema: Element, xml_path: str) -> None:
328
+ """Save the XML schema to a file."""
329
+ tree_str = tostring(schema, 'utf-8')
330
+ pretty_tree = minidom.parseString(tree_str).toprettyxml(indent=" ")
331
+ with open(xml_path, 'w', encoding='utf-8') as xml_file:
332
+ xml_file.write(pretty_tree)
333
+
334
+ def convert_avro_to_xsd(self, avro_schema_path: str, xml_file_path: str) -> None:
335
+ """Convert Avro schema file to XML schema file."""
336
+ with open(avro_schema_path, 'r', encoding='utf-8') as avro_file:
337
+ avro_schema = json.load(avro_file)
338
+
339
+ xml_schema = self.avro_schema_to_xsd(avro_schema)
340
+ self.save_xsd_to_file(xml_schema, xml_file_path)
341
+
342
+ def convert_avro_to_xsd(avro_schema_path: str, xml_file_path: str, target_namespace: str = '') -> None:
343
+ avrotoxml = AvroToXSD(target_namespace)
344
+ avrotoxml.convert_avro_to_xsd(avro_schema_path, xml_file_path)