structurize 2.16.2__py3-none-any.whl → 2.16.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. avrotize/__init__.py +63 -63
  2. avrotize/__main__.py +5 -5
  3. avrotize/_version.py +34 -34
  4. avrotize/asn1toavro.py +160 -160
  5. avrotize/avrotize.py +152 -152
  6. avrotize/avrotocpp.py +483 -483
  7. avrotize/avrotocsharp.py +992 -992
  8. avrotize/avrotocsv.py +121 -121
  9. avrotize/avrotodatapackage.py +173 -173
  10. avrotize/avrotodb.py +1383 -1383
  11. avrotize/avrotogo.py +476 -476
  12. avrotize/avrotographql.py +197 -197
  13. avrotize/avrotoiceberg.py +210 -210
  14. avrotize/avrotojava.py +1023 -1023
  15. avrotize/avrotojs.py +250 -250
  16. avrotize/avrotojsons.py +481 -481
  17. avrotize/avrotojstruct.py +345 -345
  18. avrotize/avrotokusto.py +363 -363
  19. avrotize/avrotomd.py +137 -137
  20. avrotize/avrotools.py +168 -168
  21. avrotize/avrotoparquet.py +208 -208
  22. avrotize/avrotoproto.py +358 -358
  23. avrotize/avrotopython.py +622 -622
  24. avrotize/avrotorust.py +435 -435
  25. avrotize/avrotots.py +598 -598
  26. avrotize/avrotoxsd.py +344 -344
  27. avrotize/commands.json +2493 -2433
  28. avrotize/common.py +828 -828
  29. avrotize/constants.py +4 -4
  30. avrotize/csvtoavro.py +131 -131
  31. avrotize/datapackagetoavro.py +76 -76
  32. avrotize/dependency_resolver.py +348 -348
  33. avrotize/jsonstoavro.py +1698 -1698
  34. avrotize/jsonstostructure.py +2642 -2642
  35. avrotize/jstructtoavro.py +878 -878
  36. avrotize/kstructtoavro.py +93 -93
  37. avrotize/kustotoavro.py +455 -455
  38. avrotize/parquettoavro.py +157 -157
  39. avrotize/proto2parser.py +497 -497
  40. avrotize/proto3parser.py +402 -402
  41. avrotize/prototoavro.py +382 -382
  42. avrotize/structuretocsharp.py +2005 -2005
  43. avrotize/structuretojsons.py +498 -498
  44. avrotize/structuretopython.py +772 -772
  45. avrotize/structuretots.py +653 -0
  46. avrotize/xsdtoavro.py +413 -413
  47. structurize-2.16.6.dist-info/METADATA +107 -0
  48. structurize-2.16.6.dist-info/RECORD +52 -0
  49. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/licenses/LICENSE +200 -200
  50. structurize-2.16.2.dist-info/METADATA +0 -805
  51. structurize-2.16.2.dist-info/RECORD +0 -51
  52. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/WHEEL +0 -0
  53. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/entry_points.txt +0 -0
  54. {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/top_level.txt +0 -0
avrotize/xsdtoavro.py CHANGED
@@ -1,413 +1,413 @@
1
- # pylint: disable=line-too-long, consider-iterating-dictionary, too-many-locals, too-many-branches
2
-
3
- """Converts XSD to Avro schema."""
4
-
5
- import os
6
- import re
7
- from typing import Dict, List, Tuple
8
- import xml.etree.ElementTree as ET
9
- import json
10
- from urllib.parse import urlparse
11
- from avrotize.common import avro_namespace, generic_type
12
-
13
- from avrotize.dependency_resolver import inline_dependencies_of, sort_messages_by_dependencies
14
-
15
- XSD_NAMESPACE = 'http://www.w3.org/2001/XMLSchema'
16
-
17
-
18
- class XSDToAvro:
19
- """ Convert XSD to Avro schema."""
20
-
21
- def __init__(self) -> None:
22
- """ Initialize the class. """
23
- self.simple_type_map: Dict[str, str | dict] = {}
24
- self.avro_namespace = ''
25
- self.xml_namespace = ''
26
-
27
- def xsd_targetnamespace_to_avro_namespace(self, targetnamespace: str) -> str:
28
- """Convert a XSD namespace to Avro Namespace."""
29
- parsed_url = urlparse(targetnamespace)
30
- if parsed_url.scheme == 'urn':
31
- path_segments = parsed_url.path.strip(
32
- ':').replace('.', '-').split(':')
33
- # join all path segments that start with a number with the previous one
34
- new_path_segments: List[str] = []
35
- n = len(path_segments)
36
- for i in range(n):
37
- if path_segments[i][0].isdigit():
38
- if i == 0:
39
- new_path_segments.append('_'+path_segments[i])
40
- else:
41
- new_path_segments[-1] = f"{new_path_segments[-1]}-{path_segments[i]}"
42
- else:
43
- new_path_segments.append(path_segments[i])
44
- path_segments = new_path_segments
45
- else:
46
- path_segments = parsed_url.path.strip('/').split('/')
47
- path_segments = list(reversed(path_segments))
48
- namespace_prefix = '.'.join(path_segments)
49
- if parsed_url.hostname:
50
- namespace_suffix = parsed_url.hostname
51
- namespace = f"{namespace_prefix}.{namespace_suffix}"
52
- else:
53
- namespace = namespace_prefix
54
- return avro_namespace(namespace)
55
-
56
- def xsd_to_avro_type(self, xsd_type: str, namespaces: dict):
57
- """Convert a XSD type to an Avro type."""
58
- if xsd_type in self.simple_type_map:
59
- return self.simple_type_map[xsd_type]
60
-
61
- # split the type on the first colon
62
- if ':' not in xsd_type:
63
- type_name = xsd_type
64
- prefix = ''
65
- else:
66
- prefix, type_name = xsd_type.split(':', 1)
67
- if not type_name:
68
- type_name = prefix
69
- prefix = ''
70
- # find the namespace for the prefix
71
- ns = namespaces.get(XSD_NAMESPACE, '')
72
- if ns == prefix:
73
- base_type_map = {
74
- 'string': 'string',
75
- 'int': 'int',
76
- 'integer': 'int',
77
- 'long': 'long',
78
- 'short': 'int',
79
- 'decimal': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 32, 'scale': 6},
80
- 'float': 'float',
81
- 'double': 'double',
82
- 'boolean': 'boolean',
83
- 'byte': 'int',
84
- 'date': {'type': 'int', 'logicalType': 'date'},
85
- 'dateTime': {'type': 'long', 'logicalType': 'timestamp-millis'},
86
- 'time': {'type': 'int', 'logicalType': 'time-millis'},
87
- 'duration': {'type': 'int', 'logicalType': 'duration'},
88
- 'gYear': {'type': 'string'},
89
- 'gYearMonth': {'type': 'string'},
90
- 'gMonth': {'type': 'string'},
91
- 'gMonthDay': {'type': 'string'},
92
- 'gDay': {'type': 'string'},
93
- 'nonNegativeInteger': 'int',
94
- 'positiveInteger': 'int',
95
- 'unsignedInt': 'int',
96
- 'unsignedShort': 'int',
97
- 'unsignedByte': 'int',
98
- 'unsignedLong': 'long',
99
- 'yearMonthDuration': {'type': 'string', 'logicalType': 'duration'},
100
- 'dayTimeDuration': {'type': 'string', 'logicalType': 'duration'},
101
- 'dateTimeStamp': {'type': 'long', 'logicalType': 'timestamp-millis'},
102
- 'hexBinary': 'bytes',
103
- 'base64Binary': 'bytes',
104
- 'anyURI': 'string',
105
- 'normalizedString': 'string',
106
- 'token': 'string',
107
- 'language': 'string',
108
- 'Name': 'string',
109
- 'NCName': 'string',
110
- 'ENTITY': 'string',
111
- 'ENTITIES': 'string',
112
- 'ID': 'string',
113
- 'IDREF': 'string',
114
- 'IDREFS': 'string',
115
- 'NMTOKEN': 'string',
116
- 'NMTOKENS': 'string',
117
- 'QName': 'string',
118
- 'NOTATION': 'string'
119
- }
120
- return base_type_map.get(type_name, self.avro_namespace+'.'+type_name)
121
- else:
122
- return self.avro_namespace+'.'+type_name
123
-
124
- def process_element(self, element: ET.Element, namespaces: dict, dependencies: list):
125
- """Process an element in the XSD schema."""
126
- name = element.get('name')
127
- type_value = element.get('type', '')
128
- if type_value:
129
- avro_type = self.xsd_to_avro_type(type_value, namespaces)
130
- if not type_value.startswith(f'{namespaces[XSD_NAMESPACE]}:') and type_value not in self.simple_type_map.keys():
131
- dependencies.append(avro_type if isinstance(
132
- avro_type, str) else avro_type.get('namespace')+'.'+avro_type.get('name'))
133
- dependencies = list(set(dependencies))
134
- else:
135
- complex_type = element.find(
136
- f'{{{XSD_NAMESPACE}}}complexType', namespaces)
137
- if complex_type is not None:
138
- complex_type.set('name', name)
139
- avro_type = self.process_complex_type(complex_type, namespaces)
140
- else:
141
- simple_type = element.find(
142
- f'{{{XSD_NAMESPACE}}}simpleType', namespaces)
143
- if simple_type is not None:
144
- add_to_schema, simple_type_type = self.process_simple_type(
145
- simple_type, namespaces)
146
- if add_to_schema:
147
- avro_type = simple_type_type
148
- else:
149
- avro_type = self.simple_type_map[name]
150
- else:
151
- raise ValueError('element must have a type or complexType')
152
-
153
- max_occurs = element.get('maxOccurs')
154
- if max_occurs is not None and max_occurs != '1':
155
- avro_type = {'type': 'array', 'items': avro_type}
156
- min_occurs = element.get('minOccurs')
157
- if min_occurs is not None and min_occurs == '0':
158
- avro_type = ['null', avro_type]
159
- avro_field = {'name': name, 'type': avro_type}
160
- annotation = element.find(f'{{{XSD_NAMESPACE}}}annotation', namespaces)
161
- if annotation is not None:
162
- documentation = annotation.find(
163
- f'{{{XSD_NAMESPACE}}}documentation', namespaces)
164
- if documentation is not None and documentation.text is not None:
165
- avro_field['doc'] = documentation.text.strip()
166
- return avro_field
167
-
168
- def process_complex_type(self, complex_type: ET.Element, namespaces: dict) -> dict | str:
169
- """ Process a complex type in the XSD schema."""
170
- dependencies: List[str] = []
171
- avro_type: dict = {
172
- 'type': 'record',
173
- 'name': complex_type.attrib.get('name'),
174
- 'namespace': self.avro_namespace,
175
- 'fields': []
176
- }
177
- avro_doc = ''
178
- annotation = complex_type.find(
179
- f'{{{XSD_NAMESPACE}}}annotation', namespaces)
180
- if annotation is not None:
181
- documentation = annotation.find(
182
- f'{{{XSD_NAMESPACE}}}documentation', namespaces)
183
- if documentation is not None and documentation.text is not None:
184
- avro_doc = documentation.text.strip()
185
- avro_type['doc'] = avro_doc
186
- fields = []
187
- for sequence in complex_type.findall(f'{{{XSD_NAMESPACE}}}sequence', namespaces):
188
- for el in sequence.findall(f'{{{XSD_NAMESPACE}}}element', namespaces):
189
- field = self.process_element(el, namespaces, dependencies)
190
- field['xmlkind'] = 'element'
191
- fields.append(field)
192
- if sequence.findall(f'{{{XSD_NAMESPACE}}}any', namespaces):
193
- fields.append({"name": "any", "xmlkind": "any", "type": generic_type()})
194
- for all_types in complex_type.findall(f'{{{XSD_NAMESPACE}}}all', namespaces):
195
- for el in all_types.findall(f'{{{XSD_NAMESPACE}}}element', namespaces):
196
- field = self.process_element(el, namespaces, dependencies)
197
- field['xmlkind'] = 'element'
198
- fields.append(field)
199
- for choice in complex_type.findall(f'{{{XSD_NAMESPACE}}}choice', namespaces):
200
- choices: list = []
201
- for el in choice.findall(f'{{{XSD_NAMESPACE}}}element', namespaces):
202
- deps: List[str] = []
203
- choice_field = self.process_element(el, namespaces, deps)
204
- choice_field['xmlkind'] = 'element'
205
- choice_record = {
206
- 'type': 'record',
207
- 'name': f'{complex_type.attrib.get("name")}_{choice_field["name"]}',
208
- 'fields': [choice_field],
209
- 'namespace': self.avro_namespace
210
- }
211
- if avro_doc:
212
- choice_record['doc'] = avro_doc
213
- choices.append(choice_record)
214
- dependencies.extend(deps)
215
- dependencies = list(set(dependencies))
216
- choices_field = {
217
- 'name': f'{complex_type.attrib.get("name")}',
218
- 'type': choices
219
- }
220
- fields.append(choices_field)
221
- for attribute in complex_type.findall(f'.{{{XSD_NAMESPACE}}}attribute', namespaces):
222
- field = self.process_element(attribute, namespaces, dependencies)
223
- field['xmlkind'] = 'attribute'
224
- fields.append(field)
225
- for el in complex_type.findall(f'{{{XSD_NAMESPACE}}}simpleContent', namespaces):
226
- simple_content = el.find(
227
- f'{{{XSD_NAMESPACE}}}extension', namespaces)
228
- if simple_content is not None:
229
- base_type = simple_content.attrib.get('base')
230
- if base_type:
231
- fields.append(
232
- {"name": "value", "type": self.xsd_to_avro_type(base_type, namespaces)})
233
- for se in simple_content.findall(f'{{{XSD_NAMESPACE}}}attribute', namespaces):
234
- field = self.process_element(se, namespaces, dependencies)
235
- field['xmlkind'] = 'attribute'
236
- fields.append(field)
237
- else:
238
- raise ValueError("No base found in simpleContent")
239
-
240
- avro_type['fields'] = fields
241
- if dependencies:
242
- avro_type['dependencies'] = dependencies
243
- return avro_type
244
-
245
- def process_simple_type(self, simple_type: ET.Element, namespaces: dict) -> Tuple[bool, dict | str]:
246
- """ Process a simple type in the XSD schema. """
247
- type_name = simple_type.attrib.get('name')
248
- if not type_name:
249
- raise ValueError("SimpleType must have a name")
250
- avro_doc = ''
251
- annotation = simple_type.find(
252
- f'{{{XSD_NAMESPACE}}}annotation', namespaces)
253
- if annotation is not None:
254
- documentation = annotation.find(
255
- f'{{{XSD_NAMESPACE}}}documentation', namespaces)
256
- if documentation is not None and documentation.text is not None:
257
- avro_doc = documentation.text.strip()
258
-
259
- for restriction in simple_type.findall(f'{{{XSD_NAMESPACE}}}restriction', namespaces):
260
- base_type = restriction.get('base')
261
- enums: List[str] = [el.attrib.get('value', 'Empty') for el in restriction.findall(
262
- f'{{{XSD_NAMESPACE}}}enumeration', namespaces)]
263
- # if any of the enum entries start with a digit, we need to prefix the entry with _
264
- if enums:
265
- for i, enum in enumerate(enums):
266
- if enums[i][0].isdigit():
267
- enums[i] = '_'+enum
268
- enum_type = {
269
- 'type': 'enum',
270
- 'name': simple_type.attrib.get('name'),
271
- 'namespace': self.avro_namespace,
272
- 'symbols': enums
273
- }
274
- if avro_doc:
275
- enum_type['doc'] = avro_doc
276
- return True, enum_type
277
- elif base_type:
278
- # if the baseType is a decimal, get the precision and scale sub-element value attributes to set the logicalType
279
- if base_type == namespaces[XSD_NAMESPACE]+':'+'decimal':
280
- precision = restriction.find(
281
- f'{{{XSD_NAMESPACE}}}totalDigits', namespaces)
282
- scale = restriction.find(
283
- f'{{{XSD_NAMESPACE}}}fractionDigits', namespaces)
284
- logical_type = {
285
- 'type': 'bytes',
286
- 'logicalType': 'decimal',
287
- 'precision': int(precision.attrib.get('value', 32)) if isinstance(precision, ET.Element) else 32,
288
- 'scale': int(scale.attrib.get('value', 6)) if isinstance(scale, ET.Element) else 6,
289
- }
290
- if avro_doc:
291
- logical_type['doc'] = avro_doc
292
- self.simple_type_map[type_name] = logical_type
293
- return False, logical_type
294
- else:
295
- self.simple_type_map[type_name] = self.xsd_to_avro_type(
296
- base_type, namespaces)
297
- return False, self.simple_type_map[type_name]
298
- raise ValueError("No content found in simple type")
299
-
300
- def process_top_level_element(self, element: ET.Element, namespaces: dict):
301
- """ Process a top level element in the XSD schema. """
302
- dependencies: List[str] = []
303
- avro_type: dict = {
304
- 'type': 'record',
305
- 'name': 'Root',
306
- 'namespace': self.avro_namespace,
307
- 'xmlns': self.xml_namespace,
308
- 'fields': []
309
- }
310
- annotation = element.find(f'{{{XSD_NAMESPACE}}}annotation', namespaces)
311
- if annotation is not None:
312
- documentation = annotation.find(
313
- f'{{{XSD_NAMESPACE}}}documentation', namespaces)
314
- if documentation is not None and documentation.text is not None:
315
- avro_type['doc'] = documentation.text.strip()
316
-
317
- if 'type' in element.attrib:
318
- field = self.process_element(element, namespaces, dependencies)
319
- field['xmlkind'] = 'element'
320
- avro_type['fields'].append(field)
321
- if dependencies:
322
- avro_type['dependencies'] = dependencies
323
- return avro_type
324
- else:
325
- complex_type = element.find(
326
- f'{{{XSD_NAMESPACE}}}complexType', namespaces)
327
- if complex_type is None:
328
- raise ValueError(
329
- 'top level element must have a type or be complexType')
330
- complex_type.set('name', element.get('name', ''))
331
- avro_complex_type = self.process_complex_type(
332
- complex_type, namespaces)
333
- return avro_complex_type
334
-
335
- def extract_xml_namespaces(self, xml_str: str):
336
- """ Extract XML namespaces from an XML string."""
337
- # This regex finds all xmlns:prefix="uri" declarations
338
- pattern = re.compile(r'xmlns:([\w]+)="([^"]+)"')
339
- namespaces = {m.group(2): m.group(1)
340
- for m in pattern.finditer(xml_str)}
341
- return namespaces
342
-
343
- def xsd_to_avro(self, xsd_path: str, code_namespace: str | None = None):
344
- """ Convert XSD to Avro schema. """
345
- # load the XSD file into a string
346
- with open(xsd_path, 'r', encoding='utf-8') as f:
347
- xsd = f.read()
348
-
349
- namespaces = self.extract_xml_namespaces(xsd)
350
- root = ET.fromstring(xsd)
351
- target_namespace = root.get('targetNamespace')
352
- if target_namespace is None:
353
- raise ValueError('targetNamespace not found')
354
- self.xml_namespace = target_namespace
355
- if not code_namespace:
356
- self.avro_namespace = self.xsd_targetnamespace_to_avro_namespace(target_namespace)
357
- else:
358
- self.avro_namespace = code_namespace
359
- ET.register_namespace(namespaces[XSD_NAMESPACE], XSD_NAMESPACE)
360
- avro_schema: List[dict | list | str] = []
361
-
362
- for simple_type in root.findall(f'{{{XSD_NAMESPACE}}}simpleType', namespaces):
363
- add_to_schema, simple_type_type = self.process_simple_type(
364
- simple_type, namespaces)
365
- # we only want to append simple types if they are not resolved to one of the base types
366
- if add_to_schema:
367
- avro_schema.append(simple_type_type)
368
- for complex_type in root.findall(f'{{{XSD_NAMESPACE}}}complexType', namespaces):
369
- avro_schema.append(self.process_complex_type(
370
- complex_type, namespaces))
371
-
372
- top_level_elements = root.findall(
373
- f'{{{XSD_NAMESPACE}}}element', namespaces)
374
- if len(top_level_elements) == 1:
375
- record = self.process_top_level_element(
376
- top_level_elements[0], namespaces)
377
- inline_dependencies_of(avro_schema, record)
378
- return record
379
- for element in top_level_elements:
380
- avro_schema.append(self.process_top_level_element(
381
- element, namespaces))
382
-
383
- avro_schema = sort_messages_by_dependencies(avro_schema)
384
- if len(avro_schema) == 1:
385
- return avro_schema[0]
386
- else:
387
- return avro_schema
388
-
389
- def convert_xsd_to_avro(self, xsd_path: str, avro_path: str, namespace: str | None = None):
390
- """Convert XSD to Avro schema and write to a file."""
391
-
392
-
393
- avro_schema = self.xsd_to_avro(xsd_path, code_namespace=namespace)
394
- with open(avro_path, 'w', encoding='utf-8') as f:
395
- json.dump(avro_schema, f, indent=4)
396
-
397
-
398
- def convert_xsd_to_avro(xsd_path: str, avro_path: str, namespace: str | None = None):
399
- """
400
- Convert XSD to Avro schema and write to a file.
401
-
402
- Params:
403
- xsd_path: str - Path to the XSD file.
404
- avro_path: str - Path to the Avro file.
405
- namespace: str | None - Namespace of the Avro schema.
406
- """
407
-
408
- if not os.path.exists(xsd_path):
409
- raise FileNotFoundError(f"XSD file not found at {xsd_path}")
410
- if not namespace:
411
- namespace = os.path.splitext(os.path.basename(xsd_path))[0].lower().replace('-', '_')
412
- xsd_to_avro = XSDToAvro()
413
- xsd_to_avro.convert_xsd_to_avro(xsd_path, avro_path, namespace)
1
+ # pylint: disable=line-too-long, consider-iterating-dictionary, too-many-locals, too-many-branches
2
+
3
+ """Converts XSD to Avro schema."""
4
+
5
+ import os
6
+ import re
7
+ from typing import Dict, List, Tuple
8
+ import xml.etree.ElementTree as ET
9
+ import json
10
+ from urllib.parse import urlparse
11
+ from avrotize.common import avro_namespace, generic_type
12
+
13
+ from avrotize.dependency_resolver import inline_dependencies_of, sort_messages_by_dependencies
14
+
15
+ XSD_NAMESPACE = 'http://www.w3.org/2001/XMLSchema'
16
+
17
+
18
+ class XSDToAvro:
19
+ """ Convert XSD to Avro schema."""
20
+
21
+ def __init__(self) -> None:
22
+ """ Initialize the class. """
23
+ self.simple_type_map: Dict[str, str | dict] = {}
24
+ self.avro_namespace = ''
25
+ self.xml_namespace = ''
26
+
27
+ def xsd_targetnamespace_to_avro_namespace(self, targetnamespace: str) -> str:
28
+ """Convert a XSD namespace to Avro Namespace."""
29
+ parsed_url = urlparse(targetnamespace)
30
+ if parsed_url.scheme == 'urn':
31
+ path_segments = parsed_url.path.strip(
32
+ ':').replace('.', '-').split(':')
33
+ # join all path segments that start with a number with the previous one
34
+ new_path_segments: List[str] = []
35
+ n = len(path_segments)
36
+ for i in range(n):
37
+ if path_segments[i][0].isdigit():
38
+ if i == 0:
39
+ new_path_segments.append('_'+path_segments[i])
40
+ else:
41
+ new_path_segments[-1] = f"{new_path_segments[-1]}-{path_segments[i]}"
42
+ else:
43
+ new_path_segments.append(path_segments[i])
44
+ path_segments = new_path_segments
45
+ else:
46
+ path_segments = parsed_url.path.strip('/').split('/')
47
+ path_segments = list(reversed(path_segments))
48
+ namespace_prefix = '.'.join(path_segments)
49
+ if parsed_url.hostname:
50
+ namespace_suffix = parsed_url.hostname
51
+ namespace = f"{namespace_prefix}.{namespace_suffix}"
52
+ else:
53
+ namespace = namespace_prefix
54
+ return avro_namespace(namespace)
55
+
56
+ def xsd_to_avro_type(self, xsd_type: str, namespaces: dict):
57
+ """Convert a XSD type to an Avro type."""
58
+ if xsd_type in self.simple_type_map:
59
+ return self.simple_type_map[xsd_type]
60
+
61
+ # split the type on the first colon
62
+ if ':' not in xsd_type:
63
+ type_name = xsd_type
64
+ prefix = ''
65
+ else:
66
+ prefix, type_name = xsd_type.split(':', 1)
67
+ if not type_name:
68
+ type_name = prefix
69
+ prefix = ''
70
+ # find the namespace for the prefix
71
+ ns = namespaces.get(XSD_NAMESPACE, '')
72
+ if ns == prefix:
73
+ base_type_map = {
74
+ 'string': 'string',
75
+ 'int': 'int',
76
+ 'integer': 'int',
77
+ 'long': 'long',
78
+ 'short': 'int',
79
+ 'decimal': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 32, 'scale': 6},
80
+ 'float': 'float',
81
+ 'double': 'double',
82
+ 'boolean': 'boolean',
83
+ 'byte': 'int',
84
+ 'date': {'type': 'int', 'logicalType': 'date'},
85
+ 'dateTime': {'type': 'long', 'logicalType': 'timestamp-millis'},
86
+ 'time': {'type': 'int', 'logicalType': 'time-millis'},
87
+ 'duration': {'type': 'int', 'logicalType': 'duration'},
88
+ 'gYear': {'type': 'string'},
89
+ 'gYearMonth': {'type': 'string'},
90
+ 'gMonth': {'type': 'string'},
91
+ 'gMonthDay': {'type': 'string'},
92
+ 'gDay': {'type': 'string'},
93
+ 'nonNegativeInteger': 'int',
94
+ 'positiveInteger': 'int',
95
+ 'unsignedInt': 'int',
96
+ 'unsignedShort': 'int',
97
+ 'unsignedByte': 'int',
98
+ 'unsignedLong': 'long',
99
+ 'yearMonthDuration': {'type': 'string', 'logicalType': 'duration'},
100
+ 'dayTimeDuration': {'type': 'string', 'logicalType': 'duration'},
101
+ 'dateTimeStamp': {'type': 'long', 'logicalType': 'timestamp-millis'},
102
+ 'hexBinary': 'bytes',
103
+ 'base64Binary': 'bytes',
104
+ 'anyURI': 'string',
105
+ 'normalizedString': 'string',
106
+ 'token': 'string',
107
+ 'language': 'string',
108
+ 'Name': 'string',
109
+ 'NCName': 'string',
110
+ 'ENTITY': 'string',
111
+ 'ENTITIES': 'string',
112
+ 'ID': 'string',
113
+ 'IDREF': 'string',
114
+ 'IDREFS': 'string',
115
+ 'NMTOKEN': 'string',
116
+ 'NMTOKENS': 'string',
117
+ 'QName': 'string',
118
+ 'NOTATION': 'string'
119
+ }
120
+ return base_type_map.get(type_name, self.avro_namespace+'.'+type_name)
121
+ else:
122
+ return self.avro_namespace+'.'+type_name
123
+
124
+ def process_element(self, element: ET.Element, namespaces: dict, dependencies: list):
125
+ """Process an element in the XSD schema."""
126
+ name = element.get('name')
127
+ type_value = element.get('type', '')
128
+ if type_value:
129
+ avro_type = self.xsd_to_avro_type(type_value, namespaces)
130
+ if not type_value.startswith(f'{namespaces[XSD_NAMESPACE]}:') and type_value not in self.simple_type_map.keys():
131
+ dependencies.append(avro_type if isinstance(
132
+ avro_type, str) else avro_type.get('namespace')+'.'+avro_type.get('name'))
133
+ dependencies = list(set(dependencies))
134
+ else:
135
+ complex_type = element.find(
136
+ f'{{{XSD_NAMESPACE}}}complexType', namespaces)
137
+ if complex_type is not None:
138
+ complex_type.set('name', name)
139
+ avro_type = self.process_complex_type(complex_type, namespaces)
140
+ else:
141
+ simple_type = element.find(
142
+ f'{{{XSD_NAMESPACE}}}simpleType', namespaces)
143
+ if simple_type is not None:
144
+ add_to_schema, simple_type_type = self.process_simple_type(
145
+ simple_type, namespaces)
146
+ if add_to_schema:
147
+ avro_type = simple_type_type
148
+ else:
149
+ avro_type = self.simple_type_map[name]
150
+ else:
151
+ raise ValueError('element must have a type or complexType')
152
+
153
+ max_occurs = element.get('maxOccurs')
154
+ if max_occurs is not None and max_occurs != '1':
155
+ avro_type = {'type': 'array', 'items': avro_type}
156
+ min_occurs = element.get('minOccurs')
157
+ if min_occurs is not None and min_occurs == '0':
158
+ avro_type = ['null', avro_type]
159
+ avro_field = {'name': name, 'type': avro_type}
160
+ annotation = element.find(f'{{{XSD_NAMESPACE}}}annotation', namespaces)
161
+ if annotation is not None:
162
+ documentation = annotation.find(
163
+ f'{{{XSD_NAMESPACE}}}documentation', namespaces)
164
+ if documentation is not None and documentation.text is not None:
165
+ avro_field['doc'] = documentation.text.strip()
166
+ return avro_field
167
+
168
+ def process_complex_type(self, complex_type: ET.Element, namespaces: dict) -> dict | str:
169
+ """ Process a complex type in the XSD schema."""
170
+ dependencies: List[str] = []
171
+ avro_type: dict = {
172
+ 'type': 'record',
173
+ 'name': complex_type.attrib.get('name'),
174
+ 'namespace': self.avro_namespace,
175
+ 'fields': []
176
+ }
177
+ avro_doc = ''
178
+ annotation = complex_type.find(
179
+ f'{{{XSD_NAMESPACE}}}annotation', namespaces)
180
+ if annotation is not None:
181
+ documentation = annotation.find(
182
+ f'{{{XSD_NAMESPACE}}}documentation', namespaces)
183
+ if documentation is not None and documentation.text is not None:
184
+ avro_doc = documentation.text.strip()
185
+ avro_type['doc'] = avro_doc
186
+ fields = []
187
+ for sequence in complex_type.findall(f'{{{XSD_NAMESPACE}}}sequence', namespaces):
188
+ for el in sequence.findall(f'{{{XSD_NAMESPACE}}}element', namespaces):
189
+ field = self.process_element(el, namespaces, dependencies)
190
+ field['xmlkind'] = 'element'
191
+ fields.append(field)
192
+ if sequence.findall(f'{{{XSD_NAMESPACE}}}any', namespaces):
193
+ fields.append({"name": "any", "xmlkind": "any", "type": generic_type()})
194
+ for all_types in complex_type.findall(f'{{{XSD_NAMESPACE}}}all', namespaces):
195
+ for el in all_types.findall(f'{{{XSD_NAMESPACE}}}element', namespaces):
196
+ field = self.process_element(el, namespaces, dependencies)
197
+ field['xmlkind'] = 'element'
198
+ fields.append(field)
199
+ for choice in complex_type.findall(f'{{{XSD_NAMESPACE}}}choice', namespaces):
200
+ choices: list = []
201
+ for el in choice.findall(f'{{{XSD_NAMESPACE}}}element', namespaces):
202
+ deps: List[str] = []
203
+ choice_field = self.process_element(el, namespaces, deps)
204
+ choice_field['xmlkind'] = 'element'
205
+ choice_record = {
206
+ 'type': 'record',
207
+ 'name': f'{complex_type.attrib.get("name")}_{choice_field["name"]}',
208
+ 'fields': [choice_field],
209
+ 'namespace': self.avro_namespace
210
+ }
211
+ if avro_doc:
212
+ choice_record['doc'] = avro_doc
213
+ choices.append(choice_record)
214
+ dependencies.extend(deps)
215
+ dependencies = list(set(dependencies))
216
+ choices_field = {
217
+ 'name': f'{complex_type.attrib.get("name")}',
218
+ 'type': choices
219
+ }
220
+ fields.append(choices_field)
221
+ for attribute in complex_type.findall(f'.{{{XSD_NAMESPACE}}}attribute', namespaces):
222
+ field = self.process_element(attribute, namespaces, dependencies)
223
+ field['xmlkind'] = 'attribute'
224
+ fields.append(field)
225
+ for el in complex_type.findall(f'{{{XSD_NAMESPACE}}}simpleContent', namespaces):
226
+ simple_content = el.find(
227
+ f'{{{XSD_NAMESPACE}}}extension', namespaces)
228
+ if simple_content is not None:
229
+ base_type = simple_content.attrib.get('base')
230
+ if base_type:
231
+ fields.append(
232
+ {"name": "value", "type": self.xsd_to_avro_type(base_type, namespaces)})
233
+ for se in simple_content.findall(f'{{{XSD_NAMESPACE}}}attribute', namespaces):
234
+ field = self.process_element(se, namespaces, dependencies)
235
+ field['xmlkind'] = 'attribute'
236
+ fields.append(field)
237
+ else:
238
+ raise ValueError("No base found in simpleContent")
239
+
240
+ avro_type['fields'] = fields
241
+ if dependencies:
242
+ avro_type['dependencies'] = dependencies
243
+ return avro_type
244
+
245
+ def process_simple_type(self, simple_type: ET.Element, namespaces: dict) -> Tuple[bool, dict | str]:
246
+ """ Process a simple type in the XSD schema. """
247
+ type_name = simple_type.attrib.get('name')
248
+ if not type_name:
249
+ raise ValueError("SimpleType must have a name")
250
+ avro_doc = ''
251
+ annotation = simple_type.find(
252
+ f'{{{XSD_NAMESPACE}}}annotation', namespaces)
253
+ if annotation is not None:
254
+ documentation = annotation.find(
255
+ f'{{{XSD_NAMESPACE}}}documentation', namespaces)
256
+ if documentation is not None and documentation.text is not None:
257
+ avro_doc = documentation.text.strip()
258
+
259
+ for restriction in simple_type.findall(f'{{{XSD_NAMESPACE}}}restriction', namespaces):
260
+ base_type = restriction.get('base')
261
+ enums: List[str] = [el.attrib.get('value', 'Empty') for el in restriction.findall(
262
+ f'{{{XSD_NAMESPACE}}}enumeration', namespaces)]
263
+ # if any of the enum entries start with a digit, we need to prefix the entry with _
264
+ if enums:
265
+ for i, enum in enumerate(enums):
266
+ if enums[i][0].isdigit():
267
+ enums[i] = '_'+enum
268
+ enum_type = {
269
+ 'type': 'enum',
270
+ 'name': simple_type.attrib.get('name'),
271
+ 'namespace': self.avro_namespace,
272
+ 'symbols': enums
273
+ }
274
+ if avro_doc:
275
+ enum_type['doc'] = avro_doc
276
+ return True, enum_type
277
+ elif base_type:
278
+ # if the baseType is a decimal, get the precision and scale sub-element value attributes to set the logicalType
279
+ if base_type == namespaces[XSD_NAMESPACE]+':'+'decimal':
280
+ precision = restriction.find(
281
+ f'{{{XSD_NAMESPACE}}}totalDigits', namespaces)
282
+ scale = restriction.find(
283
+ f'{{{XSD_NAMESPACE}}}fractionDigits', namespaces)
284
+ logical_type = {
285
+ 'type': 'bytes',
286
+ 'logicalType': 'decimal',
287
+ 'precision': int(precision.attrib.get('value', 32)) if isinstance(precision, ET.Element) else 32,
288
+ 'scale': int(scale.attrib.get('value', 6)) if isinstance(scale, ET.Element) else 6,
289
+ }
290
+ if avro_doc:
291
+ logical_type['doc'] = avro_doc
292
+ self.simple_type_map[type_name] = logical_type
293
+ return False, logical_type
294
+ else:
295
+ self.simple_type_map[type_name] = self.xsd_to_avro_type(
296
+ base_type, namespaces)
297
+ return False, self.simple_type_map[type_name]
298
+ raise ValueError("No content found in simple type")
299
+
300
+ def process_top_level_element(self, element: ET.Element, namespaces: dict):
301
+ """ Process a top level element in the XSD schema. """
302
+ dependencies: List[str] = []
303
+ avro_type: dict = {
304
+ 'type': 'record',
305
+ 'name': 'Root',
306
+ 'namespace': self.avro_namespace,
307
+ 'xmlns': self.xml_namespace,
308
+ 'fields': []
309
+ }
310
+ annotation = element.find(f'{{{XSD_NAMESPACE}}}annotation', namespaces)
311
+ if annotation is not None:
312
+ documentation = annotation.find(
313
+ f'{{{XSD_NAMESPACE}}}documentation', namespaces)
314
+ if documentation is not None and documentation.text is not None:
315
+ avro_type['doc'] = documentation.text.strip()
316
+
317
+ if 'type' in element.attrib:
318
+ field = self.process_element(element, namespaces, dependencies)
319
+ field['xmlkind'] = 'element'
320
+ avro_type['fields'].append(field)
321
+ if dependencies:
322
+ avro_type['dependencies'] = dependencies
323
+ return avro_type
324
+ else:
325
+ complex_type = element.find(
326
+ f'{{{XSD_NAMESPACE}}}complexType', namespaces)
327
+ if complex_type is None:
328
+ raise ValueError(
329
+ 'top level element must have a type or be complexType')
330
+ complex_type.set('name', element.get('name', ''))
331
+ avro_complex_type = self.process_complex_type(
332
+ complex_type, namespaces)
333
+ return avro_complex_type
334
+
335
+ def extract_xml_namespaces(self, xml_str: str):
336
+ """ Extract XML namespaces from an XML string."""
337
+ # This regex finds all xmlns:prefix="uri" declarations
338
+ pattern = re.compile(r'xmlns:([\w]+)="([^"]+)"')
339
+ namespaces = {m.group(2): m.group(1)
340
+ for m in pattern.finditer(xml_str)}
341
+ return namespaces
342
+
343
+ def xsd_to_avro(self, xsd_path: str, code_namespace: str | None = None):
344
+ """ Convert XSD to Avro schema. """
345
+ # load the XSD file into a string
346
+ with open(xsd_path, 'r', encoding='utf-8') as f:
347
+ xsd = f.read()
348
+
349
+ namespaces = self.extract_xml_namespaces(xsd)
350
+ root = ET.fromstring(xsd)
351
+ target_namespace = root.get('targetNamespace')
352
+ if target_namespace is None:
353
+ raise ValueError('targetNamespace not found')
354
+ self.xml_namespace = target_namespace
355
+ if not code_namespace:
356
+ self.avro_namespace = self.xsd_targetnamespace_to_avro_namespace(target_namespace)
357
+ else:
358
+ self.avro_namespace = code_namespace
359
+ ET.register_namespace(namespaces[XSD_NAMESPACE], XSD_NAMESPACE)
360
+ avro_schema: List[dict | list | str] = []
361
+
362
+ for simple_type in root.findall(f'{{{XSD_NAMESPACE}}}simpleType', namespaces):
363
+ add_to_schema, simple_type_type = self.process_simple_type(
364
+ simple_type, namespaces)
365
+ # we only want to append simple types if they are not resolved to one of the base types
366
+ if add_to_schema:
367
+ avro_schema.append(simple_type_type)
368
+ for complex_type in root.findall(f'{{{XSD_NAMESPACE}}}complexType', namespaces):
369
+ avro_schema.append(self.process_complex_type(
370
+ complex_type, namespaces))
371
+
372
+ top_level_elements = root.findall(
373
+ f'{{{XSD_NAMESPACE}}}element', namespaces)
374
+ if len(top_level_elements) == 1:
375
+ record = self.process_top_level_element(
376
+ top_level_elements[0], namespaces)
377
+ inline_dependencies_of(avro_schema, record)
378
+ return record
379
+ for element in top_level_elements:
380
+ avro_schema.append(self.process_top_level_element(
381
+ element, namespaces))
382
+
383
+ avro_schema = sort_messages_by_dependencies(avro_schema)
384
+ if len(avro_schema) == 1:
385
+ return avro_schema[0]
386
+ else:
387
+ return avro_schema
388
+
389
+ def convert_xsd_to_avro(self, xsd_path: str, avro_path: str, namespace: str | None = None):
390
+ """Convert XSD to Avro schema and write to a file."""
391
+
392
+
393
+ avro_schema = self.xsd_to_avro(xsd_path, code_namespace=namespace)
394
+ with open(avro_path, 'w', encoding='utf-8') as f:
395
+ json.dump(avro_schema, f, indent=4)
396
+
397
+
398
+ def convert_xsd_to_avro(xsd_path: str, avro_path: str, namespace: str | None = None):
399
+ """
400
+ Convert XSD to Avro schema and write to a file.
401
+
402
+ Params:
403
+ xsd_path: str - Path to the XSD file.
404
+ avro_path: str - Path to the Avro file.
405
+ namespace: str | None - Namespace of the Avro schema.
406
+ """
407
+
408
+ if not os.path.exists(xsd_path):
409
+ raise FileNotFoundError(f"XSD file not found at {xsd_path}")
410
+ if not namespace:
411
+ namespace = os.path.splitext(os.path.basename(xsd_path))[0].lower().replace('-', '_')
412
+ xsd_to_avro = XSDToAvro()
413
+ xsd_to_avro.convert_xsd_to_avro(xsd_path, avro_path, namespace)