structurize 2.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. avrotize/__init__.py +63 -0
  2. avrotize/__main__.py +6 -0
  3. avrotize/_version.py +34 -0
  4. avrotize/asn1toavro.py +160 -0
  5. avrotize/avrotize.py +152 -0
  6. avrotize/avrotocpp.py +483 -0
  7. avrotize/avrotocsharp.py +992 -0
  8. avrotize/avrotocsv.py +121 -0
  9. avrotize/avrotodatapackage.py +173 -0
  10. avrotize/avrotodb.py +1383 -0
  11. avrotize/avrotogo.py +476 -0
  12. avrotize/avrotographql.py +197 -0
  13. avrotize/avrotoiceberg.py +210 -0
  14. avrotize/avrotojava.py +1023 -0
  15. avrotize/avrotojs.py +250 -0
  16. avrotize/avrotojsons.py +481 -0
  17. avrotize/avrotojstruct.py +345 -0
  18. avrotize/avrotokusto.py +364 -0
  19. avrotize/avrotomd.py +137 -0
  20. avrotize/avrotools.py +168 -0
  21. avrotize/avrotoparquet.py +208 -0
  22. avrotize/avrotoproto.py +359 -0
  23. avrotize/avrotopython.py +622 -0
  24. avrotize/avrotorust.py +435 -0
  25. avrotize/avrotots.py +598 -0
  26. avrotize/avrotoxsd.py +344 -0
  27. avrotize/commands.json +2433 -0
  28. avrotize/common.py +829 -0
  29. avrotize/constants.py +5 -0
  30. avrotize/csvtoavro.py +132 -0
  31. avrotize/datapackagetoavro.py +76 -0
  32. avrotize/dependency_resolver.py +348 -0
  33. avrotize/jsonstoavro.py +1698 -0
  34. avrotize/jsonstostructure.py +2642 -0
  35. avrotize/jstructtoavro.py +878 -0
  36. avrotize/kstructtoavro.py +93 -0
  37. avrotize/kustotoavro.py +455 -0
  38. avrotize/parquettoavro.py +157 -0
  39. avrotize/proto2parser.py +498 -0
  40. avrotize/proto3parser.py +403 -0
  41. avrotize/prototoavro.py +382 -0
  42. avrotize/structuretocsharp.py +2005 -0
  43. avrotize/structuretojsons.py +498 -0
  44. avrotize/structuretopython.py +772 -0
  45. avrotize/xsdtoavro.py +413 -0
  46. structurize-2.16.2.dist-info/METADATA +805 -0
  47. structurize-2.16.2.dist-info/RECORD +51 -0
  48. structurize-2.16.2.dist-info/WHEEL +5 -0
  49. structurize-2.16.2.dist-info/entry_points.txt +2 -0
  50. structurize-2.16.2.dist-info/licenses/LICENSE +201 -0
  51. structurize-2.16.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,481 @@
1
+ import copy
2
+ import json
3
+ from typing import Dict, Any, Union, List
4
+ from avrotize.common import build_tree_hash_list, group_by_hash, is_generic_json_type, NodeHashReference
5
+ from functools import reduce
6
+ import jsonpath_ng
7
+
8
+ class AvroToJsonSchemaConverter:
9
+
10
+ def __init__(self, naming_mode: str = 'snake') -> None:
11
+ self.naming_mode = naming_mode
12
+ self.defined_types: Dict[str, Any] = {}
13
+ self.common_namespace = ''
14
+
15
+ def find_common_namespace(self, namespaces: List[str]) -> str:
16
+ """
17
+ Find the common namespace prefix from a list of namespaces.
18
+ """
19
+ if not namespaces:
20
+ return ''
21
+
22
+ def common_prefix(a, b):
23
+ prefix = ''
24
+ for a_char, b_char in zip(a.split('.'), b.split('.')):
25
+ if a_char == b_char:
26
+ prefix += a_char + '.'
27
+ else:
28
+ break
29
+ return prefix.rstrip('.')
30
+
31
+ return reduce(common_prefix, namespaces)
32
+
33
+ def update_common_namespace(self, namespace: str) -> None:
34
+ """
35
+ Update the common namespace based on the provided namespace.
36
+ """
37
+ if not self.common_namespace:
38
+ self.common_namespace = namespace
39
+ else:
40
+ self.common_namespace = self.find_common_namespace([self.common_namespace, namespace])
41
+
42
+ def get_definition_ref(self, name: str) -> str:
43
+ """
44
+ Construct the reference string based on the namespace and name.
45
+ """
46
+
47
+ if '.' in name:
48
+ namespace, name = name.rsplit('.', 1)
49
+ else:
50
+ namespace = self.common_namespace
51
+
52
+ if not self.common_namespace:
53
+ return f"#/definitions/{name}"
54
+
55
+ # Remove the common namespace and replace '.' with '/'
56
+ namespace_suffix = namespace[len(self.common_namespace):].lstrip('.')
57
+ path = namespace_suffix.replace('.', '/') if namespace_suffix else ''
58
+ ref = f"#/definitions/{path}/{name}" if path else f"#/definitions/{name}"
59
+ return ref
60
+
61
+ def get_qualified_name(self, avro_type: Dict[str, Any]) -> str:
62
+ """
63
+ Construct the qualified name based on the namespace and name.
64
+ """
65
+ return avro_type['name'] if 'namespace' not in avro_type else f"{avro_type['namespace']}.{avro_type['name']}"
66
+
67
+ def avro_primitive_to_json_type(self, avro_type: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
68
+ """
69
+ Map Avro primitive types to JSON types with appropriate format annotations.
70
+ Handles both standard Avro logical types and Avrotize schema extensions.
71
+ """
72
+ json_type = {}
73
+ if isinstance(avro_type, dict):
74
+ # Check for logical type before unwrapping the base type
75
+ logical_type = avro_type.get('logicalType')
76
+ base_type = avro_type.get('type', avro_type)
77
+
78
+ # Handle logical types based on their base type
79
+ if logical_type and isinstance(base_type, str):
80
+ # Standard Avro logical types on int/long
81
+ if base_type == 'int' and logical_type == 'date':
82
+ # Standard Avro: int with date logicalType represents days since epoch
83
+ json_type['type'] = 'integer'
84
+ json_type['format'] = 'int32'
85
+ return json_type
86
+ elif base_type == 'int' and logical_type in ['time-millis']:
87
+ # Standard Avro: int with time-millis represents milliseconds since midnight
88
+ json_type['type'] = 'integer'
89
+ json_type['format'] = 'int32'
90
+ return json_type
91
+ elif base_type == 'long' and logical_type in ['time-micros']:
92
+ # Standard Avro: long with time-micros represents microseconds since midnight
93
+ json_type['type'] = 'integer'
94
+ json_type['format'] = 'int64'
95
+ return json_type
96
+ elif base_type == 'long' and logical_type in ['timestamp-millis', 'timestamp-micros']:
97
+ # Standard Avro: long with timestamp represents milliseconds/microseconds since epoch
98
+ json_type['type'] = 'integer'
99
+ json_type['format'] = 'int64'
100
+ return json_type
101
+ # Avrotize schema extensions: string-based logical types
102
+ elif base_type == 'string' and logical_type == 'date':
103
+ # Avrotize extension: string with date logicalType
104
+ json_type['type'] = 'string'
105
+ json_type['format'] = 'date'
106
+ return json_type
107
+ elif base_type == 'string' and logical_type in ['timestamp-millis', 'timestamp-micros', 'datetime']:
108
+ # Avrotize extension: string with datetime logicalType
109
+ json_type['type'] = 'string'
110
+ json_type['format'] = 'date-time'
111
+ return json_type
112
+ elif base_type == 'string' and logical_type in ['time-millis', 'time-micros', 'time']:
113
+ # Avrotize extension: string with time logicalType
114
+ json_type['type'] = 'string'
115
+ json_type['format'] = 'time'
116
+ return json_type
117
+ elif logical_type == 'decimal':
118
+ json_type['type'] = 'number'
119
+ return json_type
120
+ elif logical_type == 'uuid':
121
+ json_type['type'] = 'string'
122
+ json_type['format'] = 'uuid'
123
+ return json_type
124
+
125
+ # If base_type is still a dict, recurse
126
+ if isinstance(base_type, dict):
127
+ if 'logicalType' in base_type:
128
+ return self.avro_primitive_to_json_type(base_type)
129
+ else:
130
+ raise ValueError(f"Avro schema contains unexpected construct {avro_type}")
131
+
132
+ # No logical type or unhandled combination, process base type
133
+ return self.avro_primitive_to_json_type(base_type)
134
+
135
+ mapping = {
136
+ 'null': {'type': 'null'},
137
+ 'boolean': {'type': 'boolean'},
138
+ 'int': {'type': 'integer', 'format': 'int32'},
139
+ 'long': {'type': 'integer', 'format': 'int64'},
140
+ 'float': {'type': 'number', 'format': 'float'},
141
+ 'double': {'type': 'number', 'format': 'double'},
142
+ 'bytes': {'type': 'string', 'contentEncoding': 'base64'},
143
+ 'string': {'type': 'string'},
144
+ 'fixed': {'type': 'string'} # Could specify length in a format or a separate attribute
145
+ }
146
+ type_ref = mapping.get(avro_type, '') # Defaulting to string type for any unknown types
147
+ if not type_ref:
148
+ raise ValueError(f"Avro schema contains unexpected type {avro_type}")
149
+ return type_ref
150
+
151
+
152
+ def convert_name(self, name: str) -> str:
153
+ """
154
+ Convert names according to the specified naming mode.
155
+ """
156
+ if self.naming_mode == 'snake':
157
+ return self.to_snake_case(name)
158
+ elif self.naming_mode == 'camel':
159
+ return self.to_camel_case(name)
160
+ elif self.naming_mode == 'pascal':
161
+ return self.to_pascal_case(name)
162
+ return name
163
+
164
+ @staticmethod
165
+ def to_snake_case(name: str) -> str:
166
+ return ''.join(['_'+c.lower() if c.isupper() else c for c in name]).lstrip('_')
167
+
168
+ @staticmethod
169
+ def to_camel_case(name: str) -> str:
170
+ return ''.join(word.capitalize() if i else word for i, word in enumerate(name.split('_')))
171
+
172
+ @staticmethod
173
+ def to_pascal_case(name: str) -> str:
174
+ return ''.join(word.capitalize() for word in name.split('_'))
175
+
176
+ def is_nullable(self, avro_type: Union[str, Dict[str, Any]]) -> bool:
177
+ """
178
+ Check if a given Avro type is nullable.
179
+ """
180
+ if isinstance(avro_type, list):
181
+ return 'null' in avro_type
182
+ return avro_type == 'null'
183
+
184
+ def handle_type_union(self, types: List[Union[str, Dict[str, Any]]]) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
185
+ """
186
+ Handle Avro type unions, returning a JSON schema that validates against any of the types.
187
+ """
188
+ non_null_types = [t for t in types if t != 'null']
189
+ if len(non_null_types) == 1:
190
+ # Single non-null type
191
+ return self.parse_avro_schema(non_null_types[0])
192
+ else:
193
+ # Multiple non-null types
194
+ union_types = [self.convert_reference(t) if isinstance(t,str) and t in self.defined_types else self.avro_primitive_to_json_type(t)
195
+ if isinstance(t, str) else self.parse_avro_schema(t)
196
+ for t in non_null_types]
197
+ return {
198
+ 'oneOf': union_types
199
+ }
200
+
201
+ def parse_avro_schema(self, avro_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str, is_root = False) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
202
+ """
203
+ Parse an Avro schema structure and return the corresponding JSON schema.
204
+ """
205
+ if isinstance(avro_schema, list):
206
+ # Type union
207
+ union = self.handle_type_union(avro_schema)
208
+ if is_root:
209
+ # all the definitions go into 'definitions'
210
+ return {
211
+ "$schema": "http://json-schema.org/draft-07/schema#"
212
+ }
213
+ if is_generic_json_type(union):
214
+ return { "type": "object" }
215
+ else:
216
+ return union
217
+ elif isinstance(avro_schema, dict):
218
+ if 'namespace' in avro_schema:
219
+ namespace = avro_schema['namespace']
220
+ self.update_common_namespace(namespace)
221
+ if avro_schema['type'] == 'record':
222
+ return self.convert_record(avro_schema, is_root)
223
+ elif avro_schema['type'] == 'enum':
224
+ return self.convert_enum(avro_schema, is_root)
225
+ elif avro_schema['type'] == 'fixed':
226
+ return self.convert_fixed(avro_schema, is_root)
227
+ elif avro_schema['type'] == 'array':
228
+ return self.convert_array(avro_schema)
229
+ elif avro_schema['type'] == 'map':
230
+ return self.convert_map(avro_schema)
231
+ elif avro_schema['type'] in self.defined_types:
232
+ # Type reference
233
+ return self.convert_reference(avro_schema)
234
+ else:
235
+ # Nested type or a direct type definition
236
+ return self.parse_avro_schema(avro_schema['type'])
237
+ elif isinstance(avro_schema, str):
238
+ # Primitive type or a reference to a defined type
239
+ if avro_schema in self.defined_types:
240
+ return self.convert_reference(avro_schema)
241
+ elif '.' in avro_schema:
242
+ raise ValueError(f"Unknown type reference {avro_schema}")
243
+ else:
244
+ return self.avro_primitive_to_json_type(avro_schema)
245
+
246
+ def convert_reference(self, avro_schema: Dict[str, Any] | str) -> Dict[str, Any]:
247
+ """
248
+ Convert a reference to a defined type to a JSON schema object with a reference to the definition.
249
+ """
250
+ key = avro_schema['type'] if isinstance(avro_schema, dict) else avro_schema
251
+ json_type = self.defined_types[key]
252
+ if 'enum' in json_type:
253
+ return copy.deepcopy(json_type)
254
+ else:
255
+ return {"$ref": self.get_definition_ref(key)}
256
+
257
+ def convert_record(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
258
+ """
259
+ Convert an Avro record type to a JSON schema object, handling nested types and type definitions.
260
+ """
261
+ record_name = self.convert_name(avro_schema['name'])
262
+ properties = {}
263
+ required = []
264
+
265
+ json_schema: Dict[str, Any] = {
266
+ "type": "object",
267
+ "title": record_name
268
+ }
269
+ if not is_root:
270
+ self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
271
+
272
+ for field in avro_schema['fields']:
273
+ field_name = self.convert_name(field['name'])
274
+ prop = self.parse_avro_schema(field['type'])
275
+ if 'doc' in field:
276
+ if isinstance(prop, dict):
277
+ prop['description'] = field['doc']
278
+ elif isinstance(prop, list) or isinstance(prop, str):
279
+ prop = {
280
+ 'allOf': [
281
+ prop,
282
+ {'description': field['doc']}
283
+ ]}
284
+ properties[field_name] = prop
285
+ if not self.is_nullable(field['type']):
286
+ required.append(field_name)
287
+
288
+ if 'doc' in avro_schema:
289
+ json_schema['description'] = avro_schema['doc']
290
+ if properties:
291
+ json_schema['properties'] = properties
292
+
293
+ if required:
294
+ json_schema['required'] = required
295
+
296
+ if not is_root:
297
+ return {"$ref": self.get_definition_ref(self.get_qualified_name(avro_schema))}
298
+ return json_schema
299
+
300
+ def convert_enum(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
301
+ """
302
+ Convert an Avro enum type to a JSON schema enum, adding the definition to the schema.
303
+ """
304
+ enum_name = self.convert_name(avro_schema['name'])
305
+ json_schema = {
306
+ "type": "string",
307
+ "enum": avro_schema['symbols'],
308
+ "title": enum_name
309
+ }
310
+
311
+ if 'doc' in avro_schema:
312
+ json_schema['description'] = avro_schema['doc']
313
+
314
+ # Add to defined types
315
+ if not is_root:
316
+ self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
317
+ return json_schema
318
+
319
+ def convert_fixed(self, avro_schema: Dict[str, Any], is_root=False) -> Dict[str, Any]:
320
+ """
321
+ Convert an Avro fixed type to a JSON schema string with length constraints.
322
+ Fixed types are represented as strings with base16 (hex) encoding and
323
+ minLength and maxLength constraints based on the size.
324
+ """
325
+ fixed_name = self.convert_name(avro_schema['name'])
326
+ size = avro_schema['size']
327
+ # Fixed types in JSON are represented as hex strings, so length is 2 * size
328
+ hex_length = size * 2
329
+
330
+ json_schema = {
331
+ "type": "string",
332
+ "contentEncoding": "base16",
333
+ "minLength": hex_length,
334
+ "maxLength": hex_length,
335
+ "title": fixed_name
336
+ }
337
+
338
+ if 'doc' in avro_schema:
339
+ json_schema['description'] = avro_schema['doc']
340
+
341
+ # Add to defined types
342
+ if not is_root:
343
+ self.defined_types[self.get_qualified_name(avro_schema)] = json_schema
344
+ return {"$ref": self.get_definition_ref(self.get_qualified_name(avro_schema))}
345
+ return json_schema
346
+
347
+ def convert_array(self, avro_schema: Dict[str, Any]) -> Dict[str, Any]:
348
+ """
349
+ Convert an Avro array type to a JSON schema array.
350
+ """
351
+ return {
352
+ "type": "array",
353
+ "items": self.parse_avro_schema(avro_schema['items'])
354
+ }
355
+
356
+ def convert_map(self, avro_schema: Dict[str, Any]) -> Dict[str, Any]:
357
+ """
358
+ Convert an Avro map type to a JSON schema object with additionalProperties.
359
+ """
360
+ return {
361
+ "type": "object",
362
+ "additionalProperties": self.parse_avro_schema(avro_schema['values'])
363
+ }
364
+
365
+ def convert(self, avro_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str) -> Dict[str, Any] | List[Dict[str, Any]| str] | str:
366
+ """
367
+ Convert the root Avro schema to a JSON schema.
368
+ """
369
+ json_schema: Dict[str, Any] | List[Dict[str, Any]| str] | str = self.parse_avro_schema(avro_schema, is_root = True)
370
+
371
+ if self.defined_types and isinstance(json_schema, dict):
372
+ for name, definition in self.defined_types.items():
373
+ if isinstance(definition, dict) and 'enum' in definition:
374
+ # enums are inlined
375
+ continue
376
+ current_level = json_schema.setdefault('definitions', {})
377
+ if '.' in name:
378
+ definition_namespace, definition_name = name.rsplit('.',1)
379
+ if not self.common_namespace or (self.common_namespace and definition_namespace == self.common_namespace):
380
+ definition_namespace = ''
381
+ else:
382
+ definition_namespace = definition_namespace[len(self.common_namespace):].lstrip('.')
383
+ # Split the definition_namespace into path segments
384
+ path_segments = definition_namespace.split('.')
385
+ if definition_namespace and len(path_segments) > 0:
386
+ # Traverse through all but the last segment, creating nested dictionaries as needed
387
+ for segment in path_segments:
388
+ # If the segment does not exist, create a new dictionary at that level
389
+ if segment not in current_level:
390
+ current_level[segment] = {}
391
+ # Move deeper into the nested structure
392
+ current_level = current_level[segment]
393
+ else:
394
+ definition_name = name
395
+ current_level[definition_name] = copy.deepcopy(definition)
396
+
397
+ return json_schema
398
+
399
+ def compact_tree(json_schema):
400
+ shared_def_counter = 1
401
+ ignored_hashes = []
402
+ while True:
403
+ thl = build_tree_hash_list(json_schema)
404
+ ghl = group_by_hash(thl)
405
+ if len(ghl) == 0:
406
+ return
407
+ # sort ghl by the count in of the first item in each group
408
+ ghl = dict(sorted(ghl.items(), key=lambda item: -item[1][0].count))
409
+ repeat = True
410
+ while repeat:
411
+ repeat = False
412
+ first_group_key = next((key for key in ghl.keys() if key not in ignored_hashes), None)
413
+ if first_group_key is None:
414
+ return
415
+ ghl_top_item_entries = ghl[first_group_key]
416
+ # sort the items by the shortest .path value
417
+ ghl_top_item_entries = sorted(ghl_top_item_entries, key=lambda item: len(item.path.split('.')))
418
+ top_item_entry: NodeHashReference = ghl_top_item_entries[0]
419
+ top_item_path_segments = top_item_entry.path.split('.')
420
+ if top_item_path_segments[1] == 'definitions' and len(top_item_path_segments) == 3:
421
+ # the top item sits right under definitions, we will merge into that one
422
+ def_key = top_item_path_segments[2]
423
+ ghl_top_item_entries.remove(top_item_entry)
424
+ elif ((top_item_path_segments[-1] == 'options' and top_item_path_segments[-2] == 'properties' and len(top_item_path_segments) > 4) and 'oneOf' in top_item_entry.value):
425
+ # the first case is likely a union we created in j2a that we had to create a top-level item for. We will undo that here.
426
+ json_item = json_schema
427
+ def_key = ''
428
+ for seg in top_item_path_segments[1:-2]:
429
+ def_key += '/' + seg if def_key else seg
430
+ json_item = json_item[seg]
431
+ json_item.clear()
432
+ json_item.update(copy.deepcopy(top_item_entry.value))
433
+ ghl_top_item_entries.remove(top_item_entry)
434
+ elif top_item_path_segments[-2] == 'properties' or top_item_path_segments[-1] == 'properties':
435
+ # the top item is a property of an object, which means that we would create direct
436
+ # links into that object and therefore we will drop that hash
437
+ ignored_hashes.append(first_group_key)
438
+ repeat = True
439
+ continue
440
+ else:
441
+ # the second is indeed a proper type declaration, so we will use the first as the one all other occurrences refer to
442
+ json_item = json_schema
443
+ def_key = ''
444
+ for seg in top_item_path_segments[1:]:
445
+ def_key += '/' + seg if def_key else seg
446
+ ghl_top_item_entries.remove(top_item_entry)
447
+
448
+
449
+ for ghl_item in ghl_top_item_entries:
450
+ node = ghl_item.value
451
+ if isinstance(node,dict):
452
+ node.clear()
453
+ node.update({
454
+ '$ref': f"#/{def_key}"
455
+ })
456
+ break
457
+
458
+
459
+
460
+ def convert_avro_to_json_schema(avro_schema_file: str, json_schema_file: str, naming_mode: str = 'default') -> None:
461
+ """
462
+ Convert an Avro schema file to a JSON schema file.
463
+
464
+ :param avro_schema_file: The path to the input Avro schema file.
465
+ :param json_schema_file: The path to the output JSON schema file.
466
+ :param naming_mode: The naming mode for converting names ('snake', 'camel', 'pascal').
467
+ """
468
+ converter = AvroToJsonSchemaConverter(naming_mode)
469
+
470
+ # Read the Avro schema file
471
+ with open(avro_schema_file, 'r') as file:
472
+ avro_schema = json.load(file)
473
+
474
+ # Convert the Avro schema to JSON schema
475
+ json_schema = converter.convert(avro_schema)
476
+
477
+ compact_tree(json_schema)
478
+ # Write the JSON schema to the output file
479
+ with open(json_schema_file, 'w') as file:
480
+ json.dump(json_schema, file, indent=4)
481
+