structurize 2.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. avrotize/__init__.py +63 -0
  2. avrotize/__main__.py +6 -0
  3. avrotize/_version.py +34 -0
  4. avrotize/asn1toavro.py +160 -0
  5. avrotize/avrotize.py +152 -0
  6. avrotize/avrotocpp.py +483 -0
  7. avrotize/avrotocsharp.py +992 -0
  8. avrotize/avrotocsv.py +121 -0
  9. avrotize/avrotodatapackage.py +173 -0
  10. avrotize/avrotodb.py +1383 -0
  11. avrotize/avrotogo.py +476 -0
  12. avrotize/avrotographql.py +197 -0
  13. avrotize/avrotoiceberg.py +210 -0
  14. avrotize/avrotojava.py +1023 -0
  15. avrotize/avrotojs.py +250 -0
  16. avrotize/avrotojsons.py +481 -0
  17. avrotize/avrotojstruct.py +345 -0
  18. avrotize/avrotokusto.py +364 -0
  19. avrotize/avrotomd.py +137 -0
  20. avrotize/avrotools.py +168 -0
  21. avrotize/avrotoparquet.py +208 -0
  22. avrotize/avrotoproto.py +359 -0
  23. avrotize/avrotopython.py +622 -0
  24. avrotize/avrotorust.py +435 -0
  25. avrotize/avrotots.py +598 -0
  26. avrotize/avrotoxsd.py +344 -0
  27. avrotize/commands.json +2433 -0
  28. avrotize/common.py +829 -0
  29. avrotize/constants.py +5 -0
  30. avrotize/csvtoavro.py +132 -0
  31. avrotize/datapackagetoavro.py +76 -0
  32. avrotize/dependency_resolver.py +348 -0
  33. avrotize/jsonstoavro.py +1698 -0
  34. avrotize/jsonstostructure.py +2642 -0
  35. avrotize/jstructtoavro.py +878 -0
  36. avrotize/kstructtoavro.py +93 -0
  37. avrotize/kustotoavro.py +455 -0
  38. avrotize/parquettoavro.py +157 -0
  39. avrotize/proto2parser.py +498 -0
  40. avrotize/proto3parser.py +403 -0
  41. avrotize/prototoavro.py +382 -0
  42. avrotize/structuretocsharp.py +2005 -0
  43. avrotize/structuretojsons.py +498 -0
  44. avrotize/structuretopython.py +772 -0
  45. avrotize/xsdtoavro.py +413 -0
  46. structurize-2.16.2.dist-info/METADATA +805 -0
  47. structurize-2.16.2.dist-info/RECORD +51 -0
  48. structurize-2.16.2.dist-info/WHEEL +5 -0
  49. structurize-2.16.2.dist-info/entry_points.txt +2 -0
  50. structurize-2.16.2.dist-info/licenses/LICENSE +201 -0
  51. structurize-2.16.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,382 @@
1
+ """
2
+ Module to convert Protobuf .proto files to Avro schema.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ from typing import Dict, List, Tuple
9
+ from avrotize.common import pascal
10
+ from avrotize.dependency_resolver import sort_messages_by_dependencies, inline_dependencies_of
11
+ from . import proto2parser
12
+ from . import proto3parser
13
+
14
+ AvroSchema = Dict[str, 'AvroSchema'] | List['AvroSchema'] | str | None
15
+
16
+ class ProtoToAvroConverter:
17
+ """Class to convert Protobuf .proto files to Avro schema."""
18
+
19
+ isomorphic_types = ['float', 'double', 'bytes', 'string']
20
+
21
+ def __init__(self, proto_root: str = None):
22
+ """Initialize ProtoToAvroConverter.
23
+
24
+ Args:
25
+ proto_root (str): Optional root directory for resolving proto imports.
26
+ When provided, imports are resolved relative to this directory.
27
+ """
28
+ self.imported_types: Dict[str, str] = {}
29
+ self.generated_types: Dict[str, str] = {}
30
+ self.forward_references: Dict[str, str] = {} # table for resolvbing forward references
31
+ self.proto_root: str = proto_root
32
+
33
+ def proto_type_to_avro_primitive(self, proto_type: str)-> Tuple[bool, str]:
34
+ """
35
+ Map Protobuf types to Avro primitive types.
36
+
37
+ Args:
38
+ proto_type (str): Protobuf type to convert.
39
+
40
+ Returns:
41
+ str or dict: Corresponding Avro type.
42
+ """
43
+ mapping = {
44
+ 'google.protobuf.Empty': 'null', # Special handling may be required
45
+ 'bool': 'boolean',
46
+ 'int32': 'int',
47
+ 'uint32': 'int',
48
+ 'sint32': 'int',
49
+ 'int64': 'long',
50
+ 'uint64': 'long',
51
+ 'sint64': 'long',
52
+ 'fixed32': 'int',
53
+ 'fixed64': 'long',
54
+ 'sfixed32': 'int',
55
+ 'sfixed64': 'long',
56
+ 'google.protobuf.Timestamp': {
57
+ "type": "long",
58
+ "logicalType": "timestamp-micros"
59
+ }
60
+ }
61
+ if proto_type in self.isomorphic_types:
62
+ return True, proto_type
63
+ mapped = mapping.get(proto_type, None)
64
+ if mapped:
65
+ return True, mapped
66
+ return False, proto_type
67
+
68
+ def build_forward_references_from_message(self, proto_message_type: proto2parser.Message | proto3parser.Message, avro_namespace: str):
69
+ """
70
+ Build forward references from a Protobuf message.
71
+
72
+ Args:
73
+ proto_message_type: The message type from the parsed proto file.
74
+ avro_namespace (str): The namespace for the message.
75
+ """
76
+ for _, nested_message in proto_message_type.messages.items():
77
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
78
+ self.build_forward_references_from_message(nested_message, nested_namespace)
79
+ for _, enum_type in proto_message_type.enums.items():
80
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
81
+ self.forward_references[nested_namespace+'.'+enum_type.name] = "enum"
82
+ self.forward_references[avro_namespace+'.'+proto_message_type.name] = "record"
83
+
84
+ def build_forward_references_from_file(self, proto_file: proto3parser.ProtoFile| proto2parser.ProtoFile, avro_namespace: str):
85
+ """
86
+ Build forward references from a Protobuf file.
87
+
88
+ Args:
89
+ proto_file: The parsed proto file.
90
+ avro_namespace (str): The namespace for the message.
91
+ """
92
+ for _, enum_type in proto_file.enums.items():
93
+ self.forward_references[avro_namespace+'.'+enum_type.name] = "enum"
94
+ for _, message in proto_file.messages.items():
95
+ self.build_forward_references_from_message(message, avro_namespace)
96
+
97
+ def convert_proto_to_avro_schema(self, proto_file_path: str, avro_namespace: str, message_type: str) -> list:
98
+ """
99
+ Convert .proto file to Avro schema.
100
+
101
+ Args:
102
+ proto_file_path (str): Path to the Protobuf .proto file.
103
+
104
+ Returns:
105
+ list: Avro schema as a list of dictionaries.
106
+ """
107
+ with open(proto_file_path, 'r', encoding='utf-8') as proto_file:
108
+ proto_schema = proto_file.read()
109
+
110
+ # Determine whether we have proto3 or proto2 and parse the data
111
+ if re.search(r'syntax\s*=\s*"proto3"', proto_schema):
112
+ data: proto3parser.ProtoFile = proto3parser.parse(proto_schema)
113
+ else:
114
+ data: proto2parser.ProtoFile = proto2parser.parse(proto_schema)
115
+
116
+ # Build forward references
117
+ self.build_forward_references_from_file(data, avro_namespace)
118
+ # Avro schema header
119
+ avro_schema = []
120
+ for import_ in data.imports:
121
+ # Handle protobuf imports
122
+ if import_.startswith('google/protobuf/'):
123
+ script_path = os.path.dirname(os.path.abspath(__file__))
124
+ avsc_dir = os.path.join(script_path, 'prototypes')
125
+ # Load the corresponding avsc file from ./prototypes at this script's path into avro_schema
126
+ avsc = f'{avsc_dir}/{import_.replace("google/protobuf/", "").replace(".proto", ".avsc")}'
127
+ with open(avsc, 'r', encoding='utf-8') as avsc_file:
128
+ types = json.load(avsc_file)
129
+ for t in types:
130
+ qualified_name = t["namespace"] + "." + t["name"]
131
+ self.imported_types[qualified_name] = t
132
+ else:
133
+ # Resolve import path: try proto_root first, then fall back to file-relative path
134
+ import_path = None
135
+
136
+ if self.proto_root:
137
+ # Try resolving relative to proto_root
138
+ candidate_path = os.path.join(self.proto_root, import_)
139
+ if os.path.exists(candidate_path):
140
+ import_path = candidate_path
141
+
142
+ if not import_path:
143
+ # Fall back to resolving relative to the directory of the current proto file
144
+ cwd = os.path.join(os.getcwd(), os.path.dirname(proto_file_path))
145
+ candidate_path = os.path.join(cwd, import_)
146
+ if os.path.exists(candidate_path):
147
+ import_path = candidate_path
148
+
149
+ # Raise an exception if the imported file does not exist
150
+ if not import_path:
151
+ raise FileNotFoundError(f'Import file {import_} does not exist. Searched in proto_root: {self.proto_root}, and relative to: {os.path.dirname(proto_file_path)}')
152
+
153
+ package_name = pascal(import_.replace('.proto', ''))
154
+ import_namespace = (avro_namespace + '.' + package_name) if avro_namespace else package_name
155
+ avro_schema.extend(self.convert_proto_to_avro_schema(import_path, import_namespace, message_type))
156
+
157
+
158
+ # Convert enum fields
159
+ for _, enum_type in data.enums.items():
160
+ self.handle_enum(enum_type, avro_schema, avro_namespace)
161
+
162
+ # Convert message fields
163
+ for _, m in data.messages.items():
164
+ self.handle_message(m, avro_schema, avro_namespace)
165
+
166
+
167
+ # Sort the messages in avro_schema by dependencies
168
+ if message_type:
169
+ message_schema = next(
170
+ (message for message in avro_schema if message['type'] == "record" and message['name'] == message_type), None)
171
+ if not message_schema:
172
+ raise ValueError(f'Message type {message_type} not found in the Avro schema.')
173
+ else:
174
+ inline_dependencies_of(avro_schema, message_schema)
175
+ return message_schema
176
+ else:
177
+ avro_schema = sort_messages_by_dependencies(avro_schema)
178
+ return avro_schema
179
+
180
+ @staticmethod
181
+ def clean_comment(comment: str):
182
+ """
183
+ Clean comments by stripping slashes, newlines, linefeeds, and extra whitespace.
184
+
185
+ Args:
186
+ comment (str): The comment to clean.
187
+
188
+ Returns:
189
+ str: Cleaned comment.
190
+ """
191
+ if comment:
192
+ return comment.replace('//', '').replace('\n', '').lstrip().rstrip()
193
+ return None
194
+
195
+ def handle_enum(self, enum_type: proto2parser.Enum | proto3parser.Enum, avro_schema: AvroSchema, avro_namespace: str) -> AvroSchema:
196
+ """
197
+ Convert enum fields to avro schema.
198
+
199
+ Args:
200
+ enum_type: The enum type from the parsed proto file.
201
+ avro_schema (list): The list to append the converted enum schema.
202
+ namespace (str): The namespace for the enum.
203
+ """
204
+ comment = self.clean_comment(
205
+ enum_type.comment.content if enum_type.comment and enum_type.comment.content else None)
206
+
207
+ # Create avro schema
208
+ avro_enum: AvroSchema = {
209
+ 'name': enum_type.name,
210
+ 'type': 'enum',
211
+ 'namespace': avro_namespace,
212
+ 'symbols': [],
213
+ 'ordinals': {}
214
+ }
215
+
216
+ if comment:
217
+ avro_enum['doc'] = comment
218
+ for value in enum_type.fields:
219
+ avro_enum['symbols'].append(value.name)
220
+ avro_enum['ordinals'][value.name] = int(value.number)
221
+ avro_schema.append(avro_enum)
222
+ self.generated_types[avro_enum['namespace']+'.'+avro_enum['name']] = "enum"
223
+ return avro_enum
224
+
225
+ def handle_message(self, proto_message_type: proto2parser.Message | proto3parser.Message, avro_schema: AvroSchema, avro_namespace: str)-> AvroSchema:
226
+ """
227
+ Convert protobuf messages to avro records.
228
+
229
+ Args:
230
+ m: The message type from the parsed proto file.
231
+ avro_schema (list): The list to append the converted message schema.
232
+ namespace (str): The namespace for the message.
233
+ """
234
+ dependencies = []
235
+
236
+ comment = self.clean_comment(proto_message_type.comment.content if proto_message_type.comment and proto_message_type.comment.content else None)
237
+ avro_record: AvroSchema = {
238
+ 'type': 'record',
239
+ 'name': proto_message_type.name,
240
+ 'namespace': avro_namespace,
241
+ 'fields': []
242
+ }
243
+ if comment:
244
+ avro_record['doc'] = comment
245
+ for proto_field in proto_message_type.fields:
246
+ avro_type = self.get_avro_type_for_field(proto_message_type, avro_namespace, avro_schema, dependencies, proto_field)
247
+ comment = self.clean_comment(proto_field.comment.content if proto_field.comment and proto_field.comment.content else None)
248
+
249
+ avro_field = {
250
+ 'name': proto_field.name,
251
+ 'type': avro_type,
252
+ }
253
+
254
+ if comment:
255
+ avro_field['doc'] = comment
256
+
257
+ avro_record['fields'].append(avro_field)
258
+
259
+ for proto_field in proto_message_type.oneofs:
260
+ avro_oneof: AvroSchema = {
261
+ 'name': proto_field.name,
262
+ 'type': []
263
+ }
264
+ comment = self.clean_comment(proto_field.comment.content if proto_field.comment and proto_field.comment.content else None)
265
+ if comment:
266
+ avro_oneof['doc'] = comment
267
+ for oneof_field in proto_field.fields:
268
+ avro_type = self.get_avro_type_for_field(proto_message_type, avro_namespace, avro_schema, dependencies, oneof_field)
269
+ comment = self.clean_comment(oneof_field.comment.content if oneof_field.comment and oneof_field.comment.content else None)
270
+ if comment:
271
+ oneof_field['doc'] = comment
272
+ avro_oneof['type'].append(avro_type)
273
+ avro_record['fields'].append(avro_oneof)
274
+
275
+ if dependencies:
276
+ avro_record['dependencies'] = dependencies
277
+ avro_schema.append(avro_record)
278
+ for _, nested_message in proto_message_type.messages.items():
279
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
280
+ self.handle_message(nested_message, avro_schema, nested_namespace)
281
+ # Convert enum fields
282
+ for _, enum_type in proto_message_type.enums.items():
283
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
284
+ self.handle_enum(enum_type, avro_schema, nested_namespace)
285
+ self.generated_types[avro_record['namespace']+'.'+avro_record['name']] = "record"
286
+ return avro_record
287
+
288
+ def get_avro_type_for_field(self, proto_message_type: proto2parser.Message | proto3parser.Message, avro_namespace: str, avro_schema: AvroSchema, dependencies: List[str], proto_field: proto2parser.Field | proto3parser.Field):
289
+ """
290
+ Get Avro type for a Protobuf field.
291
+
292
+ Args:
293
+ m: The message type from the parsed proto file.
294
+ namespace (str): The namespace for the message.
295
+ dependencies (list): The list to append the dependencies.
296
+ f: The field from the parsed proto file.
297
+
298
+ Returns:
299
+ str or dict: Corresponding Avro type.
300
+ """
301
+ avro_field_type: AvroSchema = None
302
+ proto_field_type = proto_field.val_type if proto_field.label == 'repeated' or proto_field.type == 'map' else proto_field.type
303
+ is_primitive, avro_field_type = self.proto_type_to_avro_primitive(proto_field_type)
304
+
305
+ if not is_primitive:
306
+ if proto_field.type in self.imported_types:
307
+ avro_field_type = self.imported_types[proto_field.type]
308
+ else:
309
+ avro_field_type = avro_namespace + '.' + avro_field_type
310
+ found_in_nested_definitions = False
311
+ for k, nested_proto_message_type in proto_message_type.messages.items():
312
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
313
+ if nested_proto_message_type.name == proto_field_type:
314
+ avro_field_type = self.handle_message(nested_proto_message_type, avro_schema, nested_namespace)
315
+ del proto_message_type.messages[k]
316
+ if 'dependencies' in avro_field_type:
317
+ dependencies.extend(avro_field_type['dependencies'])
318
+ del avro_field_type['dependencies']
319
+ found_in_nested_definitions = True
320
+ break
321
+ if not found_in_nested_definitions:
322
+ for k, nested_proto_enum_type in proto_message_type.enums.items():
323
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
324
+ if nested_proto_enum_type.name == proto_field_type:
325
+ avro_field_type = self.handle_enum(nested_proto_enum_type, avro_schema, nested_namespace)
326
+ del proto_message_type.enums[k]
327
+ found_in_nested_definitions = True
328
+ break
329
+ if not found_in_nested_definitions:
330
+ dependency_avro_field_type = avro_field_type
331
+ while '.' in dependency_avro_field_type:
332
+ if dependency_avro_field_type in self.forward_references:
333
+ dependencies.append(dependency_avro_field_type)
334
+ break
335
+ n = dependency_avro_field_type.split('.')
336
+ dependency_avro_field_type = '.'.join(n[:-2]+[n[-1]])
337
+
338
+ if proto_field.label == 'optional':
339
+ avro_field_type = ["null", avro_field_type]
340
+ if proto_field.label == 'repeated':
341
+ avro_type: AvroSchema = {
342
+ "type": "array",
343
+ "items": avro_field_type
344
+ }
345
+ elif proto_field.type == 'map':
346
+ avro_type: AvroSchema = {
347
+ "type": "map",
348
+ "values": avro_field_type,
349
+ }
350
+ else:
351
+ avro_type = avro_field_type
352
+ return avro_type
353
+
354
+
355
+ def convert_proto_to_avro(proto_file_path: str, avro_schema_path: str, namespace: str = None, message_type: str = None, proto_root: str = None):
356
+ """
357
+ Convert Protobuf .proto file to Avro schema.
358
+
359
+ Args:
360
+ proto_file_path (str): Path to the Protobuf .proto file.
361
+ avro_schema_path (str): Path to save the Avro schema .avsc file.
362
+ namespace (str): Optional namespace for the Avro schema.
363
+ message_type (str): Optional specific message type to extract.
364
+ proto_root (str): Optional root directory for resolving proto imports.
365
+ When provided, imports are resolved relative to this directory.
366
+
367
+ Raises:
368
+ FileNotFoundError: If the proto file does not exist.
369
+ ValueError: If the file extensions are incorrect.
370
+ """
371
+ if not os.path.exists(proto_file_path):
372
+ raise FileNotFoundError(f'Proto file {proto_file_path} does not exist.')
373
+
374
+ converter = ProtoToAvroConverter(proto_root=proto_root)
375
+ if not namespace:
376
+ namespace = pascal(os.path.basename(proto_file_path).replace('.proto', ''))
377
+ avro_schema = converter.convert_proto_to_avro_schema(proto_file_path, namespace, message_type)
378
+
379
+ # Convert the Avro schema to JSON and write it to the file
380
+ with open(avro_schema_path, 'w', encoding='utf-8') as avro_file:
381
+ avro_file.write(json.dumps(avro_schema, indent=2))
382
+