avrotize 2.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. avrotize/__init__.py +66 -0
  2. avrotize/__main__.py +6 -0
  3. avrotize/_version.py +34 -0
  4. avrotize/asn1toavro.py +160 -0
  5. avrotize/avrotize.py +152 -0
  6. avrotize/avrotocpp/CMakeLists.txt.jinja +77 -0
  7. avrotize/avrotocpp/build.bat.jinja +7 -0
  8. avrotize/avrotocpp/build.sh.jinja +7 -0
  9. avrotize/avrotocpp/dataclass_body.jinja +108 -0
  10. avrotize/avrotocpp/vcpkg.json.jinja +21 -0
  11. avrotize/avrotocpp.py +483 -0
  12. avrotize/avrotocsharp/README.md.jinja +166 -0
  13. avrotize/avrotocsharp/class_test.cs.jinja +266 -0
  14. avrotize/avrotocsharp/dataclass_core.jinja +293 -0
  15. avrotize/avrotocsharp/enum_test.cs.jinja +20 -0
  16. avrotize/avrotocsharp/project.csproj.jinja +30 -0
  17. avrotize/avrotocsharp/project.sln.jinja +34 -0
  18. avrotize/avrotocsharp/run_coverage.ps1.jinja +98 -0
  19. avrotize/avrotocsharp/run_coverage.sh.jinja +149 -0
  20. avrotize/avrotocsharp/testproject.csproj.jinja +19 -0
  21. avrotize/avrotocsharp.py +1180 -0
  22. avrotize/avrotocsv.py +121 -0
  23. avrotize/avrotodatapackage.py +173 -0
  24. avrotize/avrotodb.py +1383 -0
  25. avrotize/avrotogo/go_enum.jinja +12 -0
  26. avrotize/avrotogo/go_helpers.jinja +31 -0
  27. avrotize/avrotogo/go_struct.jinja +151 -0
  28. avrotize/avrotogo/go_test.jinja +47 -0
  29. avrotize/avrotogo/go_union.jinja +38 -0
  30. avrotize/avrotogo.py +476 -0
  31. avrotize/avrotographql.py +197 -0
  32. avrotize/avrotoiceberg.py +210 -0
  33. avrotize/avrotojava/class_test.java.jinja +212 -0
  34. avrotize/avrotojava/enum_test.java.jinja +21 -0
  35. avrotize/avrotojava/testproject.pom.jinja +54 -0
  36. avrotize/avrotojava.py +2156 -0
  37. avrotize/avrotojs.py +250 -0
  38. avrotize/avrotojsons.py +481 -0
  39. avrotize/avrotojstruct.py +345 -0
  40. avrotize/avrotokusto.py +364 -0
  41. avrotize/avrotomd/README.md.jinja +49 -0
  42. avrotize/avrotomd.py +137 -0
  43. avrotize/avrotools.py +168 -0
  44. avrotize/avrotoparquet.py +208 -0
  45. avrotize/avrotoproto.py +359 -0
  46. avrotize/avrotopython/dataclass_core.jinja +241 -0
  47. avrotize/avrotopython/enum_core.jinja +87 -0
  48. avrotize/avrotopython/pyproject_toml.jinja +18 -0
  49. avrotize/avrotopython/test_class.jinja +97 -0
  50. avrotize/avrotopython/test_enum.jinja +23 -0
  51. avrotize/avrotopython.py +626 -0
  52. avrotize/avrotorust/dataclass_enum.rs.jinja +74 -0
  53. avrotize/avrotorust/dataclass_struct.rs.jinja +204 -0
  54. avrotize/avrotorust/dataclass_union.rs.jinja +105 -0
  55. avrotize/avrotorust.py +435 -0
  56. avrotize/avrotots/class_core.ts.jinja +140 -0
  57. avrotize/avrotots/class_test.ts.jinja +77 -0
  58. avrotize/avrotots/enum_core.ts.jinja +46 -0
  59. avrotize/avrotots/gitignore.jinja +34 -0
  60. avrotize/avrotots/index.ts.jinja +0 -0
  61. avrotize/avrotots/package.json.jinja +23 -0
  62. avrotize/avrotots/tsconfig.json.jinja +21 -0
  63. avrotize/avrotots.py +687 -0
  64. avrotize/avrotoxsd.py +344 -0
  65. avrotize/cddltostructure.py +1841 -0
  66. avrotize/commands.json +3496 -0
  67. avrotize/common.py +834 -0
  68. avrotize/constants.py +87 -0
  69. avrotize/csvtoavro.py +132 -0
  70. avrotize/datapackagetoavro.py +76 -0
  71. avrotize/dependencies/cpp/vcpkg/vcpkg.json +19 -0
  72. avrotize/dependencies/cs/net90/dependencies.csproj +29 -0
  73. avrotize/dependencies/go/go121/go.mod +6 -0
  74. avrotize/dependencies/java/jdk21/pom.xml +91 -0
  75. avrotize/dependencies/python/py312/requirements.txt +13 -0
  76. avrotize/dependencies/rust/stable/Cargo.toml +17 -0
  77. avrotize/dependencies/typescript/node22/package.json +16 -0
  78. avrotize/dependency_resolver.py +348 -0
  79. avrotize/dependency_version.py +432 -0
  80. avrotize/generic/generic.avsc +57 -0
  81. avrotize/jsonstoavro.py +2167 -0
  82. avrotize/jsonstostructure.py +2864 -0
  83. avrotize/jstructtoavro.py +878 -0
  84. avrotize/kstructtoavro.py +93 -0
  85. avrotize/kustotoavro.py +455 -0
  86. avrotize/openapitostructure.py +717 -0
  87. avrotize/parquettoavro.py +157 -0
  88. avrotize/proto2parser.py +498 -0
  89. avrotize/proto3parser.py +403 -0
  90. avrotize/prototoavro.py +382 -0
  91. avrotize/prototypes/any.avsc +19 -0
  92. avrotize/prototypes/api.avsc +106 -0
  93. avrotize/prototypes/duration.avsc +20 -0
  94. avrotize/prototypes/field_mask.avsc +18 -0
  95. avrotize/prototypes/struct.avsc +60 -0
  96. avrotize/prototypes/timestamp.avsc +20 -0
  97. avrotize/prototypes/type.avsc +253 -0
  98. avrotize/prototypes/wrappers.avsc +117 -0
  99. avrotize/structuretocddl.py +597 -0
  100. avrotize/structuretocpp/CMakeLists.txt.jinja +76 -0
  101. avrotize/structuretocpp/build.bat.jinja +3 -0
  102. avrotize/structuretocpp/build.sh.jinja +3 -0
  103. avrotize/structuretocpp/dataclass_body.jinja +50 -0
  104. avrotize/structuretocpp/vcpkg.json.jinja +11 -0
  105. avrotize/structuretocpp.py +697 -0
  106. avrotize/structuretocsharp/class_test.cs.jinja +180 -0
  107. avrotize/structuretocsharp/dataclass_core.jinja +156 -0
  108. avrotize/structuretocsharp/enum_test.cs.jinja +36 -0
  109. avrotize/structuretocsharp/json_structure_converters.cs.jinja +399 -0
  110. avrotize/structuretocsharp/program.cs.jinja +49 -0
  111. avrotize/structuretocsharp/project.csproj.jinja +17 -0
  112. avrotize/structuretocsharp/project.sln.jinja +34 -0
  113. avrotize/structuretocsharp/testproject.csproj.jinja +18 -0
  114. avrotize/structuretocsharp/tuple_converter.cs.jinja +121 -0
  115. avrotize/structuretocsharp.py +2295 -0
  116. avrotize/structuretocsv.py +365 -0
  117. avrotize/structuretodatapackage.py +659 -0
  118. avrotize/structuretodb.py +1125 -0
  119. avrotize/structuretogo/go_enum.jinja +12 -0
  120. avrotize/structuretogo/go_helpers.jinja +26 -0
  121. avrotize/structuretogo/go_interface.jinja +18 -0
  122. avrotize/structuretogo/go_struct.jinja +187 -0
  123. avrotize/structuretogo/go_test.jinja +70 -0
  124. avrotize/structuretogo.py +729 -0
  125. avrotize/structuretographql.py +502 -0
  126. avrotize/structuretoiceberg.py +355 -0
  127. avrotize/structuretojava/choice_core.jinja +34 -0
  128. avrotize/structuretojava/class_core.jinja +23 -0
  129. avrotize/structuretojava/enum_core.jinja +18 -0
  130. avrotize/structuretojava/equals_hashcode.jinja +30 -0
  131. avrotize/structuretojava/pom.xml.jinja +26 -0
  132. avrotize/structuretojava/tuple_core.jinja +49 -0
  133. avrotize/structuretojava.py +938 -0
  134. avrotize/structuretojs/class_core.js.jinja +33 -0
  135. avrotize/structuretojs/enum_core.js.jinja +10 -0
  136. avrotize/structuretojs/package.json.jinja +12 -0
  137. avrotize/structuretojs/test_class.js.jinja +84 -0
  138. avrotize/structuretojs/test_enum.js.jinja +58 -0
  139. avrotize/structuretojs/test_runner.js.jinja +45 -0
  140. avrotize/structuretojs.py +657 -0
  141. avrotize/structuretojsons.py +498 -0
  142. avrotize/structuretokusto.py +639 -0
  143. avrotize/structuretomd/README.md.jinja +204 -0
  144. avrotize/structuretomd.py +322 -0
  145. avrotize/structuretoproto.py +764 -0
  146. avrotize/structuretopython/dataclass_core.jinja +363 -0
  147. avrotize/structuretopython/enum_core.jinja +45 -0
  148. avrotize/structuretopython/map_alias.jinja +21 -0
  149. avrotize/structuretopython/pyproject_toml.jinja +23 -0
  150. avrotize/structuretopython/test_class.jinja +103 -0
  151. avrotize/structuretopython/test_enum.jinja +34 -0
  152. avrotize/structuretopython.py +799 -0
  153. avrotize/structuretorust/dataclass_enum.rs.jinja +63 -0
  154. avrotize/structuretorust/dataclass_struct.rs.jinja +121 -0
  155. avrotize/structuretorust/dataclass_union.rs.jinja +81 -0
  156. avrotize/structuretorust.py +714 -0
  157. avrotize/structuretots/class_core.ts.jinja +78 -0
  158. avrotize/structuretots/enum_core.ts.jinja +6 -0
  159. avrotize/structuretots/gitignore.jinja +8 -0
  160. avrotize/structuretots/index.ts.jinja +1 -0
  161. avrotize/structuretots/package.json.jinja +39 -0
  162. avrotize/structuretots/test_class.ts.jinja +35 -0
  163. avrotize/structuretots/tsconfig.json.jinja +21 -0
  164. avrotize/structuretots.py +740 -0
  165. avrotize/structuretoxsd.py +679 -0
  166. avrotize/xsdtoavro.py +413 -0
  167. avrotize-2.21.1.dist-info/METADATA +1319 -0
  168. avrotize-2.21.1.dist-info/RECORD +171 -0
  169. avrotize-2.21.1.dist-info/WHEEL +4 -0
  170. avrotize-2.21.1.dist-info/entry_points.txt +3 -0
  171. avrotize-2.21.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,382 @@
1
+ """
2
+ Module to convert Protobuf .proto files to Avro schema.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ from typing import Dict, List, Tuple
9
+ from avrotize.common import pascal
10
+ from avrotize.dependency_resolver import sort_messages_by_dependencies, inline_dependencies_of
11
+ from . import proto2parser
12
+ from . import proto3parser
13
+
14
+ AvroSchema = Dict[str, 'AvroSchema'] | List['AvroSchema'] | str | None
15
+
16
+ class ProtoToAvroConverter:
17
+ """Class to convert Protobuf .proto files to Avro schema."""
18
+
19
+ isomorphic_types = ['float', 'double', 'bytes', 'string']
20
+
21
+ def __init__(self, proto_root: str = None):
22
+ """Initialize ProtoToAvroConverter.
23
+
24
+ Args:
25
+ proto_root (str): Optional root directory for resolving proto imports.
26
+ When provided, imports are resolved relative to this directory.
27
+ """
28
+ self.imported_types: Dict[str, str] = {}
29
+ self.generated_types: Dict[str, str] = {}
30
+ self.forward_references: Dict[str, str] = {} # table for resolvbing forward references
31
+ self.proto_root: str = proto_root
32
+
33
+ def proto_type_to_avro_primitive(self, proto_type: str)-> Tuple[bool, str]:
34
+ """
35
+ Map Protobuf types to Avro primitive types.
36
+
37
+ Args:
38
+ proto_type (str): Protobuf type to convert.
39
+
40
+ Returns:
41
+ str or dict: Corresponding Avro type.
42
+ """
43
+ mapping = {
44
+ 'google.protobuf.Empty': 'null', # Special handling may be required
45
+ 'bool': 'boolean',
46
+ 'int32': 'int',
47
+ 'uint32': 'int',
48
+ 'sint32': 'int',
49
+ 'int64': 'long',
50
+ 'uint64': 'long',
51
+ 'sint64': 'long',
52
+ 'fixed32': 'int',
53
+ 'fixed64': 'long',
54
+ 'sfixed32': 'int',
55
+ 'sfixed64': 'long',
56
+ 'google.protobuf.Timestamp': {
57
+ "type": "long",
58
+ "logicalType": "timestamp-micros"
59
+ }
60
+ }
61
+ if proto_type in self.isomorphic_types:
62
+ return True, proto_type
63
+ mapped = mapping.get(proto_type, None)
64
+ if mapped:
65
+ return True, mapped
66
+ return False, proto_type
67
+
68
+ def build_forward_references_from_message(self, proto_message_type: proto2parser.Message | proto3parser.Message, avro_namespace: str):
69
+ """
70
+ Build forward references from a Protobuf message.
71
+
72
+ Args:
73
+ proto_message_type: The message type from the parsed proto file.
74
+ avro_namespace (str): The namespace for the message.
75
+ """
76
+ for _, nested_message in proto_message_type.messages.items():
77
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
78
+ self.build_forward_references_from_message(nested_message, nested_namespace)
79
+ for _, enum_type in proto_message_type.enums.items():
80
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
81
+ self.forward_references[nested_namespace+'.'+enum_type.name] = "enum"
82
+ self.forward_references[avro_namespace+'.'+proto_message_type.name] = "record"
83
+
84
+ def build_forward_references_from_file(self, proto_file: proto3parser.ProtoFile| proto2parser.ProtoFile, avro_namespace: str):
85
+ """
86
+ Build forward references from a Protobuf file.
87
+
88
+ Args:
89
+ proto_file: The parsed proto file.
90
+ avro_namespace (str): The namespace for the message.
91
+ """
92
+ for _, enum_type in proto_file.enums.items():
93
+ self.forward_references[avro_namespace+'.'+enum_type.name] = "enum"
94
+ for _, message in proto_file.messages.items():
95
+ self.build_forward_references_from_message(message, avro_namespace)
96
+
97
+ def convert_proto_to_avro_schema(self, proto_file_path: str, avro_namespace: str, message_type: str) -> list:
98
+ """
99
+ Convert .proto file to Avro schema.
100
+
101
+ Args:
102
+ proto_file_path (str): Path to the Protobuf .proto file.
103
+
104
+ Returns:
105
+ list: Avro schema as a list of dictionaries.
106
+ """
107
+ with open(proto_file_path, 'r', encoding='utf-8') as proto_file:
108
+ proto_schema = proto_file.read()
109
+
110
+ # Determine whether we have proto3 or proto2 and parse the data
111
+ if re.search(r'syntax\s*=\s*"proto3"', proto_schema):
112
+ data: proto3parser.ProtoFile = proto3parser.parse(proto_schema)
113
+ else:
114
+ data: proto2parser.ProtoFile = proto2parser.parse(proto_schema)
115
+
116
+ # Build forward references
117
+ self.build_forward_references_from_file(data, avro_namespace)
118
+ # Avro schema header
119
+ avro_schema = []
120
+ for import_ in data.imports:
121
+ # Handle protobuf imports
122
+ if import_.startswith('google/protobuf/'):
123
+ script_path = os.path.dirname(os.path.abspath(__file__))
124
+ avsc_dir = os.path.join(script_path, 'prototypes')
125
+ # Load the corresponding avsc file from ./prototypes at this script's path into avro_schema
126
+ avsc = f'{avsc_dir}/{import_.replace("google/protobuf/", "").replace(".proto", ".avsc")}'
127
+ with open(avsc, 'r', encoding='utf-8') as avsc_file:
128
+ types = json.load(avsc_file)
129
+ for t in types:
130
+ qualified_name = t["namespace"] + "." + t["name"]
131
+ self.imported_types[qualified_name] = t
132
+ else:
133
+ # Resolve import path: try proto_root first, then fall back to file-relative path
134
+ import_path = None
135
+
136
+ if self.proto_root:
137
+ # Try resolving relative to proto_root
138
+ candidate_path = os.path.join(self.proto_root, import_)
139
+ if os.path.exists(candidate_path):
140
+ import_path = candidate_path
141
+
142
+ if not import_path:
143
+ # Fall back to resolving relative to the directory of the current proto file
144
+ cwd = os.path.join(os.getcwd(), os.path.dirname(proto_file_path))
145
+ candidate_path = os.path.join(cwd, import_)
146
+ if os.path.exists(candidate_path):
147
+ import_path = candidate_path
148
+
149
+ # Raise an exception if the imported file does not exist
150
+ if not import_path:
151
+ raise FileNotFoundError(f'Import file {import_} does not exist. Searched in proto_root: {self.proto_root}, and relative to: {os.path.dirname(proto_file_path)}')
152
+
153
+ package_name = pascal(import_.replace('.proto', ''))
154
+ import_namespace = (avro_namespace + '.' + package_name) if avro_namespace else package_name
155
+ avro_schema.extend(self.convert_proto_to_avro_schema(import_path, import_namespace, message_type))
156
+
157
+
158
+ # Convert enum fields
159
+ for _, enum_type in data.enums.items():
160
+ self.handle_enum(enum_type, avro_schema, avro_namespace)
161
+
162
+ # Convert message fields
163
+ for _, m in data.messages.items():
164
+ self.handle_message(m, avro_schema, avro_namespace)
165
+
166
+
167
+ # Sort the messages in avro_schema by dependencies
168
+ if message_type:
169
+ message_schema = next(
170
+ (message for message in avro_schema if message['type'] == "record" and message['name'] == message_type), None)
171
+ if not message_schema:
172
+ raise ValueError(f'Message type {message_type} not found in the Avro schema.')
173
+ else:
174
+ inline_dependencies_of(avro_schema, message_schema)
175
+ return message_schema
176
+ else:
177
+ avro_schema = sort_messages_by_dependencies(avro_schema)
178
+ return avro_schema
179
+
180
+ @staticmethod
181
+ def clean_comment(comment: str):
182
+ """
183
+ Clean comments by stripping slashes, newlines, linefeeds, and extra whitespace.
184
+
185
+ Args:
186
+ comment (str): The comment to clean.
187
+
188
+ Returns:
189
+ str: Cleaned comment.
190
+ """
191
+ if comment:
192
+ return comment.replace('//', '').replace('\n', '').lstrip().rstrip()
193
+ return None
194
+
195
+ def handle_enum(self, enum_type: proto2parser.Enum | proto3parser.Enum, avro_schema: AvroSchema, avro_namespace: str) -> AvroSchema:
196
+ """
197
+ Convert enum fields to avro schema.
198
+
199
+ Args:
200
+ enum_type: The enum type from the parsed proto file.
201
+ avro_schema (list): The list to append the converted enum schema.
202
+ namespace (str): The namespace for the enum.
203
+ """
204
+ comment = self.clean_comment(
205
+ enum_type.comment.content if enum_type.comment and enum_type.comment.content else None)
206
+
207
+ # Create avro schema
208
+ avro_enum: AvroSchema = {
209
+ 'name': enum_type.name,
210
+ 'type': 'enum',
211
+ 'namespace': avro_namespace,
212
+ 'symbols': [],
213
+ 'ordinals': {}
214
+ }
215
+
216
+ if comment:
217
+ avro_enum['doc'] = comment
218
+ for value in enum_type.fields:
219
+ avro_enum['symbols'].append(value.name)
220
+ avro_enum['ordinals'][value.name] = int(value.number)
221
+ avro_schema.append(avro_enum)
222
+ self.generated_types[avro_enum['namespace']+'.'+avro_enum['name']] = "enum"
223
+ return avro_enum
224
+
225
+ def handle_message(self, proto_message_type: proto2parser.Message | proto3parser.Message, avro_schema: AvroSchema, avro_namespace: str)-> AvroSchema:
226
+ """
227
+ Convert protobuf messages to avro records.
228
+
229
+ Args:
230
+ m: The message type from the parsed proto file.
231
+ avro_schema (list): The list to append the converted message schema.
232
+ namespace (str): The namespace for the message.
233
+ """
234
+ dependencies = []
235
+
236
+ comment = self.clean_comment(proto_message_type.comment.content if proto_message_type.comment and proto_message_type.comment.content else None)
237
+ avro_record: AvroSchema = {
238
+ 'type': 'record',
239
+ 'name': proto_message_type.name,
240
+ 'namespace': avro_namespace,
241
+ 'fields': []
242
+ }
243
+ if comment:
244
+ avro_record['doc'] = comment
245
+ for proto_field in proto_message_type.fields:
246
+ avro_type = self.get_avro_type_for_field(proto_message_type, avro_namespace, avro_schema, dependencies, proto_field)
247
+ comment = self.clean_comment(proto_field.comment.content if proto_field.comment and proto_field.comment.content else None)
248
+
249
+ avro_field = {
250
+ 'name': proto_field.name,
251
+ 'type': avro_type,
252
+ }
253
+
254
+ if comment:
255
+ avro_field['doc'] = comment
256
+
257
+ avro_record['fields'].append(avro_field)
258
+
259
+ for proto_field in proto_message_type.oneofs:
260
+ avro_oneof: AvroSchema = {
261
+ 'name': proto_field.name,
262
+ 'type': []
263
+ }
264
+ comment = self.clean_comment(proto_field.comment.content if proto_field.comment and proto_field.comment.content else None)
265
+ if comment:
266
+ avro_oneof['doc'] = comment
267
+ for oneof_field in proto_field.fields:
268
+ avro_type = self.get_avro_type_for_field(proto_message_type, avro_namespace, avro_schema, dependencies, oneof_field)
269
+ comment = self.clean_comment(oneof_field.comment.content if oneof_field.comment and oneof_field.comment.content else None)
270
+ if comment:
271
+ oneof_field['doc'] = comment
272
+ avro_oneof['type'].append(avro_type)
273
+ avro_record['fields'].append(avro_oneof)
274
+
275
+ if dependencies:
276
+ avro_record['dependencies'] = dependencies
277
+ avro_schema.append(avro_record)
278
+ for _, nested_message in proto_message_type.messages.items():
279
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
280
+ self.handle_message(nested_message, avro_schema, nested_namespace)
281
+ # Convert enum fields
282
+ for _, enum_type in proto_message_type.enums.items():
283
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
284
+ self.handle_enum(enum_type, avro_schema, nested_namespace)
285
+ self.generated_types[avro_record['namespace']+'.'+avro_record['name']] = "record"
286
+ return avro_record
287
+
288
+ def get_avro_type_for_field(self, proto_message_type: proto2parser.Message | proto3parser.Message, avro_namespace: str, avro_schema: AvroSchema, dependencies: List[str], proto_field: proto2parser.Field | proto3parser.Field):
289
+ """
290
+ Get Avro type for a Protobuf field.
291
+
292
+ Args:
293
+ m: The message type from the parsed proto file.
294
+ namespace (str): The namespace for the message.
295
+ dependencies (list): The list to append the dependencies.
296
+ f: The field from the parsed proto file.
297
+
298
+ Returns:
299
+ str or dict: Corresponding Avro type.
300
+ """
301
+ avro_field_type: AvroSchema = None
302
+ proto_field_type = proto_field.val_type if proto_field.label == 'repeated' or proto_field.type == 'map' else proto_field.type
303
+ is_primitive, avro_field_type = self.proto_type_to_avro_primitive(proto_field_type)
304
+
305
+ if not is_primitive:
306
+ if proto_field.type in self.imported_types:
307
+ avro_field_type = self.imported_types[proto_field.type]
308
+ else:
309
+ avro_field_type = avro_namespace + '.' + avro_field_type
310
+ found_in_nested_definitions = False
311
+ for k, nested_proto_message_type in proto_message_type.messages.items():
312
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
313
+ if nested_proto_message_type.name == proto_field_type:
314
+ avro_field_type = self.handle_message(nested_proto_message_type, avro_schema, nested_namespace)
315
+ del proto_message_type.messages[k]
316
+ if 'dependencies' in avro_field_type:
317
+ dependencies.extend(avro_field_type['dependencies'])
318
+ del avro_field_type['dependencies']
319
+ found_in_nested_definitions = True
320
+ break
321
+ if not found_in_nested_definitions:
322
+ for k, nested_proto_enum_type in proto_message_type.enums.items():
323
+ nested_namespace = avro_namespace + '.' + proto_message_type.name + '_types'
324
+ if nested_proto_enum_type.name == proto_field_type:
325
+ avro_field_type = self.handle_enum(nested_proto_enum_type, avro_schema, nested_namespace)
326
+ del proto_message_type.enums[k]
327
+ found_in_nested_definitions = True
328
+ break
329
+ if not found_in_nested_definitions:
330
+ dependency_avro_field_type = avro_field_type
331
+ while '.' in dependency_avro_field_type:
332
+ if dependency_avro_field_type in self.forward_references:
333
+ dependencies.append(dependency_avro_field_type)
334
+ break
335
+ n = dependency_avro_field_type.split('.')
336
+ dependency_avro_field_type = '.'.join(n[:-2]+[n[-1]])
337
+
338
+ if proto_field.label == 'optional':
339
+ avro_field_type = ["null", avro_field_type]
340
+ if proto_field.label == 'repeated':
341
+ avro_type: AvroSchema = {
342
+ "type": "array",
343
+ "items": avro_field_type
344
+ }
345
+ elif proto_field.type == 'map':
346
+ avro_type: AvroSchema = {
347
+ "type": "map",
348
+ "values": avro_field_type,
349
+ }
350
+ else:
351
+ avro_type = avro_field_type
352
+ return avro_type
353
+
354
+
355
+ def convert_proto_to_avro(proto_file_path: str, avro_schema_path: str, namespace: str = None, message_type: str = None, proto_root: str = None):
356
+ """
357
+ Convert Protobuf .proto file to Avro schema.
358
+
359
+ Args:
360
+ proto_file_path (str): Path to the Protobuf .proto file.
361
+ avro_schema_path (str): Path to save the Avro schema .avsc file.
362
+ namespace (str): Optional namespace for the Avro schema.
363
+ message_type (str): Optional specific message type to extract.
364
+ proto_root (str): Optional root directory for resolving proto imports.
365
+ When provided, imports are resolved relative to this directory.
366
+
367
+ Raises:
368
+ FileNotFoundError: If the proto file does not exist.
369
+ ValueError: If the file extensions are incorrect.
370
+ """
371
+ if not os.path.exists(proto_file_path):
372
+ raise FileNotFoundError(f'Proto file {proto_file_path} does not exist.')
373
+
374
+ converter = ProtoToAvroConverter(proto_root=proto_root)
375
+ if not namespace:
376
+ namespace = pascal(os.path.basename(proto_file_path).replace('.proto', ''))
377
+ avro_schema = converter.convert_proto_to_avro_schema(proto_file_path, namespace, message_type)
378
+
379
+ # Convert the Avro schema to JSON and write it to the file
380
+ with open(avro_schema_path, 'w', encoding='utf-8') as avro_file:
381
+ avro_file.write(json.dumps(avro_schema, indent=2))
382
+
@@ -0,0 +1,19 @@
1
+ [
2
+ {
3
+ "type": "record",
4
+ "name": "Any",
5
+ "namespace": "google.protobuf",
6
+ "fields": [
7
+ {
8
+ "name": "type_url",
9
+ "type": "string",
10
+ "doc": "A URL/resource name that uniquely identifies the type of the serialized protocol buffer message. This string must contain at least one \"/\" character. The last segment of the URL's path must represent the fully qualified name of the type (as in `path/google.protobuf.Duration`). The name should be in a canonical form (e.g., leading \".\" is not accepted). In practice, teams usually precompile into the binary all types that they expect it to use in the context of Any. However, for URLs which use the scheme `http`, `https`, or no scheme, one can optionally set up a type server that maps type URLs to message definitions as follows: * If no scheme is provided, `https` is assumed. * An HTTP GET on the URL must yield a [google.protobuf.Type][] value in binary format, or produce an error. * Applications are allowed to cache lookup results based on the URL, or have them precompiled into a binary to avoid any lookup. Therefore, binary compatibility needs to be preserved on changes to types. (Use versioned type names to manage breaking changes.) Note: this functionality is not currently available in the official protobuf release, and it is not used for type URLs beginning with type.googleapis.com. As of May 2023, there are no widely used type server implementations and no plans to implement one. Schemes other than `http`, `https` (or the empty scheme) might be used with implementation specific semantics."
11
+ },
12
+ {
13
+ "name": "value",
14
+ "type": "bytes"
15
+ }
16
+ ],
17
+ "doc": "if (any.is(Foo.class)) { foo = any.unpack(Foo.class); } or ... if (any.isSameTypeAs(Foo.getDefaultInstance())) { foo = any.unpack(Foo.getDefaultInstance()); } Example 3: Pack and unpack a message in Python. foo = Foo(...) any = Any() any.Pack(foo) ... if any.Is(Foo.DESCRIPTOR): any.Unpack(foo) ... Example 4: Pack and unpack a message in Go foo := &pb.Foo{...} any, err := anypb.New(foo) if err != nil { ... } ... foo := &pb.Foo{} if err := any.UnmarshalTo(foo); err != nil { ... } The pack methods provided by protobuf library will by default use 'type.googleapis.com/full.type.name' as the type URL and the unpack methods only use the fully qualified type name after the last '/' in the type URL, for example \"foo.bar.com/x/y.z\" will yield type name \"y.z\". JSON ==== The JSON representation of an `Any` value uses the regular representation of the deserialized, embedded message, with an additional field `@type` which contains the type URL. Example: package google.profile; message Person { string first_name = 1; string last_name = 2; } { \"@type\": \"type.googleapis.com/google.profile.Person\", \"firstName\": <string>, \"lastName\": <string> } If the embedded message type is well-known and has a custom JSON representation, that representation will be embedded adding a field `value` which holds the custom JSON in addition to the `@type` field. Example (for message [google.protobuf.Duration][]): { \"@type\": \"type.googleapis.com/google.protobuf.Duration\", \"value\": \"1.212s\" }"
18
+ }
19
+ ]
@@ -0,0 +1,106 @@
1
+ [
2
+ {
3
+ "type": "record",
4
+ "name": "Method",
5
+ "namespace": "google.protobuf",
6
+ "fields": [
7
+ {
8
+ "name": "name",
9
+ "type": "string",
10
+ "doc": "The simple name of this method."
11
+ },
12
+ {
13
+ "name": "request_type_url",
14
+ "type": "string",
15
+ "doc": "A URL of the input message type."
16
+ },
17
+ {
18
+ "name": "request_streaming",
19
+ "type": "boolean",
20
+ "doc": "If true, the request is streamed."
21
+ },
22
+ {
23
+ "name": "response_type_url",
24
+ "type": "string",
25
+ "doc": "The URL of the output message type."
26
+ },
27
+ {
28
+ "name": "response_streaming",
29
+ "type": "boolean",
30
+ "doc": "If true, the response is streamed."
31
+ },
32
+ {
33
+ "name": "options",
34
+ "type": "repeated",
35
+ "doc": "Any metadata attached to the method."
36
+ },
37
+ {
38
+ "name": "syntax",
39
+ "type": "Syntax",
40
+ "doc": "The source syntax of this method."
41
+ }
42
+ ],
43
+ "doc": "Method represents a method of an API interface."
44
+ },
45
+ {
46
+ "type": "record",
47
+ "name": "Mixin",
48
+ "namespace": "google.protobuf",
49
+ "fields": [
50
+ {
51
+ "name": "name",
52
+ "type": "string",
53
+ "doc": "The fully qualified name of the interface which is included."
54
+ },
55
+ {
56
+ "name": "root",
57
+ "type": "string",
58
+ "doc": "If non-empty specifies a path under which inherited HTTP paths are rooted."
59
+ }
60
+ ],
61
+ "doc": "Declares an API Interface to be included in this interface. The including interface must redeclare all the methods from the included interface, but documentation and options are inherited as follows: - If after comment and whitespace stripping, the documentation string of the redeclared method is empty, it will be inherited from the original method. - Each annotation belonging to the service config (http, visibility) which is not set in the redeclared method will be inherited. - If an http annotation is inherited, the path pattern will be modified as follows. Any version prefix will be replaced by the version of the including interface plus the [root][] path if specified. Example of a simple mixin: package google.acl.v1; service AccessControl { Get the underlying ACL object. rpc GetAcl(GetAclRequest) returns (Acl) { option (google.api.http).get = \"/v1/{resource=**}:getAcl\"; } } package google.storage.v2; service Storage { rpc GetAcl(GetAclRequest) returns (Acl); Get a data record. rpc GetData(GetDataRequest) returns (Data) { option (google.api.http).get = \"/v2/{resource=**}\"; } } Example of a mixin configuration: apis: - name: google.storage.v2.Storage mixins: - name: google.acl.v1.AccessControl The mixin construct implies that all methods in `AccessControl` are also declared with same name and request/response types in `Storage`. A documentation generator or annotation processor will see the effective `Storage.GetAcl` method after inherting documentation and annotations as follows: service Storage { Get the underlying ACL object. rpc GetAcl(GetAclRequest) returns (Acl) { option (google.api.http).get = \"/v2/{resource=**}:getAcl\"; } ... } Note how the version in the path pattern changed from `v1` to `v2`. If the `root` field in the mixin is specified, it should be a relative path under which inherited HTTP paths are placed. Example: apis: - name: google.storage.v2.Storage mixins: - name: google.acl.v1.AccessControl root: acls This implies the following inherited HTTP annotation: service Storage { Get the underlying ACL object. rpc GetAcl(GetAclRequest) returns (Acl) { option (google.api.http).get = \"/v2/acls/{resource=**}:getAcl\"; } ... }"
62
+ },
63
+ {
64
+ "type": "record",
65
+ "name": "Api",
66
+ "namespace": "google.protobuf",
67
+ "fields": [
68
+ {
69
+ "name": "name",
70
+ "type": "string",
71
+ "doc": "The fully qualified name of this interface, including package name followed by the interface's simple name."
72
+ },
73
+ {
74
+ "name": "methods",
75
+ "type": "repeated",
76
+ "doc": "The methods of this interface, in unspecified order."
77
+ },
78
+ {
79
+ "name": "options",
80
+ "type": "repeated",
81
+ "doc": "Any metadata attached to the interface."
82
+ },
83
+ {
84
+ "name": "version",
85
+ "type": "string",
86
+ "doc": "A version string for this interface. If specified, must have the form `major-version.minor-version`, as in `1.10`. If the minor version is omitted, it defaults to zero. If the entire version field is empty, the major version is derived from the package name, as outlined below. If the field is not empty, the version in the package name will be verified to be consistent with what is provided here. The versioning schema uses [semantic versioning](http:semver.org) where the major version number indicates a breaking change and the minor version an additive, non-breaking change. Both version numbers are signals to users what to expect from different versions, and should be carefully chosen based on the product plan. The major version is also reflected in the package name of the interface, which must end in `v<major-version>`, as in `google.feature.v1`. For major versions 0 and 1, the suffix can be omitted. Zero major versions must only be used for experimental, non-GA interfaces."
87
+ },
88
+ {
89
+ "name": "source_context",
90
+ "type": "SourceContext",
91
+ "doc": "Source context for the protocol buffer service represented by this message."
92
+ },
93
+ {
94
+ "name": "mixins",
95
+ "type": "repeated",
96
+ "doc": "Included interfaces. See [Mixin][]."
97
+ },
98
+ {
99
+ "name": "syntax",
100
+ "type": "Syntax",
101
+ "doc": "The source syntax of the service."
102
+ }
103
+ ],
104
+ "doc": "Api is a light-weight descriptor for an API Interface. Interfaces are also described as \"protocol buffer services\" in some contexts, such as by the \"service\" keyword in a .proto file, but they are different from API Services, which represent a concrete implementation of an interface as opposed to simply a description of methods and bindings. They are also sometimes simply referred to as \"APIs\" in other contexts, such as the name of this message itself. See https:cloud.google.com/apis/design/glossary for detailed terminology."
105
+ }
106
+ ]
@@ -0,0 +1,20 @@
1
+ [
2
+ {
3
+ "type": "record",
4
+ "name": "Duration",
5
+ "namespace": "google.protobuf",
6
+ "fields": [
7
+ {
8
+ "name": "seconds",
9
+ "type": "long",
10
+ "doc": "Signed seconds of the span of time. Must be from -315,576,000,000 to +315,576,000,000 inclusive. Note: these bounds are computed from: 60 sec/min * 60 min/hr * 24 hr/day * 365.25 days/year * 10000 years"
11
+ },
12
+ {
13
+ "name": "nanos",
14
+ "type": "int",
15
+ "doc": "of time. Durations less than one second are represented with a 0 `seconds` field and a positive or negative `nanos` field. For durations of one second or more, a non-zero value for the `nanos` field must be of the same sign as the `seconds` field. Must be from -999,999,999 to +999,999,999 inclusive."
16
+ }
17
+ ],
18
+ "doc": "end.seconds -= 1; end.nanos += 1000000000; } else if (end.nanos >= 1000000000) { end.seconds += 1; end.nanos -= 1000000000; } Example 3: Compute Duration from datetime.timedelta in Python. td = datetime.timedelta(days=3, minutes=10) duration = Duration() duration.FromTimedelta(td) # JSON Mapping In JSON format, the Duration type is encoded as a string rather than an object, where the string ends in the suffix \"s\" (indicating seconds) and is preceded by the number of seconds, with nanoseconds expressed as fractional seconds. For example, 3 seconds with 0 nanoseconds should be encoded in JSON format as \"3s\", while 3 seconds and 1 nanosecond should be expressed in JSON format as \"3.000000001s\", and 3 seconds and 1 microsecond should be expressed in JSON format as \"3.000001s\"."
19
+ }
20
+ ]
@@ -0,0 +1,18 @@
1
+ [
2
+ {
3
+ "type": "record",
4
+ "name": "FieldMask",
5
+ "namespace": "google.protobuf",
6
+ "fields": [
7
+ {
8
+ "name": "paths",
9
+ "type": {
10
+ "type": "array",
11
+ "items": "string"
12
+ },
13
+ "doc": "The set of field mask paths."
14
+ }
15
+ ],
16
+ "doc": "be appended to the existing repeated field in the target resource. Note that a repeated field is only allowed in the last position of a `paths` string. If a sub-message is specified in the last position of the field mask for an update operation, then new value will be merged into the existing sub-message in the target resource. For example, given the target message: f { b { d: 1 x: 2 } c: [1] } And an update message: f { b { d: 10 } c: [2] } then if the field mask is: paths: [\"f.b\", \"f.c\"] then the result will be: f { b { d: 10 x: 2 } c: [1, 2] } An implementation may provide options to override this default behavior for repeated and message fields. In order to reset a field's value to the default, the field must be in the mask and set to the default value in the provided resource. Hence, in order to reset all fields of a resource, provide a default instance of the resource and set all fields in the mask, or do not provide a mask as described below. If a field mask is not present on update, the operation applies to all fields (as if a field mask of all fields has been specified). Note that in the presence of schema evolution, this may mean that fields the client does not know and has therefore not filled into the request will be reset to their default. If this is unwanted behavior, a specific service may require a client to always specify a field mask, producing an error if not. As with get operations, the location of the resource which describes the updated values in the request message depends on the operation kind. In any case, the effect of the field mask is required to be honored by the API. ## Considerations for HTTP REST The HTTP kind of an update operation which uses a field mask must be set to PATCH instead of PUT in order to satisfy HTTP semantics (PUT must only be used for full updates). # JSON Encoding of Field Masks In JSON, a field mask is encoded as a single string where paths are separated by a comma. Fields name in each path are converted to/from lower-camel naming conventions. As an example, consider the following message declarations: message Profile { User user = 1; Photo photo = 2; } message User { string display_name = 1; string address = 2; } In proto a field mask for `Profile` may look as such: mask { paths: \"user.display_name\" paths: \"photo\" } In JSON, the same mask is represented as below: { mask: \"user.displayName,photo\" } # Field Masks and Oneof Fields Field masks treat fields in oneofs just as regular fields. Consider the following message: message SampleMessage { oneof test_oneof { string name = 4; SubMessage sub_message = 9; } } The field mask can be: mask { paths: \"name\" } Or: mask { paths: \"sub_message\" } Note that oneof type names (\"test_oneof\" in this case) cannot be used in paths. ## Field Mask Verification The implementation of any API method which has a FieldMask type field in the request should verify the included field paths, and return an `INVALID_ARGUMENT` error if any path is unmappable."
17
+ }
18
+ ]
@@ -0,0 +1,60 @@
1
+ [
2
+ {
3
+ "type": "record",
4
+ "name": "Struct",
5
+ "namespace": "google.protobuf",
6
+ "fields": [
7
+ {
8
+ "name": "fields",
9
+ "type": {
10
+ "type": "map",
11
+ "values": {
12
+ "type": "record",
13
+ "name": "Value",
14
+ "namespace": "google.protobuf",
15
+ "fields": [
16
+ {
17
+ "name": "kind",
18
+ "type": [
19
+ {
20
+ "name": "NullValue",
21
+ "type": "enum",
22
+ "namespace": "google.protobuf",
23
+ "symbols": [
24
+ "NULL_VALUE"
25
+ ],
26
+ "doc": "`NullValue` is a singleton enumeration to represent the null value for the `Value` type union. The JSON representation for `NullValue` is JSON `null`."
27
+ },
28
+ "double",
29
+ "string",
30
+ "boolean",
31
+ "Struct",
32
+ {
33
+ "type": "record",
34
+ "name": "ListValue",
35
+ "namespace": "google.protobuf",
36
+ "fields": [
37
+ {
38
+ "name": "values",
39
+ "type": {
40
+ "type": "array",
41
+ "items": "Value"
42
+ },
43
+ "doc": "Repeated field of dynamically typed values."
44
+ }
45
+ ],
46
+ "doc": "`ListValue` is a wrapper around a repeated field of values. The JSON representation for `ListValue` is JSON array."
47
+ }
48
+ ],
49
+ "doc": "The kind of value."
50
+ }
51
+ ],
52
+ "doc": "`Value` represents a dynamically typed value which can be either null, a number, a string, a boolean, a recursive struct value, or a list of values. A producer of value is expected to set one of these variants. Absence of any variant indicates an error. The JSON representation for `Value` is JSON value."
53
+ }
54
+ },
55
+ "doc": "Unordered map of dynamically typed values."
56
+ }
57
+ ],
58
+ "doc": "scripting languages like JS a struct is represented as an object. The details of that representation are described together with the proto support for the language. The JSON representation for `Struct` is JSON object."
59
+ }
60
+ ]
@@ -0,0 +1,20 @@
1
+ [
2
+ {
3
+ "type": "record",
4
+ "name": "Timestamp",
5
+ "namespace": "google.protobuf",
6
+ "fields": [
7
+ {
8
+ "name": "seconds",
9
+ "type": "long",
10
+ "doc": "Represents seconds of UTC time since Unix epoch 1970-01-01T00:00:00Z. Must be from 0001-01-01T00:00:00Z to 9999-12-31T23:59:59Z inclusive."
11
+ },
12
+ {
13
+ "name": "nanos",
14
+ "type": "int",
15
+ "doc": "second values with fractions must still have non-negative nanos values that count forward in time. Must be from 0 to 999,999,999 inclusive."
16
+ }
17
+ ],
18
+ "doc": "the Joda Time's [`ISODateTimeFormat.dateTime()`]( http:joda-time.sourceforge.net/apidocs/org/joda/time/format/ISODateTimeFormat.html#dateTime() ) to obtain a formatter capable of generating timestamps in this format."
19
+ }
20
+ ]