structurize 2.16.2__py3-none-any.whl → 2.16.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. avrotize/__init__.py +63 -63
  2. avrotize/__main__.py +5 -5
  3. avrotize/_version.py +34 -34
  4. avrotize/asn1toavro.py +160 -160
  5. avrotize/avrotize.py +152 -152
  6. avrotize/avrotocpp.py +483 -483
  7. avrotize/avrotocsharp.py +992 -992
  8. avrotize/avrotocsv.py +121 -121
  9. avrotize/avrotodatapackage.py +173 -173
  10. avrotize/avrotodb.py +1383 -1383
  11. avrotize/avrotogo.py +476 -476
  12. avrotize/avrotographql.py +197 -197
  13. avrotize/avrotoiceberg.py +210 -210
  14. avrotize/avrotojava.py +1023 -1023
  15. avrotize/avrotojs.py +250 -250
  16. avrotize/avrotojsons.py +481 -481
  17. avrotize/avrotojstruct.py +345 -345
  18. avrotize/avrotokusto.py +363 -363
  19. avrotize/avrotomd.py +137 -137
  20. avrotize/avrotools.py +168 -168
  21. avrotize/avrotoparquet.py +208 -208
  22. avrotize/avrotoproto.py +358 -358
  23. avrotize/avrotopython.py +622 -622
  24. avrotize/avrotorust.py +435 -435
  25. avrotize/avrotots.py +598 -598
  26. avrotize/avrotoxsd.py +344 -344
  27. avrotize/commands.json +2493 -2433
  28. avrotize/common.py +828 -828
  29. avrotize/constants.py +4 -4
  30. avrotize/csvtoavro.py +131 -131
  31. avrotize/datapackagetoavro.py +76 -76
  32. avrotize/dependency_resolver.py +348 -348
  33. avrotize/jsonstoavro.py +1698 -1698
  34. avrotize/jsonstostructure.py +2642 -2642
  35. avrotize/jstructtoavro.py +878 -878
  36. avrotize/kstructtoavro.py +93 -93
  37. avrotize/kustotoavro.py +455 -455
  38. avrotize/parquettoavro.py +157 -157
  39. avrotize/proto2parser.py +497 -497
  40. avrotize/proto3parser.py +402 -402
  41. avrotize/prototoavro.py +382 -382
  42. avrotize/structuretocsharp.py +2005 -2005
  43. avrotize/structuretojsons.py +498 -498
  44. avrotize/structuretopython.py +772 -772
  45. avrotize/structuretots.py +653 -0
  46. avrotize/xsdtoavro.py +413 -413
  47. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/METADATA +848 -805
  48. structurize-2.16.5.dist-info/RECORD +52 -0
  49. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/licenses/LICENSE +200 -200
  50. structurize-2.16.2.dist-info/RECORD +0 -51
  51. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/WHEEL +0 -0
  52. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/entry_points.txt +0 -0
  53. {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/top_level.txt +0 -0
avrotize/avrotoiceberg.py CHANGED
@@ -1,210 +1,210 @@
1
- """Convert an Avro schema to an Iceberg schema."""
2
-
3
- import json
4
- import sys
5
- from typing import Dict, List
6
- import pyarrow as pa
7
- from pyiceberg.schema import Schema, NestedField
8
- from pyiceberg.types import (
9
- BooleanType,
10
- IntegerType,
11
- LongType,
12
- FloatType,
13
- DoubleType,
14
- StringType,
15
- BinaryType,
16
- DateType,
17
- TimestampType,
18
- DecimalType,
19
- FixedType,
20
- ListType,
21
- MapType,
22
- StructType
23
- )
24
- from pyiceberg.io.pyarrow import PyArrowFileIO, schema_to_pyarrow
25
-
26
- JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
27
-
28
-
29
- class AvroToIcebergConverter:
30
- """Class to convert Avro schema to Iceberg schema."""
31
-
32
- def __init__(self: 'AvroToIcebergConverter'):
33
- self.named_type_cache: Dict[str, JsonNode] = {}
34
- self.id_counter = 0
35
-
36
- def get_id(self) -> int:
37
- """Get a unique ID for a record type."""
38
- self.id_counter += 1
39
- return self.id_counter
40
-
41
- def get_fullname(self, namespace: str, name: str) -> str:
42
- """Get the full name of a record type."""
43
- return f"{namespace}.{name}" if namespace else name
44
-
45
- def convert_avro_to_iceberg(self, avro_schema_path: str, avro_record_type: str, output_path: str, emit_cloudevents_columns: bool=False):
46
- """Convert an Avro schema to an Iceberg schema."""
47
- schema_file = avro_schema_path
48
- if not schema_file:
49
- print("Please specify the avro schema file")
50
- sys.exit(1)
51
- with open(schema_file, "r", encoding="utf-8") as f:
52
- schema_json = f.read()
53
-
54
- # Parse the schema as a JSON object
55
- schema = json.loads(schema_json)
56
- self.cache_named_types(schema)
57
-
58
- if isinstance(schema, list) and avro_record_type:
59
- schema = next(
60
- (x for x in schema if x["name"] == avro_record_type or x["namespace"] + "." + x["name"] == avro_record_type), None)
61
- if schema is None:
62
- print(
63
- f"No top-level record type {avro_record_type} found in the Avro schema")
64
- sys.exit(1)
65
- elif not isinstance(schema, dict):
66
- print(
67
- "Expected a single Avro schema as a JSON object, or a list of schema records")
68
- sys.exit(1)
69
-
70
- # Get the name and fields of the top-level record
71
- table_name = schema["name"]
72
- fields = schema["fields"]
73
-
74
- # Create a list to store the iceberg schema
75
- iceberg_fields: List[NestedField] = []
76
-
77
- # Append the iceberg schema with the column names and types
78
- for i, field in enumerate(fields):
79
- column_name = field["name"]
80
- column_type = self.convert_avro_type_to_iceberg_type(field["type"])
81
- iceberg_fields.append(
82
- NestedField(field_id=self.get_id(), name=column_name, type=column_type))
83
-
84
- if emit_cloudevents_columns:
85
- iceberg_fields.extend([
86
- NestedField(field_id=self.get_id(),
87
- name="___type", type=StringType()),
88
- NestedField(field_id=self.get_id(),
89
- name="___source", type=StringType()),
90
- NestedField(field_id=self.get_id(),
91
- name="___id", type=StringType()),
92
- NestedField(field_id=self.get_id(),
93
- name="___time", type=TimestampType()),
94
- NestedField(field_id=self.get_id(),
95
- name="___subject", type=StringType())
96
- ])
97
-
98
- iceberg_schema = Schema(*iceberg_fields)
99
- arrow_schema = schema_to_pyarrow(iceberg_schema)
100
- print(f"Iceberg schema created: {arrow_schema}")
101
-
102
- # Write to Iceberg table (for demonstration, using local file system)
103
- file_io = PyArrowFileIO()
104
- output_file = file_io.new_output("file://"+output_path)
105
- with output_file.create(overwrite=True) as f:
106
- pa.output_stream(f).write(arrow_schema.serialize().to_pybytes())
107
-
108
- def convert_avro_type_to_iceberg_type(self, avro_type):
109
- """Convert an Avro type to an Iceberg type."""
110
- if isinstance(avro_type, list):
111
- item_count = len(avro_type)
112
- if item_count == 1:
113
- return self.convert_avro_type_to_iceberg_type(avro_type[0])
114
- elif item_count == 2:
115
- first, second = avro_type[0], avro_type[1]
116
- if first == "null":
117
- return self.convert_avro_type_to_iceberg_type(second)
118
- elif second == "null":
119
- return self.convert_avro_type_to_iceberg_type(first)
120
- else:
121
- return StructType(fields=[NestedField(field_id=self.get_id(), name=f'field_{i}', type=self.convert_avro_type_to_iceberg_type(t)) for i, t in enumerate(avro_type)])
122
- elif item_count > 0:
123
- return StructType(fields=[NestedField(field_id=self.get_id(), name=f'field_{i}', type=self.convert_avro_type_to_iceberg_type(t)) for i, t in enumerate(avro_type)])
124
- else:
125
- print(f"WARNING: Empty union type {avro_type}")
126
- return StringType()
127
- elif isinstance(avro_type, dict):
128
- type_name = avro_type.get("type")
129
- if type_name == "array":
130
- return ListType(element_id=self.get_id(), element=self.convert_avro_type_to_iceberg_type(avro_type.get("items")))
131
- elif type_name == "map":
132
- return MapType(key_id=self.get_id(), key_type=StringType(), value_id=self.get_id(), value_type=self.convert_avro_type_to_iceberg_type(avro_type.get("values")))
133
- elif type_name == "record":
134
- fields = avro_type.get("fields")
135
- return StructType(fields=[NestedField(field_id=self.get_id(), name=field["name"], type=self.convert_avro_type_to_iceberg_type(field["type"])) for i, field in enumerate(fields)])
136
- if type_name == "enum":
137
- return StringType()
138
- elif type_name == "fixed":
139
- return FixedType(avro_type.get("size"))
140
- elif type_name == "string":
141
- logical_type = avro_type.get("logicalType")
142
- if logical_type == "uuid":
143
- return StringType()
144
- return StringType()
145
- elif type_name == "bytes":
146
- logical_type = avro_type.get("logicalType")
147
- if logical_type == "decimal":
148
- return DecimalType(38, 18)
149
- return BinaryType()
150
- elif type_name == "long":
151
- logical_type = avro_type.get("logicalType")
152
- if logical_type in ["timestamp-millis", "timestamp-micros"]:
153
- return TimestampType()
154
- if logical_type in ["time-millis", "time-micros"]:
155
- return LongType()
156
- return LongType()
157
- elif type_name == "int":
158
- logical_type = avro_type.get("logicalType")
159
- if logical_type == "date":
160
- return DateType()
161
- return IntegerType()
162
- else:
163
- return self.map_iceberg_scalar_type(type_name)
164
- elif isinstance(avro_type, str):
165
- if avro_type in self.named_type_cache:
166
- return self.convert_avro_type_to_iceberg_type(self.named_type_cache[avro_type])
167
- return self.map_iceberg_scalar_type(avro_type)
168
-
169
- return StringType()
170
-
171
- def cache_named_types(self, avro_type):
172
- """Add an encountered type to the list of types."""
173
- if isinstance(avro_type, list):
174
- for item in avro_type:
175
- self.cache_named_types(item)
176
- if isinstance(avro_type, dict) and avro_type.get("name"):
177
- self.named_type_cache[self.get_fullname(avro_type.get(
178
- "namespace"), avro_type.get("name"))] = avro_type
179
- if "fields" in avro_type:
180
- for field in avro_type.get("fields"):
181
- if "type" in field:
182
- self.cache_named_types(field.get("type"))
183
-
184
- def map_iceberg_scalar_type(self, type_name: str):
185
- """Map an Avro scalar type to an Iceberg scalar type."""
186
- if type_name == "null":
187
- return StringType()
188
- elif type_name == "int":
189
- return IntegerType()
190
- elif type_name == "long":
191
- return LongType()
192
- elif type_name == "float":
193
- return FloatType()
194
- elif type_name == "double":
195
- return DoubleType()
196
- elif type_name == "boolean":
197
- return BooleanType()
198
- elif type_name == "bytes":
199
- return BinaryType()
200
- elif type_name == "string":
201
- return StringType()
202
- else:
203
- return StringType()
204
-
205
-
206
- def convert_avro_to_iceberg(avro_schema_path, avro_record_type, output_path, emit_cloudevents_columns=False):
207
- """Convert an Avro schema to an Iceberg schema."""
208
- converter = AvroToIcebergConverter()
209
- converter.convert_avro_to_iceberg(
210
- avro_schema_path, avro_record_type, output_path, emit_cloudevents_columns)
1
+ """Convert an Avro schema to an Iceberg schema."""
2
+
3
+ import json
4
+ import sys
5
+ from typing import Dict, List
6
+ import pyarrow as pa
7
+ from pyiceberg.schema import Schema, NestedField
8
+ from pyiceberg.types import (
9
+ BooleanType,
10
+ IntegerType,
11
+ LongType,
12
+ FloatType,
13
+ DoubleType,
14
+ StringType,
15
+ BinaryType,
16
+ DateType,
17
+ TimestampType,
18
+ DecimalType,
19
+ FixedType,
20
+ ListType,
21
+ MapType,
22
+ StructType
23
+ )
24
+ from pyiceberg.io.pyarrow import PyArrowFileIO, schema_to_pyarrow
25
+
26
+ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
27
+
28
+
29
+ class AvroToIcebergConverter:
30
+ """Class to convert Avro schema to Iceberg schema."""
31
+
32
+ def __init__(self: 'AvroToIcebergConverter'):
33
+ self.named_type_cache: Dict[str, JsonNode] = {}
34
+ self.id_counter = 0
35
+
36
+ def get_id(self) -> int:
37
+ """Get a unique ID for a record type."""
38
+ self.id_counter += 1
39
+ return self.id_counter
40
+
41
+ def get_fullname(self, namespace: str, name: str) -> str:
42
+ """Get the full name of a record type."""
43
+ return f"{namespace}.{name}" if namespace else name
44
+
45
+ def convert_avro_to_iceberg(self, avro_schema_path: str, avro_record_type: str, output_path: str, emit_cloudevents_columns: bool=False):
46
+ """Convert an Avro schema to an Iceberg schema."""
47
+ schema_file = avro_schema_path
48
+ if not schema_file:
49
+ print("Please specify the avro schema file")
50
+ sys.exit(1)
51
+ with open(schema_file, "r", encoding="utf-8") as f:
52
+ schema_json = f.read()
53
+
54
+ # Parse the schema as a JSON object
55
+ schema = json.loads(schema_json)
56
+ self.cache_named_types(schema)
57
+
58
+ if isinstance(schema, list) and avro_record_type:
59
+ schema = next(
60
+ (x for x in schema if x["name"] == avro_record_type or x["namespace"] + "." + x["name"] == avro_record_type), None)
61
+ if schema is None:
62
+ print(
63
+ f"No top-level record type {avro_record_type} found in the Avro schema")
64
+ sys.exit(1)
65
+ elif not isinstance(schema, dict):
66
+ print(
67
+ "Expected a single Avro schema as a JSON object, or a list of schema records")
68
+ sys.exit(1)
69
+
70
+ # Get the name and fields of the top-level record
71
+ table_name = schema["name"]
72
+ fields = schema["fields"]
73
+
74
+ # Create a list to store the iceberg schema
75
+ iceberg_fields: List[NestedField] = []
76
+
77
+ # Append the iceberg schema with the column names and types
78
+ for i, field in enumerate(fields):
79
+ column_name = field["name"]
80
+ column_type = self.convert_avro_type_to_iceberg_type(field["type"])
81
+ iceberg_fields.append(
82
+ NestedField(field_id=self.get_id(), name=column_name, type=column_type))
83
+
84
+ if emit_cloudevents_columns:
85
+ iceberg_fields.extend([
86
+ NestedField(field_id=self.get_id(),
87
+ name="___type", type=StringType()),
88
+ NestedField(field_id=self.get_id(),
89
+ name="___source", type=StringType()),
90
+ NestedField(field_id=self.get_id(),
91
+ name="___id", type=StringType()),
92
+ NestedField(field_id=self.get_id(),
93
+ name="___time", type=TimestampType()),
94
+ NestedField(field_id=self.get_id(),
95
+ name="___subject", type=StringType())
96
+ ])
97
+
98
+ iceberg_schema = Schema(*iceberg_fields)
99
+ arrow_schema = schema_to_pyarrow(iceberg_schema)
100
+ print(f"Iceberg schema created: {arrow_schema}")
101
+
102
+ # Write to Iceberg table (for demonstration, using local file system)
103
+ file_io = PyArrowFileIO()
104
+ output_file = file_io.new_output("file://"+output_path)
105
+ with output_file.create(overwrite=True) as f:
106
+ pa.output_stream(f).write(arrow_schema.serialize().to_pybytes())
107
+
108
+ def convert_avro_type_to_iceberg_type(self, avro_type):
109
+ """Convert an Avro type to an Iceberg type."""
110
+ if isinstance(avro_type, list):
111
+ item_count = len(avro_type)
112
+ if item_count == 1:
113
+ return self.convert_avro_type_to_iceberg_type(avro_type[0])
114
+ elif item_count == 2:
115
+ first, second = avro_type[0], avro_type[1]
116
+ if first == "null":
117
+ return self.convert_avro_type_to_iceberg_type(second)
118
+ elif second == "null":
119
+ return self.convert_avro_type_to_iceberg_type(first)
120
+ else:
121
+ return StructType(fields=[NestedField(field_id=self.get_id(), name=f'field_{i}', type=self.convert_avro_type_to_iceberg_type(t)) for i, t in enumerate(avro_type)])
122
+ elif item_count > 0:
123
+ return StructType(fields=[NestedField(field_id=self.get_id(), name=f'field_{i}', type=self.convert_avro_type_to_iceberg_type(t)) for i, t in enumerate(avro_type)])
124
+ else:
125
+ print(f"WARNING: Empty union type {avro_type}")
126
+ return StringType()
127
+ elif isinstance(avro_type, dict):
128
+ type_name = avro_type.get("type")
129
+ if type_name == "array":
130
+ return ListType(element_id=self.get_id(), element=self.convert_avro_type_to_iceberg_type(avro_type.get("items")))
131
+ elif type_name == "map":
132
+ return MapType(key_id=self.get_id(), key_type=StringType(), value_id=self.get_id(), value_type=self.convert_avro_type_to_iceberg_type(avro_type.get("values")))
133
+ elif type_name == "record":
134
+ fields = avro_type.get("fields")
135
+ return StructType(fields=[NestedField(field_id=self.get_id(), name=field["name"], type=self.convert_avro_type_to_iceberg_type(field["type"])) for i, field in enumerate(fields)])
136
+ if type_name == "enum":
137
+ return StringType()
138
+ elif type_name == "fixed":
139
+ return FixedType(avro_type.get("size"))
140
+ elif type_name == "string":
141
+ logical_type = avro_type.get("logicalType")
142
+ if logical_type == "uuid":
143
+ return StringType()
144
+ return StringType()
145
+ elif type_name == "bytes":
146
+ logical_type = avro_type.get("logicalType")
147
+ if logical_type == "decimal":
148
+ return DecimalType(38, 18)
149
+ return BinaryType()
150
+ elif type_name == "long":
151
+ logical_type = avro_type.get("logicalType")
152
+ if logical_type in ["timestamp-millis", "timestamp-micros"]:
153
+ return TimestampType()
154
+ if logical_type in ["time-millis", "time-micros"]:
155
+ return LongType()
156
+ return LongType()
157
+ elif type_name == "int":
158
+ logical_type = avro_type.get("logicalType")
159
+ if logical_type == "date":
160
+ return DateType()
161
+ return IntegerType()
162
+ else:
163
+ return self.map_iceberg_scalar_type(type_name)
164
+ elif isinstance(avro_type, str):
165
+ if avro_type in self.named_type_cache:
166
+ return self.convert_avro_type_to_iceberg_type(self.named_type_cache[avro_type])
167
+ return self.map_iceberg_scalar_type(avro_type)
168
+
169
+ return StringType()
170
+
171
+ def cache_named_types(self, avro_type):
172
+ """Add an encountered type to the list of types."""
173
+ if isinstance(avro_type, list):
174
+ for item in avro_type:
175
+ self.cache_named_types(item)
176
+ if isinstance(avro_type, dict) and avro_type.get("name"):
177
+ self.named_type_cache[self.get_fullname(avro_type.get(
178
+ "namespace"), avro_type.get("name"))] = avro_type
179
+ if "fields" in avro_type:
180
+ for field in avro_type.get("fields"):
181
+ if "type" in field:
182
+ self.cache_named_types(field.get("type"))
183
+
184
+ def map_iceberg_scalar_type(self, type_name: str):
185
+ """Map an Avro scalar type to an Iceberg scalar type."""
186
+ if type_name == "null":
187
+ return StringType()
188
+ elif type_name == "int":
189
+ return IntegerType()
190
+ elif type_name == "long":
191
+ return LongType()
192
+ elif type_name == "float":
193
+ return FloatType()
194
+ elif type_name == "double":
195
+ return DoubleType()
196
+ elif type_name == "boolean":
197
+ return BooleanType()
198
+ elif type_name == "bytes":
199
+ return BinaryType()
200
+ elif type_name == "string":
201
+ return StringType()
202
+ else:
203
+ return StringType()
204
+
205
+
206
+ def convert_avro_to_iceberg(avro_schema_path, avro_record_type, output_path, emit_cloudevents_columns=False):
207
+ """Convert an Avro schema to an Iceberg schema."""
208
+ converter = AvroToIcebergConverter()
209
+ converter.convert_avro_to_iceberg(
210
+ avro_schema_path, avro_record_type, output_path, emit_cloudevents_columns)