structurize 2.16.2__py3-none-any.whl → 2.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +63 -63
- avrotize/__main__.py +5 -5
- avrotize/_version.py +34 -34
- avrotize/asn1toavro.py +160 -160
- avrotize/avrotize.py +152 -152
- avrotize/avrotocpp.py +483 -483
- avrotize/avrotocsharp.py +992 -992
- avrotize/avrotocsv.py +121 -121
- avrotize/avrotodatapackage.py +173 -173
- avrotize/avrotodb.py +1383 -1383
- avrotize/avrotogo.py +476 -476
- avrotize/avrotographql.py +197 -197
- avrotize/avrotoiceberg.py +210 -210
- avrotize/avrotojava.py +1023 -1023
- avrotize/avrotojs.py +250 -250
- avrotize/avrotojsons.py +481 -481
- avrotize/avrotojstruct.py +345 -345
- avrotize/avrotokusto.py +363 -363
- avrotize/avrotomd.py +137 -137
- avrotize/avrotools.py +168 -168
- avrotize/avrotoparquet.py +208 -208
- avrotize/avrotoproto.py +358 -358
- avrotize/avrotopython.py +622 -622
- avrotize/avrotorust.py +435 -435
- avrotize/avrotots.py +598 -598
- avrotize/avrotoxsd.py +344 -344
- avrotize/commands.json +2493 -2433
- avrotize/common.py +828 -828
- avrotize/constants.py +4 -4
- avrotize/csvtoavro.py +131 -131
- avrotize/datapackagetoavro.py +76 -76
- avrotize/dependency_resolver.py +348 -348
- avrotize/jsonstoavro.py +1698 -1698
- avrotize/jsonstostructure.py +2642 -2642
- avrotize/jstructtoavro.py +878 -878
- avrotize/kstructtoavro.py +93 -93
- avrotize/kustotoavro.py +455 -455
- avrotize/parquettoavro.py +157 -157
- avrotize/proto2parser.py +497 -497
- avrotize/proto3parser.py +402 -402
- avrotize/prototoavro.py +382 -382
- avrotize/structuretocsharp.py +2005 -2005
- avrotize/structuretojsons.py +498 -498
- avrotize/structuretopython.py +772 -772
- avrotize/structuretots.py +653 -0
- avrotize/xsdtoavro.py +413 -413
- structurize-2.16.6.dist-info/METADATA +107 -0
- structurize-2.16.6.dist-info/RECORD +52 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/licenses/LICENSE +200 -200
- structurize-2.16.2.dist-info/METADATA +0 -805
- structurize-2.16.2.dist-info/RECORD +0 -51
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/WHEEL +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/entry_points.txt +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/top_level.txt +0 -0
avrotize/avrotoparquet.py
CHANGED
|
@@ -1,208 +1,208 @@
|
|
|
1
|
-
""" Convert an Avro schema to a Parquet schema. """
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import sys
|
|
5
|
-
from typing import Dict, List
|
|
6
|
-
import pyarrow as pa
|
|
7
|
-
import pyarrow.parquet as pq
|
|
8
|
-
|
|
9
|
-
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class AvroToParquetConverter:
|
|
13
|
-
""" Class to convert Avro schema to Parquet schema."""
|
|
14
|
-
|
|
15
|
-
def __init__(self: 'AvroToParquetConverter'):
|
|
16
|
-
self.named_type_cache: Dict[str, JsonNode] = {}
|
|
17
|
-
|
|
18
|
-
def get_fullname(self, namespace: str, name: str) -> str:
|
|
19
|
-
""" Get the full name of a record type."""
|
|
20
|
-
return f"{namespace}.{name}" if namespace else name
|
|
21
|
-
|
|
22
|
-
def convert_avro_to_parquet(self, avro_schema_path, avro_record_type, parquet_file_path, emit_cloudevents_columns=False):
|
|
23
|
-
""" Convert an Avro schema to a Parquet schema."""
|
|
24
|
-
schema_file = avro_schema_path
|
|
25
|
-
if not schema_file:
|
|
26
|
-
print("Please specify the avro schema file")
|
|
27
|
-
sys.exit(1)
|
|
28
|
-
with open(schema_file, "r", encoding="utf-8") as f:
|
|
29
|
-
schema_json = f.read()
|
|
30
|
-
|
|
31
|
-
# Parse the schema as a JSON object
|
|
32
|
-
schema = json.loads(schema_json)
|
|
33
|
-
self.cache_named_types(schema)
|
|
34
|
-
|
|
35
|
-
if isinstance(schema, list) and avro_record_type:
|
|
36
|
-
schema = next(
|
|
37
|
-
(x for x in schema if x["name"] == avro_record_type or x["namespace"]+"."+x["name"] == avro_record_type), None)
|
|
38
|
-
if schema is None:
|
|
39
|
-
print(
|
|
40
|
-
f"No top-level record type {avro_record_type} found in the Avro schema")
|
|
41
|
-
sys.exit(1)
|
|
42
|
-
elif not isinstance(schema, dict):
|
|
43
|
-
print(
|
|
44
|
-
"Expected a single Avro schema as a JSON object, or a list of schema records")
|
|
45
|
-
sys.exit(1)
|
|
46
|
-
|
|
47
|
-
# Get the name and fields of the top-level record
|
|
48
|
-
table_name = schema["name"]
|
|
49
|
-
fields = schema["fields"]
|
|
50
|
-
|
|
51
|
-
# Create a list to store the parquet schema
|
|
52
|
-
parquet_schema = []
|
|
53
|
-
|
|
54
|
-
# Append the parquet schema with the column names and types
|
|
55
|
-
for field in fields:
|
|
56
|
-
column_name = field["name"]
|
|
57
|
-
column_type = self.convert_avro_type_to_parquet_type(field["type"])
|
|
58
|
-
parquet_schema.append((column_name, column_type))
|
|
59
|
-
|
|
60
|
-
if emit_cloudevents_columns:
|
|
61
|
-
parquet_schema.extend([
|
|
62
|
-
("___type", pa.string()),
|
|
63
|
-
("___source", pa.string()),
|
|
64
|
-
("___id", pa.string()),
|
|
65
|
-
("___time", pa.timestamp('ns')),
|
|
66
|
-
("___subject", pa.string())
|
|
67
|
-
])
|
|
68
|
-
|
|
69
|
-
# Create an empty table with the schema
|
|
70
|
-
table = pa.Table.from_batches([], schema=pa.schema(parquet_schema))
|
|
71
|
-
pq.write_table(table, parquet_file_path)
|
|
72
|
-
|
|
73
|
-
def convert_avro_type_to_parquet_type(self, avro_type):
|
|
74
|
-
""" Convert an Avro type to a Parquet type."""
|
|
75
|
-
if isinstance(avro_type, list):
|
|
76
|
-
# If the type is an array, then it is a union type. Look whether it's a pair of a scalar type and null:
|
|
77
|
-
item_count = len(avro_type)
|
|
78
|
-
if item_count == 1:
|
|
79
|
-
return self.convert_avro_type_to_parquet_type(avro_type[0])
|
|
80
|
-
elif item_count == 2:
|
|
81
|
-
first = avro_type[0]
|
|
82
|
-
second = avro_type[1]
|
|
83
|
-
if isinstance(first, str) and first == "null":
|
|
84
|
-
return self.convert_avro_type_to_parquet_type(second)
|
|
85
|
-
elif isinstance(second, str) and second == "null":
|
|
86
|
-
return self.convert_avro_type_to_parquet_type(first)
|
|
87
|
-
else:
|
|
88
|
-
struct_fields = self.map_union_fields(avro_type)
|
|
89
|
-
return pa.struct(struct_fields)
|
|
90
|
-
elif item_count > 0:
|
|
91
|
-
struct_fields = self.map_union_fields(avro_type)
|
|
92
|
-
return pa.struct(struct_fields)
|
|
93
|
-
else:
|
|
94
|
-
print(f"WARNING: Empty union type {avro_type}")
|
|
95
|
-
return pa.string()
|
|
96
|
-
elif isinstance(avro_type, dict):
|
|
97
|
-
type_name = avro_type.get("type")
|
|
98
|
-
if type_name == "array":
|
|
99
|
-
return pa.list_(self.convert_avro_type_to_parquet_type(avro_type.get("items")))
|
|
100
|
-
elif type_name == "map":
|
|
101
|
-
return pa.map_(pa.string(), self.convert_avro_type_to_parquet_type(avro_type.get("values")))
|
|
102
|
-
elif type_name == "record":
|
|
103
|
-
fields = avro_type.get("fields")
|
|
104
|
-
if len(fields) == 0:
|
|
105
|
-
print(
|
|
106
|
-
f"WARNING: No fields in record type {avro_type.get('name')}")
|
|
107
|
-
return pa.string()
|
|
108
|
-
return pa.struct({field.get("name"): self.convert_avro_type_to_parquet_type(field.get("type")) for field in fields})
|
|
109
|
-
if type_name == "enum":
|
|
110
|
-
return pa.string()
|
|
111
|
-
elif type_name == "fixed":
|
|
112
|
-
return pa.string()
|
|
113
|
-
elif type_name == "string":
|
|
114
|
-
logical_type = avro_type.get("logicalType")
|
|
115
|
-
if logical_type == "uuid":
|
|
116
|
-
return pa.string()
|
|
117
|
-
return pa.string()
|
|
118
|
-
elif type_name == "bytes":
|
|
119
|
-
logical_type = avro_type.get("logicalType")
|
|
120
|
-
if logical_type == "decimal":
|
|
121
|
-
return pa.decimal128(38, 18)
|
|
122
|
-
return pa.binary()
|
|
123
|
-
elif type_name == "long":
|
|
124
|
-
logical_type = avro_type.get("logicalType")
|
|
125
|
-
if logical_type in ["timestamp-millis", "timestamp-micros"]:
|
|
126
|
-
return pa.timestamp('ns')
|
|
127
|
-
if logical_type in ["time-millis", "time-micros"]:
|
|
128
|
-
return pa.time64('ns')
|
|
129
|
-
return pa.int64()
|
|
130
|
-
elif type_name == "int":
|
|
131
|
-
logical_type = avro_type.get("logicalType")
|
|
132
|
-
if logical_type == "date":
|
|
133
|
-
return pa.date32()
|
|
134
|
-
return pa.int32()
|
|
135
|
-
else:
|
|
136
|
-
return self.map_scalar_type(type_name)
|
|
137
|
-
elif isinstance(avro_type, str):
|
|
138
|
-
if avro_type in self.named_type_cache:
|
|
139
|
-
return self.convert_avro_type_to_parquet_type(self.named_type_cache[avro_type])
|
|
140
|
-
return self.map_scalar_type(avro_type)
|
|
141
|
-
|
|
142
|
-
return pa.string()
|
|
143
|
-
|
|
144
|
-
def cache_named_types(self, avro_type):
|
|
145
|
-
""" Add an encountered type to the list of types."""
|
|
146
|
-
if isinstance(avro_type, list):
|
|
147
|
-
for item in avro_type:
|
|
148
|
-
self.cache_named_types(item)
|
|
149
|
-
if isinstance(avro_type, dict) and avro_type.get("name"):
|
|
150
|
-
self.named_type_cache[self.get_fullname(avro_type.get(
|
|
151
|
-
"namespace"), avro_type.get("name"))] = avro_type
|
|
152
|
-
if "fields" in avro_type:
|
|
153
|
-
for field in avro_type.get("fields"):
|
|
154
|
-
if "type" in field:
|
|
155
|
-
self.cache_named_types(field.get("type"))
|
|
156
|
-
|
|
157
|
-
def map_union_fields(self, avro_type):
|
|
158
|
-
""" Map the fields of a union type to Parquet fields."""
|
|
159
|
-
struct_fields = []
|
|
160
|
-
for i, avro_union_type in enumerate(avro_type):
|
|
161
|
-
field_type = self.convert_avro_type_to_parquet_type(
|
|
162
|
-
avro_union_type)
|
|
163
|
-
if isinstance(avro_union_type, str):
|
|
164
|
-
if "null" == avro_union_type:
|
|
165
|
-
continue
|
|
166
|
-
if avro_union_type in self.named_type_cache:
|
|
167
|
-
avro_union_type = self.named_type_cache[avro_union_type]
|
|
168
|
-
if isinstance(avro_union_type, str):
|
|
169
|
-
field_name = f'{avro_union_type}Value'
|
|
170
|
-
elif isinstance(avro_union_type, dict):
|
|
171
|
-
if "type" in avro_union_type and "array" == avro_union_type["type"]:
|
|
172
|
-
field_name = 'ArrayValue'
|
|
173
|
-
elif "type" in avro_union_type and "map" == avro_union_type["type"]:
|
|
174
|
-
field_name = 'MapValue'
|
|
175
|
-
elif "name" in avro_union_type:
|
|
176
|
-
field_name = f'{avro_union_type.get("name")}Value'
|
|
177
|
-
else:
|
|
178
|
-
field_name = f'_{i}'
|
|
179
|
-
struct_fields.append(pa.field(field_name, field_type))
|
|
180
|
-
return struct_fields
|
|
181
|
-
|
|
182
|
-
def map_scalar_type(self, type_name: str):
|
|
183
|
-
""" Map an Avro scalar type to a Parquet scalar type."""
|
|
184
|
-
if type_name == "null":
|
|
185
|
-
return pa.string()
|
|
186
|
-
elif type_name == "int":
|
|
187
|
-
return pa.int32()
|
|
188
|
-
elif type_name == "long":
|
|
189
|
-
return pa.int64()
|
|
190
|
-
elif type_name == "float":
|
|
191
|
-
return pa.float32()
|
|
192
|
-
elif type_name == "double":
|
|
193
|
-
return pa.float64()
|
|
194
|
-
elif type_name == "boolean":
|
|
195
|
-
return pa.bool_()
|
|
196
|
-
elif type_name == "bytes":
|
|
197
|
-
return pa.binary()
|
|
198
|
-
elif type_name == "string":
|
|
199
|
-
return pa.string()
|
|
200
|
-
else:
|
|
201
|
-
return pa.string()
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def convert_avro_to_parquet(avro_schema_path, avro_record_type, parquet_file_path, emit_cloudevents_columns=False):
|
|
205
|
-
""" Convert an Avro schema to a Parquet schema."""
|
|
206
|
-
converter = AvroToParquetConverter()
|
|
207
|
-
converter.convert_avro_to_parquet(
|
|
208
|
-
avro_schema_path, avro_record_type, parquet_file_path, emit_cloudevents_columns)
|
|
1
|
+
""" Convert an Avro schema to a Parquet schema. """
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
import pyarrow.parquet as pq
|
|
8
|
+
|
|
9
|
+
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AvroToParquetConverter:
|
|
13
|
+
""" Class to convert Avro schema to Parquet schema."""
|
|
14
|
+
|
|
15
|
+
def __init__(self: 'AvroToParquetConverter'):
|
|
16
|
+
self.named_type_cache: Dict[str, JsonNode] = {}
|
|
17
|
+
|
|
18
|
+
def get_fullname(self, namespace: str, name: str) -> str:
|
|
19
|
+
""" Get the full name of a record type."""
|
|
20
|
+
return f"{namespace}.{name}" if namespace else name
|
|
21
|
+
|
|
22
|
+
def convert_avro_to_parquet(self, avro_schema_path, avro_record_type, parquet_file_path, emit_cloudevents_columns=False):
|
|
23
|
+
""" Convert an Avro schema to a Parquet schema."""
|
|
24
|
+
schema_file = avro_schema_path
|
|
25
|
+
if not schema_file:
|
|
26
|
+
print("Please specify the avro schema file")
|
|
27
|
+
sys.exit(1)
|
|
28
|
+
with open(schema_file, "r", encoding="utf-8") as f:
|
|
29
|
+
schema_json = f.read()
|
|
30
|
+
|
|
31
|
+
# Parse the schema as a JSON object
|
|
32
|
+
schema = json.loads(schema_json)
|
|
33
|
+
self.cache_named_types(schema)
|
|
34
|
+
|
|
35
|
+
if isinstance(schema, list) and avro_record_type:
|
|
36
|
+
schema = next(
|
|
37
|
+
(x for x in schema if x["name"] == avro_record_type or x["namespace"]+"."+x["name"] == avro_record_type), None)
|
|
38
|
+
if schema is None:
|
|
39
|
+
print(
|
|
40
|
+
f"No top-level record type {avro_record_type} found in the Avro schema")
|
|
41
|
+
sys.exit(1)
|
|
42
|
+
elif not isinstance(schema, dict):
|
|
43
|
+
print(
|
|
44
|
+
"Expected a single Avro schema as a JSON object, or a list of schema records")
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
# Get the name and fields of the top-level record
|
|
48
|
+
table_name = schema["name"]
|
|
49
|
+
fields = schema["fields"]
|
|
50
|
+
|
|
51
|
+
# Create a list to store the parquet schema
|
|
52
|
+
parquet_schema = []
|
|
53
|
+
|
|
54
|
+
# Append the parquet schema with the column names and types
|
|
55
|
+
for field in fields:
|
|
56
|
+
column_name = field["name"]
|
|
57
|
+
column_type = self.convert_avro_type_to_parquet_type(field["type"])
|
|
58
|
+
parquet_schema.append((column_name, column_type))
|
|
59
|
+
|
|
60
|
+
if emit_cloudevents_columns:
|
|
61
|
+
parquet_schema.extend([
|
|
62
|
+
("___type", pa.string()),
|
|
63
|
+
("___source", pa.string()),
|
|
64
|
+
("___id", pa.string()),
|
|
65
|
+
("___time", pa.timestamp('ns')),
|
|
66
|
+
("___subject", pa.string())
|
|
67
|
+
])
|
|
68
|
+
|
|
69
|
+
# Create an empty table with the schema
|
|
70
|
+
table = pa.Table.from_batches([], schema=pa.schema(parquet_schema))
|
|
71
|
+
pq.write_table(table, parquet_file_path)
|
|
72
|
+
|
|
73
|
+
def convert_avro_type_to_parquet_type(self, avro_type):
|
|
74
|
+
""" Convert an Avro type to a Parquet type."""
|
|
75
|
+
if isinstance(avro_type, list):
|
|
76
|
+
# If the type is an array, then it is a union type. Look whether it's a pair of a scalar type and null:
|
|
77
|
+
item_count = len(avro_type)
|
|
78
|
+
if item_count == 1:
|
|
79
|
+
return self.convert_avro_type_to_parquet_type(avro_type[0])
|
|
80
|
+
elif item_count == 2:
|
|
81
|
+
first = avro_type[0]
|
|
82
|
+
second = avro_type[1]
|
|
83
|
+
if isinstance(first, str) and first == "null":
|
|
84
|
+
return self.convert_avro_type_to_parquet_type(second)
|
|
85
|
+
elif isinstance(second, str) and second == "null":
|
|
86
|
+
return self.convert_avro_type_to_parquet_type(first)
|
|
87
|
+
else:
|
|
88
|
+
struct_fields = self.map_union_fields(avro_type)
|
|
89
|
+
return pa.struct(struct_fields)
|
|
90
|
+
elif item_count > 0:
|
|
91
|
+
struct_fields = self.map_union_fields(avro_type)
|
|
92
|
+
return pa.struct(struct_fields)
|
|
93
|
+
else:
|
|
94
|
+
print(f"WARNING: Empty union type {avro_type}")
|
|
95
|
+
return pa.string()
|
|
96
|
+
elif isinstance(avro_type, dict):
|
|
97
|
+
type_name = avro_type.get("type")
|
|
98
|
+
if type_name == "array":
|
|
99
|
+
return pa.list_(self.convert_avro_type_to_parquet_type(avro_type.get("items")))
|
|
100
|
+
elif type_name == "map":
|
|
101
|
+
return pa.map_(pa.string(), self.convert_avro_type_to_parquet_type(avro_type.get("values")))
|
|
102
|
+
elif type_name == "record":
|
|
103
|
+
fields = avro_type.get("fields")
|
|
104
|
+
if len(fields) == 0:
|
|
105
|
+
print(
|
|
106
|
+
f"WARNING: No fields in record type {avro_type.get('name')}")
|
|
107
|
+
return pa.string()
|
|
108
|
+
return pa.struct({field.get("name"): self.convert_avro_type_to_parquet_type(field.get("type")) for field in fields})
|
|
109
|
+
if type_name == "enum":
|
|
110
|
+
return pa.string()
|
|
111
|
+
elif type_name == "fixed":
|
|
112
|
+
return pa.string()
|
|
113
|
+
elif type_name == "string":
|
|
114
|
+
logical_type = avro_type.get("logicalType")
|
|
115
|
+
if logical_type == "uuid":
|
|
116
|
+
return pa.string()
|
|
117
|
+
return pa.string()
|
|
118
|
+
elif type_name == "bytes":
|
|
119
|
+
logical_type = avro_type.get("logicalType")
|
|
120
|
+
if logical_type == "decimal":
|
|
121
|
+
return pa.decimal128(38, 18)
|
|
122
|
+
return pa.binary()
|
|
123
|
+
elif type_name == "long":
|
|
124
|
+
logical_type = avro_type.get("logicalType")
|
|
125
|
+
if logical_type in ["timestamp-millis", "timestamp-micros"]:
|
|
126
|
+
return pa.timestamp('ns')
|
|
127
|
+
if logical_type in ["time-millis", "time-micros"]:
|
|
128
|
+
return pa.time64('ns')
|
|
129
|
+
return pa.int64()
|
|
130
|
+
elif type_name == "int":
|
|
131
|
+
logical_type = avro_type.get("logicalType")
|
|
132
|
+
if logical_type == "date":
|
|
133
|
+
return pa.date32()
|
|
134
|
+
return pa.int32()
|
|
135
|
+
else:
|
|
136
|
+
return self.map_scalar_type(type_name)
|
|
137
|
+
elif isinstance(avro_type, str):
|
|
138
|
+
if avro_type in self.named_type_cache:
|
|
139
|
+
return self.convert_avro_type_to_parquet_type(self.named_type_cache[avro_type])
|
|
140
|
+
return self.map_scalar_type(avro_type)
|
|
141
|
+
|
|
142
|
+
return pa.string()
|
|
143
|
+
|
|
144
|
+
def cache_named_types(self, avro_type):
|
|
145
|
+
""" Add an encountered type to the list of types."""
|
|
146
|
+
if isinstance(avro_type, list):
|
|
147
|
+
for item in avro_type:
|
|
148
|
+
self.cache_named_types(item)
|
|
149
|
+
if isinstance(avro_type, dict) and avro_type.get("name"):
|
|
150
|
+
self.named_type_cache[self.get_fullname(avro_type.get(
|
|
151
|
+
"namespace"), avro_type.get("name"))] = avro_type
|
|
152
|
+
if "fields" in avro_type:
|
|
153
|
+
for field in avro_type.get("fields"):
|
|
154
|
+
if "type" in field:
|
|
155
|
+
self.cache_named_types(field.get("type"))
|
|
156
|
+
|
|
157
|
+
def map_union_fields(self, avro_type):
|
|
158
|
+
""" Map the fields of a union type to Parquet fields."""
|
|
159
|
+
struct_fields = []
|
|
160
|
+
for i, avro_union_type in enumerate(avro_type):
|
|
161
|
+
field_type = self.convert_avro_type_to_parquet_type(
|
|
162
|
+
avro_union_type)
|
|
163
|
+
if isinstance(avro_union_type, str):
|
|
164
|
+
if "null" == avro_union_type:
|
|
165
|
+
continue
|
|
166
|
+
if avro_union_type in self.named_type_cache:
|
|
167
|
+
avro_union_type = self.named_type_cache[avro_union_type]
|
|
168
|
+
if isinstance(avro_union_type, str):
|
|
169
|
+
field_name = f'{avro_union_type}Value'
|
|
170
|
+
elif isinstance(avro_union_type, dict):
|
|
171
|
+
if "type" in avro_union_type and "array" == avro_union_type["type"]:
|
|
172
|
+
field_name = 'ArrayValue'
|
|
173
|
+
elif "type" in avro_union_type and "map" == avro_union_type["type"]:
|
|
174
|
+
field_name = 'MapValue'
|
|
175
|
+
elif "name" in avro_union_type:
|
|
176
|
+
field_name = f'{avro_union_type.get("name")}Value'
|
|
177
|
+
else:
|
|
178
|
+
field_name = f'_{i}'
|
|
179
|
+
struct_fields.append(pa.field(field_name, field_type))
|
|
180
|
+
return struct_fields
|
|
181
|
+
|
|
182
|
+
def map_scalar_type(self, type_name: str):
|
|
183
|
+
""" Map an Avro scalar type to a Parquet scalar type."""
|
|
184
|
+
if type_name == "null":
|
|
185
|
+
return pa.string()
|
|
186
|
+
elif type_name == "int":
|
|
187
|
+
return pa.int32()
|
|
188
|
+
elif type_name == "long":
|
|
189
|
+
return pa.int64()
|
|
190
|
+
elif type_name == "float":
|
|
191
|
+
return pa.float32()
|
|
192
|
+
elif type_name == "double":
|
|
193
|
+
return pa.float64()
|
|
194
|
+
elif type_name == "boolean":
|
|
195
|
+
return pa.bool_()
|
|
196
|
+
elif type_name == "bytes":
|
|
197
|
+
return pa.binary()
|
|
198
|
+
elif type_name == "string":
|
|
199
|
+
return pa.string()
|
|
200
|
+
else:
|
|
201
|
+
return pa.string()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def convert_avro_to_parquet(avro_schema_path, avro_record_type, parquet_file_path, emit_cloudevents_columns=False):
|
|
205
|
+
""" Convert an Avro schema to a Parquet schema."""
|
|
206
|
+
converter = AvroToParquetConverter()
|
|
207
|
+
converter.convert_avro_to_parquet(
|
|
208
|
+
avro_schema_path, avro_record_type, parquet_file_path, emit_cloudevents_columns)
|