structurize 2.16.2__py3-none-any.whl → 2.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +63 -63
- avrotize/__main__.py +5 -5
- avrotize/_version.py +34 -34
- avrotize/asn1toavro.py +160 -160
- avrotize/avrotize.py +152 -152
- avrotize/avrotocpp.py +483 -483
- avrotize/avrotocsharp.py +992 -992
- avrotize/avrotocsv.py +121 -121
- avrotize/avrotodatapackage.py +173 -173
- avrotize/avrotodb.py +1383 -1383
- avrotize/avrotogo.py +476 -476
- avrotize/avrotographql.py +197 -197
- avrotize/avrotoiceberg.py +210 -210
- avrotize/avrotojava.py +1023 -1023
- avrotize/avrotojs.py +250 -250
- avrotize/avrotojsons.py +481 -481
- avrotize/avrotojstruct.py +345 -345
- avrotize/avrotokusto.py +363 -363
- avrotize/avrotomd.py +137 -137
- avrotize/avrotools.py +168 -168
- avrotize/avrotoparquet.py +208 -208
- avrotize/avrotoproto.py +358 -358
- avrotize/avrotopython.py +622 -622
- avrotize/avrotorust.py +435 -435
- avrotize/avrotots.py +598 -598
- avrotize/avrotoxsd.py +344 -344
- avrotize/commands.json +2493 -2433
- avrotize/common.py +828 -828
- avrotize/constants.py +4 -4
- avrotize/csvtoavro.py +131 -131
- avrotize/datapackagetoavro.py +76 -76
- avrotize/dependency_resolver.py +348 -348
- avrotize/jsonstoavro.py +1698 -1698
- avrotize/jsonstostructure.py +2642 -2642
- avrotize/jstructtoavro.py +878 -878
- avrotize/kstructtoavro.py +93 -93
- avrotize/kustotoavro.py +455 -455
- avrotize/parquettoavro.py +157 -157
- avrotize/proto2parser.py +497 -497
- avrotize/proto3parser.py +402 -402
- avrotize/prototoavro.py +382 -382
- avrotize/structuretocsharp.py +2005 -2005
- avrotize/structuretojsons.py +498 -498
- avrotize/structuretopython.py +772 -772
- avrotize/structuretots.py +653 -0
- avrotize/xsdtoavro.py +413 -413
- structurize-2.16.6.dist-info/METADATA +107 -0
- structurize-2.16.6.dist-info/RECORD +52 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/licenses/LICENSE +200 -200
- structurize-2.16.2.dist-info/METADATA +0 -805
- structurize-2.16.2.dist-info/RECORD +0 -51
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/WHEEL +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/entry_points.txt +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/top_level.txt +0 -0
avrotize/parquettoavro.py
CHANGED
|
@@ -1,157 +1,157 @@
|
|
|
1
|
-
# coding: utf-8
|
|
2
|
-
"""
|
|
3
|
-
Module to convert Parquet schema to Avro schema.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import json
|
|
7
|
-
import os
|
|
8
|
-
import pyarrow as pa
|
|
9
|
-
import pyarrow.parquet as pq
|
|
10
|
-
|
|
11
|
-
from avrotize.common import avro_name
|
|
12
|
-
|
|
13
|
-
class ParquetToAvroConverter:
|
|
14
|
-
"""
|
|
15
|
-
Class to convert Parquet schema to Avro schema.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
def __init__(self, parquet_file_path, avro_schema_path, namespace=""):
|
|
19
|
-
"""
|
|
20
|
-
Initialize the converter with file paths and namespace.
|
|
21
|
-
|
|
22
|
-
:param parquet_file_path: Path to the Parquet file.
|
|
23
|
-
:param avro_schema_path: Path to save the Avro schema file.
|
|
24
|
-
:param namespace: Namespace for Avro records.
|
|
25
|
-
"""
|
|
26
|
-
self.parquet_file_path = parquet_file_path
|
|
27
|
-
self.avro_schema_path = avro_schema_path
|
|
28
|
-
self.namespace = namespace
|
|
29
|
-
|
|
30
|
-
def convert(self):
|
|
31
|
-
"""
|
|
32
|
-
Convert Parquet schema to Avro schema and save to file.
|
|
33
|
-
"""
|
|
34
|
-
parquet_table = pq.read_table(self.parquet_file_path)
|
|
35
|
-
schema = parquet_table.schema
|
|
36
|
-
|
|
37
|
-
# Infer the name of the schema from the parquet file name
|
|
38
|
-
schema_name = avro_name(os.path.basename(self.parquet_file_path).split(".")[0])
|
|
39
|
-
|
|
40
|
-
# Update the avro_schema dictionary
|
|
41
|
-
avro_schema = {
|
|
42
|
-
"type": "record",
|
|
43
|
-
"name": schema_name,
|
|
44
|
-
"namespace": self.namespace,
|
|
45
|
-
"fields": []
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
for field in schema:
|
|
49
|
-
avro_field = self.convert_parquet_field_to_avro_field(field)
|
|
50
|
-
avro_schema["fields"].append(avro_field)
|
|
51
|
-
|
|
52
|
-
with open(self.avro_schema_path, "w", encoding="utf-8") as file:
|
|
53
|
-
json.dump(avro_schema, file, indent=2)
|
|
54
|
-
|
|
55
|
-
def convert_parquet_field_to_avro_field(self, field):
|
|
56
|
-
"""
|
|
57
|
-
Convert a Parquet field to an Avro field.
|
|
58
|
-
|
|
59
|
-
:param field: Parquet field to convert.
|
|
60
|
-
:return: Avro field as a dictionary.
|
|
61
|
-
"""
|
|
62
|
-
avro_type = self.convert_parquet_type_to_avro_type(field.type, field.name)
|
|
63
|
-
avro_field = {
|
|
64
|
-
"name": field.name,
|
|
65
|
-
"type": avro_type
|
|
66
|
-
}
|
|
67
|
-
if field.metadata and b'description' in field.metadata:
|
|
68
|
-
avro_field["doc"] = field.metadata[b'description'].decode("utf-8")
|
|
69
|
-
return avro_field
|
|
70
|
-
|
|
71
|
-
def convert_parquet_type_to_avro_type(self, parquet_type, field_name):
|
|
72
|
-
"""
|
|
73
|
-
Convert a Parquet type to an Avro type.
|
|
74
|
-
|
|
75
|
-
:param parquet_type: Parquet type to convert.
|
|
76
|
-
:param field_name: Name of the field being converted.
|
|
77
|
-
:return: Avro type as a string or dictionary.
|
|
78
|
-
"""
|
|
79
|
-
if pa.types.is_int8(parquet_type):
|
|
80
|
-
return "int"
|
|
81
|
-
if pa.types.is_int16(parquet_type):
|
|
82
|
-
return "int"
|
|
83
|
-
if pa.types.is_int32(parquet_type):
|
|
84
|
-
return "int"
|
|
85
|
-
if pa.types.is_int64(parquet_type):
|
|
86
|
-
return "long"
|
|
87
|
-
if pa.types.is_uint8(parquet_type):
|
|
88
|
-
return "int"
|
|
89
|
-
if pa.types.is_uint16(parquet_type):
|
|
90
|
-
return "int"
|
|
91
|
-
if pa.types.is_uint32(parquet_type):
|
|
92
|
-
return "long"
|
|
93
|
-
if pa.types.is_uint64(parquet_type):
|
|
94
|
-
return "long"
|
|
95
|
-
if pa.types.is_float32(parquet_type):
|
|
96
|
-
return "float"
|
|
97
|
-
if pa.types.is_float64(parquet_type):
|
|
98
|
-
return "double"
|
|
99
|
-
if pa.types.is_boolean(parquet_type):
|
|
100
|
-
return "boolean"
|
|
101
|
-
if pa.types.is_binary(parquet_type):
|
|
102
|
-
return "bytes"
|
|
103
|
-
if pa.types.is_string(parquet_type):
|
|
104
|
-
return "string"
|
|
105
|
-
if pa.types.is_timestamp(parquet_type):
|
|
106
|
-
return {"type": "long", "logicalType": "timestamp-millis"}
|
|
107
|
-
if pa.types.is_date32(parquet_type):
|
|
108
|
-
return {"type": "int", "logicalType": "date"}
|
|
109
|
-
if pa.types.is_date64(parquet_type):
|
|
110
|
-
return {"type": "long", "logicalType": "timestamp-millis"}
|
|
111
|
-
if pa.types.is_list(parquet_type):
|
|
112
|
-
return {
|
|
113
|
-
"type": "array",
|
|
114
|
-
"items": self.convert_parquet_type_to_avro_type(parquet_type.value_type, field_name)
|
|
115
|
-
}
|
|
116
|
-
if pa.types.is_map(parquet_type):
|
|
117
|
-
return {
|
|
118
|
-
"type": "map",
|
|
119
|
-
"values": self.convert_parquet_type_to_avro_type(parquet_type.item_type, field_name)
|
|
120
|
-
}
|
|
121
|
-
if pa.types.is_struct(parquet_type):
|
|
122
|
-
fields = [
|
|
123
|
-
{
|
|
124
|
-
"name": nested_field.name,
|
|
125
|
-
"type": self.convert_parquet_type_to_avro_type(nested_field.type, nested_field.name)
|
|
126
|
-
} for nested_field in parquet_type
|
|
127
|
-
]
|
|
128
|
-
return {
|
|
129
|
-
"type": "record",
|
|
130
|
-
"name": f"{field_name}Type",
|
|
131
|
-
"namespace": self.namespace,
|
|
132
|
-
"fields": fields
|
|
133
|
-
}
|
|
134
|
-
if pa.types.is_decimal(parquet_type):
|
|
135
|
-
return {
|
|
136
|
-
"type": "bytes",
|
|
137
|
-
"logicalType": "decimal",
|
|
138
|
-
"precision": parquet_type.precision,
|
|
139
|
-
"scale": parquet_type.scale
|
|
140
|
-
}
|
|
141
|
-
return "string"
|
|
142
|
-
|
|
143
|
-
def convert_parquet_to_avro(parquet_file_path, avro_file_path, namespace=""):
|
|
144
|
-
"""
|
|
145
|
-
Convert a Parquet file to an Avro schema file.
|
|
146
|
-
|
|
147
|
-
:param parquet_file_path: Path to the Parquet file.
|
|
148
|
-
:param avro_file_path: Path to save the Avro schema file.
|
|
149
|
-
:param namespace: Namespace for Avro records.
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
if not os.path.exists(parquet_file_path):
|
|
153
|
-
raise FileNotFoundError(f"Parquet file not found: {parquet_file_path}")
|
|
154
|
-
|
|
155
|
-
converter = ParquetToAvroConverter(parquet_file_path, avro_file_path, namespace)
|
|
156
|
-
converter.convert()
|
|
157
|
-
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
"""
|
|
3
|
+
Module to convert Parquet schema to Avro schema.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
import pyarrow.parquet as pq
|
|
10
|
+
|
|
11
|
+
from avrotize.common import avro_name
|
|
12
|
+
|
|
13
|
+
class ParquetToAvroConverter:
|
|
14
|
+
"""
|
|
15
|
+
Class to convert Parquet schema to Avro schema.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, parquet_file_path, avro_schema_path, namespace=""):
|
|
19
|
+
"""
|
|
20
|
+
Initialize the converter with file paths and namespace.
|
|
21
|
+
|
|
22
|
+
:param parquet_file_path: Path to the Parquet file.
|
|
23
|
+
:param avro_schema_path: Path to save the Avro schema file.
|
|
24
|
+
:param namespace: Namespace for Avro records.
|
|
25
|
+
"""
|
|
26
|
+
self.parquet_file_path = parquet_file_path
|
|
27
|
+
self.avro_schema_path = avro_schema_path
|
|
28
|
+
self.namespace = namespace
|
|
29
|
+
|
|
30
|
+
def convert(self):
|
|
31
|
+
"""
|
|
32
|
+
Convert Parquet schema to Avro schema and save to file.
|
|
33
|
+
"""
|
|
34
|
+
parquet_table = pq.read_table(self.parquet_file_path)
|
|
35
|
+
schema = parquet_table.schema
|
|
36
|
+
|
|
37
|
+
# Infer the name of the schema from the parquet file name
|
|
38
|
+
schema_name = avro_name(os.path.basename(self.parquet_file_path).split(".")[0])
|
|
39
|
+
|
|
40
|
+
# Update the avro_schema dictionary
|
|
41
|
+
avro_schema = {
|
|
42
|
+
"type": "record",
|
|
43
|
+
"name": schema_name,
|
|
44
|
+
"namespace": self.namespace,
|
|
45
|
+
"fields": []
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
for field in schema:
|
|
49
|
+
avro_field = self.convert_parquet_field_to_avro_field(field)
|
|
50
|
+
avro_schema["fields"].append(avro_field)
|
|
51
|
+
|
|
52
|
+
with open(self.avro_schema_path, "w", encoding="utf-8") as file:
|
|
53
|
+
json.dump(avro_schema, file, indent=2)
|
|
54
|
+
|
|
55
|
+
def convert_parquet_field_to_avro_field(self, field):
|
|
56
|
+
"""
|
|
57
|
+
Convert a Parquet field to an Avro field.
|
|
58
|
+
|
|
59
|
+
:param field: Parquet field to convert.
|
|
60
|
+
:return: Avro field as a dictionary.
|
|
61
|
+
"""
|
|
62
|
+
avro_type = self.convert_parquet_type_to_avro_type(field.type, field.name)
|
|
63
|
+
avro_field = {
|
|
64
|
+
"name": field.name,
|
|
65
|
+
"type": avro_type
|
|
66
|
+
}
|
|
67
|
+
if field.metadata and b'description' in field.metadata:
|
|
68
|
+
avro_field["doc"] = field.metadata[b'description'].decode("utf-8")
|
|
69
|
+
return avro_field
|
|
70
|
+
|
|
71
|
+
def convert_parquet_type_to_avro_type(self, parquet_type, field_name):
|
|
72
|
+
"""
|
|
73
|
+
Convert a Parquet type to an Avro type.
|
|
74
|
+
|
|
75
|
+
:param parquet_type: Parquet type to convert.
|
|
76
|
+
:param field_name: Name of the field being converted.
|
|
77
|
+
:return: Avro type as a string or dictionary.
|
|
78
|
+
"""
|
|
79
|
+
if pa.types.is_int8(parquet_type):
|
|
80
|
+
return "int"
|
|
81
|
+
if pa.types.is_int16(parquet_type):
|
|
82
|
+
return "int"
|
|
83
|
+
if pa.types.is_int32(parquet_type):
|
|
84
|
+
return "int"
|
|
85
|
+
if pa.types.is_int64(parquet_type):
|
|
86
|
+
return "long"
|
|
87
|
+
if pa.types.is_uint8(parquet_type):
|
|
88
|
+
return "int"
|
|
89
|
+
if pa.types.is_uint16(parquet_type):
|
|
90
|
+
return "int"
|
|
91
|
+
if pa.types.is_uint32(parquet_type):
|
|
92
|
+
return "long"
|
|
93
|
+
if pa.types.is_uint64(parquet_type):
|
|
94
|
+
return "long"
|
|
95
|
+
if pa.types.is_float32(parquet_type):
|
|
96
|
+
return "float"
|
|
97
|
+
if pa.types.is_float64(parquet_type):
|
|
98
|
+
return "double"
|
|
99
|
+
if pa.types.is_boolean(parquet_type):
|
|
100
|
+
return "boolean"
|
|
101
|
+
if pa.types.is_binary(parquet_type):
|
|
102
|
+
return "bytes"
|
|
103
|
+
if pa.types.is_string(parquet_type):
|
|
104
|
+
return "string"
|
|
105
|
+
if pa.types.is_timestamp(parquet_type):
|
|
106
|
+
return {"type": "long", "logicalType": "timestamp-millis"}
|
|
107
|
+
if pa.types.is_date32(parquet_type):
|
|
108
|
+
return {"type": "int", "logicalType": "date"}
|
|
109
|
+
if pa.types.is_date64(parquet_type):
|
|
110
|
+
return {"type": "long", "logicalType": "timestamp-millis"}
|
|
111
|
+
if pa.types.is_list(parquet_type):
|
|
112
|
+
return {
|
|
113
|
+
"type": "array",
|
|
114
|
+
"items": self.convert_parquet_type_to_avro_type(parquet_type.value_type, field_name)
|
|
115
|
+
}
|
|
116
|
+
if pa.types.is_map(parquet_type):
|
|
117
|
+
return {
|
|
118
|
+
"type": "map",
|
|
119
|
+
"values": self.convert_parquet_type_to_avro_type(parquet_type.item_type, field_name)
|
|
120
|
+
}
|
|
121
|
+
if pa.types.is_struct(parquet_type):
|
|
122
|
+
fields = [
|
|
123
|
+
{
|
|
124
|
+
"name": nested_field.name,
|
|
125
|
+
"type": self.convert_parquet_type_to_avro_type(nested_field.type, nested_field.name)
|
|
126
|
+
} for nested_field in parquet_type
|
|
127
|
+
]
|
|
128
|
+
return {
|
|
129
|
+
"type": "record",
|
|
130
|
+
"name": f"{field_name}Type",
|
|
131
|
+
"namespace": self.namespace,
|
|
132
|
+
"fields": fields
|
|
133
|
+
}
|
|
134
|
+
if pa.types.is_decimal(parquet_type):
|
|
135
|
+
return {
|
|
136
|
+
"type": "bytes",
|
|
137
|
+
"logicalType": "decimal",
|
|
138
|
+
"precision": parquet_type.precision,
|
|
139
|
+
"scale": parquet_type.scale
|
|
140
|
+
}
|
|
141
|
+
return "string"
|
|
142
|
+
|
|
143
|
+
def convert_parquet_to_avro(parquet_file_path, avro_file_path, namespace=""):
|
|
144
|
+
"""
|
|
145
|
+
Convert a Parquet file to an Avro schema file.
|
|
146
|
+
|
|
147
|
+
:param parquet_file_path: Path to the Parquet file.
|
|
148
|
+
:param avro_file_path: Path to save the Avro schema file.
|
|
149
|
+
:param namespace: Namespace for Avro records.
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
if not os.path.exists(parquet_file_path):
|
|
153
|
+
raise FileNotFoundError(f"Parquet file not found: {parquet_file_path}")
|
|
154
|
+
|
|
155
|
+
converter = ParquetToAvroConverter(parquet_file_path, avro_file_path, namespace)
|
|
156
|
+
converter.convert()
|
|
157
|
+
|