structurize 2.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +64 -0
- avrotize/__main__.py +6 -0
- avrotize/_version.py +34 -0
- avrotize/asn1toavro.py +160 -0
- avrotize/avrotize.py +152 -0
- avrotize/avrotocpp.py +483 -0
- avrotize/avrotocsharp.py +1075 -0
- avrotize/avrotocsv.py +121 -0
- avrotize/avrotodatapackage.py +173 -0
- avrotize/avrotodb.py +1383 -0
- avrotize/avrotogo.py +476 -0
- avrotize/avrotographql.py +197 -0
- avrotize/avrotoiceberg.py +210 -0
- avrotize/avrotojava.py +2156 -0
- avrotize/avrotojs.py +250 -0
- avrotize/avrotojsons.py +481 -0
- avrotize/avrotojstruct.py +345 -0
- avrotize/avrotokusto.py +364 -0
- avrotize/avrotomd.py +137 -0
- avrotize/avrotools.py +168 -0
- avrotize/avrotoparquet.py +208 -0
- avrotize/avrotoproto.py +359 -0
- avrotize/avrotopython.py +624 -0
- avrotize/avrotorust.py +435 -0
- avrotize/avrotots.py +598 -0
- avrotize/avrotoxsd.py +344 -0
- avrotize/cddltostructure.py +1841 -0
- avrotize/commands.json +3337 -0
- avrotize/common.py +834 -0
- avrotize/constants.py +72 -0
- avrotize/csvtoavro.py +132 -0
- avrotize/datapackagetoavro.py +76 -0
- avrotize/dependencies/cpp/vcpkg/vcpkg.json +19 -0
- avrotize/dependencies/typescript/node22/package.json +16 -0
- avrotize/dependency_resolver.py +348 -0
- avrotize/dependency_version.py +432 -0
- avrotize/jsonstoavro.py +2167 -0
- avrotize/jsonstostructure.py +2642 -0
- avrotize/jstructtoavro.py +878 -0
- avrotize/kstructtoavro.py +93 -0
- avrotize/kustotoavro.py +455 -0
- avrotize/parquettoavro.py +157 -0
- avrotize/proto2parser.py +498 -0
- avrotize/proto3parser.py +403 -0
- avrotize/prototoavro.py +382 -0
- avrotize/structuretocddl.py +597 -0
- avrotize/structuretocpp.py +697 -0
- avrotize/structuretocsharp.py +2295 -0
- avrotize/structuretocsv.py +365 -0
- avrotize/structuretodatapackage.py +659 -0
- avrotize/structuretodb.py +1125 -0
- avrotize/structuretogo.py +720 -0
- avrotize/structuretographql.py +502 -0
- avrotize/structuretoiceberg.py +355 -0
- avrotize/structuretojava.py +853 -0
- avrotize/structuretojsons.py +498 -0
- avrotize/structuretokusto.py +639 -0
- avrotize/structuretomd.py +322 -0
- avrotize/structuretoproto.py +764 -0
- avrotize/structuretopython.py +772 -0
- avrotize/structuretorust.py +714 -0
- avrotize/structuretots.py +653 -0
- avrotize/structuretoxsd.py +679 -0
- avrotize/xsdtoavro.py +413 -0
- structurize-2.19.0.dist-info/METADATA +107 -0
- structurize-2.19.0.dist-info/RECORD +70 -0
- structurize-2.19.0.dist-info/WHEEL +5 -0
- structurize-2.19.0.dist-info/entry_points.txt +2 -0
- structurize-2.19.0.dist-info/licenses/LICENSE +201 -0
- structurize-2.19.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""Convert a JSON Structure schema to an Iceberg schema."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Dict, List, Any, Optional
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
from pyiceberg.schema import Schema, NestedField
|
|
8
|
+
from pyiceberg.types import (
|
|
9
|
+
BooleanType,
|
|
10
|
+
IntegerType,
|
|
11
|
+
LongType,
|
|
12
|
+
FloatType,
|
|
13
|
+
DoubleType,
|
|
14
|
+
StringType,
|
|
15
|
+
BinaryType,
|
|
16
|
+
DateType,
|
|
17
|
+
TimestampType,
|
|
18
|
+
DecimalType,
|
|
19
|
+
FixedType,
|
|
20
|
+
ListType,
|
|
21
|
+
MapType,
|
|
22
|
+
StructType,
|
|
23
|
+
TimeType
|
|
24
|
+
)
|
|
25
|
+
from pyiceberg.io.pyarrow import PyArrowFileIO, schema_to_pyarrow
|
|
26
|
+
|
|
27
|
+
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class StructureToIcebergConverter:
|
|
31
|
+
"""Class to convert JSON Structure schema to Iceberg schema."""
|
|
32
|
+
|
|
33
|
+
def __init__(self: 'StructureToIcebergConverter'):
|
|
34
|
+
self.named_type_cache: Dict[str, JsonNode] = {}
|
|
35
|
+
self.id_counter = 0
|
|
36
|
+
self.definitions: Dict[str, Any] = {}
|
|
37
|
+
self.schema_doc: Optional[Dict[str, Any]] = None
|
|
38
|
+
|
|
39
|
+
def get_id(self) -> int:
|
|
40
|
+
"""Get a unique ID for a field."""
|
|
41
|
+
self.id_counter += 1
|
|
42
|
+
return self.id_counter
|
|
43
|
+
|
|
44
|
+
def get_fullname(self, namespace: str, name: str) -> str:
|
|
45
|
+
"""Get the full name of a record type."""
|
|
46
|
+
return f"{namespace}.{name}" if namespace else name
|
|
47
|
+
|
|
48
|
+
def convert_structure_to_iceberg(self, structure_schema_path: str, structure_record_type: Optional[str], output_path: str, emit_cloudevents_columns: bool=False):
|
|
49
|
+
"""Convert a JSON Structure schema to an Iceberg schema."""
|
|
50
|
+
schema_file = structure_schema_path
|
|
51
|
+
if not schema_file:
|
|
52
|
+
print("Please specify the JSON Structure schema file")
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
with open(schema_file, "r", encoding="utf-8") as f:
|
|
55
|
+
schema_json = f.read()
|
|
56
|
+
|
|
57
|
+
# Parse the schema as a JSON object
|
|
58
|
+
schema = json.loads(schema_json)
|
|
59
|
+
self.schema_doc = schema
|
|
60
|
+
|
|
61
|
+
# Handle definitions if present
|
|
62
|
+
if "definitions" in schema:
|
|
63
|
+
self.definitions = schema["definitions"]
|
|
64
|
+
|
|
65
|
+
# For JSON Structure, we expect an object type at the top level
|
|
66
|
+
if schema.get("type") != "object":
|
|
67
|
+
# Check if we have a $ref at the top level
|
|
68
|
+
if "$ref" in schema:
|
|
69
|
+
ref = schema["$ref"]
|
|
70
|
+
schema = self.resolve_ref(ref)
|
|
71
|
+
elif structure_record_type and "definitions" in schema:
|
|
72
|
+
# Look for the type in definitions
|
|
73
|
+
if structure_record_type in schema["definitions"]:
|
|
74
|
+
schema = schema["definitions"][structure_record_type]
|
|
75
|
+
else:
|
|
76
|
+
print(f"No record type {structure_record_type} found in the JSON Structure schema definitions")
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
else:
|
|
79
|
+
print("Expected a JSON Structure schema with type 'object' at the top level")
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
|
|
82
|
+
# Get the name and properties of the top-level object
|
|
83
|
+
table_name = schema.get("name", "Table")
|
|
84
|
+
properties = schema.get("properties", {})
|
|
85
|
+
required = schema.get("required", [])
|
|
86
|
+
|
|
87
|
+
# Create a list to store the iceberg schema
|
|
88
|
+
iceberg_fields: List[NestedField] = []
|
|
89
|
+
|
|
90
|
+
# Append the iceberg schema with the column names and types
|
|
91
|
+
for prop_name, prop_schema in properties.items():
|
|
92
|
+
is_required = prop_name in required
|
|
93
|
+
column_type = self.convert_structure_type_to_iceberg_type(prop_schema)
|
|
94
|
+
iceberg_fields.append(
|
|
95
|
+
NestedField(
|
|
96
|
+
field_id=self.get_id(),
|
|
97
|
+
name=prop_name,
|
|
98
|
+
field_type=column_type,
|
|
99
|
+
required=is_required
|
|
100
|
+
))
|
|
101
|
+
|
|
102
|
+
if emit_cloudevents_columns:
|
|
103
|
+
iceberg_fields.extend([
|
|
104
|
+
NestedField(field_id=self.get_id(),
|
|
105
|
+
name="___type", field_type=StringType(), required=False),
|
|
106
|
+
NestedField(field_id=self.get_id(),
|
|
107
|
+
name="___source", field_type=StringType(), required=False),
|
|
108
|
+
NestedField(field_id=self.get_id(),
|
|
109
|
+
name="___id", field_type=StringType(), required=False),
|
|
110
|
+
NestedField(field_id=self.get_id(),
|
|
111
|
+
name="___time", field_type=TimestampType(), required=False),
|
|
112
|
+
NestedField(field_id=self.get_id(),
|
|
113
|
+
name="___subject", field_type=StringType(), required=False)
|
|
114
|
+
])
|
|
115
|
+
|
|
116
|
+
iceberg_schema = Schema(*iceberg_fields)
|
|
117
|
+
arrow_schema = schema_to_pyarrow(iceberg_schema)
|
|
118
|
+
print(f"Iceberg schema created: {arrow_schema}")
|
|
119
|
+
|
|
120
|
+
# Write to Iceberg table (for demonstration, using local file system)
|
|
121
|
+
file_io = PyArrowFileIO()
|
|
122
|
+
output_file = file_io.new_output("file://"+output_path)
|
|
123
|
+
with output_file.create(overwrite=True) as f:
|
|
124
|
+
pa.output_stream(f).write(arrow_schema.serialize().to_pybytes())
|
|
125
|
+
|
|
126
|
+
def resolve_ref(self, ref: str) -> Dict[str, Any]:
|
|
127
|
+
"""Resolve a $ref reference."""
|
|
128
|
+
if not ref.startswith("#/"):
|
|
129
|
+
raise ValueError(f"Only local references are supported, got: {ref}")
|
|
130
|
+
|
|
131
|
+
parts = ref[2:].split("/")
|
|
132
|
+
current = self.schema_doc
|
|
133
|
+
|
|
134
|
+
for part in parts:
|
|
135
|
+
if isinstance(current, dict) and part in current:
|
|
136
|
+
current = current[part]
|
|
137
|
+
else:
|
|
138
|
+
raise ValueError(f"Could not resolve reference: {ref}")
|
|
139
|
+
|
|
140
|
+
return current
|
|
141
|
+
|
|
142
|
+
def convert_structure_type_to_iceberg_type(self, structure_type):
|
|
143
|
+
"""Convert a JSON Structure type to an Iceberg type."""
|
|
144
|
+
# Handle $ref
|
|
145
|
+
if isinstance(structure_type, dict) and "$ref" in structure_type:
|
|
146
|
+
ref = structure_type["$ref"]
|
|
147
|
+
resolved = self.resolve_ref(ref)
|
|
148
|
+
return self.convert_structure_type_to_iceberg_type(resolved)
|
|
149
|
+
|
|
150
|
+
# Handle array of types (e.g., ["string", "null"] for nullable types)
|
|
151
|
+
if isinstance(structure_type, list):
|
|
152
|
+
# Filter out null from the list
|
|
153
|
+
non_null_types = [t for t in structure_type if t != "null"]
|
|
154
|
+
if len(non_null_types) == 1:
|
|
155
|
+
# Nullable type - just use the non-null type (Iceberg handles optionality with required flag)
|
|
156
|
+
return self.convert_structure_type_to_iceberg_type(non_null_types[0])
|
|
157
|
+
elif len(non_null_types) > 1:
|
|
158
|
+
# Union of multiple non-null types - create a struct with alternatives
|
|
159
|
+
fields = []
|
|
160
|
+
for i, choice in enumerate(non_null_types):
|
|
161
|
+
choice_type = self.convert_structure_type_to_iceberg_type(choice)
|
|
162
|
+
fields.append(NestedField(
|
|
163
|
+
field_id=self.get_id(),
|
|
164
|
+
name=f"option_{i}",
|
|
165
|
+
field_type=choice_type,
|
|
166
|
+
required=False
|
|
167
|
+
))
|
|
168
|
+
return StructType(*fields)
|
|
169
|
+
else:
|
|
170
|
+
# Only null - return string as fallback
|
|
171
|
+
return StringType()
|
|
172
|
+
|
|
173
|
+
# Handle dictionary with type field
|
|
174
|
+
if isinstance(structure_type, dict):
|
|
175
|
+
type_name = structure_type.get("type")
|
|
176
|
+
|
|
177
|
+
# Handle type being an array
|
|
178
|
+
if isinstance(type_name, list):
|
|
179
|
+
# This is like {"type": ["string", "null"]}
|
|
180
|
+
return self.convert_structure_type_to_iceberg_type(type_name)
|
|
181
|
+
|
|
182
|
+
# Handle array type
|
|
183
|
+
if type_name == "array":
|
|
184
|
+
items = structure_type.get("items", {"type": "string"})
|
|
185
|
+
return ListType(
|
|
186
|
+
element_id=self.get_id(),
|
|
187
|
+
element_type=self.convert_structure_type_to_iceberg_type(items),
|
|
188
|
+
element_required=True
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Handle set type (treated as array in Iceberg)
|
|
192
|
+
elif type_name == "set":
|
|
193
|
+
items = structure_type.get("items", {"type": "string"})
|
|
194
|
+
return ListType(
|
|
195
|
+
element_id=self.get_id(),
|
|
196
|
+
element_type=self.convert_structure_type_to_iceberg_type(items),
|
|
197
|
+
element_required=True
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Handle map type
|
|
201
|
+
elif type_name == "map":
|
|
202
|
+
values = structure_type.get("values", {"type": "string"})
|
|
203
|
+
return MapType(
|
|
204
|
+
key_id=self.get_id(),
|
|
205
|
+
key_type=StringType(),
|
|
206
|
+
value_id=self.get_id(),
|
|
207
|
+
value_type=self.convert_structure_type_to_iceberg_type(values),
|
|
208
|
+
value_required=True
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Handle tuple type (treated as struct with indexed fields)
|
|
212
|
+
elif type_name == "tuple":
|
|
213
|
+
items = structure_type.get("items", [])
|
|
214
|
+
fields = []
|
|
215
|
+
for i, item in enumerate(items):
|
|
216
|
+
fields.append(NestedField(
|
|
217
|
+
field_id=self.get_id(),
|
|
218
|
+
name=f"field_{i}",
|
|
219
|
+
field_type=self.convert_structure_type_to_iceberg_type(item),
|
|
220
|
+
required=True
|
|
221
|
+
))
|
|
222
|
+
return StructType(*fields)
|
|
223
|
+
|
|
224
|
+
# Handle object type
|
|
225
|
+
elif type_name == "object":
|
|
226
|
+
properties = structure_type.get("properties", {})
|
|
227
|
+
required = structure_type.get("required", [])
|
|
228
|
+
fields = []
|
|
229
|
+
|
|
230
|
+
# Handle $extends if present
|
|
231
|
+
if "$extends" in structure_type:
|
|
232
|
+
extends_ref = structure_type["$extends"]
|
|
233
|
+
base_schema = self.resolve_ref(extends_ref)
|
|
234
|
+
base_properties = base_schema.get("properties", {})
|
|
235
|
+
base_required = base_schema.get("required", [])
|
|
236
|
+
|
|
237
|
+
# Add base properties first
|
|
238
|
+
for prop_name, prop_schema in base_properties.items():
|
|
239
|
+
is_required = prop_name in base_required
|
|
240
|
+
fields.append(NestedField(
|
|
241
|
+
field_id=self.get_id(),
|
|
242
|
+
name=prop_name,
|
|
243
|
+
field_type=self.convert_structure_type_to_iceberg_type(prop_schema),
|
|
244
|
+
required=is_required
|
|
245
|
+
))
|
|
246
|
+
|
|
247
|
+
# Add own properties
|
|
248
|
+
for prop_name, prop_schema in properties.items():
|
|
249
|
+
is_required = prop_name in required
|
|
250
|
+
fields.append(NestedField(
|
|
251
|
+
field_id=self.get_id(),
|
|
252
|
+
name=prop_name,
|
|
253
|
+
field_type=self.convert_structure_type_to_iceberg_type(prop_schema),
|
|
254
|
+
required=is_required
|
|
255
|
+
))
|
|
256
|
+
|
|
257
|
+
return StructType(*fields)
|
|
258
|
+
|
|
259
|
+
# Handle choice type (union)
|
|
260
|
+
elif type_name == "choice":
|
|
261
|
+
choices = structure_type.get("choices", [])
|
|
262
|
+
if isinstance(choices, list):
|
|
263
|
+
# For inline choices, create a struct with alternatives
|
|
264
|
+
fields = []
|
|
265
|
+
for i, choice in enumerate(choices):
|
|
266
|
+
choice_type = self.convert_structure_type_to_iceberg_type(choice)
|
|
267
|
+
fields.append(NestedField(
|
|
268
|
+
field_id=self.get_id(),
|
|
269
|
+
name=f"option_{i}",
|
|
270
|
+
field_type=choice_type,
|
|
271
|
+
required=False
|
|
272
|
+
))
|
|
273
|
+
return StructType(*fields)
|
|
274
|
+
elif isinstance(choices, dict):
|
|
275
|
+
# For tagged choices, create a struct with named alternatives
|
|
276
|
+
fields = []
|
|
277
|
+
for choice_name, choice_schema in choices.items():
|
|
278
|
+
choice_type = self.convert_structure_type_to_iceberg_type(choice_schema)
|
|
279
|
+
fields.append(NestedField(
|
|
280
|
+
field_id=self.get_id(),
|
|
281
|
+
name=choice_name,
|
|
282
|
+
field_type=choice_type,
|
|
283
|
+
required=False
|
|
284
|
+
))
|
|
285
|
+
return StructType(*fields)
|
|
286
|
+
else:
|
|
287
|
+
return StringType()
|
|
288
|
+
|
|
289
|
+
# Handle any type
|
|
290
|
+
elif type_name == "any":
|
|
291
|
+
return StringType()
|
|
292
|
+
|
|
293
|
+
# Handle primitive types with annotations
|
|
294
|
+
elif type_name:
|
|
295
|
+
return self.map_iceberg_scalar_type(type_name, structure_type)
|
|
296
|
+
|
|
297
|
+
# Handle string type name directly
|
|
298
|
+
elif isinstance(structure_type, str):
|
|
299
|
+
return self.map_iceberg_scalar_type(structure_type, {})
|
|
300
|
+
|
|
301
|
+
return StringType()
|
|
302
|
+
|
|
303
|
+
def map_iceberg_scalar_type(self, type_name: str, type_schema: Dict[str, Any]):
|
|
304
|
+
"""Map a JSON Structure scalar type to an Iceberg scalar type."""
|
|
305
|
+
# Check for decimal with precision and scale
|
|
306
|
+
if type_name == "decimal":
|
|
307
|
+
precision = type_schema.get("precision", 38)
|
|
308
|
+
scale = type_schema.get("scale", 18)
|
|
309
|
+
return DecimalType(precision, scale)
|
|
310
|
+
|
|
311
|
+
# Map other primitive types
|
|
312
|
+
type_mapping = {
|
|
313
|
+
'null': StringType(), # Iceberg doesn't have a null type
|
|
314
|
+
'boolean': BooleanType(),
|
|
315
|
+
'string': StringType(),
|
|
316
|
+
'int8': IntegerType(), # Iceberg doesn't have byte type
|
|
317
|
+
'uint8': IntegerType(),
|
|
318
|
+
'int16': IntegerType(), # Iceberg doesn't have short type
|
|
319
|
+
'uint16': IntegerType(),
|
|
320
|
+
'int32': IntegerType(),
|
|
321
|
+
'uint32': LongType(), # Use long for unsigned int32
|
|
322
|
+
'int64': LongType(),
|
|
323
|
+
'uint64': LongType(), # Iceberg doesn't distinguish signed/unsigned
|
|
324
|
+
'int128': StringType(), # No native 128-bit support
|
|
325
|
+
'uint128': StringType(),
|
|
326
|
+
'integer': IntegerType(), # Generic integer
|
|
327
|
+
'number': DoubleType(), # Generic number
|
|
328
|
+
'float8': FloatType(),
|
|
329
|
+
'float': FloatType(),
|
|
330
|
+
'float32': FloatType(),
|
|
331
|
+
'binary32': FloatType(),
|
|
332
|
+
'double': DoubleType(),
|
|
333
|
+
'float64': DoubleType(),
|
|
334
|
+
'binary64': DoubleType(),
|
|
335
|
+
'decimal': DecimalType(38, 18),
|
|
336
|
+
'binary': BinaryType(),
|
|
337
|
+
'bytes': BinaryType(), # Binary data
|
|
338
|
+
'date': DateType(),
|
|
339
|
+
'time': TimeType(),
|
|
340
|
+
'datetime': TimestampType(),
|
|
341
|
+
'timestamp': TimestampType(),
|
|
342
|
+
'duration': LongType(), # Store as microseconds
|
|
343
|
+
'uuid': StringType(), # Store UUID as string
|
|
344
|
+
'uri': StringType(),
|
|
345
|
+
'jsonpointer': StringType(),
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return type_mapping.get(type_name, StringType())
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def convert_structure_to_iceberg(structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns=False):
|
|
352
|
+
"""Convert a JSON Structure schema to an Iceberg schema."""
|
|
353
|
+
converter = StructureToIcebergConverter()
|
|
354
|
+
converter.convert_structure_to_iceberg(
|
|
355
|
+
structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns)
|