dlt-iceberg 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dlt-iceberg might be problematic. Click here for more details.
- dlt_iceberg/__init__.py +28 -0
- dlt_iceberg/destination.py +400 -0
- dlt_iceberg/destination_client.py +606 -0
- dlt_iceberg/error_handling.py +224 -0
- dlt_iceberg/partition_builder.py +308 -0
- dlt_iceberg/schema_casting.py +381 -0
- dlt_iceberg/schema_converter.py +207 -0
- dlt_iceberg/schema_evolution.py +261 -0
- dlt_iceberg-0.1.1.dist-info/METADATA +15 -0
- dlt_iceberg-0.1.1.dist-info/RECORD +12 -0
- dlt_iceberg-0.1.1.dist-info/WHEEL +4 -0
- dlt_iceberg-0.1.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schema evolution support for Apache Iceberg tables.
|
|
3
|
+
|
|
4
|
+
This module handles comparing schemas and applying safe schema changes:
|
|
5
|
+
- Adding new columns
|
|
6
|
+
- Promoting types (int→long, float→double)
|
|
7
|
+
- Detecting unsafe changes (dropping columns, narrowing types)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Set, List, Dict, Tuple, Optional
|
|
12
|
+
from pyiceberg.schema import Schema
|
|
13
|
+
from pyiceberg.types import (
|
|
14
|
+
IcebergType,
|
|
15
|
+
BooleanType,
|
|
16
|
+
IntegerType,
|
|
17
|
+
LongType,
|
|
18
|
+
FloatType,
|
|
19
|
+
DoubleType,
|
|
20
|
+
DecimalType,
|
|
21
|
+
StringType,
|
|
22
|
+
BinaryType,
|
|
23
|
+
TimestampType,
|
|
24
|
+
DateType,
|
|
25
|
+
TimeType,
|
|
26
|
+
ListType,
|
|
27
|
+
MapType,
|
|
28
|
+
StructType,
|
|
29
|
+
NestedField,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SchemaEvolutionError(Exception):
|
|
36
|
+
"""Raised when an unsafe schema change is detected."""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def can_promote_type(from_type: IcebergType, to_type: IcebergType) -> bool:
|
|
41
|
+
"""
|
|
42
|
+
Check if one Iceberg type can be safely promoted to another.
|
|
43
|
+
|
|
44
|
+
Safe promotions (following Iceberg spec):
|
|
45
|
+
- IntegerType → LongType
|
|
46
|
+
- FloatType → DoubleType
|
|
47
|
+
- DecimalType → DecimalType (with wider precision/scale)
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
from_type: Current type in existing schema
|
|
51
|
+
to_type: New type in incoming schema
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
True if promotion is safe, False otherwise
|
|
55
|
+
"""
|
|
56
|
+
# Same type is always safe
|
|
57
|
+
if type(from_type) == type(to_type):
|
|
58
|
+
# For decimals, check precision/scale
|
|
59
|
+
if isinstance(from_type, DecimalType) and isinstance(to_type, DecimalType):
|
|
60
|
+
# Can widen precision and scale
|
|
61
|
+
return (to_type.precision >= from_type.precision and
|
|
62
|
+
to_type.scale >= from_type.scale)
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
# Integer to Long
|
|
66
|
+
if isinstance(from_type, IntegerType) and isinstance(to_type, LongType):
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
# Float to Double
|
|
70
|
+
if isinstance(from_type, FloatType) and isinstance(to_type, DoubleType):
|
|
71
|
+
return True
|
|
72
|
+
|
|
73
|
+
# No other promotions are safe
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def compare_schemas(
|
|
78
|
+
existing_schema: Schema,
|
|
79
|
+
new_schema: Schema
|
|
80
|
+
) -> Tuple[List[NestedField], List[Tuple[str, IcebergType, IcebergType]], List[str]]:
|
|
81
|
+
"""
|
|
82
|
+
Compare two Iceberg schemas and identify differences.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
existing_schema: Current schema of the table
|
|
86
|
+
new_schema: New schema from incoming data
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Tuple of:
|
|
90
|
+
- List of new fields to add
|
|
91
|
+
- List of (field_name, old_type, new_type) for type changes
|
|
92
|
+
- List of field names that were dropped (present in existing but not in new)
|
|
93
|
+
"""
|
|
94
|
+
# Build name->field mappings
|
|
95
|
+
existing_fields = {field.name: field for field in existing_schema.fields}
|
|
96
|
+
new_fields = {field.name: field for field in new_schema.fields}
|
|
97
|
+
|
|
98
|
+
existing_names = set(existing_fields.keys())
|
|
99
|
+
new_names = set(new_fields.keys())
|
|
100
|
+
|
|
101
|
+
# Find added columns
|
|
102
|
+
added_names = new_names - existing_names
|
|
103
|
+
added_fields = [new_fields[name] for name in added_names]
|
|
104
|
+
|
|
105
|
+
# Find dropped columns
|
|
106
|
+
dropped_names = existing_names - new_names
|
|
107
|
+
|
|
108
|
+
# Find type changes in common columns
|
|
109
|
+
common_names = existing_names & new_names
|
|
110
|
+
type_changes = []
|
|
111
|
+
|
|
112
|
+
for name in common_names:
|
|
113
|
+
existing_field = existing_fields[name]
|
|
114
|
+
new_field = new_fields[name]
|
|
115
|
+
|
|
116
|
+
if type(existing_field.field_type) != type(new_field.field_type):
|
|
117
|
+
# Types differ - could be promotion or unsafe change
|
|
118
|
+
type_changes.append((name, existing_field.field_type, new_field.field_type))
|
|
119
|
+
elif isinstance(existing_field.field_type, DecimalType) and isinstance(new_field.field_type, DecimalType):
|
|
120
|
+
# Check decimal precision/scale changes
|
|
121
|
+
if (existing_field.field_type.precision != new_field.field_type.precision or
|
|
122
|
+
existing_field.field_type.scale != new_field.field_type.scale):
|
|
123
|
+
type_changes.append((name, existing_field.field_type, new_field.field_type))
|
|
124
|
+
|
|
125
|
+
return added_fields, type_changes, list(dropped_names)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def validate_schema_changes(
|
|
129
|
+
added_fields: List[NestedField],
|
|
130
|
+
type_changes: List[Tuple[str, IcebergType, IcebergType]],
|
|
131
|
+
dropped_fields: List[str],
|
|
132
|
+
allow_column_drops: bool = False
|
|
133
|
+
) -> None:
|
|
134
|
+
"""
|
|
135
|
+
Validate that schema changes are safe to apply.
|
|
136
|
+
|
|
137
|
+
Raises SchemaEvolutionError if any unsafe changes are detected.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
added_fields: Fields being added
|
|
141
|
+
type_changes: Type changes being made
|
|
142
|
+
dropped_fields: Fields being dropped
|
|
143
|
+
allow_column_drops: If False, raises error on dropped columns
|
|
144
|
+
"""
|
|
145
|
+
errors = []
|
|
146
|
+
|
|
147
|
+
# Check dropped columns
|
|
148
|
+
if dropped_fields and not allow_column_drops:
|
|
149
|
+
errors.append(
|
|
150
|
+
f"Columns dropped (not safe): {', '.join(dropped_fields)}. "
|
|
151
|
+
f"Dropping columns is not supported by default to prevent data loss."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Check type changes
|
|
155
|
+
for field_name, old_type, new_type in type_changes:
|
|
156
|
+
if not can_promote_type(old_type, new_type):
|
|
157
|
+
errors.append(
|
|
158
|
+
f"Unsafe type change for column '{field_name}': "
|
|
159
|
+
f"{old_type} → {new_type}. "
|
|
160
|
+
f"Only safe promotions are allowed (int→long, float→double, decimal widening)."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if errors:
|
|
164
|
+
raise SchemaEvolutionError(
|
|
165
|
+
"Schema evolution validation failed:\n" + "\n".join(f" - {e}" for e in errors)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def apply_schema_evolution(
|
|
170
|
+
table,
|
|
171
|
+
added_fields: List[NestedField],
|
|
172
|
+
type_changes: List[Tuple[str, IcebergType, IcebergType]]
|
|
173
|
+
) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Apply schema evolution changes to an Iceberg table.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
table: PyIceberg table instance
|
|
179
|
+
added_fields: New fields to add
|
|
180
|
+
type_changes: Type promotions to apply
|
|
181
|
+
"""
|
|
182
|
+
if not added_fields and not type_changes:
|
|
183
|
+
logger.info("No schema changes to apply")
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
logger.info(
|
|
187
|
+
f"Applying schema evolution: "
|
|
188
|
+
f"{len(added_fields)} new columns, {len(type_changes)} type promotions"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Apply changes using update_schema transaction
|
|
192
|
+
with table.update_schema() as update:
|
|
193
|
+
# Add new columns
|
|
194
|
+
for field in added_fields:
|
|
195
|
+
logger.info(f" Adding column: {field.name} ({field.field_type})")
|
|
196
|
+
update.add_column(
|
|
197
|
+
path=field.name,
|
|
198
|
+
field_type=field.field_type,
|
|
199
|
+
required=field.required,
|
|
200
|
+
doc=field.doc
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Apply type promotions
|
|
204
|
+
for field_name, old_type, new_type in type_changes:
|
|
205
|
+
logger.info(f" Promoting column '{field_name}': {old_type} → {new_type}")
|
|
206
|
+
update.update_column(
|
|
207
|
+
path=field_name,
|
|
208
|
+
field_type=new_type
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
logger.info("Schema evolution applied successfully")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def evolve_schema_if_needed(
|
|
215
|
+
table,
|
|
216
|
+
new_schema: Schema,
|
|
217
|
+
allow_column_drops: bool = False
|
|
218
|
+
) -> bool:
|
|
219
|
+
"""
|
|
220
|
+
Compare table schema with new schema and apply evolution if needed.
|
|
221
|
+
|
|
222
|
+
This is the main entry point for schema evolution logic.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
table: PyIceberg table instance
|
|
226
|
+
new_schema: New schema from incoming data
|
|
227
|
+
allow_column_drops: Whether to allow dropping columns
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if schema was evolved, False if no changes needed
|
|
231
|
+
|
|
232
|
+
Raises:
|
|
233
|
+
SchemaEvolutionError: If unsafe schema changes are detected
|
|
234
|
+
"""
|
|
235
|
+
existing_schema = table.schema()
|
|
236
|
+
|
|
237
|
+
# Compare schemas
|
|
238
|
+
added_fields, type_changes, dropped_fields = compare_schemas(
|
|
239
|
+
existing_schema, new_schema
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Log what we found
|
|
243
|
+
if added_fields:
|
|
244
|
+
logger.info(f"Detected {len(added_fields)} new columns: {[f.name for f in added_fields]}")
|
|
245
|
+
if type_changes:
|
|
246
|
+
logger.info(f"Detected {len(type_changes)} type changes: {[(name, str(old), str(new)) for name, old, new in type_changes]}")
|
|
247
|
+
if dropped_fields:
|
|
248
|
+
logger.warning(f"Detected {len(dropped_fields)} dropped columns: {dropped_fields}")
|
|
249
|
+
|
|
250
|
+
# If no changes, nothing to do
|
|
251
|
+
if not added_fields and not type_changes and not dropped_fields:
|
|
252
|
+
logger.debug("No schema changes detected")
|
|
253
|
+
return False
|
|
254
|
+
|
|
255
|
+
# Validate changes are safe
|
|
256
|
+
validate_schema_changes(added_fields, type_changes, dropped_fields, allow_column_drops)
|
|
257
|
+
|
|
258
|
+
# Apply evolution
|
|
259
|
+
apply_schema_evolution(table, added_fields, type_changes)
|
|
260
|
+
|
|
261
|
+
return True
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dlt-iceberg
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: dlt custom destination for Apache Iceberg with REST catalog support
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: boto3>=1.40.50
|
|
8
|
+
Requires-Dist: dlt>=1.17.1
|
|
9
|
+
Requires-Dist: pandas>=2.3.3
|
|
10
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
11
|
+
Requires-Dist: pydantic<2.11
|
|
12
|
+
Requires-Dist: pyiceberg[pyiceberg-core]>=0.10.0
|
|
13
|
+
Requires-Dist: requests>=2.32.5
|
|
14
|
+
Requires-Dist: s3fs>=0.4.2
|
|
15
|
+
Requires-Dist: sqlalchemy>=2.0.44
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
dlt_iceberg/__init__.py,sha256=ONy6E-sGcCvvqia8_fGaYp8da4n4wdjox9W42tmQPK0,780
|
|
2
|
+
dlt_iceberg/destination.py,sha256=F8QJXsQeosOA32Xm1140DL485WQmxbuhiA2QZ6zpVSU,15737
|
|
3
|
+
dlt_iceberg/destination_client.py,sha256=dyJtHHy2Ow0GIFVj17LePC76rKw6MiVJnrS-y28OctQ,22341
|
|
4
|
+
dlt_iceberg/error_handling.py,sha256=k6Kkldi9BDRsXQ63VEBMMSw1xx2-b1BMjsgRFKI2iB0,7852
|
|
5
|
+
dlt_iceberg/partition_builder.py,sha256=l9YNAh2t6gk2xqsPSOs8ymTDLk9BOEZWVOtVni7ONNU,10081
|
|
6
|
+
dlt_iceberg/schema_casting.py,sha256=Qn4sarRnyJM04lKvKonEjvlvVdizUOGI65J_AmzbEAs,12997
|
|
7
|
+
dlt_iceberg/schema_converter.py,sha256=e_eqXQz2cpABOGEAxVwcGbiOdVmv9kaZanRnU83lzXk,5619
|
|
8
|
+
dlt_iceberg/schema_evolution.py,sha256=ieOkCA9ngQdJ5lbZLYQ09deTLZEW8whxDn2arpoH-aM,8326
|
|
9
|
+
dlt_iceberg-0.1.1.dist-info/METADATA,sha256=hhtEkMwpG_rQBUULTeyoMsSevGGEIhCqOjTJJgCw8qY,466
|
|
10
|
+
dlt_iceberg-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
11
|
+
dlt_iceberg-0.1.1.dist-info/licenses/LICENSE,sha256=0amGlcH0msYju3WUhlsuUxO4aj3ZODkkIZ0MKOq9fQ4,1066
|
|
12
|
+
dlt_iceberg-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sidequery
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|