dlt-iceberg 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dlt-iceberg might be problematic. Click here for more details.

@@ -0,0 +1,261 @@
1
+ """
2
+ Schema evolution support for Apache Iceberg tables.
3
+
4
+ This module handles comparing schemas and applying safe schema changes:
5
+ - Adding new columns
6
+ - Promoting types (int→long, float→double)
7
+ - Detecting unsafe changes (dropping columns, narrowing types)
8
+ """
9
+
10
+ import logging
11
+ from typing import Set, List, Dict, Tuple, Optional
12
+ from pyiceberg.schema import Schema
13
+ from pyiceberg.types import (
14
+ IcebergType,
15
+ BooleanType,
16
+ IntegerType,
17
+ LongType,
18
+ FloatType,
19
+ DoubleType,
20
+ DecimalType,
21
+ StringType,
22
+ BinaryType,
23
+ TimestampType,
24
+ DateType,
25
+ TimeType,
26
+ ListType,
27
+ MapType,
28
+ StructType,
29
+ NestedField,
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class SchemaEvolutionError(Exception):
36
+ """Raised when an unsafe schema change is detected."""
37
+ pass
38
+
39
+
40
+ def can_promote_type(from_type: IcebergType, to_type: IcebergType) -> bool:
41
+ """
42
+ Check if one Iceberg type can be safely promoted to another.
43
+
44
+ Safe promotions (following Iceberg spec):
45
+ - IntegerType → LongType
46
+ - FloatType → DoubleType
47
+ - DecimalType → DecimalType (with wider precision/scale)
48
+
49
+ Args:
50
+ from_type: Current type in existing schema
51
+ to_type: New type in incoming schema
52
+
53
+ Returns:
54
+ True if promotion is safe, False otherwise
55
+ """
56
+ # Same type is always safe
57
+ if type(from_type) == type(to_type):
58
+ # For decimals, check precision/scale
59
+ if isinstance(from_type, DecimalType) and isinstance(to_type, DecimalType):
60
+ # Can widen precision and scale
61
+ return (to_type.precision >= from_type.precision and
62
+ to_type.scale >= from_type.scale)
63
+ return True
64
+
65
+ # Integer to Long
66
+ if isinstance(from_type, IntegerType) and isinstance(to_type, LongType):
67
+ return True
68
+
69
+ # Float to Double
70
+ if isinstance(from_type, FloatType) and isinstance(to_type, DoubleType):
71
+ return True
72
+
73
+ # No other promotions are safe
74
+ return False
75
+
76
+
77
+ def compare_schemas(
78
+ existing_schema: Schema,
79
+ new_schema: Schema
80
+ ) -> Tuple[List[NestedField], List[Tuple[str, IcebergType, IcebergType]], List[str]]:
81
+ """
82
+ Compare two Iceberg schemas and identify differences.
83
+
84
+ Args:
85
+ existing_schema: Current schema of the table
86
+ new_schema: New schema from incoming data
87
+
88
+ Returns:
89
+ Tuple of:
90
+ - List of new fields to add
91
+ - List of (field_name, old_type, new_type) for type changes
92
+ - List of field names that were dropped (present in existing but not in new)
93
+ """
94
+ # Build name->field mappings
95
+ existing_fields = {field.name: field for field in existing_schema.fields}
96
+ new_fields = {field.name: field for field in new_schema.fields}
97
+
98
+ existing_names = set(existing_fields.keys())
99
+ new_names = set(new_fields.keys())
100
+
101
+ # Find added columns
102
+ added_names = new_names - existing_names
103
+ added_fields = [new_fields[name] for name in added_names]
104
+
105
+ # Find dropped columns
106
+ dropped_names = existing_names - new_names
107
+
108
+ # Find type changes in common columns
109
+ common_names = existing_names & new_names
110
+ type_changes = []
111
+
112
+ for name in common_names:
113
+ existing_field = existing_fields[name]
114
+ new_field = new_fields[name]
115
+
116
+ if type(existing_field.field_type) != type(new_field.field_type):
117
+ # Types differ - could be promotion or unsafe change
118
+ type_changes.append((name, existing_field.field_type, new_field.field_type))
119
+ elif isinstance(existing_field.field_type, DecimalType) and isinstance(new_field.field_type, DecimalType):
120
+ # Check decimal precision/scale changes
121
+ if (existing_field.field_type.precision != new_field.field_type.precision or
122
+ existing_field.field_type.scale != new_field.field_type.scale):
123
+ type_changes.append((name, existing_field.field_type, new_field.field_type))
124
+
125
+ return added_fields, type_changes, list(dropped_names)
126
+
127
+
128
+ def validate_schema_changes(
129
+ added_fields: List[NestedField],
130
+ type_changes: List[Tuple[str, IcebergType, IcebergType]],
131
+ dropped_fields: List[str],
132
+ allow_column_drops: bool = False
133
+ ) -> None:
134
+ """
135
+ Validate that schema changes are safe to apply.
136
+
137
+ Raises SchemaEvolutionError if any unsafe changes are detected.
138
+
139
+ Args:
140
+ added_fields: Fields being added
141
+ type_changes: Type changes being made
142
+ dropped_fields: Fields being dropped
143
+ allow_column_drops: If False, raises error on dropped columns
144
+ """
145
+ errors = []
146
+
147
+ # Check dropped columns
148
+ if dropped_fields and not allow_column_drops:
149
+ errors.append(
150
+ f"Columns dropped (not safe): {', '.join(dropped_fields)}. "
151
+ f"Dropping columns is not supported by default to prevent data loss."
152
+ )
153
+
154
+ # Check type changes
155
+ for field_name, old_type, new_type in type_changes:
156
+ if not can_promote_type(old_type, new_type):
157
+ errors.append(
158
+ f"Unsafe type change for column '{field_name}': "
159
+ f"{old_type} → {new_type}. "
160
+ f"Only safe promotions are allowed (int→long, float→double, decimal widening)."
161
+ )
162
+
163
+ if errors:
164
+ raise SchemaEvolutionError(
165
+ "Schema evolution validation failed:\n" + "\n".join(f" - {e}" for e in errors)
166
+ )
167
+
168
+
169
+ def apply_schema_evolution(
170
+ table,
171
+ added_fields: List[NestedField],
172
+ type_changes: List[Tuple[str, IcebergType, IcebergType]]
173
+ ) -> None:
174
+ """
175
+ Apply schema evolution changes to an Iceberg table.
176
+
177
+ Args:
178
+ table: PyIceberg table instance
179
+ added_fields: New fields to add
180
+ type_changes: Type promotions to apply
181
+ """
182
+ if not added_fields and not type_changes:
183
+ logger.info("No schema changes to apply")
184
+ return
185
+
186
+ logger.info(
187
+ f"Applying schema evolution: "
188
+ f"{len(added_fields)} new columns, {len(type_changes)} type promotions"
189
+ )
190
+
191
+ # Apply changes using update_schema transaction
192
+ with table.update_schema() as update:
193
+ # Add new columns
194
+ for field in added_fields:
195
+ logger.info(f" Adding column: {field.name} ({field.field_type})")
196
+ update.add_column(
197
+ path=field.name,
198
+ field_type=field.field_type,
199
+ required=field.required,
200
+ doc=field.doc
201
+ )
202
+
203
+ # Apply type promotions
204
+ for field_name, old_type, new_type in type_changes:
205
+ logger.info(f" Promoting column '{field_name}': {old_type} → {new_type}")
206
+ update.update_column(
207
+ path=field_name,
208
+ field_type=new_type
209
+ )
210
+
211
+ logger.info("Schema evolution applied successfully")
212
+
213
+
214
+ def evolve_schema_if_needed(
215
+ table,
216
+ new_schema: Schema,
217
+ allow_column_drops: bool = False
218
+ ) -> bool:
219
+ """
220
+ Compare table schema with new schema and apply evolution if needed.
221
+
222
+ This is the main entry point for schema evolution logic.
223
+
224
+ Args:
225
+ table: PyIceberg table instance
226
+ new_schema: New schema from incoming data
227
+ allow_column_drops: Whether to allow dropping columns
228
+
229
+ Returns:
230
+ True if schema was evolved, False if no changes needed
231
+
232
+ Raises:
233
+ SchemaEvolutionError: If unsafe schema changes are detected
234
+ """
235
+ existing_schema = table.schema()
236
+
237
+ # Compare schemas
238
+ added_fields, type_changes, dropped_fields = compare_schemas(
239
+ existing_schema, new_schema
240
+ )
241
+
242
+ # Log what we found
243
+ if added_fields:
244
+ logger.info(f"Detected {len(added_fields)} new columns: {[f.name for f in added_fields]}")
245
+ if type_changes:
246
+ logger.info(f"Detected {len(type_changes)} type changes: {[(name, str(old), str(new)) for name, old, new in type_changes]}")
247
+ if dropped_fields:
248
+ logger.warning(f"Detected {len(dropped_fields)} dropped columns: {dropped_fields}")
249
+
250
+ # If no changes, nothing to do
251
+ if not added_fields and not type_changes and not dropped_fields:
252
+ logger.debug("No schema changes detected")
253
+ return False
254
+
255
+ # Validate changes are safe
256
+ validate_schema_changes(added_fields, type_changes, dropped_fields, allow_column_drops)
257
+
258
+ # Apply evolution
259
+ apply_schema_evolution(table, added_fields, type_changes)
260
+
261
+ return True
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: dlt-iceberg
3
+ Version: 0.1.1
4
+ Summary: dlt custom destination for Apache Iceberg with REST catalog support
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: boto3>=1.40.50
8
+ Requires-Dist: dlt>=1.17.1
9
+ Requires-Dist: pandas>=2.3.3
10
+ Requires-Dist: pyarrow>=21.0.0
11
+ Requires-Dist: pydantic<2.11
12
+ Requires-Dist: pyiceberg[pyiceberg-core]>=0.10.0
13
+ Requires-Dist: requests>=2.32.5
14
+ Requires-Dist: s3fs>=0.4.2
15
+ Requires-Dist: sqlalchemy>=2.0.44
@@ -0,0 +1,12 @@
1
+ dlt_iceberg/__init__.py,sha256=ONy6E-sGcCvvqia8_fGaYp8da4n4wdjox9W42tmQPK0,780
2
+ dlt_iceberg/destination.py,sha256=F8QJXsQeosOA32Xm1140DL485WQmxbuhiA2QZ6zpVSU,15737
3
+ dlt_iceberg/destination_client.py,sha256=dyJtHHy2Ow0GIFVj17LePC76rKw6MiVJnrS-y28OctQ,22341
4
+ dlt_iceberg/error_handling.py,sha256=k6Kkldi9BDRsXQ63VEBMMSw1xx2-b1BMjsgRFKI2iB0,7852
5
+ dlt_iceberg/partition_builder.py,sha256=l9YNAh2t6gk2xqsPSOs8ymTDLk9BOEZWVOtVni7ONNU,10081
6
+ dlt_iceberg/schema_casting.py,sha256=Qn4sarRnyJM04lKvKonEjvlvVdizUOGI65J_AmzbEAs,12997
7
+ dlt_iceberg/schema_converter.py,sha256=e_eqXQz2cpABOGEAxVwcGbiOdVmv9kaZanRnU83lzXk,5619
8
+ dlt_iceberg/schema_evolution.py,sha256=ieOkCA9ngQdJ5lbZLYQ09deTLZEW8whxDn2arpoH-aM,8326
9
+ dlt_iceberg-0.1.1.dist-info/METADATA,sha256=hhtEkMwpG_rQBUULTeyoMsSevGGEIhCqOjTJJgCw8qY,466
10
+ dlt_iceberg-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ dlt_iceberg-0.1.1.dist-info/licenses/LICENSE,sha256=0amGlcH0msYju3WUhlsuUxO4aj3ZODkkIZ0MKOq9fQ4,1066
12
+ dlt_iceberg-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sidequery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.