dlt-iceberg 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dlt-iceberg might be problematic. Click here for more details.

@@ -0,0 +1,381 @@
1
+ """
2
+ Defensive schema casting for PyArrow tables.
3
+
4
+ This module provides safe casting operations that detect potential data loss
5
+ and allow users to control casting behavior.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Optional, Tuple
10
+ import pyarrow as pa
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class CastingError(Exception):
16
+ """Raised when a cast would result in data loss in strict mode."""
17
+ pass
18
+
19
+
20
+ class CastValidationResult:
21
+ """Result of validating a cast operation."""
22
+
23
+ def __init__(self):
24
+ self.safe = True
25
+ self.warnings: List[str] = []
26
+ self.errors: List[str] = []
27
+
28
+ def add_warning(self, message: str):
29
+ """Add a warning about the cast."""
30
+ self.warnings.append(message)
31
+ logger.warning(f"Cast warning: {message}")
32
+
33
+ def add_error(self, message: str):
34
+ """Add an error about the cast."""
35
+ self.errors.append(message)
36
+ self.safe = False
37
+ logger.error(f"Cast error: {message}")
38
+
39
+ def is_safe(self) -> bool:
40
+ """Check if the cast is safe."""
41
+ return self.safe and len(self.errors) == 0
42
+
43
+
44
+ def _check_timestamp_cast(
45
+ source_type: pa.DataType,
46
+ target_type: pa.DataType,
47
+ field_name: str,
48
+ result: CastValidationResult
49
+ ) -> None:
50
+ """
51
+ Check if timestamp cast is safe.
52
+
53
+ Validates:
54
+ - Timezone conversions (naive → aware, aware → naive, aware → different aware)
55
+ - Unit conversions (us → ms → s)
56
+ """
57
+ if not pa.types.is_timestamp(source_type) or not pa.types.is_timestamp(target_type):
58
+ return
59
+
60
+ source_tz = source_type.tz
61
+ target_tz = target_type.tz
62
+
63
+ # Check timezone conversions
64
+ if source_tz is None and target_tz is not None:
65
+ # Naive to aware - this requires assuming UTC which may not be correct
66
+ result.add_warning(
67
+ f"Field '{field_name}': Converting timezone-naive timestamp to "
68
+ f"timezone-aware ({target_tz}). Values will be interpreted as UTC."
69
+ )
70
+ elif source_tz is not None and target_tz is None:
71
+ # Aware to naive - loses timezone information
72
+ result.add_error(
73
+ f"Field '{field_name}': Converting timezone-aware ({source_tz}) timestamp to "
74
+ f"timezone-naive loses timezone information"
75
+ )
76
+ elif source_tz is not None and target_tz is not None and source_tz != target_tz:
77
+ # Aware to different aware - this is safe (conversion happens)
78
+ result.add_warning(
79
+ f"Field '{field_name}': Converting timestamp from {source_tz} to {target_tz}"
80
+ )
81
+
82
+ # Check unit conversions (precision loss)
83
+ source_unit = source_type.unit
84
+ target_unit = target_type.unit
85
+
86
+ # Units in order of precision: ns > us > ms > s
87
+ unit_precision = {'ns': 4, 'us': 3, 'ms': 2, 's': 1}
88
+
89
+ source_prec = unit_precision.get(source_unit, 0)
90
+ target_prec = unit_precision.get(target_unit, 0)
91
+
92
+ if source_prec > target_prec:
93
+ result.add_error(
94
+ f"Field '{field_name}': Converting timestamp from {source_unit} to {target_unit} "
95
+ f"loses precision"
96
+ )
97
+
98
+
99
+ def _check_numeric_cast(
100
+ source_type: pa.DataType,
101
+ target_type: pa.DataType,
102
+ field_name: str,
103
+ result: CastValidationResult
104
+ ) -> None:
105
+ """
106
+ Check if numeric cast is safe.
107
+
108
+ Validates:
109
+ - Integer downcasting (int64 → int32, int32 → int16, etc.)
110
+ - Float to integer conversion (always unsafe due to truncation)
111
+ - Decimal precision/scale changes
112
+ - Unsigned to signed conversions
113
+ """
114
+
115
+ # Float to integer
116
+ if pa.types.is_floating(source_type) and pa.types.is_integer(target_type):
117
+ result.add_error(
118
+ f"Field '{field_name}': Converting float to integer truncates decimal values"
119
+ )
120
+ return
121
+
122
+ # Integer bit size reductions
123
+ source_bits = _get_integer_bits(source_type)
124
+ target_bits = _get_integer_bits(target_type)
125
+
126
+ if source_bits and target_bits and source_bits > target_bits:
127
+ result.add_error(
128
+ f"Field '{field_name}': Converting {source_bits}-bit integer to {target_bits}-bit "
129
+ f"may overflow"
130
+ )
131
+
132
+ # Unsigned to signed conversions (can overflow)
133
+ if _is_unsigned_int(source_type) and _is_signed_int(target_type):
134
+ result.add_warning(
135
+ f"Field '{field_name}': Converting unsigned to signed integer may overflow "
136
+ f"for large values"
137
+ )
138
+
139
+ # Decimal precision/scale changes
140
+ if pa.types.is_decimal(source_type) and pa.types.is_decimal(target_type):
141
+ if source_type.precision > target_type.precision:
142
+ result.add_error(
143
+ f"Field '{field_name}': Converting decimal({source_type.precision}, "
144
+ f"{source_type.scale}) to decimal({target_type.precision}, "
145
+ f"{target_type.scale}) may lose precision"
146
+ )
147
+ if source_type.scale > target_type.scale:
148
+ result.add_error(
149
+ f"Field '{field_name}': Converting decimal scale from {source_type.scale} "
150
+ f"to {target_type.scale} truncates decimal places"
151
+ )
152
+
153
+
154
+ def _get_integer_bits(dtype: pa.DataType) -> Optional[int]:
155
+ """Get bit width of integer type."""
156
+ if pa.types.is_int8(dtype) or pa.types.is_uint8(dtype):
157
+ return 8
158
+ elif pa.types.is_int16(dtype) or pa.types.is_uint16(dtype):
159
+ return 16
160
+ elif pa.types.is_int32(dtype) or pa.types.is_uint32(dtype):
161
+ return 32
162
+ elif pa.types.is_int64(dtype) or pa.types.is_uint64(dtype):
163
+ return 64
164
+ return None
165
+
166
+
167
+ def _is_unsigned_int(dtype: pa.DataType) -> bool:
168
+ """Check if type is unsigned integer."""
169
+ return (pa.types.is_uint8(dtype) or pa.types.is_uint16(dtype) or
170
+ pa.types.is_uint32(dtype) or pa.types.is_uint64(dtype))
171
+
172
+
173
+ def _is_signed_int(dtype: pa.DataType) -> bool:
174
+ """Check if type is signed integer."""
175
+ return (pa.types.is_int8(dtype) or pa.types.is_int16(dtype) or
176
+ pa.types.is_int32(dtype) or pa.types.is_int64(dtype))
177
+
178
+
179
+ def _check_string_cast(
180
+ source_type: pa.DataType,
181
+ target_type: pa.DataType,
182
+ field_name: str,
183
+ result: CastValidationResult
184
+ ) -> None:
185
+ """
186
+ Check if string/binary cast is safe.
187
+
188
+ Validates:
189
+ - Binary to string conversion (may not be valid UTF-8)
190
+ - String to non-string conversion (except binary/large_string/large_binary)
191
+ """
192
+ if pa.types.is_binary(source_type) and pa.types.is_string(target_type):
193
+ result.add_warning(
194
+ f"Field '{field_name}': Converting binary to string assumes valid UTF-8 encoding"
195
+ )
196
+
197
+ # Check if converting string to something else
198
+ if pa.types.is_string(source_type) or pa.types.is_large_string(source_type):
199
+ # These conversions are safe
200
+ if (pa.types.is_string(target_type) or
201
+ pa.types.is_large_string(target_type) or
202
+ pa.types.is_binary(target_type) or
203
+ pa.types.is_large_binary(target_type)):
204
+ return # Safe conversion
205
+
206
+ # All other conversions are unsafe
207
+ result.add_error(
208
+ f"Field '{field_name}': Converting string to {target_type} may lose data"
209
+ )
210
+
211
+
212
+ def validate_cast(
213
+ source_schema: pa.Schema,
214
+ target_schema: pa.Schema
215
+ ) -> CastValidationResult:
216
+ """
217
+ Validate that casting from source to target schema is safe.
218
+
219
+ Checks each field for potential data loss, including:
220
+ - Timestamp timezone and precision changes
221
+ - Numeric downcasting and conversions
222
+ - String/binary conversions
223
+ - Type changes that may lose information
224
+
225
+ Args:
226
+ source_schema: Source PyArrow schema
227
+ target_schema: Target PyArrow schema
228
+
229
+ Returns:
230
+ CastValidationResult with safety status and warnings/errors
231
+ """
232
+ result = CastValidationResult()
233
+
234
+ # Build field lookup for target schema
235
+ target_fields = {field.name: field for field in target_schema}
236
+
237
+ for source_field in source_schema:
238
+ field_name = source_field.name
239
+
240
+ # Check if field exists in target
241
+ if field_name not in target_fields:
242
+ result.add_error(f"Field '{field_name}' exists in source but not in target schema")
243
+ continue
244
+
245
+ target_field = target_fields[field_name]
246
+ source_type = source_field.type
247
+ target_type = target_field.type
248
+
249
+ # If types are identical, no cast needed
250
+ if source_type == target_type:
251
+ continue
252
+
253
+ # Check timestamp casts
254
+ _check_timestamp_cast(source_type, target_type, field_name, result)
255
+
256
+ # Check numeric casts
257
+ _check_numeric_cast(source_type, target_type, field_name, result)
258
+
259
+ # Check string/binary casts
260
+ _check_string_cast(source_type, target_type, field_name, result)
261
+
262
+ # Generic type compatibility check
263
+ if not _types_compatible(source_type, target_type):
264
+ result.add_error(
265
+ f"Field '{field_name}': Type {source_type} is not compatible with {target_type}"
266
+ )
267
+
268
+ # Check for fields in target that are missing from source
269
+ source_field_names = {field.name for field in source_schema}
270
+ for target_field in target_schema:
271
+ if target_field.name not in source_field_names:
272
+ # Missing fields will be filled with nulls - this is usually OK
273
+ result.add_warning(
274
+ f"Field '{target_field.name}' exists in target but not in source "
275
+ f"(will be null)"
276
+ )
277
+
278
+ return result
279
+
280
+
281
+ def _types_compatible(source_type: pa.DataType, target_type: pa.DataType) -> bool:
282
+ """
283
+ Check if two types are compatible for casting.
284
+
285
+ This is a broad check - more specific checks are done in the individual
286
+ validation functions.
287
+ """
288
+ # Same type is always compatible
289
+ if source_type == target_type:
290
+ return True
291
+
292
+ # Numeric types are generally compatible
293
+ if pa.types.is_integer(source_type) and pa.types.is_integer(target_type):
294
+ return True
295
+ if pa.types.is_floating(source_type) and pa.types.is_floating(target_type):
296
+ return True
297
+ if pa.types.is_integer(source_type) and pa.types.is_floating(target_type):
298
+ return True # Int to float is safe
299
+
300
+ # Temporal types with same base type
301
+ if pa.types.is_timestamp(source_type) and pa.types.is_timestamp(target_type):
302
+ return True
303
+ if pa.types.is_date(source_type) and pa.types.is_date(target_type):
304
+ return True
305
+ if pa.types.is_time(source_type) and pa.types.is_time(target_type):
306
+ return True
307
+
308
+ # String and binary are interchangeable with caveats
309
+ if pa.types.is_string(source_type) and pa.types.is_binary(target_type):
310
+ return True
311
+ if pa.types.is_binary(source_type) and pa.types.is_string(target_type):
312
+ return True
313
+
314
+ # Large variants
315
+ if pa.types.is_string(source_type) and pa.types.is_large_string(target_type):
316
+ return True
317
+ if pa.types.is_large_string(source_type) and pa.types.is_string(target_type):
318
+ return True
319
+ if pa.types.is_binary(source_type) and pa.types.is_large_binary(target_type):
320
+ return True
321
+ if pa.types.is_large_binary(source_type) and pa.types.is_binary(target_type):
322
+ return True
323
+
324
+ # Decimal types
325
+ if pa.types.is_decimal(source_type) and pa.types.is_decimal(target_type):
326
+ return True
327
+
328
+ # Otherwise incompatible
329
+ return False
330
+
331
+
332
+ def cast_table_safe(
333
+ table: pa.Table,
334
+ target_schema: pa.Schema,
335
+ strict: bool = True
336
+ ) -> pa.Table:
337
+ """
338
+ Safely cast a PyArrow table to a target schema with validation.
339
+
340
+ Args:
341
+ table: Source PyArrow table
342
+ target_schema: Target schema to cast to
343
+ strict: If True, fail on any potential data loss. If False, only warn.
344
+
345
+ Returns:
346
+ Cast PyArrow table
347
+
348
+ Raises:
349
+ CastingError: If strict=True and cast would result in data loss
350
+ """
351
+ # Validate the cast
352
+ validation = validate_cast(table.schema, target_schema)
353
+
354
+ # Log warnings
355
+ for warning in validation.warnings:
356
+ logger.warning(f"Cast warning: {warning}")
357
+
358
+ # In strict mode, fail if there are errors
359
+ if strict and not validation.is_safe():
360
+ error_msg = "Cannot cast table safely. Errors:\n" + "\n".join(validation.errors)
361
+ if validation.warnings:
362
+ error_msg += "\nWarnings:\n" + "\n".join(validation.warnings)
363
+ raise CastingError(error_msg)
364
+
365
+ # Log errors as warnings in non-strict mode
366
+ if not strict:
367
+ for error in validation.errors:
368
+ logger.warning(f"Cast error (proceeding anyway due to strict=False): {error}")
369
+
370
+ # Perform the cast
371
+ logger.info(
372
+ f"Casting table with {len(table)} rows from schema with {len(table.schema)} fields "
373
+ f"to schema with {len(target_schema)} fields"
374
+ )
375
+
376
+ try:
377
+ casted_table = table.cast(target_schema)
378
+ logger.info("Cast completed successfully")
379
+ return casted_table
380
+ except pa.ArrowInvalid as e:
381
+ raise CastingError(f"Cast failed during execution: {e}") from e
@@ -0,0 +1,207 @@
1
+ """
2
+ Schema conversion from dlt to Apache Iceberg.
3
+ """
4
+
5
+ import logging
6
+ from typing import Dict
7
+ import pyarrow as pa
8
+ from dlt.common.schema import TTableSchema
9
+ from pyiceberg.schema import Schema
10
+ from pyiceberg.types import (
11
+ NestedField,
12
+ BooleanType,
13
+ IntegerType,
14
+ LongType,
15
+ FloatType,
16
+ DoubleType,
17
+ DecimalType,
18
+ StringType,
19
+ BinaryType,
20
+ TimestampType,
21
+ DateType,
22
+ TimeType,
23
+ ListType,
24
+ MapType,
25
+ StructType,
26
+ )
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def convert_dlt_to_iceberg_schema(
32
+ dlt_table: TTableSchema, arrow_table: pa.Table
33
+ ) -> Schema:
34
+ """
35
+ Convert a dlt table schema to an Iceberg schema.
36
+
37
+ We use the PyArrow table to infer actual types since dlt may not have
38
+ complete type information.
39
+
40
+ Args:
41
+ dlt_table: dlt table schema
42
+ arrow_table: PyArrow table with actual data
43
+
44
+ Returns:
45
+ Iceberg Schema object
46
+ """
47
+ fields = []
48
+ arrow_schema = arrow_table.schema
49
+
50
+ # Build field ID counter
51
+ field_id = 1
52
+
53
+ # Get column information from dlt
54
+ dlt_columns = dlt_table.get("columns", {})
55
+
56
+ for arrow_field in arrow_schema:
57
+ col_name = arrow_field.name
58
+ arrow_type = arrow_field.type
59
+
60
+ # Get dlt column metadata if available
61
+ dlt_col = dlt_columns.get(col_name, {})
62
+ nullable = dlt_col.get("nullable", True)
63
+ required = not nullable
64
+
65
+ # Convert PyArrow type to Iceberg type
66
+ iceberg_type = convert_arrow_to_iceberg_type(arrow_type)
67
+
68
+ # Create Iceberg field
69
+ field = NestedField(
70
+ field_id=field_id,
71
+ name=col_name,
72
+ field_type=iceberg_type,
73
+ required=required,
74
+ )
75
+ fields.append(field)
76
+ field_id += 1
77
+
78
+ schema = Schema(*fields)
79
+ logger.info(f"Converted schema with {len(fields)} fields")
80
+ return schema
81
+
82
+
83
+ def convert_arrow_to_iceberg_type(arrow_type: pa.DataType):
84
+ """
85
+ Convert PyArrow data type to Iceberg type.
86
+
87
+ Args:
88
+ arrow_type: PyArrow data type
89
+
90
+ Returns:
91
+ Iceberg type
92
+ """
93
+ # Boolean
94
+ if pa.types.is_boolean(arrow_type):
95
+ return BooleanType()
96
+
97
+ # Integers
98
+ elif pa.types.is_int8(arrow_type) or pa.types.is_int16(arrow_type):
99
+ return IntegerType()
100
+ elif pa.types.is_int32(arrow_type) or pa.types.is_uint8(arrow_type) or pa.types.is_uint16(arrow_type):
101
+ return IntegerType()
102
+ elif pa.types.is_int64(arrow_type) or pa.types.is_uint32(arrow_type) or pa.types.is_uint64(arrow_type):
103
+ return LongType()
104
+
105
+ # Floats
106
+ elif pa.types.is_float32(arrow_type):
107
+ return FloatType()
108
+ elif pa.types.is_float64(arrow_type):
109
+ return DoubleType()
110
+
111
+ # Decimal
112
+ elif pa.types.is_decimal(arrow_type):
113
+ return DecimalType(
114
+ precision=arrow_type.precision,
115
+ scale=arrow_type.scale
116
+ )
117
+
118
+ # String
119
+ elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type):
120
+ return StringType()
121
+
122
+ # Binary
123
+ elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type):
124
+ return BinaryType()
125
+
126
+ # Temporal types
127
+ elif pa.types.is_timestamp(arrow_type):
128
+ return TimestampType()
129
+ elif pa.types.is_date(arrow_type):
130
+ return DateType()
131
+ elif pa.types.is_time(arrow_type):
132
+ return TimeType()
133
+
134
+ # Complex types
135
+ elif pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type):
136
+ element_type = convert_arrow_to_iceberg_type(arrow_type.value_type)
137
+ return ListType(
138
+ element_id=1,
139
+ element_type=element_type,
140
+ element_required=False,
141
+ )
142
+
143
+ elif pa.types.is_map(arrow_type):
144
+ key_type = convert_arrow_to_iceberg_type(arrow_type.key_type)
145
+ value_type = convert_arrow_to_iceberg_type(arrow_type.item_type)
146
+ return MapType(
147
+ key_id=1,
148
+ key_type=key_type,
149
+ value_id=2,
150
+ value_type=value_type,
151
+ value_required=False,
152
+ )
153
+
154
+ elif pa.types.is_struct(arrow_type):
155
+ struct_fields = []
156
+ for i, field in enumerate(arrow_type):
157
+ field_type = convert_arrow_to_iceberg_type(field.type)
158
+ struct_fields.append(
159
+ NestedField(
160
+ field_id=i + 1,
161
+ name=field.name,
162
+ field_type=field_type,
163
+ required=not field.nullable,
164
+ )
165
+ )
166
+ return StructType(*struct_fields)
167
+
168
+ # Fallback to string for unknown types
169
+ else:
170
+ logger.warning(
171
+ f"Unknown PyArrow type {arrow_type}, using StringType as fallback"
172
+ )
173
+ return StringType()
174
+
175
+
176
+ def convert_dlt_type_to_iceberg_type(dlt_type: str):
177
+ """
178
+ Convert dlt data type string to Iceberg type (fallback method).
179
+
180
+ Args:
181
+ dlt_type: dlt data type string
182
+
183
+ Returns:
184
+ Iceberg type
185
+ """
186
+ type_mapping = {
187
+ "text": StringType(),
188
+ "varchar": StringType(),
189
+ "string": StringType(),
190
+ "bigint": LongType(),
191
+ "integer": IntegerType(),
192
+ "int": IntegerType(),
193
+ "smallint": IntegerType(),
194
+ "double": DoubleType(),
195
+ "float": FloatType(),
196
+ "decimal": DecimalType(38, 9),
197
+ "bool": BooleanType(),
198
+ "boolean": BooleanType(),
199
+ "timestamp": TimestampType(),
200
+ "timestamptz": TimestampType(),
201
+ "date": DateType(),
202
+ "time": TimeType(),
203
+ "binary": BinaryType(),
204
+ "json": StringType(), # Store JSON as string
205
+ }
206
+
207
+ return type_mapping.get(dlt_type.lower(), StringType())