dlt-iceberg 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dlt-iceberg might be problematic. Click here for more details.

@@ -0,0 +1,224 @@
1
+ """
2
+ Error handling and classification for PyIceberg operations.
3
+
4
+ Classifies exceptions into retryable and non-retryable categories
5
+ and provides utilities for structured error logging.
6
+ """
7
+
8
+ import logging
9
+ import traceback
10
+ from typing import Optional, Type
11
+ from pyiceberg.exceptions import (
12
+ CommitFailedException,
13
+ CommitStateUnknownException,
14
+ WaitingForLockException,
15
+ ServiceUnavailableError,
16
+ ServerError,
17
+ AuthorizationExpiredError,
18
+ UnauthorizedError,
19
+ OAuthError,
20
+ NoSuchTableError,
21
+ NoSuchNamespaceError,
22
+ NoSuchIdentifierError,
23
+ TableAlreadyExistsError,
24
+ ValidationError,
25
+ BadRequestError,
26
+ ForbiddenError,
27
+ NotInstalledError,
28
+ RESTError,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class ErrorCategory:
35
+ """Categories for error classification."""
36
+ RETRYABLE_TRANSIENT = "retryable_transient"
37
+ RETRYABLE_AUTH = "retryable_auth"
38
+ NON_RETRYABLE_CLIENT = "non_retryable_client"
39
+ NON_RETRYABLE_SERVER = "non_retryable_server"
40
+ NON_RETRYABLE_CONFIG = "non_retryable_config"
41
+
42
+
43
+ # Mapping of exception types to error categories
44
+ ERROR_CLASSIFICATION = {
45
+ # Retryable transient errors - these indicate temporary issues
46
+ CommitFailedException: ErrorCategory.RETRYABLE_TRANSIENT,
47
+ WaitingForLockException: ErrorCategory.RETRYABLE_TRANSIENT,
48
+ CommitStateUnknownException: ErrorCategory.RETRYABLE_TRANSIENT,
49
+ ServiceUnavailableError: ErrorCategory.RETRYABLE_TRANSIENT,
50
+ ServerError: ErrorCategory.RETRYABLE_TRANSIENT,
51
+
52
+ # Retryable auth errors - credentials may have refreshed
53
+ AuthorizationExpiredError: ErrorCategory.RETRYABLE_AUTH,
54
+
55
+ # Non-retryable client errors - these indicate bugs or bad requests
56
+ NoSuchTableError: ErrorCategory.NON_RETRYABLE_CLIENT,
57
+ NoSuchNamespaceError: ErrorCategory.NON_RETRYABLE_CLIENT,
58
+ NoSuchIdentifierError: ErrorCategory.NON_RETRYABLE_CLIENT,
59
+ TableAlreadyExistsError: ErrorCategory.NON_RETRYABLE_CLIENT,
60
+ ValidationError: ErrorCategory.NON_RETRYABLE_CLIENT,
61
+ BadRequestError: ErrorCategory.NON_RETRYABLE_CLIENT,
62
+
63
+ # Non-retryable server errors - these indicate configuration issues
64
+ ForbiddenError: ErrorCategory.NON_RETRYABLE_SERVER,
65
+ UnauthorizedError: ErrorCategory.NON_RETRYABLE_SERVER,
66
+ OAuthError: ErrorCategory.NON_RETRYABLE_SERVER,
67
+
68
+ # Non-retryable config errors - missing dependencies
69
+ NotInstalledError: ErrorCategory.NON_RETRYABLE_CONFIG,
70
+ }
71
+
72
+
73
+ def is_retryable_error(exception: Exception) -> bool:
74
+ """
75
+ Determine if an exception should be retried.
76
+
77
+ Args:
78
+ exception: The exception to classify
79
+
80
+ Returns:
81
+ True if the error is retryable, False otherwise
82
+ """
83
+ exception_type = type(exception)
84
+ category = ERROR_CLASSIFICATION.get(exception_type)
85
+
86
+ if category is None:
87
+ # Unknown error - check if it's a RESTError or generic exception
88
+ if isinstance(exception, RESTError):
89
+ # RESTError is a catch-all, so we conservatively retry
90
+ logger.warning(f"Unknown RESTError encountered: {exception}")
91
+ return True
92
+ # For truly unknown errors, don't retry (fail fast)
93
+ return False
94
+
95
+ return category in {
96
+ ErrorCategory.RETRYABLE_TRANSIENT,
97
+ ErrorCategory.RETRYABLE_AUTH,
98
+ }
99
+
100
+
101
+ def get_error_category(exception: Exception) -> str:
102
+ """
103
+ Get the category of an exception.
104
+
105
+ Args:
106
+ exception: The exception to classify
107
+
108
+ Returns:
109
+ Error category string
110
+ """
111
+ exception_type = type(exception)
112
+ category = ERROR_CLASSIFICATION.get(exception_type)
113
+
114
+ if category is None:
115
+ if isinstance(exception, RESTError):
116
+ return "unknown_rest_error"
117
+ return "unknown_error"
118
+
119
+ return category
120
+
121
+
122
+ def log_error_with_context(
123
+ exception: Exception,
124
+ operation: str,
125
+ table_name: Optional[str] = None,
126
+ attempt: Optional[int] = None,
127
+ max_attempts: Optional[int] = None,
128
+ include_traceback: bool = False,
129
+ ) -> None:
130
+ """
131
+ Log an error with structured context.
132
+
133
+ Args:
134
+ exception: The exception that occurred
135
+ operation: Description of the operation being performed
136
+ table_name: Name of the table being operated on
137
+ attempt: Current attempt number (if retrying)
138
+ max_attempts: Maximum number of attempts
139
+ include_traceback: Whether to include full stack trace
140
+ """
141
+ error_type = type(exception).__name__
142
+ category = get_error_category(exception)
143
+ retryable = is_retryable_error(exception)
144
+
145
+ context = {
146
+ "error_type": error_type,
147
+ "error_category": category,
148
+ "retryable": retryable,
149
+ "operation": operation,
150
+ }
151
+
152
+ if table_name:
153
+ context["table_name"] = table_name
154
+
155
+ if attempt is not None and max_attempts is not None:
156
+ context["attempt"] = f"{attempt}/{max_attempts}"
157
+
158
+ # Format context for logging
159
+ context_str = ", ".join(f"{k}={v}" for k, v in context.items())
160
+
161
+ if retryable:
162
+ if attempt is not None and attempt >= max_attempts:
163
+ logger.error(
164
+ f"Operation failed after max retries: {exception} ({context_str})"
165
+ )
166
+ if include_traceback:
167
+ logger.error(f"Stack trace:\n{traceback.format_exc()}")
168
+ else:
169
+ logger.warning(
170
+ f"Retryable error encountered: {exception} ({context_str})"
171
+ )
172
+ else:
173
+ logger.error(
174
+ f"Non-retryable error encountered: {exception} ({context_str})"
175
+ )
176
+ if include_traceback:
177
+ logger.error(f"Stack trace:\n{traceback.format_exc()}")
178
+
179
+
180
+ def get_user_friendly_error_message(exception: Exception, operation: str) -> str:
181
+ """
182
+ Generate a user-friendly error message.
183
+
184
+ Args:
185
+ exception: The exception that occurred
186
+ operation: Description of the operation being performed
187
+
188
+ Returns:
189
+ User-friendly error message
190
+ """
191
+ error_type = type(exception).__name__
192
+ category = get_error_category(exception)
193
+
194
+ # Base message
195
+ message = f"Failed to {operation}: {exception}"
196
+
197
+ # Add helpful hints based on category
198
+ if category == ErrorCategory.NON_RETRYABLE_CLIENT:
199
+ if isinstance(exception, NoSuchTableError):
200
+ message += "\nHint: The table does not exist. Create it first or use write_disposition='append' with auto-create."
201
+ elif isinstance(exception, NoSuchNamespaceError):
202
+ message += "\nHint: The namespace does not exist. It will be created automatically on first write."
203
+ elif isinstance(exception, ValidationError):
204
+ message += "\nHint: There is a schema validation error. Check that your data types match the table schema."
205
+ elif isinstance(exception, TableAlreadyExistsError):
206
+ message += "\nHint: The table already exists. Use write_disposition='append' or 'merge' instead of 'replace'."
207
+
208
+ elif category == ErrorCategory.NON_RETRYABLE_SERVER:
209
+ if isinstance(exception, (UnauthorizedError, ForbiddenError)):
210
+ message += "\nHint: Check your credentials and permissions."
211
+ elif isinstance(exception, OAuthError):
212
+ message += "\nHint: OAuth authentication failed. Verify oauth2_server_uri and credentials."
213
+
214
+ elif category == ErrorCategory.NON_RETRYABLE_CONFIG:
215
+ if isinstance(exception, NotInstalledError):
216
+ message += "\nHint: A required optional dependency is missing. Install it with: uv add <package>"
217
+
218
+ elif category == ErrorCategory.RETRYABLE_TRANSIENT:
219
+ if isinstance(exception, CommitFailedException):
220
+ message += "\nNote: This is typically caused by concurrent writes. The operation will be retried."
221
+ elif isinstance(exception, ServiceUnavailableError):
222
+ message += "\nNote: The service is temporarily unavailable. The operation will be retried."
223
+
224
+ return message
@@ -0,0 +1,308 @@
1
+ """
2
+ Partition spec builder for Iceberg tables.
3
+
4
+ Supports advanced Iceberg partitioning transforms:
5
+ - Temporal: year, month, day, hour
6
+ - Identity: no transformation
7
+ - Bucket: hash-based partitioning with bucket[N] syntax
8
+ - Truncate: truncate to width with truncate[N] syntax
9
+
10
+ Examples:
11
+ # Bucket partitioning on user_id into 10 buckets
12
+ partition_transform="bucket[10]"
13
+
14
+ # Truncate string to 4 characters
15
+ partition_transform="truncate[4]"
16
+
17
+ # Temporal partitioning
18
+ partition_transform="month"
19
+ """
20
+
21
+ import logging
22
+ import re
23
+ from typing import Optional, Tuple
24
+ from dlt.common.schema import TTableSchema
25
+ from pyiceberg.schema import Schema
26
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
27
+ from pyiceberg.transforms import (
28
+ IdentityTransform,
29
+ YearTransform,
30
+ MonthTransform,
31
+ DayTransform,
32
+ HourTransform,
33
+ BucketTransform,
34
+ TruncateTransform,
35
+ )
36
+ from pyiceberg.types import (
37
+ TimestampType,
38
+ DateType,
39
+ StringType,
40
+ IntegerType,
41
+ LongType,
42
+ BinaryType,
43
+ DecimalType,
44
+ DoubleType,
45
+ FloatType,
46
+ )
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+
51
+ def parse_transform_hint(hint: str) -> Tuple[str, Optional[int]]:
52
+ """
53
+ Parse partition transform hint into transform type and parameter.
54
+
55
+ Supports:
56
+ - Simple transforms: "year", "month", "day", "hour", "identity"
57
+ - Parameterized transforms: "bucket[10]", "truncate[5]"
58
+
59
+ Args:
60
+ hint: Transform hint string
61
+
62
+ Returns:
63
+ Tuple of (transform_type, parameter)
64
+ Example: ("bucket", 10) or ("month", None)
65
+
66
+ Raises:
67
+ ValueError: If hint format is invalid
68
+ """
69
+ # Check for parameterized transform: bucket[N] or truncate[N]
70
+ match = re.match(r"^(\w+)\[(\d+)\]$", hint)
71
+ if match:
72
+ transform_type = match.group(1)
73
+ param = int(match.group(2))
74
+ return (transform_type, param)
75
+
76
+ # Simple transform
77
+ return (hint, None)
78
+
79
+
80
+ def validate_transform_for_type(
81
+ transform_type: str, param: Optional[int], field_type, col_name: str
82
+ ) -> None:
83
+ """
84
+ Validate that a transform is appropriate for the field type.
85
+
86
+ Args:
87
+ transform_type: Transform type (e.g., "bucket", "truncate", "month")
88
+ param: Transform parameter (e.g., bucket count, truncate width)
89
+ field_type: Iceberg field type
90
+ col_name: Column name for error messages
91
+
92
+ Raises:
93
+ ValueError: If transform is invalid for the field type
94
+ """
95
+ # Temporal transforms only for timestamp/date
96
+ temporal_transforms = {"year", "month", "day", "hour"}
97
+ if transform_type in temporal_transforms:
98
+ if not isinstance(field_type, (TimestampType, DateType)):
99
+ raise ValueError(
100
+ f"Temporal transform '{transform_type}' cannot be applied to "
101
+ f"column '{col_name}' with type {field_type}. "
102
+ f"Use timestamp or date types for temporal transforms."
103
+ )
104
+
105
+ # Bucket transform validation
106
+ if transform_type == "bucket":
107
+ if param is None or param <= 0:
108
+ raise ValueError(
109
+ f"Bucket transform requires a positive integer parameter, "
110
+ f"got: {param} for column '{col_name}'"
111
+ )
112
+ # Bucket can be applied to most types except binary
113
+ if isinstance(field_type, BinaryType):
114
+ raise ValueError(
115
+ f"Bucket transform cannot be applied to binary column '{col_name}'"
116
+ )
117
+
118
+ # Truncate transform validation
119
+ if transform_type == "truncate":
120
+ if param is None or param <= 0:
121
+ raise ValueError(
122
+ f"Truncate transform requires a positive integer parameter, "
123
+ f"got: {param} for column '{col_name}'"
124
+ )
125
+ # Truncate works on strings, integers, longs, decimals, binary
126
+ valid_types = (StringType, IntegerType, LongType, DecimalType, BinaryType)
127
+ if not isinstance(field_type, valid_types):
128
+ raise ValueError(
129
+ f"Truncate transform cannot be applied to column '{col_name}' "
130
+ f"with type {field_type}. "
131
+ f"Use string, integer, long, decimal, or binary types."
132
+ )
133
+
134
+
135
+ def build_partition_spec(
136
+ dlt_table: TTableSchema, iceberg_schema: Schema
137
+ ) -> Optional[PartitionSpec]:
138
+ """
139
+ Build Iceberg partition spec from dlt table hints.
140
+
141
+ Looks for columns marked with partition=True in dlt table schema
142
+ and creates appropriate partition transforms based on data types.
143
+
144
+ Args:
145
+ dlt_table: dlt table schema with column hints
146
+ iceberg_schema: Iceberg schema with field information
147
+
148
+ Returns:
149
+ PartitionSpec or None if no partitioning specified
150
+ """
151
+ partition_columns = []
152
+
153
+ # Extract partition columns from dlt hints
154
+ # Support both 'partition' and 'x-partition' (custom hint)
155
+ dlt_columns = dlt_table.get("columns", {})
156
+ for col_name, col_info in dlt_columns.items():
157
+ if col_info.get("partition") or col_info.get("x-partition"):
158
+ partition_columns.append(col_name)
159
+
160
+ if not partition_columns:
161
+ logger.info("No partition columns specified")
162
+ return None
163
+
164
+ logger.info(f"Building partition spec for columns: {partition_columns}")
165
+
166
+ # Build partition fields
167
+ partition_fields = []
168
+
169
+ for col_name in partition_columns:
170
+ # Find field in Iceberg schema
171
+ iceberg_field = None
172
+ for field in iceberg_schema.fields:
173
+ if field.name == col_name:
174
+ iceberg_field = field
175
+ break
176
+
177
+ if not iceberg_field:
178
+ logger.warning(
179
+ f"Partition column {col_name} not found in schema, skipping"
180
+ )
181
+ continue
182
+
183
+ # Choose transform based on data type
184
+ transform = choose_partition_transform(
185
+ iceberg_field.field_type, col_name, dlt_columns.get(col_name, {})
186
+ )
187
+
188
+ # Create partition field
189
+ partition_field = PartitionField(
190
+ source_id=iceberg_field.field_id,
191
+ field_id=1000 + len(partition_fields), # Start partition IDs at 1000
192
+ transform=transform,
193
+ name=f"{col_name}_{get_transform_name(transform)}",
194
+ )
195
+ partition_fields.append(partition_field)
196
+
197
+ if not partition_fields:
198
+ logger.warning("No valid partition fields created")
199
+ return None
200
+
201
+ spec = PartitionSpec(*partition_fields)
202
+ logger.info(f"Created partition spec with {len(partition_fields)} fields")
203
+ return spec
204
+
205
+
206
+ def choose_partition_transform(field_type, col_name: str, col_hints: dict):
207
+ """
208
+ Choose appropriate Iceberg transform based on field type and hints.
209
+
210
+ Supports:
211
+ - Temporal transforms: year, month, day, hour (for timestamps/dates)
212
+ - Bucket transforms: bucket[N] (hash-based partitioning)
213
+ - Truncate transforms: truncate[N] (truncate to width)
214
+ - Identity transform: no transformation
215
+
216
+ Args:
217
+ field_type: Iceberg field type
218
+ col_name: Column name
219
+ col_hints: dlt column hints (may contain partition_transform)
220
+
221
+ Returns:
222
+ Iceberg transform
223
+
224
+ Raises:
225
+ ValueError: If transform is invalid for the field type
226
+
227
+ Examples:
228
+ # Bucket partitioning
229
+ col_hints = {"partition_transform": "bucket[10]"}
230
+
231
+ # Truncate partitioning
232
+ col_hints = {"partition_transform": "truncate[4]"}
233
+
234
+ # Temporal partitioning
235
+ col_hints = {"partition_transform": "month"}
236
+ """
237
+ # Check if user specified a transform in hints
238
+ # Support both 'partition_transform' and 'x-partition-transform' (custom hint)
239
+ partition_transform = col_hints.get("partition_transform") or col_hints.get("x-partition-transform")
240
+
241
+ if partition_transform:
242
+ # Parse the transform hint
243
+ transform_type, param = parse_transform_hint(partition_transform)
244
+
245
+ # Validate transform is appropriate for field type
246
+ validate_transform_for_type(transform_type, param, field_type, col_name)
247
+
248
+ # Create the appropriate transform
249
+ if transform_type == "bucket":
250
+ return BucketTransform(num_buckets=param)
251
+ elif transform_type == "truncate":
252
+ return TruncateTransform(width=param)
253
+ elif transform_type == "year":
254
+ return YearTransform()
255
+ elif transform_type == "month":
256
+ return MonthTransform()
257
+ elif transform_type == "day":
258
+ return DayTransform()
259
+ elif transform_type == "hour":
260
+ return HourTransform()
261
+ elif transform_type == "identity":
262
+ return IdentityTransform()
263
+ else:
264
+ raise ValueError(
265
+ f"Unknown transform type '{transform_type}' for column '{col_name}'"
266
+ )
267
+
268
+ # No hint specified - use defaults based on type
269
+ if isinstance(field_type, (TimestampType, DateType)):
270
+ # Default to month for timestamps/dates
271
+ return MonthTransform()
272
+ elif isinstance(field_type, (StringType, IntegerType, LongType)):
273
+ # Default to identity for discrete types
274
+ return IdentityTransform()
275
+ else:
276
+ # Fallback to identity
277
+ logger.warning(
278
+ f"Using identity transform for {col_name} with type {field_type}"
279
+ )
280
+ return IdentityTransform()
281
+
282
+
283
+ def get_transform_name(transform) -> str:
284
+ """
285
+ Get human-readable name for transform.
286
+
287
+ Args:
288
+ transform: Iceberg transform instance
289
+
290
+ Returns:
291
+ String representation of the transform
292
+ """
293
+ if isinstance(transform, IdentityTransform):
294
+ return "identity"
295
+ elif isinstance(transform, YearTransform):
296
+ return "year"
297
+ elif isinstance(transform, MonthTransform):
298
+ return "month"
299
+ elif isinstance(transform, DayTransform):
300
+ return "day"
301
+ elif isinstance(transform, HourTransform):
302
+ return "hour"
303
+ elif isinstance(transform, BucketTransform):
304
+ return f"bucket_{transform.num_buckets}"
305
+ elif isinstance(transform, TruncateTransform):
306
+ return f"truncate_{transform.width}"
307
+ else:
308
+ return "transform"