dlt-iceberg 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dlt-iceberg might be problematic. Click here for more details.

@@ -0,0 +1,28 @@
1
+ """Sidequery dlt Iceberg REST Catalog Destination"""
2
+
3
+ # Class-based destination with atomic multi-file commits (RECOMMENDED)
4
+ from .destination_client import (
5
+ iceberg_rest_class_based,
6
+ IcebergRestClient,
7
+ IcebergRestConfiguration,
8
+ )
9
+
10
+ # Function-based destination (legacy, per-file commits)
11
+ from .destination import iceberg_rest as iceberg_rest_function_based
12
+
13
+ # Export the class-based version as the primary destination
14
+ iceberg_rest = iceberg_rest_class_based
15
+
16
+ # Errors
17
+ from .schema_casting import CastingError
18
+ from .schema_evolution import SchemaEvolutionError
19
+
20
+ __all__ = [
21
+ "iceberg_rest",
22
+ "iceberg_rest_class_based",
23
+ "iceberg_rest_function_based",
24
+ "IcebergRestClient",
25
+ "IcebergRestConfiguration",
26
+ "CastingError",
27
+ "SchemaEvolutionError",
28
+ ]
@@ -0,0 +1,400 @@
1
+ """
2
+ Iceberg REST Catalog destination for dlt.
3
+
4
+ This custom destination writes data to Apache Iceberg tables using a REST catalog
5
+ (Polaris, Unity Catalog, AWS Glue, Nessie, etc.).
6
+ """
7
+
8
+ import time
9
+ import logging
10
+ from typing import Optional, Dict, Any
11
+ from pathlib import Path
12
+
13
+ import dlt
14
+ import pyarrow.parquet as pq
15
+ from dlt.common.schema import TTableSchema
16
+ from pyiceberg.catalog import load_catalog
17
+ from pyiceberg.catalog.rest import RestCatalog
18
+ from pyiceberg.catalog.sql import SqlCatalog
19
+ from pyiceberg.exceptions import (
20
+ NoSuchTableError,
21
+ CommitFailedException,
22
+ NoSuchNamespaceError,
23
+ ValidationError,
24
+ )
25
+
26
+ from .schema_converter import convert_dlt_to_iceberg_schema
27
+ from .partition_builder import build_partition_spec
28
+ from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
29
+ from .schema_casting import cast_table_safe, CastingError
30
+ from .error_handling import (
31
+ is_retryable_error,
32
+ log_error_with_context,
33
+ get_user_friendly_error_message,
34
+ )
35
+ from pyiceberg.io.pyarrow import schema_to_pyarrow
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ def _iceberg_rest_handler(
41
+ items: str, # File path when batch_size=0
42
+ table: TTableSchema,
43
+ # Catalog configuration
44
+ catalog_uri: str = dlt.secrets.value,
45
+ warehouse: Optional[str] = dlt.secrets.value,
46
+ namespace: str = "default",
47
+ # Authentication (OAuth2)
48
+ credential: Optional[str] = dlt.secrets.value,
49
+ oauth2_server_uri: Optional[str] = dlt.secrets.value,
50
+ scope: Optional[str] = "PRINCIPAL_ROLE:ALL",
51
+ # Or Bearer token
52
+ token: Optional[str] = dlt.secrets.value,
53
+ # AWS SigV4 (for Glue)
54
+ sigv4_enabled: bool = False,
55
+ signing_region: Optional[str] = None,
56
+ signing_name: str = "execute-api",
57
+ # S3 configuration (for MinIO or other S3-compatible storage)
58
+ s3_endpoint: Optional[str] = None,
59
+ s3_access_key_id: Optional[str] = None,
60
+ s3_secret_access_key: Optional[str] = None,
61
+ s3_region: Optional[str] = None,
62
+ # Retry configuration
63
+ max_retries: int = 5,
64
+ retry_backoff_base: float = 2.0,
65
+ # Schema casting configuration
66
+ strict_casting: bool = False,
67
+ ) -> None:
68
+ """
69
+ Custom dlt destination for Iceberg tables with REST catalog.
70
+
71
+ Args:
72
+ items: Path to parquet file generated by dlt
73
+ table: dlt table schema with column information and hints
74
+ catalog_uri: REST catalog endpoint URL
75
+ warehouse: Warehouse location (S3/GCS/Azure path)
76
+ namespace: Iceberg namespace (database)
77
+ credential: OAuth2 credentials in format "client_id:client_secret"
78
+ oauth2_server_uri: OAuth2 token endpoint
79
+ token: Bearer token (alternative to OAuth2)
80
+ sigv4_enabled: Enable AWS SigV4 signing
81
+ signing_region: AWS region for SigV4
82
+ signing_name: AWS service name for SigV4
83
+ max_retries: Maximum retry attempts for commit failures
84
+ retry_backoff_base: Base for exponential backoff
85
+ strict_casting: If True, fail when schema cast would lose data (timezone info,
86
+ precision, etc.). If False, proceed with aggressive casting and
87
+ log warnings. Default is False for backward compatibility.
88
+ """
89
+
90
+ # Build catalog configuration
91
+ # Auto-detect catalog type from URI
92
+ if catalog_uri.startswith("sqlite://") or catalog_uri.startswith("postgresql://"):
93
+ catalog_type = "sql"
94
+ else:
95
+ catalog_type = "rest"
96
+
97
+ catalog_config = {
98
+ "type": catalog_type,
99
+ "uri": catalog_uri,
100
+ }
101
+
102
+ # Add warehouse if provided (some catalogs configure it globally)
103
+ if warehouse:
104
+ catalog_config["warehouse"] = warehouse
105
+
106
+ # Add authentication
107
+ if credential and oauth2_server_uri:
108
+ catalog_config["credential"] = credential
109
+ catalog_config["oauth2-server-uri"] = oauth2_server_uri
110
+ if scope:
111
+ catalog_config["scope"] = scope
112
+ elif token:
113
+ catalog_config["token"] = token
114
+
115
+ # AWS SigV4
116
+ if sigv4_enabled:
117
+ catalog_config["rest.sigv4-enabled"] = "true"
118
+ if signing_region:
119
+ catalog_config["rest.signing-region"] = signing_region
120
+ catalog_config["rest.signing-name"] = signing_name
121
+
122
+ # S3 configuration
123
+ if s3_endpoint:
124
+ catalog_config["s3.endpoint"] = s3_endpoint
125
+ if s3_access_key_id:
126
+ catalog_config["s3.access-key-id"] = s3_access_key_id
127
+ if s3_secret_access_key:
128
+ catalog_config["s3.secret-access-key"] = s3_secret_access_key
129
+ if s3_region:
130
+ catalog_config["s3.region"] = s3_region
131
+
132
+ # Create fresh catalog connection for each file
133
+ catalog_type = catalog_config.get("type", "rest")
134
+ catalog_uri = catalog_config.get("uri", "unknown")
135
+ logger.info(
136
+ f"Creating catalog connection (type={catalog_type}, uri={catalog_uri})"
137
+ )
138
+ catalog = load_catalog("dlt_catalog", **catalog_config)
139
+
140
+ # Get table information
141
+ table_name = table["name"]
142
+ identifier = f"{namespace}.{table_name}"
143
+ write_disposition = table.get("write_disposition", "append")
144
+
145
+ logger.info(f"Processing table {identifier} with disposition {write_disposition}")
146
+
147
+ # Ensure namespace exists
148
+ try:
149
+ namespaces = catalog.list_namespaces()
150
+ if (namespace,) not in namespaces:
151
+ logger.info(f"Creating namespace {namespace}")
152
+ catalog.create_namespace(namespace)
153
+ else:
154
+ logger.info(f"Namespace {namespace} exists")
155
+ except Exception as e:
156
+ # Non-retryable namespace errors should fail fast
157
+ if not is_retryable_error(e):
158
+ log_error_with_context(
159
+ e,
160
+ operation=f"ensure namespace {namespace} exists",
161
+ table_name=table_name,
162
+ include_traceback=True,
163
+ )
164
+ raise RuntimeError(get_user_friendly_error_message(e, "ensure namespace exists")) from e
165
+ # If retryable, let it propagate to the retry loop
166
+ raise
167
+
168
+ # Read parquet file generated by dlt
169
+ file_path = Path(items)
170
+ if not file_path.exists():
171
+ raise FileNotFoundError(f"Parquet file not found: {file_path}")
172
+
173
+ arrow_table = pq.read_table(str(file_path))
174
+ logger.info(f"Read {len(arrow_table)} rows from {file_path}")
175
+
176
+ # Retry loop for transient failures
177
+ for attempt in range(max_retries):
178
+ try:
179
+ # Check if table exists
180
+ table_exists = False
181
+ try:
182
+ iceberg_table = catalog.load_table(identifier)
183
+ table_exists = True
184
+ logger.info(f"Loaded existing table {identifier}")
185
+ except NoSuchTableError:
186
+ logger.info(f"Table {identifier} does not exist, will create")
187
+ except Exception as e:
188
+ # Non-retryable errors during table load should fail fast
189
+ if not is_retryable_error(e):
190
+ log_error_with_context(
191
+ e,
192
+ operation="load table",
193
+ table_name=identifier,
194
+ attempt=attempt + 1,
195
+ max_attempts=max_retries,
196
+ include_traceback=True,
197
+ )
198
+ raise RuntimeError(get_user_friendly_error_message(e, "load table")) from e
199
+ # Retryable error - propagate to retry loop
200
+ raise
201
+
202
+ # Create table if it doesn't exist
203
+ if not table_exists:
204
+ # Convert dlt schema to Iceberg schema
205
+ iceberg_schema = convert_dlt_to_iceberg_schema(table, arrow_table)
206
+
207
+ # Build partition spec from table hints
208
+ partition_spec = build_partition_spec(table, iceberg_schema)
209
+
210
+ # If no partitioning, use empty spec (PyIceberg doesn't handle None well)
211
+ if partition_spec is None:
212
+ from pyiceberg.partitioning import PartitionSpec
213
+ partition_spec = PartitionSpec()
214
+
215
+ # Create table
216
+ logger.info(f"Creating table {identifier}")
217
+ iceberg_table = catalog.create_table(
218
+ identifier=identifier,
219
+ schema=iceberg_schema,
220
+ partition_spec=partition_spec,
221
+ )
222
+ logger.info(f"Created table {identifier} at {iceberg_table.location()}")
223
+ else:
224
+ # Table exists - check if schema evolution is needed
225
+ incoming_schema = convert_dlt_to_iceberg_schema(table, arrow_table)
226
+
227
+ try:
228
+ schema_evolved = evolve_schema_if_needed(
229
+ iceberg_table,
230
+ incoming_schema,
231
+ allow_column_drops=False
232
+ )
233
+ if schema_evolved:
234
+ logger.info(f"Schema evolved for table {identifier}")
235
+ # Refresh table to get updated schema
236
+ iceberg_table = catalog.load_table(identifier)
237
+ except SchemaEvolutionError as e:
238
+ # Schema evolution errors are non-retryable (data model issues)
239
+ log_error_with_context(
240
+ e,
241
+ operation="evolve schema",
242
+ table_name=identifier,
243
+ include_traceback=True,
244
+ )
245
+ raise
246
+
247
+ # Cast Arrow table to match Iceberg schema with defensive validation
248
+ # This handles timezone differences and other schema mismatches
249
+ # In strict mode (default), casting fails if data would be lost
250
+ # In non-strict mode, casting proceeds with warnings
251
+ expected_schema = schema_to_pyarrow(iceberg_table.schema())
252
+ try:
253
+ arrow_table = cast_table_safe(
254
+ arrow_table,
255
+ expected_schema,
256
+ strict=strict_casting
257
+ )
258
+ except CastingError as e:
259
+ # Casting errors are non-retryable (schema mismatch)
260
+ log_error_with_context(
261
+ e,
262
+ operation="cast schema",
263
+ table_name=identifier,
264
+ include_traceback=True,
265
+ )
266
+ logger.error(
267
+ f"Schema casting failed for {identifier}. "
268
+ f"Set strict_casting=False to allow aggressive casting."
269
+ )
270
+ raise
271
+
272
+ # Write data based on disposition
273
+ if write_disposition == "replace":
274
+ logger.info(f"Overwriting table {identifier}")
275
+ iceberg_table.overwrite(arrow_table)
276
+ elif write_disposition == "append":
277
+ logger.info(f"Appending to table {identifier}")
278
+ iceberg_table.append(arrow_table)
279
+ elif write_disposition == "merge":
280
+ # For merge, we need primary keys
281
+ # Try multiple ways to get primary keys from dlt table schema
282
+ primary_keys = table.get("primary_key") or table.get("x-merge-keys")
283
+
284
+ # If not found, check columns for primary_key hints
285
+ if not primary_keys:
286
+ columns = table.get("columns", {})
287
+ primary_keys = [
288
+ col_name
289
+ for col_name, col_def in columns.items()
290
+ if col_def.get("primary_key") or col_def.get("x-primary-key")
291
+ ]
292
+
293
+ if not primary_keys:
294
+ logger.warning(
295
+ f"Merge disposition requires primary_key, falling back to append"
296
+ )
297
+ iceberg_table.append(arrow_table)
298
+ else:
299
+ logger.info(f"Merging into table {identifier} on keys {primary_keys}")
300
+ # Use PyIceberg's upsert API to update existing rows and insert new ones
301
+ # PyIceberg will automatically match rows based on join_cols (primary keys)
302
+ upsert_result = iceberg_table.upsert(
303
+ df=arrow_table,
304
+ join_cols=primary_keys,
305
+ when_matched_update_all=True,
306
+ when_not_matched_insert_all=True,
307
+ )
308
+ logger.info(
309
+ f"Upsert completed: {upsert_result.rows_updated} updated, "
310
+ f"{upsert_result.rows_inserted} inserted"
311
+ )
312
+ else:
313
+ raise ValueError(f"Unknown write disposition: {write_disposition}")
314
+
315
+ logger.info(f"Successfully wrote {len(arrow_table)} rows to {identifier}")
316
+ return # Success
317
+
318
+ except Exception as e:
319
+ # Classify the error
320
+ retryable = is_retryable_error(e)
321
+
322
+ # Log error with context
323
+ log_error_with_context(
324
+ e,
325
+ operation=f"write data (disposition={write_disposition})",
326
+ table_name=identifier,
327
+ attempt=attempt + 1,
328
+ max_attempts=max_retries,
329
+ include_traceback=not retryable, # Full trace for non-retryable errors
330
+ )
331
+
332
+ # Non-retryable errors should fail immediately
333
+ if not retryable:
334
+ error_msg = get_user_friendly_error_message(
335
+ e, f"write data to table {identifier}"
336
+ )
337
+ raise RuntimeError(error_msg) from e
338
+
339
+ # Retryable error - check if we should retry
340
+ if attempt >= max_retries - 1:
341
+ # Max retries exhausted
342
+ error_msg = get_user_friendly_error_message(
343
+ e, f"write data to table {identifier} after {max_retries} attempts"
344
+ )
345
+ raise RuntimeError(error_msg) from e
346
+
347
+ # Retry with exponential backoff
348
+ sleep_time = retry_backoff_base ** attempt
349
+ logger.info(
350
+ f"Retrying after {sleep_time}s (attempt {attempt + 2}/{max_retries})"
351
+ )
352
+ time.sleep(sleep_time)
353
+
354
+ # Refresh table state for next attempt
355
+ if table_exists:
356
+ try:
357
+ iceberg_table.refresh()
358
+ logger.debug(f"Refreshed table state for {identifier}")
359
+ except Exception as refresh_error:
360
+ logger.warning(
361
+ f"Failed to refresh table (will retry anyway): {refresh_error}"
362
+ )
363
+
364
+
365
+ # Create the base destination factory
366
+ _iceberg_rest_base = dlt.destination(
367
+ _iceberg_rest_handler,
368
+ batch_size=0,
369
+ loader_file_format="parquet",
370
+ name="iceberg_rest",
371
+ naming_convention="snake_case",
372
+ skip_dlt_columns_and_tables=True,
373
+ max_parallel_load_jobs=5,
374
+ loader_parallelism_strategy="table-sequential",
375
+ )
376
+
377
+
378
+ # Wrap the factory to add merge support to capabilities
379
+ def iceberg_rest(**kwargs):
380
+ """
381
+ Iceberg REST destination factory with merge support.
382
+
383
+ Returns a destination instance configured for Iceberg with merge capabilities.
384
+ """
385
+ # Get the destination instance
386
+ dest = _iceberg_rest_base(**kwargs)
387
+
388
+ # Override the _raw_capabilities method to include merge support
389
+ original_raw_capabilities = dest._raw_capabilities
390
+
391
+ def _raw_capabilities_with_merge():
392
+ """Add merge support to the destination capabilities."""
393
+ caps = original_raw_capabilities()
394
+ caps.supported_merge_strategies = ["upsert"]
395
+ return caps
396
+
397
+ # Bind the new method to the instance
398
+ dest._raw_capabilities = _raw_capabilities_with_merge
399
+
400
+ return dest