dlt-iceberg 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dlt-iceberg might be problematic. Click here for more details.

@@ -0,0 +1,606 @@
1
+ """
2
+ Class-based Iceberg REST destination with atomic multi-file commits.
3
+
4
+ This implementation uses dlt's JobClientBase interface to provide full lifecycle
5
+ hooks, enabling atomic commits of multiple files per table.
6
+ """
7
+
8
+ import logging
9
+ import time
10
+ import threading
11
+ from collections import defaultdict
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional, Iterable, Tuple, Type
14
+ from types import TracebackType
15
+
16
+ import pyarrow as pa
17
+ import pyarrow.parquet as pq
18
+ from dlt.common.configuration import configspec
19
+ from dlt.common.destination import DestinationCapabilitiesContext, Destination
20
+ from dlt.common.destination.client import (
21
+ JobClientBase,
22
+ LoadJob,
23
+ RunnableLoadJob,
24
+ DestinationClientConfiguration,
25
+ )
26
+ from dlt.common.schema import Schema, TTableSchema
27
+ from dlt.common.schema.typing import TTableSchema as PreparedTableSchema
28
+ from pyiceberg.catalog import load_catalog
29
+ from pyiceberg.exceptions import (
30
+ NoSuchTableError,
31
+ NoSuchNamespaceError,
32
+ )
33
+
34
+ from .schema_converter import convert_dlt_to_iceberg_schema
35
+ from .partition_builder import build_partition_spec
36
+ from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
37
+ from .schema_casting import cast_table_safe, CastingError
38
+ from .error_handling import (
39
+ is_retryable_error,
40
+ log_error_with_context,
41
+ get_user_friendly_error_message,
42
+ )
43
+ from pyiceberg.io.pyarrow import schema_to_pyarrow
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ # MODULE-LEVEL STATE: Accumulate files across client instances
49
+ # Key: load_id, Value: {table_name: [(table_schema, file_path, arrow_table), ...]}
50
+ # This works because dlt creates multiple client instances but they're all in the same process
51
+ _PENDING_FILES: Dict[str, Dict[str, List[Tuple[TTableSchema, str, pa.Table]]]] = {}
52
+ _PENDING_FILES_LOCK = threading.Lock()
53
+
54
+
55
+ @configspec
56
+ class IcebergRestConfiguration(DestinationClientConfiguration):
57
+ """Configuration for Iceberg REST catalog destination."""
58
+
59
+ destination_type: str = "iceberg_rest"
60
+
61
+ # Catalog configuration
62
+ catalog_uri: Optional[str] = None
63
+ warehouse: Optional[str] = None
64
+ namespace: str = "default"
65
+
66
+ # Authentication (OAuth2)
67
+ credential: Optional[str] = None
68
+ oauth2_server_uri: Optional[str] = None
69
+ scope: Optional[str] = "PRINCIPAL_ROLE:ALL"
70
+
71
+ # Or Bearer token
72
+ token: Optional[str] = None
73
+
74
+ # AWS SigV4 (for Glue)
75
+ sigv4_enabled: bool = False
76
+ signing_region: Optional[str] = None
77
+ signing_name: str = "execute-api"
78
+
79
+ # S3 configuration
80
+ s3_endpoint: Optional[str] = None
81
+ s3_access_key_id: Optional[str] = None
82
+ s3_secret_access_key: Optional[str] = None
83
+ s3_region: Optional[str] = None
84
+
85
+ # Retry configuration
86
+ max_retries: int = 5
87
+ retry_backoff_base: float = 2.0
88
+
89
+ # Schema casting configuration
90
+ strict_casting: bool = False
91
+
92
+
93
+ class IcebergRestLoadJob(RunnableLoadJob):
94
+ """
95
+ Load job that processes a single parquet file.
96
+
97
+ This job validates the file and accumulates it for atomic commit in complete_load().
98
+ """
99
+
100
+ def __init__(self, file_path: str) -> None:
101
+ super().__init__(file_path)
102
+ self._client: Optional["IcebergRestClient"] = None
103
+
104
+ def run(self) -> None:
105
+ """
106
+ Process the file: read, validate, and register for batch commit.
107
+
108
+ This does NOT commit to Iceberg - that happens in complete_load().
109
+ """
110
+ try:
111
+ # Read parquet file
112
+ file_path = Path(self._file_path)
113
+ if not file_path.exists():
114
+ raise FileNotFoundError(f"Parquet file not found: {file_path}")
115
+
116
+ arrow_table = pq.read_table(str(file_path))
117
+ logger.info(f"Read {len(arrow_table)} rows from {file_path.name}")
118
+
119
+ # Get table info from load context
120
+ table_name = self._load_table["name"]
121
+
122
+ # Register file for batch commit (using global state)
123
+ self._client.register_pending_file(
124
+ load_id=self._load_id,
125
+ table_schema=self._load_table,
126
+ table_name=table_name,
127
+ file_path=str(file_path),
128
+ arrow_table=arrow_table,
129
+ )
130
+
131
+ logger.info(
132
+ f"Registered file for batch commit: {table_name}/{file_path.name}"
133
+ )
134
+
135
+ except Exception as e:
136
+ logger.error(f"Failed to process file {self._file_path}: {e}")
137
+ raise
138
+
139
+
140
+ class IcebergRestClient(JobClientBase):
141
+ """
142
+ Class-based Iceberg REST destination with atomic multi-file commits.
143
+
144
+ Accumulates files during load and commits them atomically in complete_load().
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ schema: Schema,
150
+ config: IcebergRestConfiguration,
151
+ capabilities: DestinationCapabilitiesContext,
152
+ ) -> None:
153
+ super().__init__(schema, config, capabilities)
154
+ self.config: IcebergRestConfiguration = config
155
+
156
+ # Catalog instance (created lazily)
157
+ self._catalog = None
158
+
159
+ def _get_catalog(self):
160
+ """Get or create catalog connection."""
161
+ if self._catalog is not None:
162
+ return self._catalog
163
+
164
+ # Build catalog configuration
165
+ if self.config.catalog_uri.startswith("sqlite://") or self.config.catalog_uri.startswith("postgresql://"):
166
+ catalog_type = "sql"
167
+ else:
168
+ catalog_type = "rest"
169
+
170
+ catalog_config = {
171
+ "type": catalog_type,
172
+ "uri": self.config.catalog_uri,
173
+ }
174
+
175
+ if self.config.warehouse:
176
+ catalog_config["warehouse"] = self.config.warehouse
177
+
178
+ # Add authentication
179
+ if self.config.credential and self.config.oauth2_server_uri:
180
+ catalog_config["credential"] = self.config.credential
181
+ catalog_config["oauth2-server-uri"] = self.config.oauth2_server_uri
182
+ if self.config.scope:
183
+ catalog_config["scope"] = self.config.scope
184
+ elif self.config.token:
185
+ catalog_config["token"] = self.config.token
186
+
187
+ # AWS SigV4
188
+ if self.config.sigv4_enabled:
189
+ catalog_config["rest.sigv4-enabled"] = "true"
190
+ if self.config.signing_region:
191
+ catalog_config["rest.signing-region"] = self.config.signing_region
192
+ catalog_config["rest.signing-name"] = self.config.signing_name
193
+
194
+ # S3 configuration
195
+ if self.config.s3_endpoint:
196
+ catalog_config["s3.endpoint"] = self.config.s3_endpoint
197
+ if self.config.s3_access_key_id:
198
+ catalog_config["s3.access-key-id"] = self.config.s3_access_key_id
199
+ if self.config.s3_secret_access_key:
200
+ catalog_config["s3.secret-access-key"] = self.config.s3_secret_access_key
201
+ if self.config.s3_region:
202
+ catalog_config["s3.region"] = self.config.s3_region
203
+
204
+ logger.info(
205
+ f"Creating catalog connection (type={catalog_type}, uri={self.config.catalog_uri})"
206
+ )
207
+ self._catalog = load_catalog("dlt_catalog", **catalog_config)
208
+ return self._catalog
209
+
210
+ def initialize_storage(self, truncate_tables: Optional[Iterable[str]] = None) -> None:
211
+ """Create Iceberg namespace if it doesn't exist."""
212
+ catalog = self._get_catalog()
213
+ namespace = self.config.namespace
214
+
215
+ try:
216
+ namespaces = catalog.list_namespaces()
217
+ if (namespace,) not in namespaces:
218
+ logger.info(f"Creating namespace {namespace}")
219
+ catalog.create_namespace(namespace)
220
+ else:
221
+ logger.info(f"Namespace {namespace} already exists")
222
+ except Exception as e:
223
+ logger.error(f"Failed to initialize storage: {e}")
224
+ raise
225
+
226
+ # Handle truncation if requested
227
+ if truncate_tables:
228
+ for table_name in truncate_tables:
229
+ identifier = f"{namespace}.{table_name}"
230
+ try:
231
+ catalog.drop_table(identifier)
232
+ logger.info(f"Truncated table {identifier}")
233
+ except NoSuchTableError:
234
+ pass # Table doesn't exist, nothing to truncate
235
+
236
+ def is_storage_initialized(self) -> bool:
237
+ """Check if namespace exists."""
238
+ try:
239
+ catalog = self._get_catalog()
240
+ namespace = self.config.namespace
241
+ namespaces = catalog.list_namespaces()
242
+ return (namespace,) in namespaces
243
+ except Exception:
244
+ return False
245
+
246
+ def drop_storage(self) -> None:
247
+ """Drop all tables in the namespace."""
248
+ catalog = self._get_catalog()
249
+ namespace = self.config.namespace
250
+
251
+ try:
252
+ for table_identifier in catalog.list_tables(namespace):
253
+ catalog.drop_table(table_identifier)
254
+ logger.info(f"Dropped table {table_identifier}")
255
+ except NoSuchNamespaceError:
256
+ pass # Namespace doesn't exist
257
+
258
+ def create_load_job(
259
+ self,
260
+ table: PreparedTableSchema,
261
+ file_path: str,
262
+ load_id: str,
263
+ restore: bool = False,
264
+ ) -> LoadJob:
265
+ """
266
+ Create a load job for a single file.
267
+
268
+ The job will register the file for batch commit but not commit yet.
269
+ """
270
+ job = IcebergRestLoadJob(file_path)
271
+ # Set reference to client so job can register files
272
+ job._client = self
273
+ return job
274
+
275
+ def register_pending_file(
276
+ self,
277
+ load_id: str,
278
+ table_schema: TTableSchema,
279
+ table_name: str,
280
+ file_path: str,
281
+ arrow_table: pa.Table,
282
+ ) -> None:
283
+ """
284
+ Register a file for batch commit in complete_load().
285
+
286
+ Uses module-level global state since dlt creates multiple client instances.
287
+ """
288
+ with _PENDING_FILES_LOCK:
289
+ if load_id not in _PENDING_FILES:
290
+ _PENDING_FILES[load_id] = defaultdict(list)
291
+
292
+ _PENDING_FILES[load_id][table_name].append(
293
+ (table_schema, file_path, arrow_table)
294
+ )
295
+
296
+ file_count = len(_PENDING_FILES[load_id][table_name])
297
+ logger.info(
298
+ f"Registered file for {table_name} in load {load_id}: "
299
+ f"{file_count} files pending"
300
+ )
301
+
302
+ def complete_load(self, load_id: str) -> None:
303
+ """
304
+ ATOMIC COMMIT: Process all accumulated files in a single transaction per table.
305
+
306
+ Called once by dlt after all individual file jobs complete successfully.
307
+ Reads from module-level global state since dlt creates multiple client instances.
308
+ """
309
+ with _PENDING_FILES_LOCK:
310
+ if load_id not in _PENDING_FILES or not _PENDING_FILES[load_id]:
311
+ logger.info(f"No files to commit for load {load_id}")
312
+ return
313
+
314
+ # Copy data and clear immediately (under lock)
315
+ pending_files = dict(_PENDING_FILES[load_id])
316
+ del _PENDING_FILES[load_id]
317
+
318
+ catalog = self._get_catalog()
319
+ namespace = self.config.namespace
320
+
321
+ total_files = sum(len(files) for files in pending_files.values())
322
+ logger.info(
323
+ f"Committing {total_files} files across "
324
+ f"{len(pending_files)} tables for load {load_id}"
325
+ )
326
+
327
+ # Process each table
328
+ for table_name, file_data in pending_files.items():
329
+ identifier = f"{namespace}.{table_name}"
330
+
331
+ try:
332
+ self._commit_table_files(
333
+ catalog=catalog,
334
+ identifier=identifier,
335
+ table_name=table_name,
336
+ file_data=file_data,
337
+ )
338
+ except Exception as e:
339
+ logger.error(
340
+ f"Failed to commit files for table {identifier}: {e}",
341
+ exc_info=True,
342
+ )
343
+ raise
344
+
345
+ logger.info(f"Load {load_id} completed successfully")
346
+
347
+ def _commit_table_files(
348
+ self,
349
+ catalog,
350
+ identifier: str,
351
+ table_name: str,
352
+ file_data: List[Tuple[TTableSchema, str, pa.Table]],
353
+ ) -> None:
354
+ """
355
+ Commit all files for a single table atomically.
356
+
357
+ This is where the atomic magic happens - all files go into one Iceberg snapshot.
358
+ """
359
+ # Get table schema and write disposition from first file
360
+ table_schema, _, _ = file_data[0]
361
+ write_disposition = table_schema.get("write_disposition", "append")
362
+
363
+ logger.info(
364
+ f"Processing {len(file_data)} files for {identifier} "
365
+ f"with disposition {write_disposition}"
366
+ )
367
+
368
+ # Retry loop for transient failures
369
+ for attempt in range(self.config.max_retries):
370
+ try:
371
+ # Check if table exists
372
+ table_exists = False
373
+ try:
374
+ iceberg_table = catalog.load_table(identifier)
375
+ table_exists = True
376
+ logger.info(f"Loaded existing table {identifier}")
377
+ except NoSuchTableError:
378
+ logger.info(f"Table {identifier} does not exist, will create")
379
+
380
+ # Create table if needed
381
+ if not table_exists:
382
+ # Use first file's Arrow table to generate schema
383
+ first_arrow_table = file_data[0][2]
384
+ iceberg_schema = convert_dlt_to_iceberg_schema(
385
+ table_schema, first_arrow_table
386
+ )
387
+
388
+ # Build partition spec
389
+ partition_spec = build_partition_spec(table_schema, iceberg_schema)
390
+ if partition_spec is None:
391
+ from pyiceberg.partitioning import PartitionSpec
392
+ partition_spec = PartitionSpec()
393
+
394
+ # Create table
395
+ logger.info(f"Creating table {identifier}")
396
+ iceberg_table = catalog.create_table(
397
+ identifier=identifier,
398
+ schema=iceberg_schema,
399
+ partition_spec=partition_spec,
400
+ )
401
+ logger.info(f"Created table {identifier} at {iceberg_table.location()}")
402
+ else:
403
+ # Table exists - check if schema evolution is needed
404
+ first_arrow_table = file_data[0][2]
405
+ incoming_schema = convert_dlt_to_iceberg_schema(
406
+ table_schema, first_arrow_table
407
+ )
408
+
409
+ schema_evolved = evolve_schema_if_needed(
410
+ iceberg_table,
411
+ incoming_schema,
412
+ allow_column_drops=False,
413
+ )
414
+ if schema_evolved:
415
+ logger.info(f"Schema evolved for table {identifier}")
416
+ iceberg_table = catalog.load_table(identifier)
417
+
418
+ # Combine all Arrow tables and cast to match Iceberg schema
419
+ expected_schema = schema_to_pyarrow(iceberg_table.schema())
420
+ combined_tables = []
421
+
422
+ for _, file_path, arrow_table in file_data:
423
+ # Cast each table to match Iceberg schema
424
+ casted_table = cast_table_safe(
425
+ arrow_table,
426
+ expected_schema,
427
+ strict=self.config.strict_casting,
428
+ )
429
+ combined_tables.append(casted_table)
430
+
431
+ # Concatenate all tables
432
+ combined_table = pa.concat_tables(combined_tables)
433
+ total_rows = len(combined_table)
434
+
435
+ logger.info(
436
+ f"Combined {len(file_data)} files into {total_rows} rows "
437
+ f"for table {identifier}"
438
+ )
439
+
440
+ # ATOMIC COMMIT: Write all data in one transaction
441
+ if write_disposition == "replace":
442
+ logger.info(f"Overwriting table {identifier}")
443
+ iceberg_table.overwrite(combined_table)
444
+ elif write_disposition == "append":
445
+ logger.info(f"Appending to table {identifier}")
446
+ iceberg_table.append(combined_table)
447
+ elif write_disposition == "merge":
448
+ # Get primary keys
449
+ primary_keys = table_schema.get("primary_key") or table_schema.get("x-merge-keys")
450
+
451
+ if not primary_keys:
452
+ columns = table_schema.get("columns", {})
453
+ primary_keys = [
454
+ col_name
455
+ for col_name, col_def in columns.items()
456
+ if col_def.get("primary_key") or col_def.get("x-primary-key")
457
+ ]
458
+
459
+ if not primary_keys:
460
+ logger.warning(
461
+ f"Merge disposition requires primary_key, falling back to append"
462
+ )
463
+ iceberg_table.append(combined_table)
464
+ else:
465
+ logger.info(f"Merging into table {identifier} on keys {primary_keys}")
466
+ upsert_result = iceberg_table.upsert(
467
+ df=combined_table,
468
+ join_cols=primary_keys,
469
+ when_matched_update_all=True,
470
+ when_not_matched_insert_all=True,
471
+ )
472
+ logger.info(
473
+ f"Upsert completed: {upsert_result.rows_updated} updated, "
474
+ f"{upsert_result.rows_inserted} inserted"
475
+ )
476
+ else:
477
+ raise ValueError(f"Unknown write disposition: {write_disposition}")
478
+
479
+ logger.info(
480
+ f"Successfully committed {len(file_data)} files "
481
+ f"({total_rows} rows) to {identifier}"
482
+ )
483
+ return # Success
484
+
485
+ except (CastingError, SchemaEvolutionError) as e:
486
+ # Non-retryable errors
487
+ log_error_with_context(
488
+ e,
489
+ operation=f"commit files to {identifier}",
490
+ table_name=table_name,
491
+ include_traceback=True,
492
+ )
493
+ raise
494
+
495
+ except Exception as e:
496
+ # Check if retryable
497
+ retryable = is_retryable_error(e)
498
+
499
+ log_error_with_context(
500
+ e,
501
+ operation=f"commit files to {identifier}",
502
+ table_name=table_name,
503
+ attempt=attempt + 1,
504
+ max_attempts=self.config.max_retries,
505
+ include_traceback=not retryable,
506
+ )
507
+
508
+ if not retryable:
509
+ error_msg = get_user_friendly_error_message(
510
+ e, f"commit files to {identifier}"
511
+ )
512
+ raise RuntimeError(error_msg) from e
513
+
514
+ if attempt >= self.config.max_retries - 1:
515
+ error_msg = get_user_friendly_error_message(
516
+ e,
517
+ f"commit files to {identifier} after {self.config.max_retries} attempts",
518
+ )
519
+ raise RuntimeError(error_msg) from e
520
+
521
+ # Retry with exponential backoff
522
+ sleep_time = self.config.retry_backoff_base ** attempt
523
+ logger.info(
524
+ f"Retrying after {sleep_time}s (attempt {attempt + 2}/{self.config.max_retries})"
525
+ )
526
+ time.sleep(sleep_time)
527
+
528
+ def __enter__(self) -> "IcebergRestClient":
529
+ return self
530
+
531
+ def __exit__(
532
+ self,
533
+ exc_type: Type[BaseException],
534
+ exc_val: BaseException,
535
+ exc_tb: TracebackType,
536
+ ) -> None:
537
+ # Cleanup catalog connection if needed
538
+ if self._catalog is not None:
539
+ # PyIceberg doesn't have explicit close, but clear reference
540
+ self._catalog = None
541
+
542
+
543
+ class iceberg_rest_class_based(Destination[IcebergRestConfiguration, "IcebergRestClient"]):
544
+ """
545
+ Iceberg REST destination with atomic multi-file commits.
546
+
547
+ This uses the class-based destination API to provide full lifecycle hooks,
548
+ enabling atomic commits of multiple files per table.
549
+
550
+ Usage:
551
+ pipeline = dlt.pipeline(
552
+ destination=iceberg_rest(
553
+ catalog_uri="http://localhost:19120/iceberg/v1/main",
554
+ warehouse="s3://bucket/warehouse",
555
+ namespace="default",
556
+ )
557
+ )
558
+ """
559
+
560
+ spec = IcebergRestConfiguration
561
+
562
+ def _raw_capabilities(self) -> DestinationCapabilitiesContext:
563
+ """Define capabilities for Iceberg REST destination."""
564
+ caps = DestinationCapabilitiesContext()
565
+
566
+ # File formats
567
+ caps.preferred_loader_file_format = "parquet"
568
+ caps.supported_loader_file_formats = ["parquet"]
569
+ caps.preferred_staging_file_format = None
570
+ caps.supported_staging_file_formats = []
571
+
572
+ # Merge strategies (we handle upsert ourselves)
573
+ caps.supported_merge_strategies = ["upsert"]
574
+
575
+ # Replace strategies
576
+ caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"]
577
+
578
+ # Identifiers
579
+ caps.escape_identifier = lambda x: f"`{x}`"
580
+ caps.escape_literal = lambda x: f"'{x}'"
581
+ caps.casefold_identifier = str.lower
582
+ caps.has_case_sensitive_identifiers = True
583
+
584
+ # Precision
585
+ caps.decimal_precision = (38, 9)
586
+ caps.wei_precision = (38, 0)
587
+ caps.timestamp_precision = 6
588
+
589
+ # Limits
590
+ caps.max_identifier_length = 255
591
+ caps.max_column_identifier_length = 255
592
+ caps.max_query_length = 1024 * 1024
593
+ caps.is_max_query_length_in_bytes = True
594
+ caps.max_text_data_type_length = 1024 * 1024 * 1024
595
+ caps.is_max_text_data_type_length_in_bytes = True
596
+
597
+ # Transactions (Iceberg handles its own)
598
+ caps.supports_ddl_transactions = False
599
+ caps.supports_transactions = False
600
+ caps.supports_multiple_statements = False
601
+
602
+ return caps
603
+
604
+ @property
605
+ def client_class(self) -> Type["IcebergRestClient"]:
606
+ return IcebergRestClient