dlt-iceberg 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dlt_iceberg/__init__.py +6 -0
- dlt_iceberg/adapter.py +276 -0
- dlt_iceberg/destination.py +117 -16
- dlt_iceberg/destination_client.py +455 -38
- dlt_iceberg/partition_builder.py +12 -6
- dlt_iceberg/schema_converter.py +4 -1
- dlt_iceberg/sql_client.py +222 -0
- dlt_iceberg-0.2.0.dist-info/METADATA +442 -0
- dlt_iceberg-0.2.0.dist-info/RECORD +14 -0
- {dlt_iceberg-0.1.3.dist-info → dlt_iceberg-0.2.0.dist-info}/WHEEL +1 -1
- dlt_iceberg-0.1.3.dist-info/METADATA +0 -279
- dlt_iceberg-0.1.3.dist-info/RECORD +0 -12
- {dlt_iceberg-0.1.3.dist-info → dlt_iceberg-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,7 +10,7 @@ import time
|
|
|
10
10
|
import threading
|
|
11
11
|
from collections import defaultdict
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Dict, List, Optional, Iterable, Tuple, Type
|
|
13
|
+
from typing import Any, Dict, List, Optional, Iterable, Tuple, Type
|
|
14
14
|
from types import TracebackType
|
|
15
15
|
|
|
16
16
|
import pyarrow as pa
|
|
@@ -22,7 +22,10 @@ from dlt.common.destination.client import (
|
|
|
22
22
|
LoadJob,
|
|
23
23
|
RunnableLoadJob,
|
|
24
24
|
DestinationClientConfiguration,
|
|
25
|
+
SupportsOpenTables,
|
|
25
26
|
)
|
|
27
|
+
from dlt.common.schema.typing import TTableFormat
|
|
28
|
+
from dlt.destinations.sql_client import WithSqlClient, SqlClientBase
|
|
26
29
|
from dlt.common.schema import Schema, TTableSchema
|
|
27
30
|
from dlt.common.schema.typing import TTableSchema as PreparedTableSchema
|
|
28
31
|
from pyiceberg.catalog import load_catalog
|
|
@@ -94,7 +97,19 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
|
|
|
94
97
|
strict_casting: bool = False
|
|
95
98
|
|
|
96
99
|
# Merge batch size (for upsert operations to avoid memory issues)
|
|
97
|
-
merge_batch_size: int =
|
|
100
|
+
merge_batch_size: int = 500000
|
|
101
|
+
|
|
102
|
+
# Table location layout - controls directory structure for table files
|
|
103
|
+
# Supports patterns: {namespace}, {dataset_name}, {table_name}
|
|
104
|
+
# Example: "{namespace}/{table_name}" or "warehouse/{dataset_name}/{table_name}"
|
|
105
|
+
table_location_layout: Optional[str] = None
|
|
106
|
+
|
|
107
|
+
# Register tables found in storage but missing from catalog (backward compatibility)
|
|
108
|
+
register_new_tables: bool = False
|
|
109
|
+
|
|
110
|
+
# Hard delete column name - rows with this column set will be deleted during merge
|
|
111
|
+
# Set to None to disable hard delete
|
|
112
|
+
hard_delete_column: Optional[str] = "_dlt_deleted_at"
|
|
98
113
|
|
|
99
114
|
|
|
100
115
|
class IcebergRestLoadJob(RunnableLoadJob):
|
|
@@ -144,11 +159,12 @@ class IcebergRestLoadJob(RunnableLoadJob):
|
|
|
144
159
|
raise
|
|
145
160
|
|
|
146
161
|
|
|
147
|
-
class IcebergRestClient(JobClientBase):
|
|
162
|
+
class IcebergRestClient(JobClientBase, WithSqlClient, SupportsOpenTables):
|
|
148
163
|
"""
|
|
149
164
|
Class-based Iceberg REST destination with atomic multi-file commits.
|
|
150
165
|
|
|
151
166
|
Accumulates files during load and commits them atomically in complete_load().
|
|
167
|
+
Implements WithSqlClient and SupportsOpenTables for pipeline.dataset() support.
|
|
152
168
|
"""
|
|
153
169
|
|
|
154
170
|
def __init__(
|
|
@@ -162,6 +178,77 @@ class IcebergRestClient(JobClientBase):
|
|
|
162
178
|
|
|
163
179
|
# Catalog instance (created lazily)
|
|
164
180
|
self._catalog = None
|
|
181
|
+
# SQL client instance (created lazily)
|
|
182
|
+
self._sql_client = None
|
|
183
|
+
|
|
184
|
+
# ---- WithSqlClient interface ----
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def sql_client(self) -> SqlClientBase:
|
|
188
|
+
"""Get or create the DuckDB SQL client for dataset access."""
|
|
189
|
+
if self._sql_client is None:
|
|
190
|
+
from .sql_client import IcebergSqlClient
|
|
191
|
+
self._sql_client = IcebergSqlClient(
|
|
192
|
+
remote_client=self,
|
|
193
|
+
dataset_name=self.config.namespace,
|
|
194
|
+
)
|
|
195
|
+
return self._sql_client
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def sql_client_class(self) -> Type[SqlClientBase]:
|
|
199
|
+
"""Return the SQL client class."""
|
|
200
|
+
from .sql_client import IcebergSqlClient
|
|
201
|
+
return IcebergSqlClient
|
|
202
|
+
|
|
203
|
+
# ---- SupportsOpenTables interface ----
|
|
204
|
+
|
|
205
|
+
def get_open_table_catalog(self, table_format: TTableFormat, catalog_name: str = None) -> Any:
|
|
206
|
+
"""Get the PyIceberg catalog for accessing table metadata."""
|
|
207
|
+
if table_format != "iceberg":
|
|
208
|
+
raise ValueError(f"Unsupported table format: {table_format}")
|
|
209
|
+
return self._get_catalog()
|
|
210
|
+
|
|
211
|
+
def get_open_table_location(self, table_format: TTableFormat, table_name: str) -> str:
|
|
212
|
+
"""Get the storage location for an Iceberg table."""
|
|
213
|
+
if table_format != "iceberg":
|
|
214
|
+
raise ValueError(f"Unsupported table format: {table_format}")
|
|
215
|
+
|
|
216
|
+
# Try to get location from catalog
|
|
217
|
+
try:
|
|
218
|
+
catalog = self._get_catalog()
|
|
219
|
+
identifier = f"{self.config.namespace}.{table_name}"
|
|
220
|
+
iceberg_table = catalog.load_table(identifier)
|
|
221
|
+
return iceberg_table.location()
|
|
222
|
+
except NoSuchTableError:
|
|
223
|
+
# Table doesn't exist yet, compute expected location
|
|
224
|
+
location = self._get_table_location(table_name)
|
|
225
|
+
if location:
|
|
226
|
+
return location
|
|
227
|
+
# Fallback to default warehouse location
|
|
228
|
+
warehouse = self.config.warehouse or ""
|
|
229
|
+
if warehouse and not warehouse.endswith("/"):
|
|
230
|
+
warehouse += "/"
|
|
231
|
+
return f"{warehouse}{self.config.namespace}/{table_name}"
|
|
232
|
+
|
|
233
|
+
def load_open_table(self, table_format: TTableFormat, table_name: str, **kwargs: Any) -> Any:
|
|
234
|
+
"""Load and return a PyIceberg Table object."""
|
|
235
|
+
if table_format != "iceberg":
|
|
236
|
+
raise ValueError(f"Unsupported table format: {table_format}")
|
|
237
|
+
|
|
238
|
+
from dlt.common.destination.exceptions import DestinationUndefinedEntity
|
|
239
|
+
|
|
240
|
+
catalog = self._get_catalog()
|
|
241
|
+
identifier = f"{self.config.namespace}.{table_name}"
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
return catalog.load_table(identifier)
|
|
245
|
+
except NoSuchTableError as e:
|
|
246
|
+
raise DestinationUndefinedEntity(table_name) from e
|
|
247
|
+
|
|
248
|
+
def is_open_table(self, table_format: TTableFormat, table_name: str) -> bool:
|
|
249
|
+
"""Check if a table uses the specified open table format."""
|
|
250
|
+
# All tables in this destination are Iceberg tables
|
|
251
|
+
return table_format == "iceberg"
|
|
165
252
|
|
|
166
253
|
def _get_catalog(self):
|
|
167
254
|
"""Get or create catalog connection."""
|
|
@@ -214,6 +301,123 @@ class IcebergRestClient(JobClientBase):
|
|
|
214
301
|
self._catalog = load_catalog("dlt_catalog", **catalog_config)
|
|
215
302
|
return self._catalog
|
|
216
303
|
|
|
304
|
+
def _get_table_location(self, table_name: str) -> Optional[str]:
|
|
305
|
+
"""
|
|
306
|
+
Get the table location based on table_location_layout configuration.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
table_name: Name of the table
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Table location string or None to use catalog default
|
|
313
|
+
"""
|
|
314
|
+
if not self.config.table_location_layout:
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
# Get warehouse base from config
|
|
318
|
+
warehouse = self.config.warehouse or ""
|
|
319
|
+
|
|
320
|
+
# Build location from layout pattern
|
|
321
|
+
location = self.config.table_location_layout.format(
|
|
322
|
+
namespace=self.config.namespace,
|
|
323
|
+
dataset_name=self.config.namespace, # In dlt, dataset_name maps to namespace
|
|
324
|
+
table_name=table_name,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# If layout is relative (doesn't start with protocol), prepend warehouse
|
|
328
|
+
if not location.startswith(("s3://", "gs://", "az://", "file://", "hdfs://")):
|
|
329
|
+
# Ensure warehouse ends with / for proper joining
|
|
330
|
+
if warehouse and not warehouse.endswith("/"):
|
|
331
|
+
warehouse += "/"
|
|
332
|
+
location = f"{warehouse}{location}"
|
|
333
|
+
|
|
334
|
+
return location
|
|
335
|
+
|
|
336
|
+
def _register_tables_from_storage(self, catalog, namespace: str) -> None:
|
|
337
|
+
"""
|
|
338
|
+
Register tables found in storage but missing from catalog.
|
|
339
|
+
|
|
340
|
+
Scans the warehouse directory for Iceberg metadata files and registers
|
|
341
|
+
any tables not already in the catalog. This provides backward compatibility
|
|
342
|
+
when tables exist in storage but haven't been registered.
|
|
343
|
+
"""
|
|
344
|
+
if not self.config.register_new_tables:
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
if not self.config.warehouse:
|
|
348
|
+
logger.warning("Cannot register tables: no warehouse configured")
|
|
349
|
+
return
|
|
350
|
+
|
|
351
|
+
import os
|
|
352
|
+
from urllib.parse import urlparse
|
|
353
|
+
|
|
354
|
+
warehouse = self.config.warehouse
|
|
355
|
+
|
|
356
|
+
# Only support local filesystem for now
|
|
357
|
+
parsed = urlparse(warehouse)
|
|
358
|
+
if parsed.scheme and parsed.scheme != "file":
|
|
359
|
+
logger.info(
|
|
360
|
+
f"register_new_tables only supported for local filesystem, "
|
|
361
|
+
f"skipping for {parsed.scheme}"
|
|
362
|
+
)
|
|
363
|
+
return
|
|
364
|
+
|
|
365
|
+
# Get local path
|
|
366
|
+
local_path = parsed.path if parsed.scheme == "file" else warehouse
|
|
367
|
+
namespace_path = os.path.join(local_path, namespace)
|
|
368
|
+
|
|
369
|
+
if not os.path.exists(namespace_path):
|
|
370
|
+
logger.info(f"Namespace path {namespace_path} doesn't exist, nothing to register")
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
# Get existing tables in catalog
|
|
374
|
+
try:
|
|
375
|
+
existing_tables = {t[1] for t in catalog.list_tables(namespace)}
|
|
376
|
+
except NoSuchNamespaceError:
|
|
377
|
+
existing_tables = set()
|
|
378
|
+
|
|
379
|
+
# Scan for table directories with metadata
|
|
380
|
+
registered_count = 0
|
|
381
|
+
for item in os.listdir(namespace_path):
|
|
382
|
+
table_path = os.path.join(namespace_path, item)
|
|
383
|
+
if not os.path.isdir(table_path):
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
# Check if it's an Iceberg table (has metadata directory)
|
|
387
|
+
metadata_path = os.path.join(table_path, "metadata")
|
|
388
|
+
if not os.path.exists(metadata_path):
|
|
389
|
+
continue
|
|
390
|
+
|
|
391
|
+
table_name = item
|
|
392
|
+
if table_name in existing_tables:
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
# Find latest metadata file
|
|
396
|
+
metadata_files = [
|
|
397
|
+
f for f in os.listdir(metadata_path)
|
|
398
|
+
if f.endswith(".metadata.json")
|
|
399
|
+
]
|
|
400
|
+
if not metadata_files:
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
# Sort to get latest (by version number in filename)
|
|
404
|
+
metadata_files.sort(reverse=True)
|
|
405
|
+
latest_metadata = os.path.join(metadata_path, metadata_files[0])
|
|
406
|
+
|
|
407
|
+
try:
|
|
408
|
+
identifier = f"{namespace}.{table_name}"
|
|
409
|
+
catalog.register_table(
|
|
410
|
+
identifier=identifier,
|
|
411
|
+
metadata_location=f"file://{latest_metadata}",
|
|
412
|
+
)
|
|
413
|
+
logger.info(f"Registered table {identifier} from storage")
|
|
414
|
+
registered_count += 1
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.warning(f"Failed to register table {table_name}: {e}")
|
|
417
|
+
|
|
418
|
+
if registered_count > 0:
|
|
419
|
+
logger.info(f"Registered {registered_count} tables from storage")
|
|
420
|
+
|
|
217
421
|
def initialize_storage(self, truncate_tables: Optional[Iterable[str]] = None) -> None:
|
|
218
422
|
"""Create Iceberg namespace if it doesn't exist."""
|
|
219
423
|
catalog = self._get_catalog()
|
|
@@ -230,6 +434,9 @@ class IcebergRestClient(JobClientBase):
|
|
|
230
434
|
logger.error(f"Failed to initialize storage: {e}")
|
|
231
435
|
raise
|
|
232
436
|
|
|
437
|
+
# Register tables from storage if enabled
|
|
438
|
+
self._register_tables_from_storage(catalog, namespace)
|
|
439
|
+
|
|
233
440
|
# Handle truncation if requested
|
|
234
441
|
if truncate_tables:
|
|
235
442
|
for table_name in truncate_tables:
|
|
@@ -351,6 +558,168 @@ class IcebergRestClient(JobClientBase):
|
|
|
351
558
|
|
|
352
559
|
logger.info(f"Load {load_id} completed successfully")
|
|
353
560
|
|
|
561
|
+
def _get_merge_strategy(self, table_schema: TTableSchema) -> str:
|
|
562
|
+
"""Extract merge strategy from table schema.
|
|
563
|
+
|
|
564
|
+
write_disposition can be:
|
|
565
|
+
- "merge" (string) -> use upsert (backward compatible)
|
|
566
|
+
- {"disposition": "merge", "strategy": "delete-insert"} -> explicit strategy
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
Merge strategy: "upsert" or "delete-insert"
|
|
570
|
+
"""
|
|
571
|
+
write_disposition = table_schema.get("write_disposition", "append")
|
|
572
|
+
|
|
573
|
+
if isinstance(write_disposition, dict):
|
|
574
|
+
return write_disposition.get("strategy", "delete-insert")
|
|
575
|
+
|
|
576
|
+
# String "merge" - use upsert as our default (backward compatible)
|
|
577
|
+
return "upsert"
|
|
578
|
+
|
|
579
|
+
def _execute_delete_insert(
|
|
580
|
+
self,
|
|
581
|
+
iceberg_table,
|
|
582
|
+
combined_table: pa.Table,
|
|
583
|
+
primary_keys: List[str],
|
|
584
|
+
identifier: str,
|
|
585
|
+
hard_delete_filter=None,
|
|
586
|
+
) -> Tuple[int, int, int]:
|
|
587
|
+
"""Execute delete-insert merge strategy with optional hard deletes.
|
|
588
|
+
|
|
589
|
+
Deletes rows matching primary keys in incoming data, then appends new data.
|
|
590
|
+
Uses PyIceberg transaction for atomic hard-delete + delete + append.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
iceberg_table: PyIceberg table object
|
|
594
|
+
combined_table: Arrow table with data to merge
|
|
595
|
+
primary_keys: List of primary key column names
|
|
596
|
+
identifier: Table identifier for logging
|
|
597
|
+
hard_delete_filter: Optional filter for hard deletes (rows to permanently remove)
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
Tuple of (rows_deleted_estimate, rows_inserted, hard_deleted)
|
|
601
|
+
"""
|
|
602
|
+
from pyiceberg.expressions import In, And, EqualTo, Or
|
|
603
|
+
|
|
604
|
+
# Build delete filter from primary key values in incoming data
|
|
605
|
+
if len(primary_keys) == 1:
|
|
606
|
+
pk_col = primary_keys[0]
|
|
607
|
+
pk_values = combined_table.column(pk_col).to_pylist()
|
|
608
|
+
# Deduplicate values
|
|
609
|
+
unique_pk_values = list(set(pk_values))
|
|
610
|
+
delete_filter = In(pk_col, unique_pk_values)
|
|
611
|
+
deleted_estimate = len(unique_pk_values)
|
|
612
|
+
else:
|
|
613
|
+
# Composite primary key - build OR of AND conditions
|
|
614
|
+
pk_tuples = set()
|
|
615
|
+
for i in range(len(combined_table)):
|
|
616
|
+
pk_tuple = tuple(
|
|
617
|
+
combined_table.column(pk).to_pylist()[i] for pk in primary_keys
|
|
618
|
+
)
|
|
619
|
+
pk_tuples.add(pk_tuple)
|
|
620
|
+
|
|
621
|
+
conditions = []
|
|
622
|
+
for pk_tuple in pk_tuples:
|
|
623
|
+
and_conditions = [
|
|
624
|
+
EqualTo(pk, val) for pk, val in zip(primary_keys, pk_tuple)
|
|
625
|
+
]
|
|
626
|
+
if len(and_conditions) == 1:
|
|
627
|
+
conditions.append(and_conditions[0])
|
|
628
|
+
else:
|
|
629
|
+
conditions.append(And(*and_conditions))
|
|
630
|
+
|
|
631
|
+
if len(conditions) == 1:
|
|
632
|
+
delete_filter = conditions[0]
|
|
633
|
+
else:
|
|
634
|
+
delete_filter = Or(*conditions)
|
|
635
|
+
deleted_estimate = len(pk_tuples)
|
|
636
|
+
|
|
637
|
+
logger.info(
|
|
638
|
+
f"Delete-insert for {identifier}: deleting up to {deleted_estimate} "
|
|
639
|
+
f"matching rows, inserting {len(combined_table)} rows"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Execute atomic hard-delete + delete + append using single transaction
|
|
643
|
+
with iceberg_table.transaction() as txn:
|
|
644
|
+
# Hard deletes first (permanent removal)
|
|
645
|
+
if hard_delete_filter is not None:
|
|
646
|
+
txn.delete(hard_delete_filter)
|
|
647
|
+
# Then delete-insert for merge
|
|
648
|
+
txn.delete(delete_filter)
|
|
649
|
+
txn.append(combined_table)
|
|
650
|
+
|
|
651
|
+
return (deleted_estimate, len(combined_table), 1 if hard_delete_filter else 0)
|
|
652
|
+
|
|
653
|
+
def _prepare_hard_deletes(
|
|
654
|
+
self,
|
|
655
|
+
combined_table: pa.Table,
|
|
656
|
+
primary_keys: List[str],
|
|
657
|
+
) -> Tuple[pa.Table, Optional[Any], int]:
|
|
658
|
+
"""
|
|
659
|
+
Prepare hard deletes from incoming data (does not execute).
|
|
660
|
+
|
|
661
|
+
Rows with the hard_delete_column set (non-null) will be deleted.
|
|
662
|
+
Returns the filter expression to use in a transaction.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
combined_table: Arrow table with data including possible delete markers
|
|
666
|
+
primary_keys: List of primary key column names
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
Tuple of (remaining_rows, delete_filter_or_none, num_to_delete)
|
|
670
|
+
"""
|
|
671
|
+
hard_delete_col = self.config.hard_delete_column
|
|
672
|
+
|
|
673
|
+
# Check if hard delete column exists in data
|
|
674
|
+
if not hard_delete_col or hard_delete_col not in combined_table.column_names:
|
|
675
|
+
return combined_table, None, 0
|
|
676
|
+
|
|
677
|
+
from pyiceberg.expressions import In, And, EqualTo, Or
|
|
678
|
+
import pyarrow.compute as pc
|
|
679
|
+
|
|
680
|
+
# Get the delete marker column
|
|
681
|
+
delete_col = combined_table.column(hard_delete_col)
|
|
682
|
+
|
|
683
|
+
# Find rows marked for deletion (non-null values)
|
|
684
|
+
delete_mask = pc.is_valid(delete_col)
|
|
685
|
+
rows_to_delete = combined_table.filter(delete_mask)
|
|
686
|
+
rows_to_keep = combined_table.filter(pc.invert(delete_mask))
|
|
687
|
+
|
|
688
|
+
if len(rows_to_delete) == 0:
|
|
689
|
+
return rows_to_keep, None, 0
|
|
690
|
+
|
|
691
|
+
# Build delete filter from primary keys of rows to delete
|
|
692
|
+
if len(primary_keys) == 1:
|
|
693
|
+
pk_col = primary_keys[0]
|
|
694
|
+
pk_values = rows_to_delete.column(pk_col).to_pylist()
|
|
695
|
+
unique_pk_values = list(set(pk_values))
|
|
696
|
+
delete_filter = In(pk_col, unique_pk_values)
|
|
697
|
+
else:
|
|
698
|
+
# Composite primary key
|
|
699
|
+
pk_tuples = set()
|
|
700
|
+
for i in range(len(rows_to_delete)):
|
|
701
|
+
pk_tuple = tuple(
|
|
702
|
+
rows_to_delete.column(pk).to_pylist()[i] for pk in primary_keys
|
|
703
|
+
)
|
|
704
|
+
pk_tuples.add(pk_tuple)
|
|
705
|
+
|
|
706
|
+
conditions = []
|
|
707
|
+
for pk_tuple in pk_tuples:
|
|
708
|
+
and_conditions = [
|
|
709
|
+
EqualTo(pk, val) for pk, val in zip(primary_keys, pk_tuple)
|
|
710
|
+
]
|
|
711
|
+
if len(and_conditions) == 1:
|
|
712
|
+
conditions.append(and_conditions[0])
|
|
713
|
+
else:
|
|
714
|
+
conditions.append(And(*and_conditions))
|
|
715
|
+
|
|
716
|
+
if len(conditions) == 1:
|
|
717
|
+
delete_filter = conditions[0]
|
|
718
|
+
else:
|
|
719
|
+
delete_filter = Or(*conditions)
|
|
720
|
+
|
|
721
|
+
return rows_to_keep, delete_filter, len(rows_to_delete)
|
|
722
|
+
|
|
354
723
|
def _commit_table_files(
|
|
355
724
|
self,
|
|
356
725
|
catalog,
|
|
@@ -401,11 +770,19 @@ class IcebergRestClient(JobClientBase):
|
|
|
401
770
|
|
|
402
771
|
# Create table
|
|
403
772
|
logger.info(f"Creating table {identifier}")
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
773
|
+
|
|
774
|
+
# Get custom location if configured
|
|
775
|
+
table_location = self._get_table_location(table_name)
|
|
776
|
+
create_kwargs = {
|
|
777
|
+
"identifier": identifier,
|
|
778
|
+
"schema": iceberg_schema,
|
|
779
|
+
"partition_spec": partition_spec,
|
|
780
|
+
}
|
|
781
|
+
if table_location:
|
|
782
|
+
create_kwargs["location"] = table_location
|
|
783
|
+
logger.info(f"Using custom location: {table_location}")
|
|
784
|
+
|
|
785
|
+
iceberg_table = catalog.create_table(**create_kwargs)
|
|
409
786
|
logger.info(f"Created table {identifier} at {iceberg_table.location()}")
|
|
410
787
|
else:
|
|
411
788
|
# Table exists - check if schema evolution is needed
|
|
@@ -449,13 +826,18 @@ class IcebergRestClient(JobClientBase):
|
|
|
449
826
|
)
|
|
450
827
|
|
|
451
828
|
# ATOMIC COMMIT: Write all data in one transaction
|
|
452
|
-
|
|
829
|
+
# Handle both string and dict write_disposition
|
|
830
|
+
disposition_type = write_disposition
|
|
831
|
+
if isinstance(write_disposition, dict):
|
|
832
|
+
disposition_type = write_disposition.get("disposition", "append")
|
|
833
|
+
|
|
834
|
+
if disposition_type == "replace":
|
|
453
835
|
logger.info(f"Overwriting table {identifier}")
|
|
454
836
|
iceberg_table.overwrite(combined_table)
|
|
455
|
-
elif
|
|
837
|
+
elif disposition_type == "append":
|
|
456
838
|
logger.info(f"Appending to table {identifier}")
|
|
457
839
|
iceberg_table.append(combined_table)
|
|
458
|
-
elif
|
|
840
|
+
elif disposition_type == "merge":
|
|
459
841
|
# Get primary keys
|
|
460
842
|
primary_keys = table_schema.get("primary_key") or table_schema.get("x-merge-keys")
|
|
461
843
|
|
|
@@ -473,38 +855,73 @@ class IcebergRestClient(JobClientBase):
|
|
|
473
855
|
)
|
|
474
856
|
iceberg_table.append(combined_table)
|
|
475
857
|
else:
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
858
|
+
# Prepare hard deletes (rows marked for deletion)
|
|
859
|
+
remaining_rows, hard_delete_filter, num_hard_deletes = self._prepare_hard_deletes(
|
|
860
|
+
combined_table, primary_keys
|
|
861
|
+
)
|
|
862
|
+
if num_hard_deletes > 0:
|
|
863
|
+
logger.info(f"Prepared {num_hard_deletes} rows for hard delete")
|
|
864
|
+
|
|
865
|
+
# If all rows were hard deletes, just execute the delete
|
|
866
|
+
if len(remaining_rows) == 0:
|
|
867
|
+
if hard_delete_filter is not None:
|
|
868
|
+
iceberg_table.delete(hard_delete_filter)
|
|
869
|
+
logger.info(f"Executed {num_hard_deletes} hard deletes (no merge needed)")
|
|
870
|
+
return
|
|
871
|
+
|
|
872
|
+
# Get merge strategy
|
|
873
|
+
merge_strategy = self._get_merge_strategy(table_schema)
|
|
874
|
+
logger.info(
|
|
875
|
+
f"Merging into table {identifier} on keys {primary_keys} "
|
|
876
|
+
f"using strategy: {merge_strategy}"
|
|
877
|
+
)
|
|
486
878
|
|
|
879
|
+
if merge_strategy == "delete-insert":
|
|
880
|
+
# Atomic hard-delete + delete + insert in single transaction
|
|
881
|
+
deleted, inserted, _ = self._execute_delete_insert(
|
|
882
|
+
iceberg_table, remaining_rows, primary_keys, identifier,
|
|
883
|
+
hard_delete_filter=hard_delete_filter
|
|
884
|
+
)
|
|
487
885
|
logger.info(
|
|
488
|
-
f"
|
|
489
|
-
f"
|
|
886
|
+
f"Delete-insert completed: ~{deleted} deleted, "
|
|
887
|
+
f"{inserted} inserted"
|
|
490
888
|
)
|
|
889
|
+
else:
|
|
890
|
+
# Default: upsert strategy
|
|
891
|
+
# Execute hard deletes first (separate transaction since upsert is atomic)
|
|
892
|
+
if hard_delete_filter is not None:
|
|
893
|
+
iceberg_table.delete(hard_delete_filter)
|
|
894
|
+
logger.info(f"Executed {num_hard_deletes} hard deletes before upsert")
|
|
895
|
+
|
|
896
|
+
batch_size = self.config.merge_batch_size
|
|
897
|
+
total_updated = 0
|
|
898
|
+
total_inserted = 0
|
|
899
|
+
|
|
900
|
+
for batch_start in range(0, len(remaining_rows), batch_size):
|
|
901
|
+
batch_end = min(batch_start + batch_size, len(remaining_rows))
|
|
902
|
+
batch = remaining_rows.slice(batch_start, batch_end - batch_start)
|
|
903
|
+
|
|
904
|
+
logger.info(
|
|
905
|
+
f"Upserting batch {batch_start//batch_size + 1}: "
|
|
906
|
+
f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
upsert_result = iceberg_table.upsert(
|
|
910
|
+
df=batch,
|
|
911
|
+
join_cols=primary_keys,
|
|
912
|
+
when_matched_update_all=True,
|
|
913
|
+
when_not_matched_insert_all=True,
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
total_updated += upsert_result.rows_updated
|
|
917
|
+
total_inserted += upsert_result.rows_inserted
|
|
491
918
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
when_matched_update_all=True,
|
|
496
|
-
when_not_matched_insert_all=True,
|
|
919
|
+
logger.info(
|
|
920
|
+
f"Upsert completed: {total_updated} updated, "
|
|
921
|
+
f"{total_inserted} inserted across {(len(remaining_rows) + batch_size - 1) // batch_size} batches"
|
|
497
922
|
)
|
|
498
|
-
|
|
499
|
-
total_updated += upsert_result.rows_updated
|
|
500
|
-
total_inserted += upsert_result.rows_inserted
|
|
501
|
-
|
|
502
|
-
logger.info(
|
|
503
|
-
f"Upsert completed: {total_updated} updated, "
|
|
504
|
-
f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
|
|
505
|
-
)
|
|
506
923
|
else:
|
|
507
|
-
raise ValueError(f"Unknown write disposition: {
|
|
924
|
+
raise ValueError(f"Unknown write disposition: {disposition_type}")
|
|
508
925
|
|
|
509
926
|
logger.info(
|
|
510
927
|
f"Successfully committed {len(file_data)} files "
|
|
@@ -600,7 +1017,7 @@ class iceberg_rest_class_based(Destination[IcebergRestConfiguration, "IcebergRes
|
|
|
600
1017
|
caps.supported_staging_file_formats = []
|
|
601
1018
|
|
|
602
1019
|
# Merge strategies (we handle upsert ourselves)
|
|
603
|
-
caps.supported_merge_strategies = ["upsert"]
|
|
1020
|
+
caps.supported_merge_strategies = ["delete-insert", "upsert"]
|
|
604
1021
|
|
|
605
1022
|
# Replace strategies
|
|
606
1023
|
caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"]
|
dlt_iceberg/partition_builder.py
CHANGED
|
@@ -35,6 +35,7 @@ from pyiceberg.transforms import (
|
|
|
35
35
|
)
|
|
36
36
|
from pyiceberg.types import (
|
|
37
37
|
TimestampType,
|
|
38
|
+
TimestamptzType,
|
|
38
39
|
DateType,
|
|
39
40
|
StringType,
|
|
40
41
|
IntegerType,
|
|
@@ -92,14 +93,14 @@ def validate_transform_for_type(
|
|
|
92
93
|
Raises:
|
|
93
94
|
ValueError: If transform is invalid for the field type
|
|
94
95
|
"""
|
|
95
|
-
# Temporal transforms only for timestamp/date
|
|
96
|
+
# Temporal transforms only for timestamp/timestamptz/date
|
|
96
97
|
temporal_transforms = {"year", "month", "day", "hour"}
|
|
97
98
|
if transform_type in temporal_transforms:
|
|
98
|
-
if not isinstance(field_type, (TimestampType, DateType)):
|
|
99
|
+
if not isinstance(field_type, (TimestampType, TimestamptzType, DateType)):
|
|
99
100
|
raise ValueError(
|
|
100
101
|
f"Temporal transform '{transform_type}' cannot be applied to "
|
|
101
102
|
f"column '{col_name}' with type {field_type}. "
|
|
102
|
-
f"Use timestamp or date types for temporal transforms."
|
|
103
|
+
f"Use timestamp, timestamptz, or date types for temporal transforms."
|
|
103
104
|
)
|
|
104
105
|
|
|
105
106
|
# Bucket transform validation
|
|
@@ -181,16 +182,21 @@ def build_partition_spec(
|
|
|
181
182
|
continue
|
|
182
183
|
|
|
183
184
|
# Choose transform based on data type
|
|
185
|
+
col_hints = dlt_columns.get(col_name, {})
|
|
184
186
|
transform = choose_partition_transform(
|
|
185
|
-
iceberg_field.field_type, col_name,
|
|
187
|
+
iceberg_field.field_type, col_name, col_hints
|
|
186
188
|
)
|
|
187
189
|
|
|
190
|
+
# Get custom partition field name or generate default
|
|
191
|
+
custom_name = col_hints.get("x-partition-name") or col_hints.get("partition_name")
|
|
192
|
+
partition_name = custom_name or f"{col_name}_{get_transform_name(transform)}"
|
|
193
|
+
|
|
188
194
|
# Create partition field
|
|
189
195
|
partition_field = PartitionField(
|
|
190
196
|
source_id=iceberg_field.field_id,
|
|
191
197
|
field_id=1000 + len(partition_fields), # Start partition IDs at 1000
|
|
192
198
|
transform=transform,
|
|
193
|
-
name=
|
|
199
|
+
name=partition_name,
|
|
194
200
|
)
|
|
195
201
|
partition_fields.append(partition_field)
|
|
196
202
|
|
|
@@ -266,7 +272,7 @@ def choose_partition_transform(field_type, col_name: str, col_hints: dict):
|
|
|
266
272
|
)
|
|
267
273
|
|
|
268
274
|
# No hint specified - use defaults based on type
|
|
269
|
-
if isinstance(field_type, (TimestampType, DateType)):
|
|
275
|
+
if isinstance(field_type, (TimestampType, TimestamptzType, DateType)):
|
|
270
276
|
# Default to month for timestamps/dates
|
|
271
277
|
return MonthTransform()
|
|
272
278
|
elif isinstance(field_type, (StringType, IntegerType, LongType)):
|
dlt_iceberg/schema_converter.py
CHANGED
|
@@ -18,6 +18,7 @@ from pyiceberg.types import (
|
|
|
18
18
|
StringType,
|
|
19
19
|
BinaryType,
|
|
20
20
|
TimestampType,
|
|
21
|
+
TimestamptzType,
|
|
21
22
|
DateType,
|
|
22
23
|
TimeType,
|
|
23
24
|
ListType,
|
|
@@ -125,6 +126,8 @@ def convert_arrow_to_iceberg_type(arrow_type: pa.DataType):
|
|
|
125
126
|
|
|
126
127
|
# Temporal types
|
|
127
128
|
elif pa.types.is_timestamp(arrow_type):
|
|
129
|
+
if arrow_type.tz is not None:
|
|
130
|
+
return TimestamptzType()
|
|
128
131
|
return TimestampType()
|
|
129
132
|
elif pa.types.is_date(arrow_type):
|
|
130
133
|
return DateType()
|
|
@@ -197,7 +200,7 @@ def convert_dlt_type_to_iceberg_type(dlt_type: str):
|
|
|
197
200
|
"bool": BooleanType(),
|
|
198
201
|
"boolean": BooleanType(),
|
|
199
202
|
"timestamp": TimestampType(),
|
|
200
|
-
"timestamptz":
|
|
203
|
+
"timestamptz": TimestamptzType(),
|
|
201
204
|
"date": DateType(),
|
|
202
205
|
"time": TimeType(),
|
|
203
206
|
"binary": BinaryType(),
|