dlt-iceberg 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,7 @@ import time
10
10
  import threading
11
11
  from collections import defaultdict
12
12
  from pathlib import Path
13
- from typing import Dict, List, Optional, Iterable, Tuple, Type
13
+ from typing import Any, Dict, List, Optional, Iterable, Tuple, Type
14
14
  from types import TracebackType
15
15
 
16
16
  import pyarrow as pa
@@ -22,7 +22,10 @@ from dlt.common.destination.client import (
22
22
  LoadJob,
23
23
  RunnableLoadJob,
24
24
  DestinationClientConfiguration,
25
+ SupportsOpenTables,
25
26
  )
27
+ from dlt.common.schema.typing import TTableFormat
28
+ from dlt.destinations.sql_client import WithSqlClient, SqlClientBase
26
29
  from dlt.common.schema import Schema, TTableSchema
27
30
  from dlt.common.schema.typing import TTableSchema as PreparedTableSchema
28
31
  from pyiceberg.catalog import load_catalog
@@ -94,7 +97,19 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
94
97
  strict_casting: bool = False
95
98
 
96
99
  # Merge batch size (for upsert operations to avoid memory issues)
97
- merge_batch_size: int = 100000
100
+ merge_batch_size: int = 500000
101
+
102
+ # Table location layout - controls directory structure for table files
103
+ # Supports patterns: {namespace}, {dataset_name}, {table_name}
104
+ # Example: "{namespace}/{table_name}" or "warehouse/{dataset_name}/{table_name}"
105
+ table_location_layout: Optional[str] = None
106
+
107
+ # Register tables found in storage but missing from catalog (backward compatibility)
108
+ register_new_tables: bool = False
109
+
110
+ # Hard delete column name - rows with this column set will be deleted during merge
111
+ # Set to None to disable hard delete
112
+ hard_delete_column: Optional[str] = "_dlt_deleted_at"
98
113
 
99
114
 
100
115
  class IcebergRestLoadJob(RunnableLoadJob):
@@ -144,11 +159,12 @@ class IcebergRestLoadJob(RunnableLoadJob):
144
159
  raise
145
160
 
146
161
 
147
- class IcebergRestClient(JobClientBase):
162
+ class IcebergRestClient(JobClientBase, WithSqlClient, SupportsOpenTables):
148
163
  """
149
164
  Class-based Iceberg REST destination with atomic multi-file commits.
150
165
 
151
166
  Accumulates files during load and commits them atomically in complete_load().
167
+ Implements WithSqlClient and SupportsOpenTables for pipeline.dataset() support.
152
168
  """
153
169
 
154
170
  def __init__(
@@ -162,6 +178,77 @@ class IcebergRestClient(JobClientBase):
162
178
 
163
179
  # Catalog instance (created lazily)
164
180
  self._catalog = None
181
+ # SQL client instance (created lazily)
182
+ self._sql_client = None
183
+
184
+ # ---- WithSqlClient interface ----
185
+
186
+ @property
187
+ def sql_client(self) -> SqlClientBase:
188
+ """Get or create the DuckDB SQL client for dataset access."""
189
+ if self._sql_client is None:
190
+ from .sql_client import IcebergSqlClient
191
+ self._sql_client = IcebergSqlClient(
192
+ remote_client=self,
193
+ dataset_name=self.config.namespace,
194
+ )
195
+ return self._sql_client
196
+
197
+ @property
198
+ def sql_client_class(self) -> Type[SqlClientBase]:
199
+ """Return the SQL client class."""
200
+ from .sql_client import IcebergSqlClient
201
+ return IcebergSqlClient
202
+
203
+ # ---- SupportsOpenTables interface ----
204
+
205
+ def get_open_table_catalog(self, table_format: TTableFormat, catalog_name: str = None) -> Any:
206
+ """Get the PyIceberg catalog for accessing table metadata."""
207
+ if table_format != "iceberg":
208
+ raise ValueError(f"Unsupported table format: {table_format}")
209
+ return self._get_catalog()
210
+
211
+ def get_open_table_location(self, table_format: TTableFormat, table_name: str) -> str:
212
+ """Get the storage location for an Iceberg table."""
213
+ if table_format != "iceberg":
214
+ raise ValueError(f"Unsupported table format: {table_format}")
215
+
216
+ # Try to get location from catalog
217
+ try:
218
+ catalog = self._get_catalog()
219
+ identifier = f"{self.config.namespace}.{table_name}"
220
+ iceberg_table = catalog.load_table(identifier)
221
+ return iceberg_table.location()
222
+ except NoSuchTableError:
223
+ # Table doesn't exist yet, compute expected location
224
+ location = self._get_table_location(table_name)
225
+ if location:
226
+ return location
227
+ # Fallback to default warehouse location
228
+ warehouse = self.config.warehouse or ""
229
+ if warehouse and not warehouse.endswith("/"):
230
+ warehouse += "/"
231
+ return f"{warehouse}{self.config.namespace}/{table_name}"
232
+
233
+ def load_open_table(self, table_format: TTableFormat, table_name: str, **kwargs: Any) -> Any:
234
+ """Load and return a PyIceberg Table object."""
235
+ if table_format != "iceberg":
236
+ raise ValueError(f"Unsupported table format: {table_format}")
237
+
238
+ from dlt.common.destination.exceptions import DestinationUndefinedEntity
239
+
240
+ catalog = self._get_catalog()
241
+ identifier = f"{self.config.namespace}.{table_name}"
242
+
243
+ try:
244
+ return catalog.load_table(identifier)
245
+ except NoSuchTableError as e:
246
+ raise DestinationUndefinedEntity(table_name) from e
247
+
248
+ def is_open_table(self, table_format: TTableFormat, table_name: str) -> bool:
249
+ """Check if a table uses the specified open table format."""
250
+ # All tables in this destination are Iceberg tables
251
+ return table_format == "iceberg"
165
252
 
166
253
  def _get_catalog(self):
167
254
  """Get or create catalog connection."""
@@ -214,6 +301,123 @@ class IcebergRestClient(JobClientBase):
214
301
  self._catalog = load_catalog("dlt_catalog", **catalog_config)
215
302
  return self._catalog
216
303
 
304
+ def _get_table_location(self, table_name: str) -> Optional[str]:
305
+ """
306
+ Get the table location based on table_location_layout configuration.
307
+
308
+ Args:
309
+ table_name: Name of the table
310
+
311
+ Returns:
312
+ Table location string or None to use catalog default
313
+ """
314
+ if not self.config.table_location_layout:
315
+ return None
316
+
317
+ # Get warehouse base from config
318
+ warehouse = self.config.warehouse or ""
319
+
320
+ # Build location from layout pattern
321
+ location = self.config.table_location_layout.format(
322
+ namespace=self.config.namespace,
323
+ dataset_name=self.config.namespace, # In dlt, dataset_name maps to namespace
324
+ table_name=table_name,
325
+ )
326
+
327
+ # If layout is relative (doesn't start with protocol), prepend warehouse
328
+ if not location.startswith(("s3://", "gs://", "az://", "file://", "hdfs://")):
329
+ # Ensure warehouse ends with / for proper joining
330
+ if warehouse and not warehouse.endswith("/"):
331
+ warehouse += "/"
332
+ location = f"{warehouse}{location}"
333
+
334
+ return location
335
+
336
+ def _register_tables_from_storage(self, catalog, namespace: str) -> None:
337
+ """
338
+ Register tables found in storage but missing from catalog.
339
+
340
+ Scans the warehouse directory for Iceberg metadata files and registers
341
+ any tables not already in the catalog. This provides backward compatibility
342
+ when tables exist in storage but haven't been registered.
343
+ """
344
+ if not self.config.register_new_tables:
345
+ return
346
+
347
+ if not self.config.warehouse:
348
+ logger.warning("Cannot register tables: no warehouse configured")
349
+ return
350
+
351
+ import os
352
+ from urllib.parse import urlparse
353
+
354
+ warehouse = self.config.warehouse
355
+
356
+ # Only support local filesystem for now
357
+ parsed = urlparse(warehouse)
358
+ if parsed.scheme and parsed.scheme != "file":
359
+ logger.info(
360
+ f"register_new_tables only supported for local filesystem, "
361
+ f"skipping for {parsed.scheme}"
362
+ )
363
+ return
364
+
365
+ # Get local path
366
+ local_path = parsed.path if parsed.scheme == "file" else warehouse
367
+ namespace_path = os.path.join(local_path, namespace)
368
+
369
+ if not os.path.exists(namespace_path):
370
+ logger.info(f"Namespace path {namespace_path} doesn't exist, nothing to register")
371
+ return
372
+
373
+ # Get existing tables in catalog
374
+ try:
375
+ existing_tables = {t[1] for t in catalog.list_tables(namespace)}
376
+ except NoSuchNamespaceError:
377
+ existing_tables = set()
378
+
379
+ # Scan for table directories with metadata
380
+ registered_count = 0
381
+ for item in os.listdir(namespace_path):
382
+ table_path = os.path.join(namespace_path, item)
383
+ if not os.path.isdir(table_path):
384
+ continue
385
+
386
+ # Check if it's an Iceberg table (has metadata directory)
387
+ metadata_path = os.path.join(table_path, "metadata")
388
+ if not os.path.exists(metadata_path):
389
+ continue
390
+
391
+ table_name = item
392
+ if table_name in existing_tables:
393
+ continue
394
+
395
+ # Find latest metadata file
396
+ metadata_files = [
397
+ f for f in os.listdir(metadata_path)
398
+ if f.endswith(".metadata.json")
399
+ ]
400
+ if not metadata_files:
401
+ continue
402
+
403
+ # Sort to get latest (by version number in filename)
404
+ metadata_files.sort(reverse=True)
405
+ latest_metadata = os.path.join(metadata_path, metadata_files[0])
406
+
407
+ try:
408
+ identifier = f"{namespace}.{table_name}"
409
+ catalog.register_table(
410
+ identifier=identifier,
411
+ metadata_location=f"file://{latest_metadata}",
412
+ )
413
+ logger.info(f"Registered table {identifier} from storage")
414
+ registered_count += 1
415
+ except Exception as e:
416
+ logger.warning(f"Failed to register table {table_name}: {e}")
417
+
418
+ if registered_count > 0:
419
+ logger.info(f"Registered {registered_count} tables from storage")
420
+
217
421
  def initialize_storage(self, truncate_tables: Optional[Iterable[str]] = None) -> None:
218
422
  """Create Iceberg namespace if it doesn't exist."""
219
423
  catalog = self._get_catalog()
@@ -230,6 +434,9 @@ class IcebergRestClient(JobClientBase):
230
434
  logger.error(f"Failed to initialize storage: {e}")
231
435
  raise
232
436
 
437
+ # Register tables from storage if enabled
438
+ self._register_tables_from_storage(catalog, namespace)
439
+
233
440
  # Handle truncation if requested
234
441
  if truncate_tables:
235
442
  for table_name in truncate_tables:
@@ -351,6 +558,168 @@ class IcebergRestClient(JobClientBase):
351
558
 
352
559
  logger.info(f"Load {load_id} completed successfully")
353
560
 
561
+ def _get_merge_strategy(self, table_schema: TTableSchema) -> str:
562
+ """Extract merge strategy from table schema.
563
+
564
+ write_disposition can be:
565
+ - "merge" (string) -> use upsert (backward compatible)
566
+ - {"disposition": "merge", "strategy": "delete-insert"} -> explicit strategy
567
+
568
+ Returns:
569
+ Merge strategy: "upsert" or "delete-insert"
570
+ """
571
+ write_disposition = table_schema.get("write_disposition", "append")
572
+
573
+ if isinstance(write_disposition, dict):
574
+ return write_disposition.get("strategy", "delete-insert")
575
+
576
+ # String "merge" - use upsert as our default (backward compatible)
577
+ return "upsert"
578
+
579
+ def _execute_delete_insert(
580
+ self,
581
+ iceberg_table,
582
+ combined_table: pa.Table,
583
+ primary_keys: List[str],
584
+ identifier: str,
585
+ hard_delete_filter=None,
586
+ ) -> Tuple[int, int, int]:
587
+ """Execute delete-insert merge strategy with optional hard deletes.
588
+
589
+ Deletes rows matching primary keys in incoming data, then appends new data.
590
+ Uses PyIceberg transaction for atomic hard-delete + delete + append.
591
+
592
+ Args:
593
+ iceberg_table: PyIceberg table object
594
+ combined_table: Arrow table with data to merge
595
+ primary_keys: List of primary key column names
596
+ identifier: Table identifier for logging
597
+ hard_delete_filter: Optional filter for hard deletes (rows to permanently remove)
598
+
599
+ Returns:
600
+ Tuple of (rows_deleted_estimate, rows_inserted, hard_deleted)
601
+ """
602
+ from pyiceberg.expressions import In, And, EqualTo, Or
603
+
604
+ # Build delete filter from primary key values in incoming data
605
+ if len(primary_keys) == 1:
606
+ pk_col = primary_keys[0]
607
+ pk_values = combined_table.column(pk_col).to_pylist()
608
+ # Deduplicate values
609
+ unique_pk_values = list(set(pk_values))
610
+ delete_filter = In(pk_col, unique_pk_values)
611
+ deleted_estimate = len(unique_pk_values)
612
+ else:
613
+ # Composite primary key - build OR of AND conditions
614
+ pk_tuples = set()
615
+ for i in range(len(combined_table)):
616
+ pk_tuple = tuple(
617
+ combined_table.column(pk).to_pylist()[i] for pk in primary_keys
618
+ )
619
+ pk_tuples.add(pk_tuple)
620
+
621
+ conditions = []
622
+ for pk_tuple in pk_tuples:
623
+ and_conditions = [
624
+ EqualTo(pk, val) for pk, val in zip(primary_keys, pk_tuple)
625
+ ]
626
+ if len(and_conditions) == 1:
627
+ conditions.append(and_conditions[0])
628
+ else:
629
+ conditions.append(And(*and_conditions))
630
+
631
+ if len(conditions) == 1:
632
+ delete_filter = conditions[0]
633
+ else:
634
+ delete_filter = Or(*conditions)
635
+ deleted_estimate = len(pk_tuples)
636
+
637
+ logger.info(
638
+ f"Delete-insert for {identifier}: deleting up to {deleted_estimate} "
639
+ f"matching rows, inserting {len(combined_table)} rows"
640
+ )
641
+
642
+ # Execute atomic hard-delete + delete + append using single transaction
643
+ with iceberg_table.transaction() as txn:
644
+ # Hard deletes first (permanent removal)
645
+ if hard_delete_filter is not None:
646
+ txn.delete(hard_delete_filter)
647
+ # Then delete-insert for merge
648
+ txn.delete(delete_filter)
649
+ txn.append(combined_table)
650
+
651
+ return (deleted_estimate, len(combined_table), 1 if hard_delete_filter else 0)
652
+
653
+ def _prepare_hard_deletes(
654
+ self,
655
+ combined_table: pa.Table,
656
+ primary_keys: List[str],
657
+ ) -> Tuple[pa.Table, Optional[Any], int]:
658
+ """
659
+ Prepare hard deletes from incoming data (does not execute).
660
+
661
+ Rows with the hard_delete_column set (non-null) will be deleted.
662
+ Returns the filter expression to use in a transaction.
663
+
664
+ Args:
665
+ combined_table: Arrow table with data including possible delete markers
666
+ primary_keys: List of primary key column names
667
+
668
+ Returns:
669
+ Tuple of (remaining_rows, delete_filter_or_none, num_to_delete)
670
+ """
671
+ hard_delete_col = self.config.hard_delete_column
672
+
673
+ # Check if hard delete column exists in data
674
+ if not hard_delete_col or hard_delete_col not in combined_table.column_names:
675
+ return combined_table, None, 0
676
+
677
+ from pyiceberg.expressions import In, And, EqualTo, Or
678
+ import pyarrow.compute as pc
679
+
680
+ # Get the delete marker column
681
+ delete_col = combined_table.column(hard_delete_col)
682
+
683
+ # Find rows marked for deletion (non-null values)
684
+ delete_mask = pc.is_valid(delete_col)
685
+ rows_to_delete = combined_table.filter(delete_mask)
686
+ rows_to_keep = combined_table.filter(pc.invert(delete_mask))
687
+
688
+ if len(rows_to_delete) == 0:
689
+ return rows_to_keep, None, 0
690
+
691
+ # Build delete filter from primary keys of rows to delete
692
+ if len(primary_keys) == 1:
693
+ pk_col = primary_keys[0]
694
+ pk_values = rows_to_delete.column(pk_col).to_pylist()
695
+ unique_pk_values = list(set(pk_values))
696
+ delete_filter = In(pk_col, unique_pk_values)
697
+ else:
698
+ # Composite primary key
699
+ pk_tuples = set()
700
+ for i in range(len(rows_to_delete)):
701
+ pk_tuple = tuple(
702
+ rows_to_delete.column(pk).to_pylist()[i] for pk in primary_keys
703
+ )
704
+ pk_tuples.add(pk_tuple)
705
+
706
+ conditions = []
707
+ for pk_tuple in pk_tuples:
708
+ and_conditions = [
709
+ EqualTo(pk, val) for pk, val in zip(primary_keys, pk_tuple)
710
+ ]
711
+ if len(and_conditions) == 1:
712
+ conditions.append(and_conditions[0])
713
+ else:
714
+ conditions.append(And(*and_conditions))
715
+
716
+ if len(conditions) == 1:
717
+ delete_filter = conditions[0]
718
+ else:
719
+ delete_filter = Or(*conditions)
720
+
721
+ return rows_to_keep, delete_filter, len(rows_to_delete)
722
+
354
723
  def _commit_table_files(
355
724
  self,
356
725
  catalog,
@@ -401,11 +770,19 @@ class IcebergRestClient(JobClientBase):
401
770
 
402
771
  # Create table
403
772
  logger.info(f"Creating table {identifier}")
404
- iceberg_table = catalog.create_table(
405
- identifier=identifier,
406
- schema=iceberg_schema,
407
- partition_spec=partition_spec,
408
- )
773
+
774
+ # Get custom location if configured
775
+ table_location = self._get_table_location(table_name)
776
+ create_kwargs = {
777
+ "identifier": identifier,
778
+ "schema": iceberg_schema,
779
+ "partition_spec": partition_spec,
780
+ }
781
+ if table_location:
782
+ create_kwargs["location"] = table_location
783
+ logger.info(f"Using custom location: {table_location}")
784
+
785
+ iceberg_table = catalog.create_table(**create_kwargs)
409
786
  logger.info(f"Created table {identifier} at {iceberg_table.location()}")
410
787
  else:
411
788
  # Table exists - check if schema evolution is needed
@@ -449,13 +826,18 @@ class IcebergRestClient(JobClientBase):
449
826
  )
450
827
 
451
828
  # ATOMIC COMMIT: Write all data in one transaction
452
- if write_disposition == "replace":
829
+ # Handle both string and dict write_disposition
830
+ disposition_type = write_disposition
831
+ if isinstance(write_disposition, dict):
832
+ disposition_type = write_disposition.get("disposition", "append")
833
+
834
+ if disposition_type == "replace":
453
835
  logger.info(f"Overwriting table {identifier}")
454
836
  iceberg_table.overwrite(combined_table)
455
- elif write_disposition == "append":
837
+ elif disposition_type == "append":
456
838
  logger.info(f"Appending to table {identifier}")
457
839
  iceberg_table.append(combined_table)
458
- elif write_disposition == "merge":
840
+ elif disposition_type == "merge":
459
841
  # Get primary keys
460
842
  primary_keys = table_schema.get("primary_key") or table_schema.get("x-merge-keys")
461
843
 
@@ -473,38 +855,73 @@ class IcebergRestClient(JobClientBase):
473
855
  )
474
856
  iceberg_table.append(combined_table)
475
857
  else:
476
- logger.info(f"Merging into table {identifier} on keys {primary_keys}")
477
-
478
- # Batch upserts to avoid memory issues on large datasets
479
- batch_size = self.config.merge_batch_size
480
- total_updated = 0
481
- total_inserted = 0
482
-
483
- for batch_start in range(0, len(combined_table), batch_size):
484
- batch_end = min(batch_start + batch_size, len(combined_table))
485
- batch = combined_table.slice(batch_start, batch_end - batch_start)
858
+ # Prepare hard deletes (rows marked for deletion)
859
+ remaining_rows, hard_delete_filter, num_hard_deletes = self._prepare_hard_deletes(
860
+ combined_table, primary_keys
861
+ )
862
+ if num_hard_deletes > 0:
863
+ logger.info(f"Prepared {num_hard_deletes} rows for hard delete")
864
+
865
+ # If all rows were hard deletes, just execute the delete
866
+ if len(remaining_rows) == 0:
867
+ if hard_delete_filter is not None:
868
+ iceberg_table.delete(hard_delete_filter)
869
+ logger.info(f"Executed {num_hard_deletes} hard deletes (no merge needed)")
870
+ return
871
+
872
+ # Get merge strategy
873
+ merge_strategy = self._get_merge_strategy(table_schema)
874
+ logger.info(
875
+ f"Merging into table {identifier} on keys {primary_keys} "
876
+ f"using strategy: {merge_strategy}"
877
+ )
486
878
 
879
+ if merge_strategy == "delete-insert":
880
+ # Atomic hard-delete + delete + insert in single transaction
881
+ deleted, inserted, _ = self._execute_delete_insert(
882
+ iceberg_table, remaining_rows, primary_keys, identifier,
883
+ hard_delete_filter=hard_delete_filter
884
+ )
487
885
  logger.info(
488
- f"Upserting batch {batch_start//batch_size + 1}: "
489
- f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
886
+ f"Delete-insert completed: ~{deleted} deleted, "
887
+ f"{inserted} inserted"
490
888
  )
889
+ else:
890
+ # Default: upsert strategy
891
+ # Execute hard deletes first (separate transaction since upsert is atomic)
892
+ if hard_delete_filter is not None:
893
+ iceberg_table.delete(hard_delete_filter)
894
+ logger.info(f"Executed {num_hard_deletes} hard deletes before upsert")
895
+
896
+ batch_size = self.config.merge_batch_size
897
+ total_updated = 0
898
+ total_inserted = 0
899
+
900
+ for batch_start in range(0, len(remaining_rows), batch_size):
901
+ batch_end = min(batch_start + batch_size, len(remaining_rows))
902
+ batch = remaining_rows.slice(batch_start, batch_end - batch_start)
903
+
904
+ logger.info(
905
+ f"Upserting batch {batch_start//batch_size + 1}: "
906
+ f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
907
+ )
908
+
909
+ upsert_result = iceberg_table.upsert(
910
+ df=batch,
911
+ join_cols=primary_keys,
912
+ when_matched_update_all=True,
913
+ when_not_matched_insert_all=True,
914
+ )
915
+
916
+ total_updated += upsert_result.rows_updated
917
+ total_inserted += upsert_result.rows_inserted
491
918
 
492
- upsert_result = iceberg_table.upsert(
493
- df=batch,
494
- join_cols=primary_keys,
495
- when_matched_update_all=True,
496
- when_not_matched_insert_all=True,
919
+ logger.info(
920
+ f"Upsert completed: {total_updated} updated, "
921
+ f"{total_inserted} inserted across {(len(remaining_rows) + batch_size - 1) // batch_size} batches"
497
922
  )
498
-
499
- total_updated += upsert_result.rows_updated
500
- total_inserted += upsert_result.rows_inserted
501
-
502
- logger.info(
503
- f"Upsert completed: {total_updated} updated, "
504
- f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
505
- )
506
923
  else:
507
- raise ValueError(f"Unknown write disposition: {write_disposition}")
924
+ raise ValueError(f"Unknown write disposition: {disposition_type}")
508
925
 
509
926
  logger.info(
510
927
  f"Successfully committed {len(file_data)} files "
@@ -600,7 +1017,7 @@ class iceberg_rest_class_based(Destination[IcebergRestConfiguration, "IcebergRes
600
1017
  caps.supported_staging_file_formats = []
601
1018
 
602
1019
  # Merge strategies (we handle upsert ourselves)
603
- caps.supported_merge_strategies = ["upsert"]
1020
+ caps.supported_merge_strategies = ["delete-insert", "upsert"]
604
1021
 
605
1022
  # Replace strategies
606
1023
  caps.supported_replace_strategies = ["truncate-and-insert", "insert-from-staging"]
@@ -35,6 +35,7 @@ from pyiceberg.transforms import (
35
35
  )
36
36
  from pyiceberg.types import (
37
37
  TimestampType,
38
+ TimestamptzType,
38
39
  DateType,
39
40
  StringType,
40
41
  IntegerType,
@@ -92,14 +93,14 @@ def validate_transform_for_type(
92
93
  Raises:
93
94
  ValueError: If transform is invalid for the field type
94
95
  """
95
- # Temporal transforms only for timestamp/date
96
+ # Temporal transforms only for timestamp/timestamptz/date
96
97
  temporal_transforms = {"year", "month", "day", "hour"}
97
98
  if transform_type in temporal_transforms:
98
- if not isinstance(field_type, (TimestampType, DateType)):
99
+ if not isinstance(field_type, (TimestampType, TimestamptzType, DateType)):
99
100
  raise ValueError(
100
101
  f"Temporal transform '{transform_type}' cannot be applied to "
101
102
  f"column '{col_name}' with type {field_type}. "
102
- f"Use timestamp or date types for temporal transforms."
103
+ f"Use timestamp, timestamptz, or date types for temporal transforms."
103
104
  )
104
105
 
105
106
  # Bucket transform validation
@@ -181,16 +182,21 @@ def build_partition_spec(
181
182
  continue
182
183
 
183
184
  # Choose transform based on data type
185
+ col_hints = dlt_columns.get(col_name, {})
184
186
  transform = choose_partition_transform(
185
- iceberg_field.field_type, col_name, dlt_columns.get(col_name, {})
187
+ iceberg_field.field_type, col_name, col_hints
186
188
  )
187
189
 
190
+ # Get custom partition field name or generate default
191
+ custom_name = col_hints.get("x-partition-name") or col_hints.get("partition_name")
192
+ partition_name = custom_name or f"{col_name}_{get_transform_name(transform)}"
193
+
188
194
  # Create partition field
189
195
  partition_field = PartitionField(
190
196
  source_id=iceberg_field.field_id,
191
197
  field_id=1000 + len(partition_fields), # Start partition IDs at 1000
192
198
  transform=transform,
193
- name=f"{col_name}_{get_transform_name(transform)}",
199
+ name=partition_name,
194
200
  )
195
201
  partition_fields.append(partition_field)
196
202
 
@@ -266,7 +272,7 @@ def choose_partition_transform(field_type, col_name: str, col_hints: dict):
266
272
  )
267
273
 
268
274
  # No hint specified - use defaults based on type
269
- if isinstance(field_type, (TimestampType, DateType)):
275
+ if isinstance(field_type, (TimestampType, TimestamptzType, DateType)):
270
276
  # Default to month for timestamps/dates
271
277
  return MonthTransform()
272
278
  elif isinstance(field_type, (StringType, IntegerType, LongType)):
@@ -18,6 +18,7 @@ from pyiceberg.types import (
18
18
  StringType,
19
19
  BinaryType,
20
20
  TimestampType,
21
+ TimestamptzType,
21
22
  DateType,
22
23
  TimeType,
23
24
  ListType,
@@ -125,6 +126,8 @@ def convert_arrow_to_iceberg_type(arrow_type: pa.DataType):
125
126
 
126
127
  # Temporal types
127
128
  elif pa.types.is_timestamp(arrow_type):
129
+ if arrow_type.tz is not None:
130
+ return TimestamptzType()
128
131
  return TimestampType()
129
132
  elif pa.types.is_date(arrow_type):
130
133
  return DateType()
@@ -197,7 +200,7 @@ def convert_dlt_type_to_iceberg_type(dlt_type: str):
197
200
  "bool": BooleanType(),
198
201
  "boolean": BooleanType(),
199
202
  "timestamp": TimestampType(),
200
- "timestamptz": TimestampType(),
203
+ "timestamptz": TimestamptzType(),
201
204
  "date": DateType(),
202
205
  "time": TimeType(),
203
206
  "binary": BinaryType(),