PyPI - dlt-iceberg - Versions diffs - 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

dlt-iceberg 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

dlt_iceberg/__init__.py +6 -0
dlt_iceberg/adapter.py +276 -0
dlt_iceberg/destination.py +117 -16
dlt_iceberg/destination_client.py +455 -38
dlt_iceberg/partition_builder.py +12 -6
dlt_iceberg/schema_converter.py +4 -1
dlt_iceberg/sql_client.py +222 -0
dlt_iceberg-0.2.0.dist-info/METADATA +442 -0
dlt_iceberg-0.2.0.dist-info/RECORD +14 -0
{dlt_iceberg-0.1.3.dist-info → dlt_iceberg-0.2.0.dist-info}/WHEEL +1 -1
dlt_iceberg-0.1.3.dist-info/METADATA +0 -279
dlt_iceberg-0.1.3.dist-info/RECORD +0 -12
{dlt_iceberg-0.1.3.dist-info → dlt_iceberg-0.2.0.dist-info}/licenses/LICENSE +0 -0

dlt_iceberg/__init__.py CHANGED Viewed

@@ -13,6 +13,9 @@ from .destination import iceberg_rest as iceberg_rest_function_based
 # Export the class-based version as the primary destination
 iceberg_rest = iceberg_rest_class_based
+# Adapter for Iceberg-specific hints
+from .adapter import iceberg_adapter, iceberg_partition, PartitionTransform
 # Errors
 from .schema_casting import CastingError
 from .schema_evolution import SchemaEvolutionError
@@ -23,6 +26,9 @@ __all__ = [
     "iceberg_rest_function_based",
     "IcebergRestClient",
     "IcebergRestConfiguration",
+    "iceberg_adapter",
+    "iceberg_partition",
+    "PartitionTransform",
     "CastingError",
     "SchemaEvolutionError",
 ]

dlt_iceberg/adapter.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""
+Iceberg adapter for dlt resources.
+Provides a way to add Iceberg-specific hints to dlt resources, following
+the adapter pattern used by BigQuery, Databricks, and other destinations.
+Usage:
+    from dlt_iceberg import iceberg_adapter, iceberg_partition
+    @dlt.resource(name="events")
+    def my_events():
+        yield {"event_date": "2024-01-01", "user_id": 123}
+    # Partition by month on event_date and bucket user_id
+    adapted = iceberg_adapter(
+        my_events,
+        partition=[
+            iceberg_partition.month("event_date"),
+            iceberg_partition.bucket("user_id", 10),
+        ]
+    )
+"""
+import logging
+from typing import Any, List, Optional, Union, cast
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class PartitionTransform:
+    """Represents an Iceberg partition transform for a column.
+    Attributes:
+        column: Column name to partition on
+        transform: Transform type (identity, year, month, day, hour, bucket, truncate)
+        param: Optional parameter for bucket[N] or truncate[N]
+        name: Optional custom name for the partition field
+    """
+    column: str
+    transform: str
+    param: Optional[int] = None
+    name: Optional[str] = None
+    def to_hint_value(self) -> str:
+        """Convert to partition_transform hint value."""
+        if self.param is not None:
+            return f"{self.transform}[{self.param}]"
+        return self.transform
+class iceberg_partition:
+    """Factory for Iceberg partition transforms.
+    Provides static methods to create partition specifications:
+    - identity(column, name=None): No transformation, use value as-is
+    - year(column, name=None): Extract year from timestamp/date
+    - month(column, name=None): Extract year-month from timestamp/date
+    - day(column, name=None): Extract date from timestamp/date
+    - hour(column, name=None): Extract date-hour from timestamp
+    - bucket(num_buckets, column, name=None): Hash partition into n buckets
+    - truncate(width, column, name=None): Truncate string/number to width
+    Examples:
+        iceberg_partition.month("created_at")
+        iceberg_partition.month("created_at", "month_created")
+        iceberg_partition.bucket(10, "user_id")
+        iceberg_partition.bucket(10, "user_id", "user_bucket")
+        iceberg_partition.truncate(4, "email")
+    """
+    @staticmethod
+    def identity(column: str, name: Optional[str] = None) -> PartitionTransform:
+        """Identity transform - use column value as-is for partitioning.
+        Args:
+            column: Column name to partition on
+            name: Optional custom name for the partition field
+        """
+        return PartitionTransform(column=column, transform="identity", name=name)
+    @staticmethod
+    def year(column: str, name: Optional[str] = None) -> PartitionTransform:
+        """Year transform - partition by year extracted from timestamp/date.
+        Args:
+            column: Column name to partition on
+            name: Optional custom name for the partition field
+        """
+        return PartitionTransform(column=column, transform="year", name=name)
+    @staticmethod
+    def month(column: str, name: Optional[str] = None) -> PartitionTransform:
+        """Month transform - partition by year-month extracted from timestamp/date.
+        Args:
+            column: Column name to partition on
+            name: Optional custom name for the partition field
+        """
+        return PartitionTransform(column=column, transform="month", name=name)
+    @staticmethod
+    def day(column: str, name: Optional[str] = None) -> PartitionTransform:
+        """Day transform - partition by date extracted from timestamp/date.
+        Args:
+            column: Column name to partition on
+            name: Optional custom name for the partition field
+        """
+        return PartitionTransform(column=column, transform="day", name=name)
+    @staticmethod
+    def hour(column: str, name: Optional[str] = None) -> PartitionTransform:
+        """Hour transform - partition by date-hour extracted from timestamp.
+        Args:
+            column: Column name to partition on
+            name: Optional custom name for the partition field
+        """
+        return PartitionTransform(column=column, transform="hour", name=name)
+    @staticmethod
+    def bucket(num_buckets: int, column: str, name: Optional[str] = None) -> PartitionTransform:
+        """Bucket transform - hash partition into n buckets.
+        Args:
+            num_buckets: Number of buckets (must be positive)
+            column: Column name to partition on
+            name: Optional custom name for the partition field
+        Raises:
+            ValueError: If num_buckets is not positive
+        """
+        if num_buckets <= 0:
+            raise ValueError(f"num_buckets must be positive, got {num_buckets}")
+        return PartitionTransform(column=column, transform="bucket", param=num_buckets, name=name)
+    @staticmethod
+    def truncate(width: int, column: str, name: Optional[str] = None) -> PartitionTransform:
+        """Truncate transform - truncate string/number to width.
+        Args:
+            width: Truncation width (must be positive)
+            column: Column name to partition on
+            name: Optional custom name for the partition field
+        Raises:
+            ValueError: If width is not positive
+        """
+        if width <= 0:
+            raise ValueError(f"width must be positive, got {width}")
+        return PartitionTransform(column=column, transform="truncate", param=width, name=name)
+def _get_resource_for_adapter(data: Any):
+    """Get or create a DltResource from data.
+    Follows the pattern from dlt.destinations.utils.get_resource_for_adapter.
+    """
+    import dlt
+    from dlt.extract.resource import DltResource
+    from dlt.extract.source import DltSource
+    if isinstance(data, DltResource):
+        return data
+    if isinstance(data, DltSource):
+        if len(data.selected_resources.keys()) == 1:
+            return list(data.selected_resources.values())[0]
+        else:
+            raise ValueError(
+                "You are trying to use iceberg_adapter on a DltSource with "
+                "multiple resources. You can only use adapters on: pure data, "
+                "a DltResource, or a DltSource with a single DltResource."
+            )
+    resource_name = None
+    if not hasattr(data, "__name__"):
+        logger.info("Setting default resource name to 'content' for adapted resource.")
+        resource_name = "content"
+    return cast("DltResource", dlt.resource(data, name=resource_name))
+def iceberg_adapter(
+    data: Any,
+    partition: Optional[Union[str, PartitionTransform, List[Union[str, PartitionTransform]]]] = None,
+):
+    """
+    Apply Iceberg-specific hints to a dlt resource.
+    This adapter prepares data for loading into Iceberg tables by setting
+    partition specifications using Iceberg's native transforms.
+    Args:
+        data: A dlt resource, source (with single resource), or raw data
+        partition: Partition specification(s). Can be:
+            - A column name string (uses identity transform)
+            - A single PartitionTransform
+            - A list of column names and/or PartitionTransform objects
+            Use iceberg_partition helpers to create transforms.
+    Returns:
+        DltResource with Iceberg-specific hints applied
+    Examples:
+        # Simple identity partition by column name
+        iceberg_adapter(my_resource, partition="region")
+        iceberg_adapter(my_resource, partition=["region", "category"])
+        # Single partition column with month transform
+        iceberg_adapter(my_resource, partition=iceberg_partition.month("created_at"))
+        # Multiple partition columns with mixed specs
+        iceberg_adapter(
+            my_resource,
+            partition=[
+                iceberg_partition.day("event_date"),
+                "region",  # identity partition
+                iceberg_partition.bucket(10, "user_id"),
+            ]
+        )
+        # Works with raw data too
+        data = [{"id": 1, "ts": "2024-01-01"}]
+        iceberg_adapter(data, partition=iceberg_partition.month("ts"))
+    """
+    resource = _get_resource_for_adapter(data)
+    if partition is None:
+        return resource
+    # Normalize to list
+    if isinstance(partition, (str, PartitionTransform)):
+        partition_list = [partition]
+    else:
+        partition_list = partition
+    if not partition_list:
+        return resource
+    # Convert strings to identity PartitionTransforms
+    partitions: List[PartitionTransform] = []
+    for p in partition_list:
+        if isinstance(p, str):
+            partitions.append(iceberg_partition.identity(p))
+        else:
+            partitions.append(p)
+    # Build column hints for partitioning
+    column_hints = {}
+    for p in partitions:
+        if p.column not in column_hints:
+            column_hints[p.column] = {"name": p.column}
+        # Set partition flag using x-partition (custom hint prefix)
+        column_hints[p.column]["x-partition"] = True
+        # Set transform (identity is handled as default in partition_builder)
+        if p.transform != "identity":
+            column_hints[p.column]["x-partition-transform"] = p.to_hint_value()
+        # Set custom partition field name if provided
+        if p.name:
+            column_hints[p.column]["x-partition-name"] = p.name
+    # Apply hints to resource
+    resource.apply_hints(columns=column_hints)
+    logger.info(f"Applied Iceberg partition hints: {[p.column for p in partitions]}")
+    return resource

dlt_iceberg/destination.py CHANGED Viewed

@@ -37,6 +37,87 @@ from pyiceberg.io.pyarrow import schema_to_pyarrow
 logger = logging.getLogger(__name__)
+def _get_merge_strategy(table_schema: TTableSchema) -> str:
+    """Extract merge strategy from table schema.
+    write_disposition can be:
+    - "merge" (string) -> use upsert (backward compatible)
+    - {"disposition": "merge", "strategy": "delete-insert"} -> explicit strategy
+    Returns:
+        Merge strategy: "upsert" or "delete-insert"
+    """
+    write_disposition = table_schema.get("write_disposition", "append")
+    if isinstance(write_disposition, dict):
+        return write_disposition.get("strategy", "delete-insert")
+    # String "merge" - use upsert as our default (backward compatible)
+    return "upsert"
+def _execute_delete_insert(iceberg_table, arrow_table, primary_keys: list, identifier: str):
+    """Execute delete-insert merge strategy.
+    Deletes rows matching primary keys in incoming data, then appends new data.
+    Uses PyIceberg transaction for atomic delete + append.
+    Args:
+        iceberg_table: PyIceberg table object
+        arrow_table: Arrow table with data to merge
+        primary_keys: List of primary key column names
+        identifier: Table identifier for logging
+    Returns:
+        Tuple of (rows_deleted_estimate, rows_inserted)
+    """
+    from pyiceberg.expressions import In, And, EqualTo, Or
+    # Build delete filter from primary key values in incoming data
+    if len(primary_keys) == 1:
+        pk_col = primary_keys[0]
+        pk_values = arrow_table.column(pk_col).to_pylist()
+        unique_pk_values = list(set(pk_values))
+        delete_filter = In(pk_col, unique_pk_values)
+        deleted_estimate = len(unique_pk_values)
+    else:
+        # Composite primary key - build OR of AND conditions
+        pk_tuples = set()
+        for i in range(len(arrow_table)):
+            pk_tuple = tuple(
+                arrow_table.column(pk).to_pylist()[i] for pk in primary_keys
+            )
+            pk_tuples.add(pk_tuple)
+        conditions = []
+        for pk_tuple in pk_tuples:
+            and_conditions = [
+                EqualTo(pk, val) for pk, val in zip(primary_keys, pk_tuple)
+            ]
+            if len(and_conditions) == 1:
+                conditions.append(and_conditions[0])
+            else:
+                conditions.append(And(*and_conditions))
+        if len(conditions) == 1:
+            delete_filter = conditions[0]
+        else:
+            delete_filter = Or(*conditions)
+        deleted_estimate = len(pk_tuples)
+    logger.info(
+        f"Delete-insert for {identifier}: deleting up to {deleted_estimate} "
+        f"matching rows, inserting {len(arrow_table)} rows"
+    )
+    # Execute atomic delete + append using transaction
+    with iceberg_table.transaction() as txn:
+        txn.delete(delete_filter)
+        txn.append(arrow_table)
+    return (deleted_estimate, len(arrow_table))
 def _iceberg_rest_handler(
     items: str,  # File path when batch_size=0
     table: TTableSchema,
@@ -270,13 +351,18 @@ def _iceberg_rest_handler(
                 raise
             # Write data based on disposition
-            if write_disposition == "replace":
+            # Handle both string and dict write_disposition
+            disposition_type = write_disposition
+            if isinstance(write_disposition, dict):
+                disposition_type = write_disposition.get("disposition", "append")
+            if disposition_type == "replace":
                 logger.info(f"Overwriting table {identifier}")
                 iceberg_table.overwrite(arrow_table)
-            elif write_disposition == "append":
+            elif disposition_type == "append":
                 logger.info(f"Appending to table {identifier}")
                 iceberg_table.append(arrow_table)
-            elif write_disposition == "merge":
+            elif disposition_type == "merge":
                 # For merge, we need primary keys
                 # Try multiple ways to get primary keys from dlt table schema
                 primary_keys = table.get("primary_key") or table.get("x-merge-keys")
@@ -296,21 +382,36 @@ def _iceberg_rest_handler(
                     )
                     iceberg_table.append(arrow_table)
                 else:
-                    logger.info(f"Merging into table {identifier} on keys {primary_keys}")
-                    # Use PyIceberg's upsert API to update existing rows and insert new ones
-                    # PyIceberg will automatically match rows based on join_cols (primary keys)
-                    upsert_result = iceberg_table.upsert(
-                        df=arrow_table,
-                        join_cols=primary_keys,
-                        when_matched_update_all=True,
-                        when_not_matched_insert_all=True,
-                    )
+                    # Get merge strategy
+                    merge_strategy = _get_merge_strategy(table)
                     logger.info(
-                        f"Upsert completed: {upsert_result.rows_updated} updated, "
-                        f"{upsert_result.rows_inserted} inserted"
+                        f"Merging into table {identifier} on keys {primary_keys} "
+                        f"using strategy: {merge_strategy}"
                     )
+                    if merge_strategy == "delete-insert":
+                        # Atomic delete + insert
+                        deleted, inserted = _execute_delete_insert(
+                            iceberg_table, arrow_table, primary_keys, identifier
+                        )
+                        logger.info(
+                            f"Delete-insert completed: ~{deleted} deleted, "
+                            f"{inserted} inserted"
+                        )
+                    else:
+                        # Default: upsert strategy
+                        upsert_result = iceberg_table.upsert(
+                            df=arrow_table,
+                            join_cols=primary_keys,
+                            when_matched_update_all=True,
+                            when_not_matched_insert_all=True,
+                        )
+                        logger.info(
+                            f"Upsert completed: {upsert_result.rows_updated} updated, "
+                            f"{upsert_result.rows_inserted} inserted"
+                        )
             else:
-                raise ValueError(f"Unknown write disposition: {write_disposition}")
+                raise ValueError(f"Unknown write disposition: {disposition_type}")
             logger.info(f"Successfully wrote {len(arrow_table)} rows to {identifier}")
             return  # Success
@@ -391,7 +492,7 @@ def iceberg_rest(**kwargs):
     def _raw_capabilities_with_merge():
         """Add merge support to the destination capabilities."""
         caps = original_raw_capabilities()
-        caps.supported_merge_strategies = ["upsert"]
+        caps.supported_merge_strategies = ["delete-insert", "upsert"]
         return caps
     # Bind the new method to the instance

dlt-iceberg 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

dlt-iceberg 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl