PyPI - dlt-iceberg - Versions diffs - 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

dlt-iceberg 0.1.4py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

dlt_iceberg/__init__.py +6 -0
dlt_iceberg/adapter.py +276 -0
dlt_iceberg/destination.py +117 -16
dlt_iceberg/destination_client.py +455 -38
dlt_iceberg/partition_builder.py +12 -6
dlt_iceberg/schema_converter.py +4 -1
dlt_iceberg/sql_client.py +222 -0
dlt_iceberg-0.2.0.dist-info/METADATA +442 -0
dlt_iceberg-0.2.0.dist-info/RECORD +14 -0
{dlt_iceberg-0.1.4.dist-info → dlt_iceberg-0.2.0.dist-info}/WHEEL +1 -1
dlt_iceberg-0.1.4.dist-info/METADATA +0 -314
dlt_iceberg-0.1.4.dist-info/RECORD +0 -12
{dlt_iceberg-0.1.4.dist-info → dlt_iceberg-0.2.0.dist-info}/licenses/LICENSE +0 -0

dlt_iceberg/sql_client.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""
+DuckDB SQL client for Iceberg tables.
+Provides queryable access to Iceberg tables via DuckDB views using iceberg_scan().
+This enables pipeline.dataset() to work with the Iceberg destination.
+"""
+import logging
+from typing import TYPE_CHECKING, Any, List, Tuple
+import duckdb
+from packaging.version import Version
+from dlt.common.destination.exceptions import DestinationUndefinedEntity
+from dlt.common.destination.typing import PreparedTableSchema
+from dlt.destinations.impl.duckdb.sql_client import WithTableScanners
+from dlt.destinations.impl.duckdb.factory import DuckDbCredentials
+from dlt.destinations.sql_client import raise_database_error
+if TYPE_CHECKING:
+    from dlt_iceberg.destination_client import IcebergRestClient
+else:
+    IcebergRestClient = Any
+logger = logging.getLogger(__name__)
+class IcebergSqlClient(WithTableScanners):
+    """SQL client that maps Iceberg tables as DuckDB views.
+    Creates DuckDB views using iceberg_scan() that point to the Iceberg
+    table metadata, enabling SQL queries over Iceberg tables.
+    """
+    def __init__(
+        self,
+        remote_client: "IcebergRestClient",
+        dataset_name: str,
+        cache_db: DuckDbCredentials = None,
+        persist_secrets: bool = False,
+    ) -> None:
+        super().__init__(remote_client, dataset_name, cache_db, persist_secrets=persist_secrets)
+        self.remote_client: "IcebergRestClient" = remote_client
+        self.iceberg_initialized = False
+    def can_create_view(self, table_schema: PreparedTableSchema) -> bool:
+        """Check if a view can be created for this table."""
+        # All Iceberg tables can have views created
+        return True
+    def should_replace_view(self, view_name: str, table_schema: PreparedTableSchema) -> bool:
+        """Determine if view should be replaced to get fresh data."""
+        # Always replace to get latest snapshot
+        # TODO: Could optimize with configuration option
+        return True
+    @raise_database_error
+    def open_connection(self) -> duckdb.DuckDBPyConnection:
+        """Open DuckDB connection and set up for Iceberg access."""
+        with self.credentials.conn_pool._conn_lock:
+            first_connection = self.credentials.conn_pool.never_borrowed
+            super().open_connection()
+        if first_connection:
+            # Set up storage credentials if available
+            self._setup_storage_credentials()
+        return self._conn
+    def _setup_storage_credentials(self) -> None:
+        """Set up DuckDB secrets for storage access."""
+        config = self.remote_client.config
+        # S3 credentials
+        if config.s3_access_key_id and config.s3_secret_access_key:
+            secret_sql = f"""
+                CREATE SECRET IF NOT EXISTS iceberg_s3_secret (
+                    TYPE S3,
+                    KEY_ID '{config.s3_access_key_id}',
+                    SECRET '{config.s3_secret_access_key}'
+            """
+            if config.s3_region:
+                secret_sql += f", REGION '{config.s3_region}'"
+            if config.s3_endpoint:
+                # Handle endpoint URL
+                endpoint = config.s3_endpoint
+                if endpoint.startswith("http://"):
+                    secret_sql += f", ENDPOINT '{endpoint[7:]}', USE_SSL false"
+                elif endpoint.startswith("https://"):
+                    secret_sql += f", ENDPOINT '{endpoint[8:]}'"
+                else:
+                    secret_sql += f", ENDPOINT '{endpoint}'"
+            secret_sql += ")"
+            try:
+                self._conn.execute(secret_sql)
+                logger.info("Created DuckDB S3 secret for Iceberg access")
+            except Exception as e:
+                logger.warning(f"Failed to create S3 secret: {e}")
+    @raise_database_error
+    def create_view(self, view_name: str, table_schema: PreparedTableSchema) -> None:
+        """Create a DuckDB view for an Iceberg table using iceberg_scan()."""
+        table_name = table_schema["name"]
+        # Get table location from catalog
+        try:
+            table_location = self.remote_client.get_open_table_location("iceberg", table_name)
+        except Exception as e:
+            raise DestinationUndefinedEntity(table_name) from e
+        if not table_location:
+            raise DestinationUndefinedEntity(table_name)
+        # Ensure iceberg extension is loaded
+        if not self.iceberg_initialized:
+            self._setup_iceberg(self._conn)
+            self.iceberg_initialized = True
+        # Get metadata file path
+        metadata_file = self._get_metadata_file(table_location)
+        if not metadata_file:
+            raise DestinationUndefinedEntity(table_name)
+        # Check for gzip compression
+        compression = ""
+        if ".gz." in metadata_file:
+            compression = ", metadata_compression_codec = 'gzip'"
+        # Scanner options based on DuckDB version
+        if Version(duckdb.__version__) > Version("1.3.0"):
+            scanner_options = "union_by_name=true"
+        else:
+            scanner_options = "skip_schema_inference=false"
+        # Build column selection from schema
+        columns = list(self.schema.get_table_columns(table_name).keys())
+        escaped_columns = [self.escape_column_name(c) for c in columns]
+        columns_sql = ", ".join(escaped_columns) if columns else "*"
+        # Create the view
+        view_sql = f"""
+            CREATE OR REPLACE VIEW {self.escape_column_name(view_name)} AS
+            SELECT {columns_sql}
+            FROM iceberg_scan('{metadata_file}'{compression}, {scanner_options})
+        """
+        logger.info(f"Creating view {view_name} for Iceberg table {table_name}")
+        self._conn.execute(view_sql)
+    def _get_metadata_file(self, table_location: str) -> str:
+        """Get the latest metadata file path for an Iceberg table.
+        Args:
+            table_location: Base location of the Iceberg table
+        Returns:
+            Path to the latest metadata JSON file
+        """
+        # Try to get metadata from the catalog directly
+        try:
+            # Load the table through PyIceberg to get metadata location
+            catalog = self.remote_client._get_catalog()
+            namespace = self.remote_client.config.namespace
+            # Extract table name from location
+            table_name = table_location.rstrip("/").split("/")[-1]
+            identifier = f"{namespace}.{table_name}"
+            iceberg_table = catalog.load_table(identifier)
+            metadata_location = iceberg_table.metadata_location
+            if metadata_location:
+                return metadata_location
+        except Exception as e:
+            logger.debug(f"Could not get metadata from catalog: {e}")
+        # Fallback: scan metadata directory
+        metadata_path = f"{table_location.rstrip('/')}/metadata"
+        return self._find_latest_metadata(metadata_path)
+    def _find_latest_metadata(self, metadata_path: str) -> str:
+        """Find the latest metadata file in a metadata directory.
+        Args:
+            metadata_path: Path to the metadata directory
+        Returns:
+            Path to the latest metadata file or empty string
+        """
+        import os
+        from urllib.parse import urlparse
+        parsed = urlparse(metadata_path)
+        # Only support local filesystem for fallback scan
+        if parsed.scheme and parsed.scheme not in ("file", ""):
+            logger.warning(
+                f"Cannot scan metadata directory for {parsed.scheme} storage, "
+                "use catalog for metadata location"
+            )
+            return ""
+        local_path = parsed.path if parsed.scheme == "file" else metadata_path
+        if not os.path.exists(local_path):
+            return ""
+        # Find latest metadata file
+        metadata_files = [
+            f for f in os.listdir(local_path)
+            if f.endswith(".metadata.json")
+        ]
+        if not metadata_files:
+            return ""
+        # Sort by version number (format: v1.metadata.json, v2.metadata.json, etc.)
+        # or by timestamp for UUID-based names
+        metadata_files.sort(reverse=True)
+        return f"{metadata_path}/{metadata_files[0]}"

dlt_iceberg-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,442 @@
+Metadata-Version: 2.4
+Name: dlt-iceberg
+Version: 0.2.0
+Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
+Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
+Project-URL: Repository, https://github.com/sidequery/dlt-iceberg
+Project-URL: Issues, https://github.com/sidequery/dlt-iceberg/issues
+Author-email: Sidequery <hello@sidequery.com>
+License: MIT
+License-File: LICENSE
+Keywords: data-engineering,data-pipeline,dlt,elt,etl,iceberg
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Database
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.11
+Requires-Dist: boto3>=1.40.50
+Requires-Dist: dlt>=1.17.1
+Requires-Dist: duckdb>=1.4.3
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: pyarrow>=21.0.0
+Requires-Dist: pydantic<2.11
+Requires-Dist: pyiceberg[pyiceberg-core]>=0.10.0
+Requires-Dist: requests>=2.32.5
+Requires-Dist: s3fs>=0.4.2
+Requires-Dist: sqlalchemy>=2.0.44
+Description-Content-Type: text/markdown
+# dlt-iceberg
+A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.apache.org/) tables using REST catalogs.
+## Features
+- **Atomic Multi-File Commits**: Multiple parquet files committed as single Iceberg snapshot per table
+- **REST Catalog Support**: Works with Nessie, Polaris, AWS Glue, Unity Catalog
+- **Credential Vending**: Most REST catalogs vend storage credentials automatically
+- **Partitioning**: Full support for Iceberg partition transforms via `iceberg_adapter()`
+- **Merge Strategies**: Delete-insert and upsert with hard delete support
+- **DuckDB Integration**: Query loaded data via `pipeline.dataset()`
+- **Schema Evolution**: Automatic schema updates when adding columns
+## Installation
+```bash
+pip install dlt-iceberg
+```
+Or with uv:
+```bash
+uv add dlt-iceberg
+```
+## Quick Start
+```python
+import dlt
+from dlt_iceberg import iceberg_rest
+@dlt.resource(name="events", write_disposition="append")
+def generate_events():
+    yield {"event_id": 1, "value": 100}
+pipeline = dlt.pipeline(
+    pipeline_name="my_pipeline",
+    destination=iceberg_rest(
+        catalog_uri="https://my-catalog.example.com/api/catalog",
+        namespace="analytics",
+        warehouse="my_warehouse",
+        credential="client-id:client-secret",
+        oauth2_server_uri="https://my-catalog.example.com/oauth/tokens",
+    ),
+)
+pipeline.run(generate_events())
+```
+### Query Loaded Data
+```python
+# Query data via DuckDB
+dataset = pipeline.dataset()
+# Access as dataframe
+df = dataset["events"].df()
+# Run SQL queries
+result = dataset.query("SELECT * FROM events WHERE value > 50").fetchall()
+# Get Arrow table
+arrow_table = dataset["events"].arrow()
+```
+### Merge/Upsert
+```python
+@dlt.resource(
+    name="users",
+    write_disposition="merge",
+    primary_key="user_id"
+)
+def generate_users():
+    yield {"user_id": 1, "name": "Alice", "status": "active"}
+pipeline.run(generate_users())
+```
+## Configuration
+### Required Options
+```python
+iceberg_rest(
+    catalog_uri="...",    # REST catalog endpoint (or sqlite:// for local)
+    namespace="...",      # Iceberg namespace (database)
+)
+```
+### Authentication
+Choose based on your catalog:
+| Catalog | Auth Method |
+|---------|-------------|
+| Polaris, Lakekeeper | `credential` + `oauth2_server_uri` |
+| Unity Catalog | `token` |
+| AWS Glue | `sigv4_enabled` + `signing_region` |
+| Local SQLite | None needed |
+Most REST catalogs (Polaris, Lakekeeper, etc.) **vend storage credentials automatically** via the catalog API. You typically don't need to configure S3/GCS/Azure credentials manually.
+<details>
+<summary><b>Advanced Options</b></summary>
+```python
+iceberg_rest(
+    # ... required options ...
+    # Manual storage credentials (usually not needed with credential vending)
+    s3_endpoint="...",
+    s3_access_key_id="...",
+    s3_secret_access_key="...",
+    s3_region="...",
+    # Performance tuning
+    max_retries=5,               # Retry attempts for transient failures
+    retry_backoff_base=2.0,      # Exponential backoff multiplier
+    merge_batch_size=500000,     # Rows per batch for merge operations
+    strict_casting=False,        # Fail on potential data loss
+    # Table management
+    table_location_layout=None,  # Custom table location pattern
+    register_new_tables=False,   # Register tables found in storage
+    hard_delete_column="_dlt_deleted_at",  # Column for hard deletes
+)
+```
+</details>
+## Catalog Examples
+<details>
+<summary><b>Polaris / Lakekeeper</b></summary>
+```python
+iceberg_rest(
+    catalog_uri="https://polaris.example.com/api/catalog",
+    warehouse="my_warehouse",
+    namespace="production",
+    credential="client-id:client-secret",
+    oauth2_server_uri="https://polaris.example.com/api/catalog/v1/oauth/tokens",
+)
+```
+Storage credentials are vended automatically by the catalog.
+</details>
+<details>
+<summary><b>Unity Catalog (Databricks)</b></summary>
+```python
+iceberg_rest(
+    catalog_uri="https://<workspace>.cloud.databricks.com/api/2.1/unity-catalog/iceberg-rest",
+    warehouse="<catalog-name>",
+    namespace="<schema-name>",
+    token="<databricks-token>",
+)
+```
+</details>
+<details>
+<summary><b>AWS Glue</b></summary>
+```python
+iceberg_rest(
+    catalog_uri="https://glue.us-east-1.amazonaws.com/iceberg",
+    warehouse="<account-id>:s3tablescatalog/<bucket>",
+    namespace="my_database",
+    sigv4_enabled=True,
+    signing_region="us-east-1",
+)
+```
+Requires AWS credentials in environment (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`).
+</details>
+<details>
+<summary><b>Local SQLite Catalog</b></summary>
+```python
+iceberg_rest(
+    catalog_uri="sqlite:///catalog.db",
+    warehouse="file:///path/to/warehouse",
+    namespace="my_namespace",
+)
+```
+Great for local development and testing.
+</details>
+<details>
+<summary><b>Nessie (Docker)</b></summary>
+```python
+iceberg_rest(
+    catalog_uri="http://localhost:19120/iceberg/main",
+    namespace="my_namespace",
+    s3_endpoint="http://localhost:9000",
+    s3_access_key_id="minioadmin",
+    s3_secret_access_key="minioadmin",
+    s3_region="us-east-1",
+)
+```
+Start Nessie + MinIO with `docker compose up -d` (see docker-compose.yml in repo).
+</details>
+## Partitioning
+### Using iceberg_adapter (Recommended)
+The `iceberg_adapter` function provides a clean API for configuring Iceberg partitioning:
+```python
+from dlt_iceberg import iceberg_adapter, iceberg_partition
+@dlt.resource(name="events")
+def events():
+    yield {"event_date": "2024-01-01", "user_id": 123, "region": "US"}
+# Single partition
+adapted = iceberg_adapter(events, partition="region")
+# Multiple partitions with transforms
+adapted = iceberg_adapter(
+    events,
+    partition=[
+        iceberg_partition.month("event_date"),
+        iceberg_partition.bucket(10, "user_id"),
+        "region",  # identity partition
+    ]
+)
+pipeline.run(adapted)
+```
+### Partition Transforms
+```python
+# Temporal transforms (for timestamp/date columns)
+iceberg_partition.year("created_at")
+iceberg_partition.month("created_at")
+iceberg_partition.day("created_at")
+iceberg_partition.hour("created_at")
+# Identity (no transformation)
+iceberg_partition.identity("region")
+# Bucket (hash into N buckets)
+iceberg_partition.bucket(10, "user_id")
+# Truncate (truncate to width)
+iceberg_partition.truncate(4, "email")
+# Custom partition field names
+iceberg_partition.month("created_at", "event_month")
+iceberg_partition.bucket(8, "user_id", "user_bucket")
+```
+### Using Column Hints
+You can also use dlt column hints for partitioning:
+```python
+@dlt.resource(
+    name="events",
+    columns={
+        "event_date": {
+            "data_type": "date",
+            "partition": True,
+            "partition_transform": "day",
+        },
+        "user_id": {
+            "data_type": "bigint",
+            "partition": True,
+            "partition_transform": "bucket[10]",
+        }
+    }
+)
+def events():
+    ...
+```
+## Write Dispositions
+### Append
+```python
+write_disposition="append"
+```
+Adds new data without modifying existing rows.
+### Replace
+```python
+write_disposition="replace"
+```
+Truncates table and inserts new data.
+### Merge
+#### Delete-Insert Strategy (Default)
+```python
+@dlt.resource(
+    write_disposition={"disposition": "merge", "strategy": "delete-insert"},
+    primary_key="user_id"
+)
+```
+Deletes matching rows then inserts new data. Single atomic transaction.
+#### Upsert Strategy
+```python
+@dlt.resource(
+    write_disposition={"disposition": "merge", "strategy": "upsert"},
+    primary_key="user_id"
+)
+```
+Updates existing rows, inserts new rows.
+#### Hard Deletes
+Mark rows for deletion by setting the `_dlt_deleted_at` column:
+```python
+@dlt.resource(
+    write_disposition={"disposition": "merge", "strategy": "delete-insert"},
+    primary_key="user_id"
+)
+def users_with_deletes():
+    from datetime import datetime
+    yield {"user_id": 1, "name": "alice", "_dlt_deleted_at": None}  # Keep
+    yield {"user_id": 2, "name": "bob", "_dlt_deleted_at": datetime.now()}  # Delete
+```
+## Development
+### Run Tests
+```bash
+# Start Docker services (for Nessie tests)
+docker compose up -d
+# Run all tests
+uv run pytest tests/ -v
+# Run only unit tests (no Docker required)
+uv run pytest tests/ --ignore=tests/nessie -v
+# Run Nessie integration tests
+uv run pytest tests/nessie/ -v
+```
+### Project Structure
+```
+dlt-iceberg/
+├── src/dlt_iceberg/
+│   ├── __init__.py           # Public API
+│   ├── destination_client.py # Class-based destination (atomic commits)
+│   ├── destination.py        # Function-based destination (legacy)
+│   ├── adapter.py            # iceberg_adapter() for partitioning
+│   ├── sql_client.py         # DuckDB integration for dataset()
+│   ├── schema_converter.py   # dlt → Iceberg schema conversion
+│   ├── schema_casting.py     # Arrow table casting
+│   ├── schema_evolution.py   # Schema updates
+│   ├── partition_builder.py  # Partition specs
+│   └── error_handling.py     # Retry logic
+├── tests/
+│   ├── test_adapter.py       # iceberg_adapter tests
+│   ├── test_capabilities.py  # Hard delete, partition names tests
+│   ├── test_dataset.py       # DuckDB integration tests
+│   ├── test_merge_disposition.py
+│   ├── test_schema_evolution.py
+│   └── ...
+├── examples/
+│   ├── incremental_load.py   # CSV incremental loading
+│   ├── merge_load.py         # CSV merge/upsert
+│   └── data/                 # Sample CSV files
+└── docker-compose.yml        # Nessie + MinIO for testing
+```
+## How It Works
+The class-based destination uses dlt's `JobClientBase` interface to accumulate parquet files during a load and commit them atomically in `complete_load()`:
+1. dlt extracts data and writes parquet files
+2. Each file is registered in module-level global state
+3. After all files complete, `complete_load()` is called
+4. All files for a table are combined and committed as single Iceberg snapshot
+5. Each table gets one snapshot per load
+This ensures atomic commits even though dlt creates multiple client instances.
+## License
+MIT License - see LICENSE file
+## Resources
+- [dlt Documentation](https://dlthub.com/docs)
+- [Apache Iceberg](https://iceberg.apache.org/)
+- [PyIceberg](https://py.iceberg.apache.org/)
+- [Iceberg REST Spec](https://iceberg.apache.org/rest-catalog-spec/)

dlt_iceberg-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+dlt_iceberg/__init__.py,sha256=lwTBzF5tUzh1inDwqqutwcniQzgaK0_H27oRxCiRh5Y,968
+dlt_iceberg/adapter.py,sha256=mpwnBz07B84yX2jbixv33-LJOt3XZi6lt_8yP3X6LTM,9674
+dlt_iceberg/destination.py,sha256=oE0J8mcuCmR9KX0E6nEJ9kOTLPwPb6lq3ZHVorMlOAc,19409
+dlt_iceberg/destination_client.py,sha256=Br-uCIM7QGumRbgw1ehNraRUjeNcbZRlE0uP5KvSwVI,41060
+dlt_iceberg/error_handling.py,sha256=k6Kkldi9BDRsXQ63VEBMMSw1xx2-b1BMjsgRFKI2iB0,7852
+dlt_iceberg/partition_builder.py,sha256=ERAewxVXbqXh0XX92KXt4a6h9bnKmf4D-uTFSoExbm8,10401
+dlt_iceberg/schema_casting.py,sha256=oSQrnOcCMFcinMS65N8YQ1uzrqnQmN50mCCuQyE3794,15247
+dlt_iceberg/schema_converter.py,sha256=ImpxvUY4oEietOgycqQZaJJ0mISqVyH4IkQ-fQ_lf6Y,5717
+dlt_iceberg/schema_evolution.py,sha256=ieOkCA9ngQdJ5lbZLYQ09deTLZEW8whxDn2arpoH-aM,8326
+dlt_iceberg/sql_client.py,sha256=EIHpsH0k4XoEffLbzobm4NJvr0Se6fA7pkc97DQqT88,8202
+dlt_iceberg-0.2.0.dist-info/METADATA,sha256=O1oR3OQkMoYdnJBH5iUJgvoYqPA-h1PkLCarsNo7zRI,11747
+dlt_iceberg-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+dlt_iceberg-0.2.0.dist-info/licenses/LICENSE,sha256=0amGlcH0msYju3WUhlsuUxO4aj3ZODkkIZ0MKOq9fQ4,1066
+dlt_iceberg-0.2.0.dist-info/RECORD,,

{dlt_iceberg-0.1.4.dist-info → dlt_iceberg-0.2.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.27.0
+Generator: hatchling 1.28.0
 Root-Is-Purelib: true
 Tag: py3-none-any

dlt-iceberg 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

dlt-iceberg 0.1.4py3-none-any.whl → 0.2.0py3-none-any.whl