PyPI - dlt-iceberg - Versions diffs - 0.1.2__tar.gz → 0.1.3__tar.gz - Mend

dlt-iceberg 0.1.2tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dlt-iceberg
-Version: 0.1.2
+Version: 0.1.3
 Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
 Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
 Project-URL: Repository, https://github.com/sidequery/dlt-iceberg

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dlt-iceberg"
-version = "0.1.2"
+version = "0.1.3"
 description = "dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs"
 readme = "README.md"
 requires-python = ">=3.11"

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/destination_client.py RENAMED Viewed

@@ -34,7 +34,11 @@ from pyiceberg.exceptions import (
 from .schema_converter import convert_dlt_to_iceberg_schema
 from .partition_builder import build_partition_spec
 from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
-from .schema_casting import cast_table_safe, CastingError
+from .schema_casting import (
+    cast_table_safe,
+    CastingError,
+    ensure_iceberg_compatible_arrow_data,
+)
 from .error_handling import (
     is_retryable_error,
     log_error_with_context,
@@ -89,6 +93,9 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
     # Schema casting configuration
     strict_casting: bool = False
+    # Merge batch size (for upsert operations to avoid memory issues)
+    merge_batch_size: int = 100000
 class IcebergRestLoadJob(RunnableLoadJob):
     """
@@ -380,7 +387,8 @@ class IcebergRestClient(JobClientBase):
                 # Create table if needed
                 if not table_exists:
                     # Use first file's Arrow table to generate schema
-                    first_arrow_table = file_data[0][2]
+                    # Apply Iceberg compatibility first so schema uses compatible types
+                    first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
                     iceberg_schema = convert_dlt_to_iceberg_schema(
                         table_schema, first_arrow_table
                     )
@@ -401,7 +409,7 @@ class IcebergRestClient(JobClientBase):
                     logger.info(f"Created table {identifier} at {iceberg_table.location()}")
                 else:
                     # Table exists - check if schema evolution is needed
-                    first_arrow_table = file_data[0][2]
+                    first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
                     incoming_schema = convert_dlt_to_iceberg_schema(
                         table_schema, first_arrow_table
                     )
@@ -415,12 +423,15 @@ class IcebergRestClient(JobClientBase):
                         logger.info(f"Schema evolved for table {identifier}")
                         iceberg_table = catalog.load_table(identifier)
-                # Combine all Arrow tables and cast to match Iceberg schema
+                # Get expected schema (already has Iceberg-compatible types from creation)
                 expected_schema = schema_to_pyarrow(iceberg_table.schema())
+                # Combine all Arrow tables and cast to match Iceberg schema
                 combined_tables = []
                 for _, file_path, arrow_table in file_data:
-                    # Cast each table to match Iceberg schema
+                    # Cast to match Iceberg schema
+                    # (compatibility conversions already applied when schema was created)
                     casted_table = cast_table_safe(
                         arrow_table,
                         expected_schema,
@@ -463,15 +474,34 @@ class IcebergRestClient(JobClientBase):
                         iceberg_table.append(combined_table)
                     else:
                         logger.info(f"Merging into table {identifier} on keys {primary_keys}")
-                        upsert_result = iceberg_table.upsert(
-                            df=combined_table,
-                            join_cols=primary_keys,
-                            when_matched_update_all=True,
-                            when_not_matched_insert_all=True,
-                        )
+                        # Batch upserts to avoid memory issues on large datasets
+                        batch_size = self.config.merge_batch_size
+                        total_updated = 0
+                        total_inserted = 0
+                        for batch_start in range(0, len(combined_table), batch_size):
+                            batch_end = min(batch_start + batch_size, len(combined_table))
+                            batch = combined_table.slice(batch_start, batch_end - batch_start)
+                            logger.info(
+                                f"Upserting batch {batch_start//batch_size + 1}: "
+                                f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
+                            )
+                            upsert_result = iceberg_table.upsert(
+                                df=batch,
+                                join_cols=primary_keys,
+                                when_matched_update_all=True,
+                                when_not_matched_insert_all=True,
+                            )
+                            total_updated += upsert_result.rows_updated
+                            total_inserted += upsert_result.rows_inserted
                         logger.info(
-                            f"Upsert completed: {upsert_result.rows_updated} updated, "
-                            f"{upsert_result.rows_inserted} inserted"
+                            f"Upsert completed: {total_updated} updated, "
+                            f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
                         )
                 else:
                     raise ValueError(f"Unknown write disposition: {write_disposition}")

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_casting.py RENAMED Viewed

@@ -6,12 +6,75 @@ and allow users to control casting behavior.
 """
 import logging
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Dict, Callable
 import pyarrow as pa
 logger = logging.getLogger(__name__)
+def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
+    """
+    Convert Arrow schema to Iceberg-compatible schema.
+    Converts types that Iceberg doesn't support:
+    - time32 → time64 (microseconds)
+    - decimal256 → string (Iceberg only supports decimal128)
+    - dictionary → value_type (unwrap dictionary encoding)
+    Args:
+        schema: PyArrow schema
+    Returns:
+        Iceberg-compatible PyArrow schema
+    """
+    def convert_field(field: pa.Field) -> pa.Field:
+        field_type = field.type
+        # time32 → time64(us)
+        if pa.types.is_time32(field_type):
+            return pa.field(field.name, pa.time64("us"), nullable=field.nullable)
+        # decimal256 → string (pyarrow doesn't allow downcasting to decimal128)
+        if pa.types.is_decimal256(field_type):
+            logger.warning(
+                f"Converting decimal256 field '{field.name}' to string "
+                f"(Iceberg doesn't support decimal256)"
+            )
+            return pa.field(field.name, pa.string(), nullable=field.nullable)
+        # dictionary → value_type (unwrap dictionary encoding)
+        if pa.types.is_dictionary(field_type):
+            return pa.field(field.name, field_type.value_type, nullable=field.nullable)
+        # list/struct types - recursively convert nested fields
+        if pa.types.is_list(field_type):
+            value_field = convert_field(pa.field("item", field_type.value_type))
+            return pa.field(field.name, pa.list_(value_field.type), nullable=field.nullable)
+        if pa.types.is_struct(field_type):
+            new_fields = [convert_field(f) for f in field_type]
+            return pa.field(field.name, pa.struct(new_fields), nullable=field.nullable)
+        return field
+    new_fields = [convert_field(field) for field in schema]
+    return pa.schema(new_fields)
+def ensure_iceberg_compatible_arrow_data(table: pa.Table) -> pa.Table:
+    """
+    Convert Arrow table to Iceberg-compatible schema and cast data.
+    Args:
+        table: PyArrow table
+    Returns:
+        Table with Iceberg-compatible schema
+    """
+    new_schema = ensure_iceberg_compatible_arrow_schema(table.schema)
+    return table.cast(new_schema)
 class CastingError(Exception):
     """Raised when a cast would result in data loss in strict mode."""
     pass

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/uv.lock RENAMED Viewed

@@ -182,7 +182,7 @@ wheels = [
 [[package]]
 name = "dlt-iceberg"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "boto3" },

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.github/workflows/publish.yml RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.github/workflows/test.yml RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.gitignore RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.python-version RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/LICENSE RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/README.md RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/TESTING.md RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/docker-compose.yml RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/README.md RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/customers_initial.csv RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/customers_updates.csv RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/events_batch1.csv RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/events_batch2.csv RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/incremental_load.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/merge_load.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/__init__.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/destination.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/error_handling.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/partition_builder.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_converter.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_evolution.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_class_based_atomic.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_destination_e2e.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_destination_rest_catalog.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_e2e_sqlite_catalog.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_error_handling.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_merge_disposition.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_partition_builder.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_partitioning_e2e.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_pyiceberg_append.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_schema_casting.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_schema_converter.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_schema_evolution.py RENAMED Viewed

File without changes

{dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_smoke.py RENAMED Viewed

File without changes

dlt-iceberg 0.1.2__tar.gz → 0.1.3__tar.gz

dlt-iceberg 0.1.2tar.gz → 0.1.3tar.gz