dlt-iceberg 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/PKG-INFO +1 -1
  2. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/pyproject.toml +1 -1
  3. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/destination_client.py +43 -13
  4. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_casting.py +64 -1
  5. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/uv.lock +1 -1
  6. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.github/workflows/publish.yml +0 -0
  7. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.github/workflows/test.yml +0 -0
  8. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.gitignore +0 -0
  9. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/.python-version +0 -0
  10. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/LICENSE +0 -0
  11. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/README.md +0 -0
  12. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/TESTING.md +0 -0
  13. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/docker-compose.yml +0 -0
  14. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/README.md +0 -0
  15. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/customers_initial.csv +0 -0
  16. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/customers_updates.csv +0 -0
  17. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/events_batch1.csv +0 -0
  18. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/data/events_batch2.csv +0 -0
  19. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/incremental_load.py +0 -0
  20. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/examples/merge_load.py +0 -0
  21. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/__init__.py +0 -0
  22. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/destination.py +0 -0
  23. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/error_handling.py +0 -0
  24. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/partition_builder.py +0 -0
  25. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_converter.py +0 -0
  26. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_evolution.py +0 -0
  27. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_class_based_atomic.py +0 -0
  28. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_destination_e2e.py +0 -0
  29. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_destination_rest_catalog.py +0 -0
  30. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_e2e_sqlite_catalog.py +0 -0
  31. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_error_handling.py +0 -0
  32. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_merge_disposition.py +0 -0
  33. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_partition_builder.py +0 -0
  34. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_partitioning_e2e.py +0 -0
  35. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_pyiceberg_append.py +0 -0
  36. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_schema_casting.py +0 -0
  37. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_schema_converter.py +0 -0
  38. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_schema_evolution.py +0 -0
  39. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.3}/tests/test_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dlt-iceberg
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
5
5
  Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
6
6
  Project-URL: Repository, https://github.com/sidequery/dlt-iceberg
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dlt-iceberg"
3
- version = "0.1.2"
3
+ version = "0.1.3"
4
4
  description = "dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -34,7 +34,11 @@ from pyiceberg.exceptions import (
34
34
  from .schema_converter import convert_dlt_to_iceberg_schema
35
35
  from .partition_builder import build_partition_spec
36
36
  from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
37
- from .schema_casting import cast_table_safe, CastingError
37
+ from .schema_casting import (
38
+ cast_table_safe,
39
+ CastingError,
40
+ ensure_iceberg_compatible_arrow_data,
41
+ )
38
42
  from .error_handling import (
39
43
  is_retryable_error,
40
44
  log_error_with_context,
@@ -89,6 +93,9 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
89
93
  # Schema casting configuration
90
94
  strict_casting: bool = False
91
95
 
96
+ # Merge batch size (for upsert operations to avoid memory issues)
97
+ merge_batch_size: int = 100000
98
+
92
99
 
93
100
  class IcebergRestLoadJob(RunnableLoadJob):
94
101
  """
@@ -380,7 +387,8 @@ class IcebergRestClient(JobClientBase):
380
387
  # Create table if needed
381
388
  if not table_exists:
382
389
  # Use first file's Arrow table to generate schema
383
- first_arrow_table = file_data[0][2]
390
+ # Apply Iceberg compatibility first so schema uses compatible types
391
+ first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
384
392
  iceberg_schema = convert_dlt_to_iceberg_schema(
385
393
  table_schema, first_arrow_table
386
394
  )
@@ -401,7 +409,7 @@ class IcebergRestClient(JobClientBase):
401
409
  logger.info(f"Created table {identifier} at {iceberg_table.location()}")
402
410
  else:
403
411
  # Table exists - check if schema evolution is needed
404
- first_arrow_table = file_data[0][2]
412
+ first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
405
413
  incoming_schema = convert_dlt_to_iceberg_schema(
406
414
  table_schema, first_arrow_table
407
415
  )
@@ -415,12 +423,15 @@ class IcebergRestClient(JobClientBase):
415
423
  logger.info(f"Schema evolved for table {identifier}")
416
424
  iceberg_table = catalog.load_table(identifier)
417
425
 
418
- # Combine all Arrow tables and cast to match Iceberg schema
426
+ # Get expected schema (already has Iceberg-compatible types from creation)
419
427
  expected_schema = schema_to_pyarrow(iceberg_table.schema())
428
+
429
+ # Combine all Arrow tables and cast to match Iceberg schema
420
430
  combined_tables = []
421
431
 
422
432
  for _, file_path, arrow_table in file_data:
423
- # Cast each table to match Iceberg schema
433
+ # Cast to match Iceberg schema
434
+ # (compatibility conversions already applied when schema was created)
424
435
  casted_table = cast_table_safe(
425
436
  arrow_table,
426
437
  expected_schema,
@@ -463,15 +474,34 @@ class IcebergRestClient(JobClientBase):
463
474
  iceberg_table.append(combined_table)
464
475
  else:
465
476
  logger.info(f"Merging into table {identifier} on keys {primary_keys}")
466
- upsert_result = iceberg_table.upsert(
467
- df=combined_table,
468
- join_cols=primary_keys,
469
- when_matched_update_all=True,
470
- when_not_matched_insert_all=True,
471
- )
477
+
478
+ # Batch upserts to avoid memory issues on large datasets
479
+ batch_size = self.config.merge_batch_size
480
+ total_updated = 0
481
+ total_inserted = 0
482
+
483
+ for batch_start in range(0, len(combined_table), batch_size):
484
+ batch_end = min(batch_start + batch_size, len(combined_table))
485
+ batch = combined_table.slice(batch_start, batch_end - batch_start)
486
+
487
+ logger.info(
488
+ f"Upserting batch {batch_start//batch_size + 1}: "
489
+ f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
490
+ )
491
+
492
+ upsert_result = iceberg_table.upsert(
493
+ df=batch,
494
+ join_cols=primary_keys,
495
+ when_matched_update_all=True,
496
+ when_not_matched_insert_all=True,
497
+ )
498
+
499
+ total_updated += upsert_result.rows_updated
500
+ total_inserted += upsert_result.rows_inserted
501
+
472
502
  logger.info(
473
- f"Upsert completed: {upsert_result.rows_updated} updated, "
474
- f"{upsert_result.rows_inserted} inserted"
503
+ f"Upsert completed: {total_updated} updated, "
504
+ f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
475
505
  )
476
506
  else:
477
507
  raise ValueError(f"Unknown write disposition: {write_disposition}")
@@ -6,12 +6,75 @@ and allow users to control casting behavior.
6
6
  """
7
7
 
8
8
  import logging
9
- from typing import List, Optional, Tuple
9
+ from typing import List, Optional, Tuple, Dict, Callable
10
10
  import pyarrow as pa
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
+ def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
16
+ """
17
+ Convert Arrow schema to Iceberg-compatible schema.
18
+
19
+ Converts types that Iceberg doesn't support:
20
+ - time32 → time64 (microseconds)
21
+ - decimal256 → string (Iceberg only supports decimal128)
22
+ - dictionary → value_type (unwrap dictionary encoding)
23
+
24
+ Args:
25
+ schema: PyArrow schema
26
+
27
+ Returns:
28
+ Iceberg-compatible PyArrow schema
29
+ """
30
+ def convert_field(field: pa.Field) -> pa.Field:
31
+ field_type = field.type
32
+
33
+ # time32 → time64(us)
34
+ if pa.types.is_time32(field_type):
35
+ return pa.field(field.name, pa.time64("us"), nullable=field.nullable)
36
+
37
+ # decimal256 → string (pyarrow doesn't allow downcasting to decimal128)
38
+ if pa.types.is_decimal256(field_type):
39
+ logger.warning(
40
+ f"Converting decimal256 field '{field.name}' to string "
41
+ f"(Iceberg doesn't support decimal256)"
42
+ )
43
+ return pa.field(field.name, pa.string(), nullable=field.nullable)
44
+
45
+ # dictionary → value_type (unwrap dictionary encoding)
46
+ if pa.types.is_dictionary(field_type):
47
+ return pa.field(field.name, field_type.value_type, nullable=field.nullable)
48
+
49
+ # list/struct types - recursively convert nested fields
50
+ if pa.types.is_list(field_type):
51
+ value_field = convert_field(pa.field("item", field_type.value_type))
52
+ return pa.field(field.name, pa.list_(value_field.type), nullable=field.nullable)
53
+
54
+ if pa.types.is_struct(field_type):
55
+ new_fields = [convert_field(f) for f in field_type]
56
+ return pa.field(field.name, pa.struct(new_fields), nullable=field.nullable)
57
+
58
+ return field
59
+
60
+ new_fields = [convert_field(field) for field in schema]
61
+ return pa.schema(new_fields)
62
+
63
+
64
+ def ensure_iceberg_compatible_arrow_data(table: pa.Table) -> pa.Table:
65
+ """
66
+ Convert Arrow table to Iceberg-compatible schema and cast data.
67
+
68
+ Args:
69
+ table: PyArrow table
70
+
71
+ Returns:
72
+ Table with Iceberg-compatible schema
73
+ """
74
+ new_schema = ensure_iceberg_compatible_arrow_schema(table.schema)
75
+ return table.cast(new_schema)
76
+
77
+
15
78
  class CastingError(Exception):
16
79
  """Raised when a cast would result in data loss in strict mode."""
17
80
  pass
@@ -182,7 +182,7 @@ wheels = [
182
182
 
183
183
  [[package]]
184
184
  name = "dlt-iceberg"
185
- version = "0.1.1"
185
+ version = "0.1.2"
186
186
  source = { editable = "." }
187
187
  dependencies = [
188
188
  { name = "boto3" },
File without changes
File without changes
File without changes
File without changes
File without changes