dlt-iceberg 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,7 +34,11 @@ from pyiceberg.exceptions import (
34
34
  from .schema_converter import convert_dlt_to_iceberg_schema
35
35
  from .partition_builder import build_partition_spec
36
36
  from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
37
- from .schema_casting import cast_table_safe, CastingError
37
+ from .schema_casting import (
38
+ cast_table_safe,
39
+ CastingError,
40
+ ensure_iceberg_compatible_arrow_data,
41
+ )
38
42
  from .error_handling import (
39
43
  is_retryable_error,
40
44
  log_error_with_context,
@@ -89,6 +93,9 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
89
93
  # Schema casting configuration
90
94
  strict_casting: bool = False
91
95
 
96
+ # Merge batch size (for upsert operations to avoid memory issues)
97
+ merge_batch_size: int = 100000
98
+
92
99
 
93
100
  class IcebergRestLoadJob(RunnableLoadJob):
94
101
  """
@@ -380,7 +387,8 @@ class IcebergRestClient(JobClientBase):
380
387
  # Create table if needed
381
388
  if not table_exists:
382
389
  # Use first file's Arrow table to generate schema
383
- first_arrow_table = file_data[0][2]
390
+ # Apply Iceberg compatibility first so schema uses compatible types
391
+ first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
384
392
  iceberg_schema = convert_dlt_to_iceberg_schema(
385
393
  table_schema, first_arrow_table
386
394
  )
@@ -401,7 +409,7 @@ class IcebergRestClient(JobClientBase):
401
409
  logger.info(f"Created table {identifier} at {iceberg_table.location()}")
402
410
  else:
403
411
  # Table exists - check if schema evolution is needed
404
- first_arrow_table = file_data[0][2]
412
+ first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
405
413
  incoming_schema = convert_dlt_to_iceberg_schema(
406
414
  table_schema, first_arrow_table
407
415
  )
@@ -415,12 +423,15 @@ class IcebergRestClient(JobClientBase):
415
423
  logger.info(f"Schema evolved for table {identifier}")
416
424
  iceberg_table = catalog.load_table(identifier)
417
425
 
418
- # Combine all Arrow tables and cast to match Iceberg schema
426
+ # Get expected schema (already has Iceberg-compatible types from creation)
419
427
  expected_schema = schema_to_pyarrow(iceberg_table.schema())
428
+
429
+ # Combine all Arrow tables and cast to match Iceberg schema
420
430
  combined_tables = []
421
431
 
422
432
  for _, file_path, arrow_table in file_data:
423
- # Cast each table to match Iceberg schema
433
+ # Cast to match Iceberg schema
434
+ # (compatibility conversions already applied when schema was created)
424
435
  casted_table = cast_table_safe(
425
436
  arrow_table,
426
437
  expected_schema,
@@ -463,15 +474,34 @@ class IcebergRestClient(JobClientBase):
463
474
  iceberg_table.append(combined_table)
464
475
  else:
465
476
  logger.info(f"Merging into table {identifier} on keys {primary_keys}")
466
- upsert_result = iceberg_table.upsert(
467
- df=combined_table,
468
- join_cols=primary_keys,
469
- when_matched_update_all=True,
470
- when_not_matched_insert_all=True,
471
- )
477
+
478
+ # Batch upserts to avoid memory issues on large datasets
479
+ batch_size = self.config.merge_batch_size
480
+ total_updated = 0
481
+ total_inserted = 0
482
+
483
+ for batch_start in range(0, len(combined_table), batch_size):
484
+ batch_end = min(batch_start + batch_size, len(combined_table))
485
+ batch = combined_table.slice(batch_start, batch_end - batch_start)
486
+
487
+ logger.info(
488
+ f"Upserting batch {batch_start//batch_size + 1}: "
489
+ f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
490
+ )
491
+
492
+ upsert_result = iceberg_table.upsert(
493
+ df=batch,
494
+ join_cols=primary_keys,
495
+ when_matched_update_all=True,
496
+ when_not_matched_insert_all=True,
497
+ )
498
+
499
+ total_updated += upsert_result.rows_updated
500
+ total_inserted += upsert_result.rows_inserted
501
+
472
502
  logger.info(
473
- f"Upsert completed: {upsert_result.rows_updated} updated, "
474
- f"{upsert_result.rows_inserted} inserted"
503
+ f"Upsert completed: {total_updated} updated, "
504
+ f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
475
505
  )
476
506
  else:
477
507
  raise ValueError(f"Unknown write disposition: {write_disposition}")
@@ -6,12 +6,75 @@ and allow users to control casting behavior.
6
6
  """
7
7
 
8
8
  import logging
9
- from typing import List, Optional, Tuple
9
+ from typing import List, Optional, Tuple, Dict, Callable
10
10
  import pyarrow as pa
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
+ def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
16
+ """
17
+ Convert Arrow schema to Iceberg-compatible schema.
18
+
19
+ Converts types that Iceberg doesn't support:
20
+ - time32 → time64 (microseconds)
21
+ - decimal256 → string (Iceberg only supports decimal128)
22
+ - dictionary → value_type (unwrap dictionary encoding)
23
+
24
+ Args:
25
+ schema: PyArrow schema
26
+
27
+ Returns:
28
+ Iceberg-compatible PyArrow schema
29
+ """
30
+ def convert_field(field: pa.Field) -> pa.Field:
31
+ field_type = field.type
32
+
33
+ # time32 → time64(us)
34
+ if pa.types.is_time32(field_type):
35
+ return pa.field(field.name, pa.time64("us"), nullable=field.nullable)
36
+
37
+ # decimal256 → string (pyarrow doesn't allow downcasting to decimal128)
38
+ if pa.types.is_decimal256(field_type):
39
+ logger.warning(
40
+ f"Converting decimal256 field '{field.name}' to string "
41
+ f"(Iceberg doesn't support decimal256)"
42
+ )
43
+ return pa.field(field.name, pa.string(), nullable=field.nullable)
44
+
45
+ # dictionary → value_type (unwrap dictionary encoding)
46
+ if pa.types.is_dictionary(field_type):
47
+ return pa.field(field.name, field_type.value_type, nullable=field.nullable)
48
+
49
+ # list/struct types - recursively convert nested fields
50
+ if pa.types.is_list(field_type):
51
+ value_field = convert_field(pa.field("item", field_type.value_type))
52
+ return pa.field(field.name, pa.list_(value_field.type), nullable=field.nullable)
53
+
54
+ if pa.types.is_struct(field_type):
55
+ new_fields = [convert_field(f) for f in field_type]
56
+ return pa.field(field.name, pa.struct(new_fields), nullable=field.nullable)
57
+
58
+ return field
59
+
60
+ new_fields = [convert_field(field) for field in schema]
61
+ return pa.schema(new_fields)
62
+
63
+
64
+ def ensure_iceberg_compatible_arrow_data(table: pa.Table) -> pa.Table:
65
+ """
66
+ Convert Arrow table to Iceberg-compatible schema and cast data.
67
+
68
+ Args:
69
+ table: PyArrow table
70
+
71
+ Returns:
72
+ Table with Iceberg-compatible schema
73
+ """
74
+ new_schema = ensure_iceberg_compatible_arrow_schema(table.schema)
75
+ return table.cast(new_schema)
76
+
77
+
15
78
  class CastingError(Exception):
16
79
  """Raised when a cast would result in data loss in strict mode."""
17
80
  pass
@@ -0,0 +1,279 @@
1
+ Metadata-Version: 2.4
2
+ Name: dlt-iceberg
3
+ Version: 0.1.3
4
+ Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
5
+ Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
6
+ Project-URL: Repository, https://github.com/sidequery/dlt-iceberg
7
+ Project-URL: Issues, https://github.com/sidequery/dlt-iceberg/issues
8
+ Author-email: Sidequery <hello@sidequery.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: data-engineering,data-pipeline,dlt,elt,etl,iceberg
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.11
22
+ Requires-Dist: boto3>=1.40.50
23
+ Requires-Dist: dlt>=1.17.1
24
+ Requires-Dist: pandas>=2.3.3
25
+ Requires-Dist: pyarrow>=21.0.0
26
+ Requires-Dist: pydantic<2.11
27
+ Requires-Dist: pyiceberg[pyiceberg-core]>=0.10.0
28
+ Requires-Dist: requests>=2.32.5
29
+ Requires-Dist: s3fs>=0.4.2
30
+ Requires-Dist: sqlalchemy>=2.0.44
31
+ Description-Content-Type: text/markdown
32
+
33
+ # dlt-iceberg
34
+
35
+ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.apache.org/) tables using REST catalogs.
36
+
37
+ ## Features
38
+
39
+ - **Atomic Multi-File Commits**: Multiple parquet files committed as single Iceberg snapshot per table
40
+ - **REST Catalog Support**: Works with Nessie, Polaris, AWS Glue, Unity Catalog
41
+ - **Partitioning**: Full support for Iceberg partition transforms (temporal, bucket, truncate, identity)
42
+ - **Authentication**: OAuth2, Bearer token, AWS SigV4
43
+ - **Write Dispositions**: Append, replace, merge (upsert)
44
+ - **Schema Evolution**: Automatic schema updates when adding columns
45
+ - **Retry Logic**: Exponential backoff for transient failures
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ git clone https://github.com/sidequery/dlt-iceberg.git
51
+ cd dlt-iceberg
52
+ uv sync
53
+ ```
54
+
55
+ ## Quick Start
56
+
57
+ See [examples/](examples/) directory for working examples.
58
+
59
+ ### Incremental Load
60
+
61
+ ```python
62
+ import dlt
63
+ from dlt_iceberg import iceberg_rest
64
+
65
+ @dlt.resource(name="events", write_disposition="append")
66
+ def generate_events():
67
+ yield {"event_id": 1, "value": 100}
68
+
69
+ pipeline = dlt.pipeline(
70
+ pipeline_name="my_pipeline",
71
+ destination=iceberg_rest(
72
+ catalog_uri="http://localhost:19120/iceberg/main",
73
+ namespace="analytics",
74
+ s3_endpoint="http://localhost:9000",
75
+ s3_access_key_id="minioadmin",
76
+ s3_secret_access_key="minioadmin",
77
+ s3_region="us-east-1",
78
+ ),
79
+ )
80
+
81
+ pipeline.run(generate_events())
82
+ ```
83
+
84
+ ### Merge/Upsert
85
+
86
+ ```python
87
+ @dlt.resource(
88
+ name="users",
89
+ write_disposition="merge",
90
+ primary_key="user_id"
91
+ )
92
+ def generate_users():
93
+ yield {"user_id": 1, "name": "Alice", "status": "active"}
94
+
95
+ pipeline.run(generate_users())
96
+ ```
97
+
98
+ ## Configuration
99
+
100
+ ### Nessie (Docker)
101
+
102
+ ```python
103
+ iceberg_rest(
104
+ catalog_uri="http://localhost:19120/iceberg/main",
105
+ namespace="my_namespace",
106
+ s3_endpoint="http://localhost:9000",
107
+ s3_access_key_id="minioadmin",
108
+ s3_secret_access_key="minioadmin",
109
+ s3_region="us-east-1",
110
+ )
111
+ ```
112
+
113
+ Start services: `docker compose up -d`
114
+
115
+ ### AWS Glue
116
+
117
+ ```python
118
+ iceberg_rest(
119
+ catalog_uri="https://glue.us-east-1.amazonaws.com/iceberg",
120
+ warehouse="<account-id>:s3tablescatalog/<bucket>",
121
+ namespace="my_database",
122
+ sigv4_enabled=True,
123
+ signing_region="us-east-1",
124
+ )
125
+ ```
126
+
127
+ AWS credentials via environment variables.
128
+
129
+ ### Polaris
130
+
131
+ ```python
132
+ iceberg_rest(
133
+ catalog_uri="https://polaris.example.com/api/catalog",
134
+ warehouse="s3://bucket/warehouse",
135
+ namespace="production",
136
+ credential="client-id:client-secret",
137
+ oauth2_server_uri="https://polaris.example.com/api/catalog/v1/oauth/tokens",
138
+ )
139
+ ```
140
+
141
+ ### Unity Catalog
142
+
143
+ ```python
144
+ iceberg_rest(
145
+ catalog_uri="https://<workspace>.cloud.databricks.com/api/2.1/unity-catalog/iceberg-rest",
146
+ warehouse="<catalog-name>",
147
+ namespace="<schema-name>",
148
+ token="<databricks-token>",
149
+ )
150
+ ```
151
+
152
+ ## Partitioning
153
+
154
+ Mark columns for partitioning using dlt column hints:
155
+
156
+ ```python
157
+ @dlt.resource(
158
+ name="events",
159
+ columns={
160
+ "event_date": {
161
+ "data_type": "date",
162
+ "partition": True,
163
+ "partition_transform": "day", # Optional: year, month, day, hour
164
+ },
165
+ "region": {
166
+ "data_type": "text",
167
+ "partition": True, # Uses identity transform for strings
168
+ },
169
+ "user_id": {
170
+ "data_type": "bigint",
171
+ "partition": True,
172
+ "partition_transform": "bucket[10]", # Hash into 10 buckets
173
+ }
174
+ }
175
+ )
176
+ def events():
177
+ ...
178
+ ```
179
+
180
+ ### Available Transforms
181
+
182
+ - **Temporal**: `year`, `month`, `day`, `hour` (for timestamp/date columns)
183
+ - **Identity**: No transformation (default for string/integer)
184
+ - **Bucket**: `bucket[N]` - Hash-based partitioning into N buckets
185
+ - **Truncate**: `truncate[N]` - Truncate strings/integers to N width
186
+
187
+ ### Default Behavior
188
+
189
+ If `partition_transform` is not specified:
190
+ - Timestamp/date columns default to `month`
191
+ - String/integer columns default to `identity`
192
+
193
+ ## Write Dispositions
194
+
195
+ ### Append
196
+ ```python
197
+ write_disposition="append"
198
+ ```
199
+ Adds new data without modifying existing rows.
200
+
201
+ ### Replace
202
+ ```python
203
+ write_disposition="replace"
204
+ ```
205
+ Truncates table and inserts new data.
206
+
207
+ ### Merge
208
+ ```python
209
+ write_disposition="merge"
210
+ primary_key="user_id"
211
+ ```
212
+ Updates existing rows by primary key, inserts new rows.
213
+
214
+ ## Development
215
+
216
+ ### Run Tests
217
+
218
+ ```bash
219
+ # Start Docker services
220
+ docker compose up -d
221
+
222
+ # Run all tests
223
+ uv run pytest tests/ -v
224
+
225
+ # Run only unit tests
226
+ uv run pytest tests/ -v -m "not integration"
227
+
228
+ # Run only integration tests
229
+ uv run pytest tests/ -v -m integration
230
+ ```
231
+
232
+ ### Project Structure
233
+
234
+ ```
235
+ dlt-iceberg/
236
+ ├── src/dlt_iceberg/
237
+ │ ├── __init__.py # Public API
238
+ │ ├── destination_client.py # Class-based destination (atomic commits)
239
+ │ ├── destination.py # Function-based destination (legacy)
240
+ │ ├── schema_converter.py # dlt → Iceberg schema conversion
241
+ │ ├── schema_casting.py # Arrow table casting
242
+ │ ├── schema_evolution.py # Schema updates
243
+ │ ├── partition_builder.py # Partition specs
244
+ │ └── error_handling.py # Retry logic
245
+ ├── tests/
246
+ │ ├── test_destination_rest_catalog.py # Integration tests (Docker)
247
+ │ ├── test_class_based_atomic.py # Atomic commit tests
248
+ │ ├── test_merge_disposition.py
249
+ │ ├── test_schema_evolution.py
250
+ │ └── ...
251
+ ├── examples/
252
+ │ ├── incremental_load.py # CSV incremental loading
253
+ │ ├── merge_load.py # CSV merge/upsert
254
+ │ └── data/ # Sample CSV files
255
+ └── docker-compose.yml # Nessie + MinIO for testing
256
+ ```
257
+
258
+ ## How It Works
259
+
260
+ The class-based destination uses dlt's `JobClientBase` interface to accumulate parquet files during a load and commit them atomically in `complete_load()`:
261
+
262
+ 1. dlt extracts data and writes parquet files
263
+ 2. Each file is registered in module-level global state
264
+ 3. After all files complete, `complete_load()` is called
265
+ 4. All files for a table are combined and committed as single Iceberg snapshot
266
+ 5. Each table gets one snapshot per load
267
+
268
+ This ensures atomic commits even though dlt creates multiple client instances.
269
+
270
+ ## License
271
+
272
+ MIT License - see LICENSE file
273
+
274
+ ## Resources
275
+
276
+ - [dlt Documentation](https://dlthub.com/docs)
277
+ - [Apache Iceberg](https://iceberg.apache.org/)
278
+ - [PyIceberg](https://py.iceberg.apache.org/)
279
+ - [Iceberg REST Spec](https://iceberg.apache.org/rest-catalog-spec/)
@@ -1,12 +1,12 @@
1
1
  dlt_iceberg/__init__.py,sha256=ONy6E-sGcCvvqia8_fGaYp8da4n4wdjox9W42tmQPK0,780
2
2
  dlt_iceberg/destination.py,sha256=F8QJXsQeosOA32Xm1140DL485WQmxbuhiA2QZ6zpVSU,15737
3
- dlt_iceberg/destination_client.py,sha256=dyJtHHy2Ow0GIFVj17LePC76rKw6MiVJnrS-y28OctQ,22341
3
+ dlt_iceberg/destination_client.py,sha256=l1q8GYvIJ_tBgoQ979IS3VtUQNmg2-hYv80XZkAVFKs,23786
4
4
  dlt_iceberg/error_handling.py,sha256=k6Kkldi9BDRsXQ63VEBMMSw1xx2-b1BMjsgRFKI2iB0,7852
5
5
  dlt_iceberg/partition_builder.py,sha256=l9YNAh2t6gk2xqsPSOs8ymTDLk9BOEZWVOtVni7ONNU,10081
6
- dlt_iceberg/schema_casting.py,sha256=Qn4sarRnyJM04lKvKonEjvlvVdizUOGI65J_AmzbEAs,12997
6
+ dlt_iceberg/schema_casting.py,sha256=oSQrnOcCMFcinMS65N8YQ1uzrqnQmN50mCCuQyE3794,15247
7
7
  dlt_iceberg/schema_converter.py,sha256=e_eqXQz2cpABOGEAxVwcGbiOdVmv9kaZanRnU83lzXk,5619
8
8
  dlt_iceberg/schema_evolution.py,sha256=ieOkCA9ngQdJ5lbZLYQ09deTLZEW8whxDn2arpoH-aM,8326
9
- dlt_iceberg-0.1.1.dist-info/METADATA,sha256=hhtEkMwpG_rQBUULTeyoMsSevGGEIhCqOjTJJgCw8qY,466
10
- dlt_iceberg-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
- dlt_iceberg-0.1.1.dist-info/licenses/LICENSE,sha256=0amGlcH0msYju3WUhlsuUxO4aj3ZODkkIZ0MKOq9fQ4,1066
12
- dlt_iceberg-0.1.1.dist-info/RECORD,,
9
+ dlt_iceberg-0.1.3.dist-info/METADATA,sha256=p7kTtuGvCXpuUT37AlgOh1Y7HcaCq9x6n_KDb5Ccnxk,7797
10
+ dlt_iceberg-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ dlt_iceberg-0.1.3.dist-info/licenses/LICENSE,sha256=0amGlcH0msYju3WUhlsuUxO4aj3ZODkkIZ0MKOq9fQ4,1066
12
+ dlt_iceberg-0.1.3.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: dlt-iceberg
3
- Version: 0.1.1
4
- Summary: dlt custom destination for Apache Iceberg with REST catalog support
5
- License-File: LICENSE
6
- Requires-Python: >=3.11
7
- Requires-Dist: boto3>=1.40.50
8
- Requires-Dist: dlt>=1.17.1
9
- Requires-Dist: pandas>=2.3.3
10
- Requires-Dist: pyarrow>=21.0.0
11
- Requires-Dist: pydantic<2.11
12
- Requires-Dist: pyiceberg[pyiceberg-core]>=0.10.0
13
- Requires-Dist: requests>=2.32.5
14
- Requires-Dist: s3fs>=0.4.2
15
- Requires-Dist: sqlalchemy>=2.0.44