dlt-iceberg 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dlt_iceberg-0.1.3/PKG-INFO +279 -0
- dlt_iceberg-0.1.3/pyproject.toml +58 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/destination_client.py +43 -13
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_casting.py +64 -1
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/uv.lock +753 -752
- dlt_iceberg-0.1.1/PKG-INFO +0 -15
- dlt_iceberg-0.1.1/pyproject.toml +0 -36
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/.github/workflows/publish.yml +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/.github/workflows/test.yml +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/.gitignore +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/.python-version +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/LICENSE +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/README.md +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/TESTING.md +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/docker-compose.yml +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/examples/README.md +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/examples/data/customers_initial.csv +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/examples/data/customers_updates.csv +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/examples/data/events_batch1.csv +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/examples/data/events_batch2.csv +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/examples/incremental_load.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/examples/merge_load.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/__init__.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/destination.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/error_handling.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/partition_builder.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_converter.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/src/dlt_iceberg/schema_evolution.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_class_based_atomic.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_destination_e2e.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_destination_rest_catalog.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_e2e_sqlite_catalog.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_error_handling.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_merge_disposition.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_partition_builder.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_partitioning_e2e.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_pyiceberg_append.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_schema_casting.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_schema_converter.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_schema_evolution.py +0 -0
- {dlt_iceberg-0.1.1 → dlt_iceberg-0.1.3}/tests/test_smoke.py +0 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dlt-iceberg
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
|
|
5
|
+
Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
|
|
6
|
+
Project-URL: Repository, https://github.com/sidequery/dlt-iceberg
|
|
7
|
+
Project-URL: Issues, https://github.com/sidequery/dlt-iceberg/issues
|
|
8
|
+
Author-email: Sidequery <hello@sidequery.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: data-engineering,data-pipeline,dlt,elt,etl,iceberg
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: boto3>=1.40.50
|
|
23
|
+
Requires-Dist: dlt>=1.17.1
|
|
24
|
+
Requires-Dist: pandas>=2.3.3
|
|
25
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
26
|
+
Requires-Dist: pydantic<2.11
|
|
27
|
+
Requires-Dist: pyiceberg[pyiceberg-core]>=0.10.0
|
|
28
|
+
Requires-Dist: requests>=2.32.5
|
|
29
|
+
Requires-Dist: s3fs>=0.4.2
|
|
30
|
+
Requires-Dist: sqlalchemy>=2.0.44
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# dlt-iceberg
|
|
34
|
+
|
|
35
|
+
A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.apache.org/) tables using REST catalogs.
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **Atomic Multi-File Commits**: Multiple parquet files committed as single Iceberg snapshot per table
|
|
40
|
+
- **REST Catalog Support**: Works with Nessie, Polaris, AWS Glue, Unity Catalog
|
|
41
|
+
- **Partitioning**: Full support for Iceberg partition transforms (temporal, bucket, truncate, identity)
|
|
42
|
+
- **Authentication**: OAuth2, Bearer token, AWS SigV4
|
|
43
|
+
- **Write Dispositions**: Append, replace, merge (upsert)
|
|
44
|
+
- **Schema Evolution**: Automatic schema updates when adding columns
|
|
45
|
+
- **Retry Logic**: Exponential backoff for transient failures
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
git clone https://github.com/sidequery/dlt-iceberg.git
|
|
51
|
+
cd dlt-iceberg
|
|
52
|
+
uv sync
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
See [examples/](examples/) directory for working examples.
|
|
58
|
+
|
|
59
|
+
### Incremental Load
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import dlt
|
|
63
|
+
from dlt_iceberg import iceberg_rest
|
|
64
|
+
|
|
65
|
+
@dlt.resource(name="events", write_disposition="append")
|
|
66
|
+
def generate_events():
|
|
67
|
+
yield {"event_id": 1, "value": 100}
|
|
68
|
+
|
|
69
|
+
pipeline = dlt.pipeline(
|
|
70
|
+
pipeline_name="my_pipeline",
|
|
71
|
+
destination=iceberg_rest(
|
|
72
|
+
catalog_uri="http://localhost:19120/iceberg/main",
|
|
73
|
+
namespace="analytics",
|
|
74
|
+
s3_endpoint="http://localhost:9000",
|
|
75
|
+
s3_access_key_id="minioadmin",
|
|
76
|
+
s3_secret_access_key="minioadmin",
|
|
77
|
+
s3_region="us-east-1",
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
pipeline.run(generate_events())
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Merge/Upsert
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
@dlt.resource(
|
|
88
|
+
name="users",
|
|
89
|
+
write_disposition="merge",
|
|
90
|
+
primary_key="user_id"
|
|
91
|
+
)
|
|
92
|
+
def generate_users():
|
|
93
|
+
yield {"user_id": 1, "name": "Alice", "status": "active"}
|
|
94
|
+
|
|
95
|
+
pipeline.run(generate_users())
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Configuration
|
|
99
|
+
|
|
100
|
+
### Nessie (Docker)
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
iceberg_rest(
|
|
104
|
+
catalog_uri="http://localhost:19120/iceberg/main",
|
|
105
|
+
namespace="my_namespace",
|
|
106
|
+
s3_endpoint="http://localhost:9000",
|
|
107
|
+
s3_access_key_id="minioadmin",
|
|
108
|
+
s3_secret_access_key="minioadmin",
|
|
109
|
+
s3_region="us-east-1",
|
|
110
|
+
)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Start services: `docker compose up -d`
|
|
114
|
+
|
|
115
|
+
### AWS Glue
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
iceberg_rest(
|
|
119
|
+
catalog_uri="https://glue.us-east-1.amazonaws.com/iceberg",
|
|
120
|
+
warehouse="<account-id>:s3tablescatalog/<bucket>",
|
|
121
|
+
namespace="my_database",
|
|
122
|
+
sigv4_enabled=True,
|
|
123
|
+
signing_region="us-east-1",
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
AWS credentials via environment variables.
|
|
128
|
+
|
|
129
|
+
### Polaris
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
iceberg_rest(
|
|
133
|
+
catalog_uri="https://polaris.example.com/api/catalog",
|
|
134
|
+
warehouse="s3://bucket/warehouse",
|
|
135
|
+
namespace="production",
|
|
136
|
+
credential="client-id:client-secret",
|
|
137
|
+
oauth2_server_uri="https://polaris.example.com/api/catalog/v1/oauth/tokens",
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Unity Catalog
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
iceberg_rest(
|
|
145
|
+
catalog_uri="https://<workspace>.cloud.databricks.com/api/2.1/unity-catalog/iceberg-rest",
|
|
146
|
+
warehouse="<catalog-name>",
|
|
147
|
+
namespace="<schema-name>",
|
|
148
|
+
token="<databricks-token>",
|
|
149
|
+
)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Partitioning
|
|
153
|
+
|
|
154
|
+
Mark columns for partitioning using dlt column hints:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
@dlt.resource(
|
|
158
|
+
name="events",
|
|
159
|
+
columns={
|
|
160
|
+
"event_date": {
|
|
161
|
+
"data_type": "date",
|
|
162
|
+
"partition": True,
|
|
163
|
+
"partition_transform": "day", # Optional: year, month, day, hour
|
|
164
|
+
},
|
|
165
|
+
"region": {
|
|
166
|
+
"data_type": "text",
|
|
167
|
+
"partition": True, # Uses identity transform for strings
|
|
168
|
+
},
|
|
169
|
+
"user_id": {
|
|
170
|
+
"data_type": "bigint",
|
|
171
|
+
"partition": True,
|
|
172
|
+
"partition_transform": "bucket[10]", # Hash into 10 buckets
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
)
|
|
176
|
+
def events():
|
|
177
|
+
...
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Available Transforms
|
|
181
|
+
|
|
182
|
+
- **Temporal**: `year`, `month`, `day`, `hour` (for timestamp/date columns)
|
|
183
|
+
- **Identity**: No transformation (default for string/integer)
|
|
184
|
+
- **Bucket**: `bucket[N]` - Hash-based partitioning into N buckets
|
|
185
|
+
- **Truncate**: `truncate[N]` - Truncate strings/integers to N width
|
|
186
|
+
|
|
187
|
+
### Default Behavior
|
|
188
|
+
|
|
189
|
+
If `partition_transform` is not specified:
|
|
190
|
+
- Timestamp/date columns default to `month`
|
|
191
|
+
- String/integer columns default to `identity`
|
|
192
|
+
|
|
193
|
+
## Write Dispositions
|
|
194
|
+
|
|
195
|
+
### Append
|
|
196
|
+
```python
|
|
197
|
+
write_disposition="append"
|
|
198
|
+
```
|
|
199
|
+
Adds new data without modifying existing rows.
|
|
200
|
+
|
|
201
|
+
### Replace
|
|
202
|
+
```python
|
|
203
|
+
write_disposition="replace"
|
|
204
|
+
```
|
|
205
|
+
Truncates table and inserts new data.
|
|
206
|
+
|
|
207
|
+
### Merge
|
|
208
|
+
```python
|
|
209
|
+
write_disposition="merge"
|
|
210
|
+
primary_key="user_id"
|
|
211
|
+
```
|
|
212
|
+
Updates existing rows by primary key, inserts new rows.
|
|
213
|
+
|
|
214
|
+
## Development
|
|
215
|
+
|
|
216
|
+
### Run Tests
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
# Start Docker services
|
|
220
|
+
docker compose up -d
|
|
221
|
+
|
|
222
|
+
# Run all tests
|
|
223
|
+
uv run pytest tests/ -v
|
|
224
|
+
|
|
225
|
+
# Run only unit tests
|
|
226
|
+
uv run pytest tests/ -v -m "not integration"
|
|
227
|
+
|
|
228
|
+
# Run only integration tests
|
|
229
|
+
uv run pytest tests/ -v -m integration
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Project Structure
|
|
233
|
+
|
|
234
|
+
```
|
|
235
|
+
dlt-iceberg/
|
|
236
|
+
├── src/dlt_iceberg/
|
|
237
|
+
│ ├── __init__.py # Public API
|
|
238
|
+
│ ├── destination_client.py # Class-based destination (atomic commits)
|
|
239
|
+
│ ├── destination.py # Function-based destination (legacy)
|
|
240
|
+
│ ├── schema_converter.py # dlt → Iceberg schema conversion
|
|
241
|
+
│ ├── schema_casting.py # Arrow table casting
|
|
242
|
+
│ ├── schema_evolution.py # Schema updates
|
|
243
|
+
│ ├── partition_builder.py # Partition specs
|
|
244
|
+
│ └── error_handling.py # Retry logic
|
|
245
|
+
├── tests/
|
|
246
|
+
│ ├── test_destination_rest_catalog.py # Integration tests (Docker)
|
|
247
|
+
│ ├── test_class_based_atomic.py # Atomic commit tests
|
|
248
|
+
│ ├── test_merge_disposition.py
|
|
249
|
+
│ ├── test_schema_evolution.py
|
|
250
|
+
│ └── ...
|
|
251
|
+
├── examples/
|
|
252
|
+
│ ├── incremental_load.py # CSV incremental loading
|
|
253
|
+
│ ├── merge_load.py # CSV merge/upsert
|
|
254
|
+
│ └── data/ # Sample CSV files
|
|
255
|
+
└── docker-compose.yml # Nessie + MinIO for testing
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
## How It Works
|
|
259
|
+
|
|
260
|
+
The class-based destination uses dlt's `JobClientBase` interface to accumulate parquet files during a load and commit them atomically in `complete_load()`:
|
|
261
|
+
|
|
262
|
+
1. dlt extracts data and writes parquet files
|
|
263
|
+
2. Each file is registered in module-level global state
|
|
264
|
+
3. After all files complete, `complete_load()` is called
|
|
265
|
+
4. All files for a table are combined and committed as single Iceberg snapshot
|
|
266
|
+
5. Each table gets one snapshot per load
|
|
267
|
+
|
|
268
|
+
This ensures atomic commits even though dlt creates multiple client instances.
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
MIT License - see LICENSE file
|
|
273
|
+
|
|
274
|
+
## Resources
|
|
275
|
+
|
|
276
|
+
- [dlt Documentation](https://dlthub.com/docs)
|
|
277
|
+
- [Apache Iceberg](https://iceberg.apache.org/)
|
|
278
|
+
- [PyIceberg](https://py.iceberg.apache.org/)
|
|
279
|
+
- [Iceberg REST Spec](https://iceberg.apache.org/rest-catalog-spec/)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dlt-iceberg"
|
|
3
|
+
version = "0.1.3"
|
|
4
|
+
description = "dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Sidequery", email = "hello@sidequery.com"}
|
|
10
|
+
]
|
|
11
|
+
keywords = ["dlt", "iceberg", "data-engineering", "etl", "elt", "data-pipeline"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Database",
|
|
21
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"boto3>=1.40.50",
|
|
25
|
+
"dlt>=1.17.1",
|
|
26
|
+
"pandas>=2.3.3",
|
|
27
|
+
"pyarrow>=21.0.0",
|
|
28
|
+
"pydantic<2.11",
|
|
29
|
+
"pyiceberg[pyiceberg-core]>=0.10.0",
|
|
30
|
+
"requests>=2.32.5",
|
|
31
|
+
"s3fs>=0.4.2",
|
|
32
|
+
"sqlalchemy>=2.0.44",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/sidequery/dlt-iceberg"
|
|
37
|
+
Repository = "https://github.com/sidequery/dlt-iceberg"
|
|
38
|
+
Issues = "https://github.com/sidequery/dlt-iceberg/issues"
|
|
39
|
+
|
|
40
|
+
[build-system]
|
|
41
|
+
requires = ["hatchling"]
|
|
42
|
+
build-backend = "hatchling.build"
|
|
43
|
+
|
|
44
|
+
[tool.hatch.build.targets.wheel]
|
|
45
|
+
packages = ["src/dlt_iceberg"]
|
|
46
|
+
|
|
47
|
+
[dependency-groups]
|
|
48
|
+
dev = [
|
|
49
|
+
"pytest>=8.4.2",
|
|
50
|
+
"pytest-mock>=3.15.1",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
[tool.pytest.ini_options]
|
|
54
|
+
pythonpath = ["src"]
|
|
55
|
+
testpaths = ["tests"]
|
|
56
|
+
markers = [
|
|
57
|
+
"integration: marks tests as integration tests that require external services (docker compose)",
|
|
58
|
+
]
|
|
@@ -34,7 +34,11 @@ from pyiceberg.exceptions import (
|
|
|
34
34
|
from .schema_converter import convert_dlt_to_iceberg_schema
|
|
35
35
|
from .partition_builder import build_partition_spec
|
|
36
36
|
from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
|
|
37
|
-
from .schema_casting import
|
|
37
|
+
from .schema_casting import (
|
|
38
|
+
cast_table_safe,
|
|
39
|
+
CastingError,
|
|
40
|
+
ensure_iceberg_compatible_arrow_data,
|
|
41
|
+
)
|
|
38
42
|
from .error_handling import (
|
|
39
43
|
is_retryable_error,
|
|
40
44
|
log_error_with_context,
|
|
@@ -89,6 +93,9 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
|
|
|
89
93
|
# Schema casting configuration
|
|
90
94
|
strict_casting: bool = False
|
|
91
95
|
|
|
96
|
+
# Merge batch size (for upsert operations to avoid memory issues)
|
|
97
|
+
merge_batch_size: int = 100000
|
|
98
|
+
|
|
92
99
|
|
|
93
100
|
class IcebergRestLoadJob(RunnableLoadJob):
|
|
94
101
|
"""
|
|
@@ -380,7 +387,8 @@ class IcebergRestClient(JobClientBase):
|
|
|
380
387
|
# Create table if needed
|
|
381
388
|
if not table_exists:
|
|
382
389
|
# Use first file's Arrow table to generate schema
|
|
383
|
-
|
|
390
|
+
# Apply Iceberg compatibility first so schema uses compatible types
|
|
391
|
+
first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
|
|
384
392
|
iceberg_schema = convert_dlt_to_iceberg_schema(
|
|
385
393
|
table_schema, first_arrow_table
|
|
386
394
|
)
|
|
@@ -401,7 +409,7 @@ class IcebergRestClient(JobClientBase):
|
|
|
401
409
|
logger.info(f"Created table {identifier} at {iceberg_table.location()}")
|
|
402
410
|
else:
|
|
403
411
|
# Table exists - check if schema evolution is needed
|
|
404
|
-
first_arrow_table = file_data[0][2]
|
|
412
|
+
first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
|
|
405
413
|
incoming_schema = convert_dlt_to_iceberg_schema(
|
|
406
414
|
table_schema, first_arrow_table
|
|
407
415
|
)
|
|
@@ -415,12 +423,15 @@ class IcebergRestClient(JobClientBase):
|
|
|
415
423
|
logger.info(f"Schema evolved for table {identifier}")
|
|
416
424
|
iceberg_table = catalog.load_table(identifier)
|
|
417
425
|
|
|
418
|
-
#
|
|
426
|
+
# Get expected schema (already has Iceberg-compatible types from creation)
|
|
419
427
|
expected_schema = schema_to_pyarrow(iceberg_table.schema())
|
|
428
|
+
|
|
429
|
+
# Combine all Arrow tables and cast to match Iceberg schema
|
|
420
430
|
combined_tables = []
|
|
421
431
|
|
|
422
432
|
for _, file_path, arrow_table in file_data:
|
|
423
|
-
# Cast
|
|
433
|
+
# Cast to match Iceberg schema
|
|
434
|
+
# (compatibility conversions already applied when schema was created)
|
|
424
435
|
casted_table = cast_table_safe(
|
|
425
436
|
arrow_table,
|
|
426
437
|
expected_schema,
|
|
@@ -463,15 +474,34 @@ class IcebergRestClient(JobClientBase):
|
|
|
463
474
|
iceberg_table.append(combined_table)
|
|
464
475
|
else:
|
|
465
476
|
logger.info(f"Merging into table {identifier} on keys {primary_keys}")
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
477
|
+
|
|
478
|
+
# Batch upserts to avoid memory issues on large datasets
|
|
479
|
+
batch_size = self.config.merge_batch_size
|
|
480
|
+
total_updated = 0
|
|
481
|
+
total_inserted = 0
|
|
482
|
+
|
|
483
|
+
for batch_start in range(0, len(combined_table), batch_size):
|
|
484
|
+
batch_end = min(batch_start + batch_size, len(combined_table))
|
|
485
|
+
batch = combined_table.slice(batch_start, batch_end - batch_start)
|
|
486
|
+
|
|
487
|
+
logger.info(
|
|
488
|
+
f"Upserting batch {batch_start//batch_size + 1}: "
|
|
489
|
+
f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
upsert_result = iceberg_table.upsert(
|
|
493
|
+
df=batch,
|
|
494
|
+
join_cols=primary_keys,
|
|
495
|
+
when_matched_update_all=True,
|
|
496
|
+
when_not_matched_insert_all=True,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
total_updated += upsert_result.rows_updated
|
|
500
|
+
total_inserted += upsert_result.rows_inserted
|
|
501
|
+
|
|
472
502
|
logger.info(
|
|
473
|
-
f"Upsert completed: {
|
|
474
|
-
f"{
|
|
503
|
+
f"Upsert completed: {total_updated} updated, "
|
|
504
|
+
f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
|
|
475
505
|
)
|
|
476
506
|
else:
|
|
477
507
|
raise ValueError(f"Unknown write disposition: {write_disposition}")
|
|
@@ -6,12 +6,75 @@ and allow users to control casting behavior.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
|
-
from typing import List, Optional, Tuple
|
|
9
|
+
from typing import List, Optional, Tuple, Dict, Callable
|
|
10
10
|
import pyarrow as pa
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
|
|
16
|
+
"""
|
|
17
|
+
Convert Arrow schema to Iceberg-compatible schema.
|
|
18
|
+
|
|
19
|
+
Converts types that Iceberg doesn't support:
|
|
20
|
+
- time32 → time64 (microseconds)
|
|
21
|
+
- decimal256 → string (Iceberg only supports decimal128)
|
|
22
|
+
- dictionary → value_type (unwrap dictionary encoding)
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
schema: PyArrow schema
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Iceberg-compatible PyArrow schema
|
|
29
|
+
"""
|
|
30
|
+
def convert_field(field: pa.Field) -> pa.Field:
|
|
31
|
+
field_type = field.type
|
|
32
|
+
|
|
33
|
+
# time32 → time64(us)
|
|
34
|
+
if pa.types.is_time32(field_type):
|
|
35
|
+
return pa.field(field.name, pa.time64("us"), nullable=field.nullable)
|
|
36
|
+
|
|
37
|
+
# decimal256 → string (pyarrow doesn't allow downcasting to decimal128)
|
|
38
|
+
if pa.types.is_decimal256(field_type):
|
|
39
|
+
logger.warning(
|
|
40
|
+
f"Converting decimal256 field '{field.name}' to string "
|
|
41
|
+
f"(Iceberg doesn't support decimal256)"
|
|
42
|
+
)
|
|
43
|
+
return pa.field(field.name, pa.string(), nullable=field.nullable)
|
|
44
|
+
|
|
45
|
+
# dictionary → value_type (unwrap dictionary encoding)
|
|
46
|
+
if pa.types.is_dictionary(field_type):
|
|
47
|
+
return pa.field(field.name, field_type.value_type, nullable=field.nullable)
|
|
48
|
+
|
|
49
|
+
# list/struct types - recursively convert nested fields
|
|
50
|
+
if pa.types.is_list(field_type):
|
|
51
|
+
value_field = convert_field(pa.field("item", field_type.value_type))
|
|
52
|
+
return pa.field(field.name, pa.list_(value_field.type), nullable=field.nullable)
|
|
53
|
+
|
|
54
|
+
if pa.types.is_struct(field_type):
|
|
55
|
+
new_fields = [convert_field(f) for f in field_type]
|
|
56
|
+
return pa.field(field.name, pa.struct(new_fields), nullable=field.nullable)
|
|
57
|
+
|
|
58
|
+
return field
|
|
59
|
+
|
|
60
|
+
new_fields = [convert_field(field) for field in schema]
|
|
61
|
+
return pa.schema(new_fields)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def ensure_iceberg_compatible_arrow_data(table: pa.Table) -> pa.Table:
|
|
65
|
+
"""
|
|
66
|
+
Convert Arrow table to Iceberg-compatible schema and cast data.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
table: PyArrow table
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Table with Iceberg-compatible schema
|
|
73
|
+
"""
|
|
74
|
+
new_schema = ensure_iceberg_compatible_arrow_schema(table.schema)
|
|
75
|
+
return table.cast(new_schema)
|
|
76
|
+
|
|
77
|
+
|
|
15
78
|
class CastingError(Exception):
|
|
16
79
|
"""Raised when a cast would result in data loss in strict mode."""
|
|
17
80
|
pass
|