dlt-iceberg 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. dlt_iceberg-0.1.1/.github/workflows/publish.yml +99 -0
  2. dlt_iceberg-0.1.1/.github/workflows/test.yml +47 -0
  3. dlt_iceberg-0.1.1/.gitignore +29 -0
  4. dlt_iceberg-0.1.1/.python-version +1 -0
  5. dlt_iceberg-0.1.1/LICENSE +21 -0
  6. dlt_iceberg-0.1.1/PKG-INFO +15 -0
  7. dlt_iceberg-0.1.1/README.md +247 -0
  8. dlt_iceberg-0.1.1/TESTING.md +284 -0
  9. dlt_iceberg-0.1.1/docker-compose.yml +93 -0
  10. dlt_iceberg-0.1.1/examples/README.md +66 -0
  11. dlt_iceberg-0.1.1/examples/data/customers_initial.csv +6 -0
  12. dlt_iceberg-0.1.1/examples/data/customers_updates.csv +5 -0
  13. dlt_iceberg-0.1.1/examples/data/events_batch1.csv +6 -0
  14. dlt_iceberg-0.1.1/examples/data/events_batch2.csv +6 -0
  15. dlt_iceberg-0.1.1/examples/incremental_load.py +95 -0
  16. dlt_iceberg-0.1.1/examples/merge_load.py +105 -0
  17. dlt_iceberg-0.1.1/pyproject.toml +36 -0
  18. dlt_iceberg-0.1.1/src/dlt_iceberg/__init__.py +28 -0
  19. dlt_iceberg-0.1.1/src/dlt_iceberg/destination.py +400 -0
  20. dlt_iceberg-0.1.1/src/dlt_iceberg/destination_client.py +606 -0
  21. dlt_iceberg-0.1.1/src/dlt_iceberg/error_handling.py +224 -0
  22. dlt_iceberg-0.1.1/src/dlt_iceberg/partition_builder.py +308 -0
  23. dlt_iceberg-0.1.1/src/dlt_iceberg/schema_casting.py +381 -0
  24. dlt_iceberg-0.1.1/src/dlt_iceberg/schema_converter.py +207 -0
  25. dlt_iceberg-0.1.1/src/dlt_iceberg/schema_evolution.py +261 -0
  26. dlt_iceberg-0.1.1/tests/test_class_based_atomic.py +297 -0
  27. dlt_iceberg-0.1.1/tests/test_destination_e2e.py +147 -0
  28. dlt_iceberg-0.1.1/tests/test_destination_rest_catalog.py +681 -0
  29. dlt_iceberg-0.1.1/tests/test_e2e_sqlite_catalog.py +156 -0
  30. dlt_iceberg-0.1.1/tests/test_error_handling.py +375 -0
  31. dlt_iceberg-0.1.1/tests/test_merge_disposition.py +254 -0
  32. dlt_iceberg-0.1.1/tests/test_partition_builder.py +459 -0
  33. dlt_iceberg-0.1.1/tests/test_partitioning_e2e.py +303 -0
  34. dlt_iceberg-0.1.1/tests/test_pyiceberg_append.py +149 -0
  35. dlt_iceberg-0.1.1/tests/test_schema_casting.py +458 -0
  36. dlt_iceberg-0.1.1/tests/test_schema_converter.py +103 -0
  37. dlt_iceberg-0.1.1/tests/test_schema_evolution.py +381 -0
  38. dlt_iceberg-0.1.1/tests/test_smoke.py +128 -0
  39. dlt_iceberg-0.1.1/uv.lock +1386 -0
@@ -0,0 +1,99 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ version_bump:
7
+ description: 'Version bump type'
8
+ required: true
9
+ type: choice
10
+ options:
11
+ - patch
12
+ - minor
13
+ - major
14
+
15
+ jobs:
16
+ publish:
17
+ runs-on: ubuntu-latest
18
+ permissions:
19
+ contents: write
20
+ id-token: write # For PyPI trusted publishing
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+ with:
25
+ fetch-depth: 0
26
+ token: ${{ secrets.GITHUB_TOKEN }}
27
+
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: '3.11'
32
+
33
+ - name: Install uv
34
+ uses: astral-sh/setup-uv@v3
35
+
36
+ - name: Get current version
37
+ id: current_version
38
+ run: |
39
+ CURRENT_VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])")
40
+ echo "version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
41
+ echo "Current version: $CURRENT_VERSION"
42
+
43
+ - name: Bump version
44
+ id: bump_version
45
+ run: |
46
+ CURRENT="${{ steps.current_version.outputs.version }}"
47
+ IFS='.' read -r major minor patch <<< "$CURRENT"
48
+
49
+ case "${{ github.event.inputs.version_bump }}" in
50
+ major)
51
+ NEW_VERSION="$((major + 1)).0.0"
52
+ ;;
53
+ minor)
54
+ NEW_VERSION="${major}.$((minor + 1)).0"
55
+ ;;
56
+ patch)
57
+ NEW_VERSION="${major}.${minor}.$((patch + 1))"
58
+ ;;
59
+ esac
60
+
61
+ echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT
62
+ echo "Bumping version: $CURRENT -> $NEW_VERSION"
63
+
64
+ - name: Update version in pyproject.toml
65
+ run: |
66
+ sed -i 's/^version = ".*"/version = "${{ steps.bump_version.outputs.new_version }}"/' pyproject.toml
67
+ cat pyproject.toml | grep "^version"
68
+
69
+ - name: Build package
70
+ run: |
71
+ uv build
72
+
73
+ - name: Publish to PyPI
74
+ uses: pypa/gh-action-pypi-publish@release/v1
75
+ with:
76
+ password: ${{ secrets.PYPI_TOKEN }}
77
+ print-hash: true
78
+
79
+ - name: Commit version bump
80
+ run: |
81
+ git config user.name "github-actions[bot]"
82
+ git config user.email "github-actions[bot]@users.noreply.github.com"
83
+ git add pyproject.toml
84
+ git commit -m "Bump version to ${{ steps.bump_version.outputs.new_version }}"
85
+ git push
86
+
87
+ - name: Create GitHub Release
88
+ uses: actions/create-release@v1
89
+ env:
90
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
91
+ with:
92
+ tag_name: v${{ steps.bump_version.outputs.new_version }}
93
+ release_name: v${{ steps.bump_version.outputs.new_version }}
94
+ body: |
95
+ Release version ${{ steps.bump_version.outputs.new_version }}
96
+
97
+ Published to PyPI: https://pypi.org/project/dlt-iceberg/${{ steps.bump_version.outputs.new_version }}/
98
+ draft: false
99
+ prerelease: false
@@ -0,0 +1,47 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Start Docker services
17
+ run: docker compose up -d
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v4
21
+ with:
22
+ version: "latest"
23
+
24
+ - name: Set up Python
25
+ run: uv python install 3.11
26
+
27
+ - name: Install dependencies
28
+ run: uv sync
29
+
30
+ - name: Wait for services
31
+ run: |
32
+ echo "Waiting for services to be healthy..."
33
+ timeout 60 bash -c 'until docker compose ps | grep -q "healthy"; do sleep 2; done'
34
+ echo "Services are healthy"
35
+
36
+ - name: Run unit tests
37
+ run: uv run pytest tests/ -m "not integration" -v
38
+
39
+ - name: Run integration tests
40
+ run: uv run pytest tests/ -m integration -v -s
41
+
42
+ - name: Run all tests
43
+ run: uv run pytest tests/ -v
44
+
45
+ - name: Stop Docker services
46
+ if: always()
47
+ run: docker compose down
@@ -0,0 +1,29 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # dlt
13
+ .dlt/secrets.toml
14
+ .dlt/.sources
15
+ .dlt/pipeline_state/
16
+ *.duckdb
17
+ *.duckdb.wal
18
+
19
+ # IDE
20
+ .vscode/
21
+ .idea/
22
+ *.swp
23
+
24
+ # Testing
25
+ .pytest_cache/
26
+ .coverage
27
+
28
+ # OS
29
+ .DS_Store
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sidequery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: dlt-iceberg
3
+ Version: 0.1.1
4
+ Summary: dlt custom destination for Apache Iceberg with REST catalog support
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: boto3>=1.40.50
8
+ Requires-Dist: dlt>=1.17.1
9
+ Requires-Dist: pandas>=2.3.3
10
+ Requires-Dist: pyarrow>=21.0.0
11
+ Requires-Dist: pydantic<2.11
12
+ Requires-Dist: pyiceberg[pyiceberg-core]>=0.10.0
13
+ Requires-Dist: requests>=2.32.5
14
+ Requires-Dist: s3fs>=0.4.2
15
+ Requires-Dist: sqlalchemy>=2.0.44
@@ -0,0 +1,247 @@
1
+ # dlt-iceberg
2
+
3
+ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.apache.org/) tables using REST catalogs.
4
+
5
+ ## Features
6
+
7
+ - **Atomic Multi-File Commits**: Multiple parquet files committed as single Iceberg snapshot per table
8
+ - **REST Catalog Support**: Works with Nessie, Polaris, AWS Glue, Unity Catalog
9
+ - **Partitioning**: Full support for Iceberg partition transforms (temporal, bucket, truncate, identity)
10
+ - **Authentication**: OAuth2, Bearer token, AWS SigV4
11
+ - **Write Dispositions**: Append, replace, merge (upsert)
12
+ - **Schema Evolution**: Automatic schema updates when adding columns
13
+ - **Retry Logic**: Exponential backoff for transient failures
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ git clone https://github.com/sidequery/dlt-iceberg.git
19
+ cd dlt-iceberg
20
+ uv sync
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ See [examples/](examples/) directory for working examples.
26
+
27
+ ### Incremental Load
28
+
29
+ ```python
30
+ import dlt
31
+ from dlt_iceberg import iceberg_rest
32
+
33
+ @dlt.resource(name="events", write_disposition="append")
34
+ def generate_events():
35
+ yield {"event_id": 1, "value": 100}
36
+
37
+ pipeline = dlt.pipeline(
38
+ pipeline_name="my_pipeline",
39
+ destination=iceberg_rest(
40
+ catalog_uri="http://localhost:19120/iceberg/main",
41
+ namespace="analytics",
42
+ s3_endpoint="http://localhost:9000",
43
+ s3_access_key_id="minioadmin",
44
+ s3_secret_access_key="minioadmin",
45
+ s3_region="us-east-1",
46
+ ),
47
+ )
48
+
49
+ pipeline.run(generate_events())
50
+ ```
51
+
52
+ ### Merge/Upsert
53
+
54
+ ```python
55
+ @dlt.resource(
56
+ name="users",
57
+ write_disposition="merge",
58
+ primary_key="user_id"
59
+ )
60
+ def generate_users():
61
+ yield {"user_id": 1, "name": "Alice", "status": "active"}
62
+
63
+ pipeline.run(generate_users())
64
+ ```
65
+
66
+ ## Configuration
67
+
68
+ ### Nessie (Docker)
69
+
70
+ ```python
71
+ iceberg_rest(
72
+ catalog_uri="http://localhost:19120/iceberg/main",
73
+ namespace="my_namespace",
74
+ s3_endpoint="http://localhost:9000",
75
+ s3_access_key_id="minioadmin",
76
+ s3_secret_access_key="minioadmin",
77
+ s3_region="us-east-1",
78
+ )
79
+ ```
80
+
81
+ Start services: `docker compose up -d`
82
+
83
+ ### AWS Glue
84
+
85
+ ```python
86
+ iceberg_rest(
87
+ catalog_uri="https://glue.us-east-1.amazonaws.com/iceberg",
88
+ warehouse="<account-id>:s3tablescatalog/<bucket>",
89
+ namespace="my_database",
90
+ sigv4_enabled=True,
91
+ signing_region="us-east-1",
92
+ )
93
+ ```
94
+
95
+ AWS credentials via environment variables.
96
+
97
+ ### Polaris
98
+
99
+ ```python
100
+ iceberg_rest(
101
+ catalog_uri="https://polaris.example.com/api/catalog",
102
+ warehouse="s3://bucket/warehouse",
103
+ namespace="production",
104
+ credential="client-id:client-secret",
105
+ oauth2_server_uri="https://polaris.example.com/api/catalog/v1/oauth/tokens",
106
+ )
107
+ ```
108
+
109
+ ### Unity Catalog
110
+
111
+ ```python
112
+ iceberg_rest(
113
+ catalog_uri="https://<workspace>.cloud.databricks.com/api/2.1/unity-catalog/iceberg-rest",
114
+ warehouse="<catalog-name>",
115
+ namespace="<schema-name>",
116
+ token="<databricks-token>",
117
+ )
118
+ ```
119
+
120
+ ## Partitioning
121
+
122
+ Mark columns for partitioning using dlt column hints:
123
+
124
+ ```python
125
+ @dlt.resource(
126
+ name="events",
127
+ columns={
128
+ "event_date": {
129
+ "data_type": "date",
130
+ "partition": True,
131
+ "partition_transform": "day", # Optional: year, month, day, hour
132
+ },
133
+ "region": {
134
+ "data_type": "text",
135
+ "partition": True, # Uses identity transform for strings
136
+ },
137
+ "user_id": {
138
+ "data_type": "bigint",
139
+ "partition": True,
140
+ "partition_transform": "bucket[10]", # Hash into 10 buckets
141
+ }
142
+ }
143
+ )
144
+ def events():
145
+ ...
146
+ ```
147
+
148
+ ### Available Transforms
149
+
150
+ - **Temporal**: `year`, `month`, `day`, `hour` (for timestamp/date columns)
151
+ - **Identity**: No transformation (default for string/integer)
152
+ - **Bucket**: `bucket[N]` - Hash-based partitioning into N buckets
153
+ - **Truncate**: `truncate[N]` - Truncate strings/integers to N width
154
+
155
+ ### Default Behavior
156
+
157
+ If `partition_transform` is not specified:
158
+ - Timestamp/date columns default to `month`
159
+ - String/integer columns default to `identity`
160
+
161
+ ## Write Dispositions
162
+
163
+ ### Append
164
+ ```python
165
+ write_disposition="append"
166
+ ```
167
+ Adds new data without modifying existing rows.
168
+
169
+ ### Replace
170
+ ```python
171
+ write_disposition="replace"
172
+ ```
173
+ Truncates table and inserts new data.
174
+
175
+ ### Merge
176
+ ```python
177
+ write_disposition="merge"
178
+ primary_key="user_id"
179
+ ```
180
+ Updates existing rows by primary key, inserts new rows.
181
+
182
+ ## Development
183
+
184
+ ### Run Tests
185
+
186
+ ```bash
187
+ # Start Docker services
188
+ docker compose up -d
189
+
190
+ # Run all tests
191
+ uv run pytest tests/ -v
192
+
193
+ # Run only unit tests
194
+ uv run pytest tests/ -v -m "not integration"
195
+
196
+ # Run only integration tests
197
+ uv run pytest tests/ -v -m integration
198
+ ```
199
+
200
+ ### Project Structure
201
+
202
+ ```
203
+ dlt-iceberg/
204
+ ├── src/dlt_iceberg/
205
+ │ ├── __init__.py # Public API
206
+ │ ├── destination_client.py # Class-based destination (atomic commits)
207
+ │ ├── destination.py # Function-based destination (legacy)
208
+ │ ├── schema_converter.py # dlt → Iceberg schema conversion
209
+ │ ├── schema_casting.py # Arrow table casting
210
+ │ ├── schema_evolution.py # Schema updates
211
+ │ ├── partition_builder.py # Partition specs
212
+ │ └── error_handling.py # Retry logic
213
+ ├── tests/
214
+ │ ├── test_destination_rest_catalog.py # Integration tests (Docker)
215
+ │ ├── test_class_based_atomic.py # Atomic commit tests
216
+ │ ├── test_merge_disposition.py
217
+ │ ├── test_schema_evolution.py
218
+ │ └── ...
219
+ ├── examples/
220
+ │ ├── incremental_load.py # CSV incremental loading
221
+ │ ├── merge_load.py # CSV merge/upsert
222
+ │ └── data/ # Sample CSV files
223
+ └── docker-compose.yml # Nessie + MinIO for testing
224
+ ```
225
+
226
+ ## How It Works
227
+
228
+ The class-based destination uses dlt's `JobClientBase` interface to accumulate parquet files during a load and commit them atomically in `complete_load()`:
229
+
230
+ 1. dlt extracts data and writes parquet files
231
+ 2. Each file is registered in module-level global state
232
+ 3. After all files complete, `complete_load()` is called
233
+ 4. All files for a table are combined and committed as single Iceberg snapshot
234
+ 5. Each table gets one snapshot per load
235
+
236
+ This ensures atomic commits even though dlt creates multiple client instances.
237
+
238
+ ## License
239
+
240
+ MIT License - see LICENSE file
241
+
242
+ ## Resources
243
+
244
+ - [dlt Documentation](https://dlthub.com/docs)
245
+ - [Apache Iceberg](https://iceberg.apache.org/)
246
+ - [PyIceberg](https://py.iceberg.apache.org/)
247
+ - [Iceberg REST Spec](https://iceberg.apache.org/rest-catalog-spec/)