dlt-iceberg 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/PKG-INFO +40 -5
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/README.md +39 -4
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/README.md +30 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/incremental_load.py +8 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/merge_load.py +9 -0
- dlt_iceberg-0.1.4/examples/usgs_earthquakes.py +234 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/pyproject.toml +1 -1
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/destination_client.py +43 -13
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/schema_casting.py +64 -1
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/uv.lock +1 -1
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.github/workflows/publish.yml +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.github/workflows/test.yml +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.gitignore +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.python-version +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/LICENSE +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/TESTING.md +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/docker-compose.yml +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/customers_initial.csv +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/customers_updates.csv +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/events_batch1.csv +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/events_batch2.csv +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/__init__.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/destination.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/error_handling.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/partition_builder.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/schema_converter.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/schema_evolution.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_class_based_atomic.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_destination_e2e.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_destination_rest_catalog.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_e2e_sqlite_catalog.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_error_handling.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_merge_disposition.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_partition_builder.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_partitioning_e2e.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_pyiceberg_append.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_schema_casting.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_schema_converter.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_schema_evolution.py +0 -0
- {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dlt-iceberg
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
|
|
5
5
|
Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
|
|
6
6
|
Project-URL: Repository, https://github.com/sidequery/dlt-iceberg
|
|
@@ -47,9 +47,13 @@ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.ap
|
|
|
47
47
|
## Installation
|
|
48
48
|
|
|
49
49
|
```bash
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
50
|
+
pip install dlt-iceberg
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Or with uv:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
uv add dlt-iceberg
|
|
53
57
|
```
|
|
54
58
|
|
|
55
59
|
## Quick Start
|
|
@@ -95,7 +99,38 @@ def generate_users():
|
|
|
95
99
|
pipeline.run(generate_users())
|
|
96
100
|
```
|
|
97
101
|
|
|
98
|
-
## Configuration
|
|
102
|
+
## Configuration Options
|
|
103
|
+
|
|
104
|
+
All configuration options can be passed to `iceberg_rest()`:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
iceberg_rest(
|
|
108
|
+
catalog_uri="...", # Required: REST catalog URI
|
|
109
|
+
namespace="...", # Required: Iceberg namespace (database)
|
|
110
|
+
warehouse="...", # Optional: Warehouse location
|
|
111
|
+
|
|
112
|
+
# Authentication
|
|
113
|
+
credential="...", # OAuth2 client credentials
|
|
114
|
+
oauth2_server_uri="...", # OAuth2 token endpoint
|
|
115
|
+
token="...", # Bearer token
|
|
116
|
+
|
|
117
|
+
# AWS SigV4
|
|
118
|
+
sigv4_enabled=True,
|
|
119
|
+
signing_region="us-east-1",
|
|
120
|
+
|
|
121
|
+
# S3 configuration
|
|
122
|
+
s3_endpoint="...",
|
|
123
|
+
s3_access_key_id="...",
|
|
124
|
+
s3_secret_access_key="...",
|
|
125
|
+
s3_region="...",
|
|
126
|
+
|
|
127
|
+
# Performance tuning
|
|
128
|
+
max_retries=5, # Retry attempts for transient failures
|
|
129
|
+
retry_backoff_base=2.0, # Exponential backoff multiplier
|
|
130
|
+
merge_batch_size=100000, # Rows per batch for merge operations
|
|
131
|
+
strict_casting=False, # Fail on potential data loss
|
|
132
|
+
)
|
|
133
|
+
```
|
|
99
134
|
|
|
100
135
|
### Nessie (Docker)
|
|
101
136
|
|
|
@@ -15,9 +15,13 @@ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.ap
|
|
|
15
15
|
## Installation
|
|
16
16
|
|
|
17
17
|
```bash
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
pip install dlt-iceberg
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or with uv:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv add dlt-iceberg
|
|
21
25
|
```
|
|
22
26
|
|
|
23
27
|
## Quick Start
|
|
@@ -63,7 +67,38 @@ def generate_users():
|
|
|
63
67
|
pipeline.run(generate_users())
|
|
64
68
|
```
|
|
65
69
|
|
|
66
|
-
## Configuration
|
|
70
|
+
## Configuration Options
|
|
71
|
+
|
|
72
|
+
All configuration options can be passed to `iceberg_rest()`:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
iceberg_rest(
|
|
76
|
+
catalog_uri="...", # Required: REST catalog URI
|
|
77
|
+
namespace="...", # Required: Iceberg namespace (database)
|
|
78
|
+
warehouse="...", # Optional: Warehouse location
|
|
79
|
+
|
|
80
|
+
# Authentication
|
|
81
|
+
credential="...", # OAuth2 client credentials
|
|
82
|
+
oauth2_server_uri="...", # OAuth2 token endpoint
|
|
83
|
+
token="...", # Bearer token
|
|
84
|
+
|
|
85
|
+
# AWS SigV4
|
|
86
|
+
sigv4_enabled=True,
|
|
87
|
+
signing_region="us-east-1",
|
|
88
|
+
|
|
89
|
+
# S3 configuration
|
|
90
|
+
s3_endpoint="...",
|
|
91
|
+
s3_access_key_id="...",
|
|
92
|
+
s3_secret_access_key="...",
|
|
93
|
+
s3_region="...",
|
|
94
|
+
|
|
95
|
+
# Performance tuning
|
|
96
|
+
max_retries=5, # Retry attempts for transient failures
|
|
97
|
+
retry_backoff_base=2.0, # Exponential backoff multiplier
|
|
98
|
+
merge_batch_size=100000, # Rows per batch for merge operations
|
|
99
|
+
strict_casting=False, # Fail on potential data loss
|
|
100
|
+
)
|
|
101
|
+
```
|
|
67
102
|
|
|
68
103
|
### Nessie (Docker)
|
|
69
104
|
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
This directory contains example scripts demonstrating how to use dlt-iceberg with REST catalogs.
|
|
4
4
|
|
|
5
|
+
All examples use [PEP 723](https://peps.python.org/pep-0723/) inline script metadata, allowing them to be run directly with `uv run` without installing dependencies separately.
|
|
6
|
+
|
|
5
7
|
## Prerequisites
|
|
6
8
|
|
|
7
9
|
Start the Docker services (Nessie REST catalog and MinIO):
|
|
@@ -48,6 +50,32 @@ Demonstrates merging two CSV files with overlapping customer IDs using merge dis
|
|
|
48
50
|
uv run examples/merge_load.py
|
|
49
51
|
```
|
|
50
52
|
|
|
53
|
+
### USGS Earthquake Data (`usgs_earthquakes.py`)
|
|
54
|
+
|
|
55
|
+
Demonstrates loading real-world GeoJSON data from the USGS Earthquake API from 2010 through current date.
|
|
56
|
+
|
|
57
|
+
- Fetches earthquake data from USGS API (2010-present, ~190 months)
|
|
58
|
+
- Loads data in monthly batches with automatic weekly splitting for high-volume months
|
|
59
|
+
- Handles API's 20,000 result limit by automatically splitting large months into weekly chunks
|
|
60
|
+
- Uses partitioning by month on timestamp column
|
|
61
|
+
- Transforms GeoJSON features into flat records
|
|
62
|
+
- Loads ~2.5 million earthquakes with complete metadata
|
|
63
|
+
|
|
64
|
+
**Features demonstrated:**
|
|
65
|
+
- Dynamic date range generation using `date.today()`
|
|
66
|
+
- Automatic handling of API result limits with recursive splitting
|
|
67
|
+
- Retry logic with exponential backoff
|
|
68
|
+
- Rate limiting between API requests
|
|
69
|
+
- Large-scale data ingestion (190+ API calls)
|
|
70
|
+
|
|
71
|
+
**Run:**
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
uv run examples/usgs_earthquakes.py
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**Note:** This script takes approximately 15-20 minutes to complete as it makes 190+ API calls with rate limiting.
|
|
78
|
+
|
|
51
79
|
## Data Files
|
|
52
80
|
|
|
53
81
|
Sample CSV files are in the `data/` directory:
|
|
@@ -63,4 +91,6 @@ Sample CSV files are in the `data/` directory:
|
|
|
63
91
|
- **Incremental loads**: Append new data to existing tables
|
|
64
92
|
- **Merge/Upsert**: Update existing records and insert new ones based on primary key
|
|
65
93
|
- **REST catalog**: All examples use Nessie REST catalog with MinIO storage
|
|
94
|
+
- **Partitioning**: Partition tables by timestamp (month transform)
|
|
95
|
+
- **API integration**: Fetch and transform data from external APIs
|
|
66
96
|
- **Querying**: Direct PyIceberg queries to verify loaded data
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
#!/usr/bin/env -S uv run
|
|
2
|
+
# /// script
|
|
3
|
+
# dependencies = [
|
|
4
|
+
# "dlt",
|
|
5
|
+
# "dlt-iceberg",
|
|
6
|
+
# "pyiceberg",
|
|
7
|
+
# "requests",
|
|
8
|
+
# "python-dateutil",
|
|
9
|
+
# ]
|
|
10
|
+
# ///
|
|
11
|
+
"""
|
|
12
|
+
USGS Earthquake Data Example
|
|
13
|
+
|
|
14
|
+
Loads earthquake data from USGS GeoJSON API from 2010 through current date
|
|
15
|
+
into an Iceberg table.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import dlt
|
|
19
|
+
import requests
|
|
20
|
+
import time
|
|
21
|
+
from datetime import datetime, date
|
|
22
|
+
from dateutil.relativedelta import relativedelta
|
|
23
|
+
from dlt_iceberg import iceberg_rest
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def fetch_earthquakes(start_date: str, end_date: str, split_on_error: bool = True):
|
|
27
|
+
"""
|
|
28
|
+
Fetch earthquake data from USGS API for a date range.
|
|
29
|
+
|
|
30
|
+
If a 400 error occurs (likely due to >20k result limit), automatically
|
|
31
|
+
splits the range into smaller chunks and retries.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
start_date: Start date in YYYY-MM-DD format
|
|
35
|
+
end_date: End date in YYYY-MM-DD format
|
|
36
|
+
split_on_error: If True, split the date range on 400 errors
|
|
37
|
+
"""
|
|
38
|
+
url = "https://earthquake.usgs.gov/fdsnws/event/1/query"
|
|
39
|
+
params = {
|
|
40
|
+
"format": "geojson",
|
|
41
|
+
"starttime": start_date,
|
|
42
|
+
"endtime": end_date,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
print(f"Fetching earthquakes from {start_date} to {end_date}...")
|
|
46
|
+
|
|
47
|
+
# Retry logic for rate limiting
|
|
48
|
+
max_retries = 3
|
|
49
|
+
for attempt in range(max_retries):
|
|
50
|
+
try:
|
|
51
|
+
response = requests.get(url, params=params, timeout=30)
|
|
52
|
+
response.raise_for_status()
|
|
53
|
+
break
|
|
54
|
+
except requests.exceptions.HTTPError as e:
|
|
55
|
+
if e.response.status_code == 400 and split_on_error:
|
|
56
|
+
# Split the date range and retry with smaller chunks
|
|
57
|
+
print(f" Too many results, splitting range into weekly chunks...")
|
|
58
|
+
from datetime import timedelta
|
|
59
|
+
|
|
60
|
+
start_dt = datetime.fromisoformat(start_date)
|
|
61
|
+
end_dt = datetime.fromisoformat(end_date)
|
|
62
|
+
current = start_dt
|
|
63
|
+
|
|
64
|
+
while current < end_dt:
|
|
65
|
+
next_week = min(current + timedelta(days=7), end_dt)
|
|
66
|
+
# Recursively fetch with split_on_error=False to avoid infinite recursion
|
|
67
|
+
yield from fetch_earthquakes(
|
|
68
|
+
current.date().isoformat(),
|
|
69
|
+
next_week.date().isoformat(),
|
|
70
|
+
split_on_error=False
|
|
71
|
+
)
|
|
72
|
+
current = next_week
|
|
73
|
+
return
|
|
74
|
+
elif e.response.status_code == 400:
|
|
75
|
+
print(f" Warning: API returned 400 error for {start_date} to {end_date}, skipping...")
|
|
76
|
+
return
|
|
77
|
+
if attempt < max_retries - 1:
|
|
78
|
+
wait_time = 2 ** attempt
|
|
79
|
+
print(f" Request failed, retrying in {wait_time}s...")
|
|
80
|
+
time.sleep(wait_time)
|
|
81
|
+
else:
|
|
82
|
+
raise
|
|
83
|
+
except requests.exceptions.RequestException as e:
|
|
84
|
+
if attempt < max_retries - 1:
|
|
85
|
+
wait_time = 2 ** attempt
|
|
86
|
+
print(f" Request failed, retrying in {wait_time}s...")
|
|
87
|
+
time.sleep(wait_time)
|
|
88
|
+
else:
|
|
89
|
+
raise
|
|
90
|
+
|
|
91
|
+
data = response.json()
|
|
92
|
+
features = data.get("features", [])
|
|
93
|
+
print(f"Retrieved {len(features)} earthquakes")
|
|
94
|
+
|
|
95
|
+
# Rate limiting: sleep briefly between requests
|
|
96
|
+
time.sleep(0.5)
|
|
97
|
+
|
|
98
|
+
# Transform GeoJSON features into flat records
|
|
99
|
+
for feature in features:
|
|
100
|
+
props = feature["properties"]
|
|
101
|
+
geom = feature["geometry"]
|
|
102
|
+
|
|
103
|
+
yield {
|
|
104
|
+
"earthquake_id": feature["id"],
|
|
105
|
+
"magnitude": props.get("mag"),
|
|
106
|
+
"place": props.get("place"),
|
|
107
|
+
"time": datetime.fromtimestamp(props["time"] / 1000) if props.get("time") else None,
|
|
108
|
+
"updated": datetime.fromtimestamp(props["updated"] / 1000) if props.get("updated") else None,
|
|
109
|
+
"url": props.get("url"),
|
|
110
|
+
"detail": props.get("detail"),
|
|
111
|
+
"felt": props.get("felt"),
|
|
112
|
+
"cdi": props.get("cdi"),
|
|
113
|
+
"mmi": props.get("mmi"),
|
|
114
|
+
"alert": props.get("alert"),
|
|
115
|
+
"status": props.get("status"),
|
|
116
|
+
"tsunami": props.get("tsunami"),
|
|
117
|
+
"sig": props.get("sig"),
|
|
118
|
+
"net": props.get("net"),
|
|
119
|
+
"code": props.get("code"),
|
|
120
|
+
"ids": props.get("ids"),
|
|
121
|
+
"sources": props.get("sources"),
|
|
122
|
+
"types": props.get("types"),
|
|
123
|
+
"nst": props.get("nst"),
|
|
124
|
+
"dmin": props.get("dmin"),
|
|
125
|
+
"rms": props.get("rms"),
|
|
126
|
+
"gap": props.get("gap"),
|
|
127
|
+
"magType": props.get("magType"),
|
|
128
|
+
"type": props.get("type"),
|
|
129
|
+
"title": props.get("title"),
|
|
130
|
+
"longitude": geom["coordinates"][0] if geom and geom.get("coordinates") else None,
|
|
131
|
+
"latitude": geom["coordinates"][1] if geom and geom.get("coordinates") else None,
|
|
132
|
+
"depth": geom["coordinates"][2] if geom and geom.get("coordinates") and len(geom["coordinates"]) > 2 else None,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def main():
|
|
137
|
+
# Create dlt pipeline with Nessie REST catalog
|
|
138
|
+
pipeline = dlt.pipeline(
|
|
139
|
+
pipeline_name="usgs_earthquakes",
|
|
140
|
+
destination=iceberg_rest(
|
|
141
|
+
catalog_uri="http://localhost:19120/iceberg/main",
|
|
142
|
+
namespace="examples",
|
|
143
|
+
s3_endpoint="http://localhost:9000",
|
|
144
|
+
s3_access_key_id="minioadmin",
|
|
145
|
+
s3_secret_access_key="minioadmin",
|
|
146
|
+
s3_region="us-east-1",
|
|
147
|
+
),
|
|
148
|
+
dataset_name="usgs_data",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Load earthquakes from 2010 through current date
|
|
152
|
+
# Breaking into monthly batches to avoid overwhelming the API
|
|
153
|
+
# Note: USGS endtime is exclusive, so we use the first day of next month
|
|
154
|
+
|
|
155
|
+
start_date = date(2010, 1, 1)
|
|
156
|
+
end_date = date.today()
|
|
157
|
+
|
|
158
|
+
date_ranges = []
|
|
159
|
+
current = start_date
|
|
160
|
+
while current <= end_date:
|
|
161
|
+
next_month = current + relativedelta(months=1)
|
|
162
|
+
date_ranges.append((current.isoformat(), next_month.isoformat()))
|
|
163
|
+
current = next_month
|
|
164
|
+
|
|
165
|
+
print(f"Loading {len(date_ranges)} months of earthquake data from {start_date} to {end_date}...")
|
|
166
|
+
print()
|
|
167
|
+
|
|
168
|
+
for i, (start, end) in enumerate(date_ranges, 1):
|
|
169
|
+
@dlt.resource(
|
|
170
|
+
name="earthquakes",
|
|
171
|
+
write_disposition="append",
|
|
172
|
+
columns={
|
|
173
|
+
"time": {
|
|
174
|
+
"data_type": "timestamp",
|
|
175
|
+
"x-partition": True,
|
|
176
|
+
"x-partition-transform": "month",
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
)
|
|
180
|
+
def earthquakes_batch():
|
|
181
|
+
return fetch_earthquakes(start, end)
|
|
182
|
+
|
|
183
|
+
load_info = pipeline.run(earthquakes_batch())
|
|
184
|
+
print(f"[{i}/{len(date_ranges)}] Loaded {start} to {end}")
|
|
185
|
+
print()
|
|
186
|
+
|
|
187
|
+
# Query the table to verify
|
|
188
|
+
from pyiceberg.catalog import load_catalog
|
|
189
|
+
|
|
190
|
+
catalog = load_catalog(
|
|
191
|
+
"query",
|
|
192
|
+
type="rest",
|
|
193
|
+
uri="http://localhost:19120/iceberg/main",
|
|
194
|
+
**{
|
|
195
|
+
"s3.endpoint": "http://localhost:9000",
|
|
196
|
+
"s3.access-key-id": "minioadmin",
|
|
197
|
+
"s3.secret-access-key": "minioadmin",
|
|
198
|
+
"s3.region": "us-east-1",
|
|
199
|
+
},
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
table = catalog.load_table("examples.earthquakes")
|
|
203
|
+
result = table.scan().to_arrow()
|
|
204
|
+
|
|
205
|
+
print(f"\n{'='*60}")
|
|
206
|
+
print(f"Total earthquakes loaded: {len(result)}")
|
|
207
|
+
|
|
208
|
+
import pyarrow.compute as pc
|
|
209
|
+
print(f"Date range: {pc.min(result['time']).as_py()} to {pc.max(result['time']).as_py()}")
|
|
210
|
+
print(f"Magnitude range: {pc.min(result['magnitude']).as_py()} to {pc.max(result['magnitude']).as_py()}")
|
|
211
|
+
|
|
212
|
+
# Show some sample records
|
|
213
|
+
import pandas as pd
|
|
214
|
+
df = result.to_pandas()
|
|
215
|
+
print(f"\nSample earthquakes:")
|
|
216
|
+
print(df[["time", "magnitude", "place", "depth"]].head(10).to_string(index=False))
|
|
217
|
+
|
|
218
|
+
# Show distribution by month
|
|
219
|
+
df["month"] = pd.to_datetime(df["time"]).dt.to_period("M")
|
|
220
|
+
monthly_counts = df.groupby("month").size().sort_index()
|
|
221
|
+
print(f"\nEarthquakes by month:")
|
|
222
|
+
for month, count in monthly_counts.items():
|
|
223
|
+
print(f" {month}: {count:,}")
|
|
224
|
+
|
|
225
|
+
# Show magnitude distribution
|
|
226
|
+
print(f"\nMagnitude distribution:")
|
|
227
|
+
print(df["magnitude"].describe())
|
|
228
|
+
|
|
229
|
+
print(f"\n{'='*60}")
|
|
230
|
+
print("USGS earthquake data load complete!")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
if __name__ == "__main__":
|
|
234
|
+
main()
|
|
@@ -34,7 +34,11 @@ from pyiceberg.exceptions import (
|
|
|
34
34
|
from .schema_converter import convert_dlt_to_iceberg_schema
|
|
35
35
|
from .partition_builder import build_partition_spec
|
|
36
36
|
from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
|
|
37
|
-
from .schema_casting import
|
|
37
|
+
from .schema_casting import (
|
|
38
|
+
cast_table_safe,
|
|
39
|
+
CastingError,
|
|
40
|
+
ensure_iceberg_compatible_arrow_data,
|
|
41
|
+
)
|
|
38
42
|
from .error_handling import (
|
|
39
43
|
is_retryable_error,
|
|
40
44
|
log_error_with_context,
|
|
@@ -89,6 +93,9 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
|
|
|
89
93
|
# Schema casting configuration
|
|
90
94
|
strict_casting: bool = False
|
|
91
95
|
|
|
96
|
+
# Merge batch size (for upsert operations to avoid memory issues)
|
|
97
|
+
merge_batch_size: int = 100000
|
|
98
|
+
|
|
92
99
|
|
|
93
100
|
class IcebergRestLoadJob(RunnableLoadJob):
|
|
94
101
|
"""
|
|
@@ -380,7 +387,8 @@ class IcebergRestClient(JobClientBase):
|
|
|
380
387
|
# Create table if needed
|
|
381
388
|
if not table_exists:
|
|
382
389
|
# Use first file's Arrow table to generate schema
|
|
383
|
-
|
|
390
|
+
# Apply Iceberg compatibility first so schema uses compatible types
|
|
391
|
+
first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
|
|
384
392
|
iceberg_schema = convert_dlt_to_iceberg_schema(
|
|
385
393
|
table_schema, first_arrow_table
|
|
386
394
|
)
|
|
@@ -401,7 +409,7 @@ class IcebergRestClient(JobClientBase):
|
|
|
401
409
|
logger.info(f"Created table {identifier} at {iceberg_table.location()}")
|
|
402
410
|
else:
|
|
403
411
|
# Table exists - check if schema evolution is needed
|
|
404
|
-
first_arrow_table = file_data[0][2]
|
|
412
|
+
first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
|
|
405
413
|
incoming_schema = convert_dlt_to_iceberg_schema(
|
|
406
414
|
table_schema, first_arrow_table
|
|
407
415
|
)
|
|
@@ -415,12 +423,15 @@ class IcebergRestClient(JobClientBase):
|
|
|
415
423
|
logger.info(f"Schema evolved for table {identifier}")
|
|
416
424
|
iceberg_table = catalog.load_table(identifier)
|
|
417
425
|
|
|
418
|
-
#
|
|
426
|
+
# Get expected schema (already has Iceberg-compatible types from creation)
|
|
419
427
|
expected_schema = schema_to_pyarrow(iceberg_table.schema())
|
|
428
|
+
|
|
429
|
+
# Combine all Arrow tables and cast to match Iceberg schema
|
|
420
430
|
combined_tables = []
|
|
421
431
|
|
|
422
432
|
for _, file_path, arrow_table in file_data:
|
|
423
|
-
# Cast
|
|
433
|
+
# Cast to match Iceberg schema
|
|
434
|
+
# (compatibility conversions already applied when schema was created)
|
|
424
435
|
casted_table = cast_table_safe(
|
|
425
436
|
arrow_table,
|
|
426
437
|
expected_schema,
|
|
@@ -463,15 +474,34 @@ class IcebergRestClient(JobClientBase):
|
|
|
463
474
|
iceberg_table.append(combined_table)
|
|
464
475
|
else:
|
|
465
476
|
logger.info(f"Merging into table {identifier} on keys {primary_keys}")
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
477
|
+
|
|
478
|
+
# Batch upserts to avoid memory issues on large datasets
|
|
479
|
+
batch_size = self.config.merge_batch_size
|
|
480
|
+
total_updated = 0
|
|
481
|
+
total_inserted = 0
|
|
482
|
+
|
|
483
|
+
for batch_start in range(0, len(combined_table), batch_size):
|
|
484
|
+
batch_end = min(batch_start + batch_size, len(combined_table))
|
|
485
|
+
batch = combined_table.slice(batch_start, batch_end - batch_start)
|
|
486
|
+
|
|
487
|
+
logger.info(
|
|
488
|
+
f"Upserting batch {batch_start//batch_size + 1}: "
|
|
489
|
+
f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
upsert_result = iceberg_table.upsert(
|
|
493
|
+
df=batch,
|
|
494
|
+
join_cols=primary_keys,
|
|
495
|
+
when_matched_update_all=True,
|
|
496
|
+
when_not_matched_insert_all=True,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
total_updated += upsert_result.rows_updated
|
|
500
|
+
total_inserted += upsert_result.rows_inserted
|
|
501
|
+
|
|
472
502
|
logger.info(
|
|
473
|
-
f"Upsert completed: {
|
|
474
|
-
f"{
|
|
503
|
+
f"Upsert completed: {total_updated} updated, "
|
|
504
|
+
f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
|
|
475
505
|
)
|
|
476
506
|
else:
|
|
477
507
|
raise ValueError(f"Unknown write disposition: {write_disposition}")
|
|
@@ -6,12 +6,75 @@ and allow users to control casting behavior.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
|
-
from typing import List, Optional, Tuple
|
|
9
|
+
from typing import List, Optional, Tuple, Dict, Callable
|
|
10
10
|
import pyarrow as pa
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
|
|
16
|
+
"""
|
|
17
|
+
Convert Arrow schema to Iceberg-compatible schema.
|
|
18
|
+
|
|
19
|
+
Converts types that Iceberg doesn't support:
|
|
20
|
+
- time32 → time64 (microseconds)
|
|
21
|
+
- decimal256 → string (Iceberg only supports decimal128)
|
|
22
|
+
- dictionary → value_type (unwrap dictionary encoding)
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
schema: PyArrow schema
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Iceberg-compatible PyArrow schema
|
|
29
|
+
"""
|
|
30
|
+
def convert_field(field: pa.Field) -> pa.Field:
|
|
31
|
+
field_type = field.type
|
|
32
|
+
|
|
33
|
+
# time32 → time64(us)
|
|
34
|
+
if pa.types.is_time32(field_type):
|
|
35
|
+
return pa.field(field.name, pa.time64("us"), nullable=field.nullable)
|
|
36
|
+
|
|
37
|
+
# decimal256 → string (pyarrow doesn't allow downcasting to decimal128)
|
|
38
|
+
if pa.types.is_decimal256(field_type):
|
|
39
|
+
logger.warning(
|
|
40
|
+
f"Converting decimal256 field '{field.name}' to string "
|
|
41
|
+
f"(Iceberg doesn't support decimal256)"
|
|
42
|
+
)
|
|
43
|
+
return pa.field(field.name, pa.string(), nullable=field.nullable)
|
|
44
|
+
|
|
45
|
+
# dictionary → value_type (unwrap dictionary encoding)
|
|
46
|
+
if pa.types.is_dictionary(field_type):
|
|
47
|
+
return pa.field(field.name, field_type.value_type, nullable=field.nullable)
|
|
48
|
+
|
|
49
|
+
# list/struct types - recursively convert nested fields
|
|
50
|
+
if pa.types.is_list(field_type):
|
|
51
|
+
value_field = convert_field(pa.field("item", field_type.value_type))
|
|
52
|
+
return pa.field(field.name, pa.list_(value_field.type), nullable=field.nullable)
|
|
53
|
+
|
|
54
|
+
if pa.types.is_struct(field_type):
|
|
55
|
+
new_fields = [convert_field(f) for f in field_type]
|
|
56
|
+
return pa.field(field.name, pa.struct(new_fields), nullable=field.nullable)
|
|
57
|
+
|
|
58
|
+
return field
|
|
59
|
+
|
|
60
|
+
new_fields = [convert_field(field) for field in schema]
|
|
61
|
+
return pa.schema(new_fields)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def ensure_iceberg_compatible_arrow_data(table: pa.Table) -> pa.Table:
|
|
65
|
+
"""
|
|
66
|
+
Convert Arrow table to Iceberg-compatible schema and cast data.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
table: PyArrow table
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Table with Iceberg-compatible schema
|
|
73
|
+
"""
|
|
74
|
+
new_schema = ensure_iceberg_compatible_arrow_schema(table.schema)
|
|
75
|
+
return table.cast(new_schema)
|
|
76
|
+
|
|
77
|
+
|
|
15
78
|
class CastingError(Exception):
|
|
16
79
|
"""Raised when a cast would result in data loss in strict mode."""
|
|
17
80
|
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|