dlt-iceberg 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/PKG-INFO +40 -5
  2. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/README.md +39 -4
  3. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/README.md +30 -0
  4. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/incremental_load.py +8 -0
  5. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/merge_load.py +9 -0
  6. dlt_iceberg-0.1.4/examples/usgs_earthquakes.py +234 -0
  7. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/pyproject.toml +1 -1
  8. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/destination_client.py +43 -13
  9. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/schema_casting.py +64 -1
  10. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/uv.lock +1 -1
  11. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.github/workflows/publish.yml +0 -0
  12. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.github/workflows/test.yml +0 -0
  13. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.gitignore +0 -0
  14. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/.python-version +0 -0
  15. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/LICENSE +0 -0
  16. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/TESTING.md +0 -0
  17. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/docker-compose.yml +0 -0
  18. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/customers_initial.csv +0 -0
  19. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/customers_updates.csv +0 -0
  20. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/events_batch1.csv +0 -0
  21. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/examples/data/events_batch2.csv +0 -0
  22. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/__init__.py +0 -0
  23. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/destination.py +0 -0
  24. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/error_handling.py +0 -0
  25. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/partition_builder.py +0 -0
  26. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/schema_converter.py +0 -0
  27. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/src/dlt_iceberg/schema_evolution.py +0 -0
  28. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_class_based_atomic.py +0 -0
  29. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_destination_e2e.py +0 -0
  30. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_destination_rest_catalog.py +0 -0
  31. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_e2e_sqlite_catalog.py +0 -0
  32. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_error_handling.py +0 -0
  33. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_merge_disposition.py +0 -0
  34. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_partition_builder.py +0 -0
  35. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_partitioning_e2e.py +0 -0
  36. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_pyiceberg_append.py +0 -0
  37. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_schema_casting.py +0 -0
  38. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_schema_converter.py +0 -0
  39. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_schema_evolution.py +0 -0
  40. {dlt_iceberg-0.1.2 → dlt_iceberg-0.1.4}/tests/test_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dlt-iceberg
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
5
5
  Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
6
6
  Project-URL: Repository, https://github.com/sidequery/dlt-iceberg
@@ -47,9 +47,13 @@ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.ap
47
47
  ## Installation
48
48
 
49
49
  ```bash
50
- git clone https://github.com/sidequery/dlt-iceberg.git
51
- cd dlt-iceberg
52
- uv sync
50
+ pip install dlt-iceberg
51
+ ```
52
+
53
+ Or with uv:
54
+
55
+ ```bash
56
+ uv add dlt-iceberg
53
57
  ```
54
58
 
55
59
  ## Quick Start
@@ -95,7 +99,38 @@ def generate_users():
95
99
  pipeline.run(generate_users())
96
100
  ```
97
101
 
98
- ## Configuration
102
+ ## Configuration Options
103
+
104
+ All configuration options can be passed to `iceberg_rest()`:
105
+
106
+ ```python
107
+ iceberg_rest(
108
+ catalog_uri="...", # Required: REST catalog URI
109
+ namespace="...", # Required: Iceberg namespace (database)
110
+ warehouse="...", # Optional: Warehouse location
111
+
112
+ # Authentication
113
+ credential="...", # OAuth2 client credentials
114
+ oauth2_server_uri="...", # OAuth2 token endpoint
115
+ token="...", # Bearer token
116
+
117
+ # AWS SigV4
118
+ sigv4_enabled=True,
119
+ signing_region="us-east-1",
120
+
121
+ # S3 configuration
122
+ s3_endpoint="...",
123
+ s3_access_key_id="...",
124
+ s3_secret_access_key="...",
125
+ s3_region="...",
126
+
127
+ # Performance tuning
128
+ max_retries=5, # Retry attempts for transient failures
129
+ retry_backoff_base=2.0, # Exponential backoff multiplier
130
+ merge_batch_size=100000, # Rows per batch for merge operations
131
+ strict_casting=False, # Fail on potential data loss
132
+ )
133
+ ```
99
134
 
100
135
  ### Nessie (Docker)
101
136
 
@@ -15,9 +15,13 @@ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.ap
15
15
  ## Installation
16
16
 
17
17
  ```bash
18
- git clone https://github.com/sidequery/dlt-iceberg.git
19
- cd dlt-iceberg
20
- uv sync
18
+ pip install dlt-iceberg
19
+ ```
20
+
21
+ Or with uv:
22
+
23
+ ```bash
24
+ uv add dlt-iceberg
21
25
  ```
22
26
 
23
27
  ## Quick Start
@@ -63,7 +67,38 @@ def generate_users():
63
67
  pipeline.run(generate_users())
64
68
  ```
65
69
 
66
- ## Configuration
70
+ ## Configuration Options
71
+
72
+ All configuration options can be passed to `iceberg_rest()`:
73
+
74
+ ```python
75
+ iceberg_rest(
76
+ catalog_uri="...", # Required: REST catalog URI
77
+ namespace="...", # Required: Iceberg namespace (database)
78
+ warehouse="...", # Optional: Warehouse location
79
+
80
+ # Authentication
81
+ credential="...", # OAuth2 client credentials
82
+ oauth2_server_uri="...", # OAuth2 token endpoint
83
+ token="...", # Bearer token
84
+
85
+ # AWS SigV4
86
+ sigv4_enabled=True,
87
+ signing_region="us-east-1",
88
+
89
+ # S3 configuration
90
+ s3_endpoint="...",
91
+ s3_access_key_id="...",
92
+ s3_secret_access_key="...",
93
+ s3_region="...",
94
+
95
+ # Performance tuning
96
+ max_retries=5, # Retry attempts for transient failures
97
+ retry_backoff_base=2.0, # Exponential backoff multiplier
98
+ merge_batch_size=100000, # Rows per batch for merge operations
99
+ strict_casting=False, # Fail on potential data loss
100
+ )
101
+ ```
67
102
 
68
103
  ### Nessie (Docker)
69
104
 
@@ -2,6 +2,8 @@
2
2
 
3
3
  This directory contains example scripts demonstrating how to use dlt-iceberg with REST catalogs.
4
4
 
5
+ All examples use [PEP 723](https://peps.python.org/pep-0723/) inline script metadata, allowing them to be run directly with `uv run` without installing dependencies separately.
6
+
5
7
  ## Prerequisites
6
8
 
7
9
  Start the Docker services (Nessie REST catalog and MinIO):
@@ -48,6 +50,32 @@ Demonstrates merging two CSV files with overlapping customer IDs using merge dis
48
50
  uv run examples/merge_load.py
49
51
  ```
50
52
 
53
+ ### USGS Earthquake Data (`usgs_earthquakes.py`)
54
+
55
+ Demonstrates loading real-world GeoJSON data from the USGS Earthquake API from 2010 through current date.
56
+
57
+ - Fetches earthquake data from USGS API (2010-present, ~190 months)
58
+ - Loads data in monthly batches with automatic weekly splitting for high-volume months
59
+ - Handles API's 20,000 result limit by automatically splitting large months into weekly chunks
60
+ - Uses partitioning by month on timestamp column
61
+ - Transforms GeoJSON features into flat records
62
+ - Loads ~2.5 million earthquakes with complete metadata
63
+
64
+ **Features demonstrated:**
65
+ - Dynamic date range generation using `date.today()`
66
+ - Automatic handling of API result limits with recursive splitting
67
+ - Retry logic with exponential backoff
68
+ - Rate limiting between API requests
69
+ - Large-scale data ingestion (190+ API calls)
70
+
71
+ **Run:**
72
+
73
+ ```bash
74
+ uv run examples/usgs_earthquakes.py
75
+ ```
76
+
77
+ **Note:** This script takes approximately 15-20 minutes to complete as it makes 190+ API calls with rate limiting.
78
+
51
79
  ## Data Files
52
80
 
53
81
  Sample CSV files are in the `data/` directory:
@@ -63,4 +91,6 @@ Sample CSV files are in the `data/` directory:
63
91
  - **Incremental loads**: Append new data to existing tables
64
92
  - **Merge/Upsert**: Update existing records and insert new ones based on primary key
65
93
  - **REST catalog**: All examples use Nessie REST catalog with MinIO storage
94
+ - **Partitioning**: Partition tables by timestamp (month transform)
95
+ - **API integration**: Fetch and transform data from external APIs
66
96
  - **Querying**: Direct PyIceberg queries to verify loaded data
@@ -1,3 +1,11 @@
1
+ #!/usr/bin/env -S uv run
2
+ # /// script
3
+ # dependencies = [
4
+ # "dlt",
5
+ # "dlt-iceberg",
6
+ # "pyiceberg",
7
+ # ]
8
+ # ///
1
9
  """
2
10
  Incremental Load Example
3
11
 
@@ -1,3 +1,12 @@
1
+ #!/usr/bin/env -S uv run
2
+ # /// script
3
+ # dependencies = [
4
+ # "dlt",
5
+ # "dlt-iceberg",
6
+ # "pyiceberg",
7
+ # "pandas",
8
+ # ]
9
+ # ///
1
10
  """
2
11
  Merge Load Example
3
12
 
@@ -0,0 +1,234 @@
1
+ #!/usr/bin/env -S uv run
2
+ # /// script
3
+ # dependencies = [
4
+ # "dlt",
5
+ # "dlt-iceberg",
6
+ # "pyiceberg",
7
+ # "requests",
8
+ # "python-dateutil",
9
+ # ]
10
+ # ///
11
+ """
12
+ USGS Earthquake Data Example
13
+
14
+ Loads earthquake data from USGS GeoJSON API from 2010 through current date
15
+ into an Iceberg table.
16
+ """
17
+
18
+ import dlt
19
+ import requests
20
+ import time
21
+ from datetime import datetime, date
22
+ from dateutil.relativedelta import relativedelta
23
+ from dlt_iceberg import iceberg_rest
24
+
25
+
26
+ def fetch_earthquakes(start_date: str, end_date: str, split_on_error: bool = True):
27
+ """
28
+ Fetch earthquake data from USGS API for a date range.
29
+
30
+ If a 400 error occurs (likely due to >20k result limit), automatically
31
+ splits the range into smaller chunks and retries.
32
+
33
+ Args:
34
+ start_date: Start date in YYYY-MM-DD format
35
+ end_date: End date in YYYY-MM-DD format
36
+ split_on_error: If True, split the date range on 400 errors
37
+ """
38
+ url = "https://earthquake.usgs.gov/fdsnws/event/1/query"
39
+ params = {
40
+ "format": "geojson",
41
+ "starttime": start_date,
42
+ "endtime": end_date,
43
+ }
44
+
45
+ print(f"Fetching earthquakes from {start_date} to {end_date}...")
46
+
47
+ # Retry logic for rate limiting
48
+ max_retries = 3
49
+ for attempt in range(max_retries):
50
+ try:
51
+ response = requests.get(url, params=params, timeout=30)
52
+ response.raise_for_status()
53
+ break
54
+ except requests.exceptions.HTTPError as e:
55
+ if e.response.status_code == 400 and split_on_error:
56
+ # Split the date range and retry with smaller chunks
57
+ print(f" Too many results, splitting range into weekly chunks...")
58
+ from datetime import timedelta
59
+
60
+ start_dt = datetime.fromisoformat(start_date)
61
+ end_dt = datetime.fromisoformat(end_date)
62
+ current = start_dt
63
+
64
+ while current < end_dt:
65
+ next_week = min(current + timedelta(days=7), end_dt)
66
+ # Recursively fetch with split_on_error=False to avoid infinite recursion
67
+ yield from fetch_earthquakes(
68
+ current.date().isoformat(),
69
+ next_week.date().isoformat(),
70
+ split_on_error=False
71
+ )
72
+ current = next_week
73
+ return
74
+ elif e.response.status_code == 400:
75
+ print(f" Warning: API returned 400 error for {start_date} to {end_date}, skipping...")
76
+ return
77
+ if attempt < max_retries - 1:
78
+ wait_time = 2 ** attempt
79
+ print(f" Request failed, retrying in {wait_time}s...")
80
+ time.sleep(wait_time)
81
+ else:
82
+ raise
83
+ except requests.exceptions.RequestException as e:
84
+ if attempt < max_retries - 1:
85
+ wait_time = 2 ** attempt
86
+ print(f" Request failed, retrying in {wait_time}s...")
87
+ time.sleep(wait_time)
88
+ else:
89
+ raise
90
+
91
+ data = response.json()
92
+ features = data.get("features", [])
93
+ print(f"Retrieved {len(features)} earthquakes")
94
+
95
+ # Rate limiting: sleep briefly between requests
96
+ time.sleep(0.5)
97
+
98
+ # Transform GeoJSON features into flat records
99
+ for feature in features:
100
+ props = feature["properties"]
101
+ geom = feature["geometry"]
102
+
103
+ yield {
104
+ "earthquake_id": feature["id"],
105
+ "magnitude": props.get("mag"),
106
+ "place": props.get("place"),
107
+ "time": datetime.fromtimestamp(props["time"] / 1000) if props.get("time") else None,
108
+ "updated": datetime.fromtimestamp(props["updated"] / 1000) if props.get("updated") else None,
109
+ "url": props.get("url"),
110
+ "detail": props.get("detail"),
111
+ "felt": props.get("felt"),
112
+ "cdi": props.get("cdi"),
113
+ "mmi": props.get("mmi"),
114
+ "alert": props.get("alert"),
115
+ "status": props.get("status"),
116
+ "tsunami": props.get("tsunami"),
117
+ "sig": props.get("sig"),
118
+ "net": props.get("net"),
119
+ "code": props.get("code"),
120
+ "ids": props.get("ids"),
121
+ "sources": props.get("sources"),
122
+ "types": props.get("types"),
123
+ "nst": props.get("nst"),
124
+ "dmin": props.get("dmin"),
125
+ "rms": props.get("rms"),
126
+ "gap": props.get("gap"),
127
+ "magType": props.get("magType"),
128
+ "type": props.get("type"),
129
+ "title": props.get("title"),
130
+ "longitude": geom["coordinates"][0] if geom and geom.get("coordinates") else None,
131
+ "latitude": geom["coordinates"][1] if geom and geom.get("coordinates") else None,
132
+ "depth": geom["coordinates"][2] if geom and geom.get("coordinates") and len(geom["coordinates"]) > 2 else None,
133
+ }
134
+
135
+
136
+ def main():
137
+ # Create dlt pipeline with Nessie REST catalog
138
+ pipeline = dlt.pipeline(
139
+ pipeline_name="usgs_earthquakes",
140
+ destination=iceberg_rest(
141
+ catalog_uri="http://localhost:19120/iceberg/main",
142
+ namespace="examples",
143
+ s3_endpoint="http://localhost:9000",
144
+ s3_access_key_id="minioadmin",
145
+ s3_secret_access_key="minioadmin",
146
+ s3_region="us-east-1",
147
+ ),
148
+ dataset_name="usgs_data",
149
+ )
150
+
151
+ # Load earthquakes from 2010 through current date
152
+ # Breaking into monthly batches to avoid overwhelming the API
153
+ # Note: USGS endtime is exclusive, so we use the first day of next month
154
+
155
+ start_date = date(2010, 1, 1)
156
+ end_date = date.today()
157
+
158
+ date_ranges = []
159
+ current = start_date
160
+ while current <= end_date:
161
+ next_month = current + relativedelta(months=1)
162
+ date_ranges.append((current.isoformat(), next_month.isoformat()))
163
+ current = next_month
164
+
165
+ print(f"Loading {len(date_ranges)} months of earthquake data from {start_date} to {end_date}...")
166
+ print()
167
+
168
+ for i, (start, end) in enumerate(date_ranges, 1):
169
+ @dlt.resource(
170
+ name="earthquakes",
171
+ write_disposition="append",
172
+ columns={
173
+ "time": {
174
+ "data_type": "timestamp",
175
+ "x-partition": True,
176
+ "x-partition-transform": "month",
177
+ }
178
+ }
179
+ )
180
+ def earthquakes_batch():
181
+ return fetch_earthquakes(start, end)
182
+
183
+ load_info = pipeline.run(earthquakes_batch())
184
+ print(f"[{i}/{len(date_ranges)}] Loaded {start} to {end}")
185
+ print()
186
+
187
+ # Query the table to verify
188
+ from pyiceberg.catalog import load_catalog
189
+
190
+ catalog = load_catalog(
191
+ "query",
192
+ type="rest",
193
+ uri="http://localhost:19120/iceberg/main",
194
+ **{
195
+ "s3.endpoint": "http://localhost:9000",
196
+ "s3.access-key-id": "minioadmin",
197
+ "s3.secret-access-key": "minioadmin",
198
+ "s3.region": "us-east-1",
199
+ },
200
+ )
201
+
202
+ table = catalog.load_table("examples.earthquakes")
203
+ result = table.scan().to_arrow()
204
+
205
+ print(f"\n{'='*60}")
206
+ print(f"Total earthquakes loaded: {len(result)}")
207
+
208
+ import pyarrow.compute as pc
209
+ print(f"Date range: {pc.min(result['time']).as_py()} to {pc.max(result['time']).as_py()}")
210
+ print(f"Magnitude range: {pc.min(result['magnitude']).as_py()} to {pc.max(result['magnitude']).as_py()}")
211
+
212
+ # Show some sample records
213
+ import pandas as pd
214
+ df = result.to_pandas()
215
+ print(f"\nSample earthquakes:")
216
+ print(df[["time", "magnitude", "place", "depth"]].head(10).to_string(index=False))
217
+
218
+ # Show distribution by month
219
+ df["month"] = pd.to_datetime(df["time"]).dt.to_period("M")
220
+ monthly_counts = df.groupby("month").size().sort_index()
221
+ print(f"\nEarthquakes by month:")
222
+ for month, count in monthly_counts.items():
223
+ print(f" {month}: {count:,}")
224
+
225
+ # Show magnitude distribution
226
+ print(f"\nMagnitude distribution:")
227
+ print(df["magnitude"].describe())
228
+
229
+ print(f"\n{'='*60}")
230
+ print("USGS earthquake data load complete!")
231
+
232
+
233
+ if __name__ == "__main__":
234
+ main()
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dlt-iceberg"
3
- version = "0.1.2"
3
+ version = "0.1.4"
4
4
  description = "dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -34,7 +34,11 @@ from pyiceberg.exceptions import (
34
34
  from .schema_converter import convert_dlt_to_iceberg_schema
35
35
  from .partition_builder import build_partition_spec
36
36
  from .schema_evolution import evolve_schema_if_needed, SchemaEvolutionError
37
- from .schema_casting import cast_table_safe, CastingError
37
+ from .schema_casting import (
38
+ cast_table_safe,
39
+ CastingError,
40
+ ensure_iceberg_compatible_arrow_data,
41
+ )
38
42
  from .error_handling import (
39
43
  is_retryable_error,
40
44
  log_error_with_context,
@@ -89,6 +93,9 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
89
93
  # Schema casting configuration
90
94
  strict_casting: bool = False
91
95
 
96
+ # Merge batch size (for upsert operations to avoid memory issues)
97
+ merge_batch_size: int = 100000
98
+
92
99
 
93
100
  class IcebergRestLoadJob(RunnableLoadJob):
94
101
  """
@@ -380,7 +387,8 @@ class IcebergRestClient(JobClientBase):
380
387
  # Create table if needed
381
388
  if not table_exists:
382
389
  # Use first file's Arrow table to generate schema
383
- first_arrow_table = file_data[0][2]
390
+ # Apply Iceberg compatibility first so schema uses compatible types
391
+ first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
384
392
  iceberg_schema = convert_dlt_to_iceberg_schema(
385
393
  table_schema, first_arrow_table
386
394
  )
@@ -401,7 +409,7 @@ class IcebergRestClient(JobClientBase):
401
409
  logger.info(f"Created table {identifier} at {iceberg_table.location()}")
402
410
  else:
403
411
  # Table exists - check if schema evolution is needed
404
- first_arrow_table = file_data[0][2]
412
+ first_arrow_table = ensure_iceberg_compatible_arrow_data(file_data[0][2])
405
413
  incoming_schema = convert_dlt_to_iceberg_schema(
406
414
  table_schema, first_arrow_table
407
415
  )
@@ -415,12 +423,15 @@ class IcebergRestClient(JobClientBase):
415
423
  logger.info(f"Schema evolved for table {identifier}")
416
424
  iceberg_table = catalog.load_table(identifier)
417
425
 
418
- # Combine all Arrow tables and cast to match Iceberg schema
426
+ # Get expected schema (already has Iceberg-compatible types from creation)
419
427
  expected_schema = schema_to_pyarrow(iceberg_table.schema())
428
+
429
+ # Combine all Arrow tables and cast to match Iceberg schema
420
430
  combined_tables = []
421
431
 
422
432
  for _, file_path, arrow_table in file_data:
423
- # Cast each table to match Iceberg schema
433
+ # Cast to match Iceberg schema
434
+ # (compatibility conversions already applied when schema was created)
424
435
  casted_table = cast_table_safe(
425
436
  arrow_table,
426
437
  expected_schema,
@@ -463,15 +474,34 @@ class IcebergRestClient(JobClientBase):
463
474
  iceberg_table.append(combined_table)
464
475
  else:
465
476
  logger.info(f"Merging into table {identifier} on keys {primary_keys}")
466
- upsert_result = iceberg_table.upsert(
467
- df=combined_table,
468
- join_cols=primary_keys,
469
- when_matched_update_all=True,
470
- when_not_matched_insert_all=True,
471
- )
477
+
478
+ # Batch upserts to avoid memory issues on large datasets
479
+ batch_size = self.config.merge_batch_size
480
+ total_updated = 0
481
+ total_inserted = 0
482
+
483
+ for batch_start in range(0, len(combined_table), batch_size):
484
+ batch_end = min(batch_start + batch_size, len(combined_table))
485
+ batch = combined_table.slice(batch_start, batch_end - batch_start)
486
+
487
+ logger.info(
488
+ f"Upserting batch {batch_start//batch_size + 1}: "
489
+ f"rows {batch_start} to {batch_end} ({len(batch)} rows)"
490
+ )
491
+
492
+ upsert_result = iceberg_table.upsert(
493
+ df=batch,
494
+ join_cols=primary_keys,
495
+ when_matched_update_all=True,
496
+ when_not_matched_insert_all=True,
497
+ )
498
+
499
+ total_updated += upsert_result.rows_updated
500
+ total_inserted += upsert_result.rows_inserted
501
+
472
502
  logger.info(
473
- f"Upsert completed: {upsert_result.rows_updated} updated, "
474
- f"{upsert_result.rows_inserted} inserted"
503
+ f"Upsert completed: {total_updated} updated, "
504
+ f"{total_inserted} inserted across {(total_rows + batch_size - 1) // batch_size} batches"
475
505
  )
476
506
  else:
477
507
  raise ValueError(f"Unknown write disposition: {write_disposition}")
@@ -6,12 +6,75 @@ and allow users to control casting behavior.
6
6
  """
7
7
 
8
8
  import logging
9
- from typing import List, Optional, Tuple
9
+ from typing import List, Optional, Tuple, Dict, Callable
10
10
  import pyarrow as pa
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
+ def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
16
+ """
17
+ Convert Arrow schema to Iceberg-compatible schema.
18
+
19
+ Converts types that Iceberg doesn't support:
20
+ - time32 → time64 (microseconds)
21
+ - decimal256 → string (Iceberg only supports decimal128)
22
+ - dictionary → value_type (unwrap dictionary encoding)
23
+
24
+ Args:
25
+ schema: PyArrow schema
26
+
27
+ Returns:
28
+ Iceberg-compatible PyArrow schema
29
+ """
30
+ def convert_field(field: pa.Field) -> pa.Field:
31
+ field_type = field.type
32
+
33
+ # time32 → time64(us)
34
+ if pa.types.is_time32(field_type):
35
+ return pa.field(field.name, pa.time64("us"), nullable=field.nullable)
36
+
37
+ # decimal256 → string (pyarrow doesn't allow downcasting to decimal128)
38
+ if pa.types.is_decimal256(field_type):
39
+ logger.warning(
40
+ f"Converting decimal256 field '{field.name}' to string "
41
+ f"(Iceberg doesn't support decimal256)"
42
+ )
43
+ return pa.field(field.name, pa.string(), nullable=field.nullable)
44
+
45
+ # dictionary → value_type (unwrap dictionary encoding)
46
+ if pa.types.is_dictionary(field_type):
47
+ return pa.field(field.name, field_type.value_type, nullable=field.nullable)
48
+
49
+ # list/struct types - recursively convert nested fields
50
+ if pa.types.is_list(field_type):
51
+ value_field = convert_field(pa.field("item", field_type.value_type))
52
+ return pa.field(field.name, pa.list_(value_field.type), nullable=field.nullable)
53
+
54
+ if pa.types.is_struct(field_type):
55
+ new_fields = [convert_field(f) for f in field_type]
56
+ return pa.field(field.name, pa.struct(new_fields), nullable=field.nullable)
57
+
58
+ return field
59
+
60
+ new_fields = [convert_field(field) for field in schema]
61
+ return pa.schema(new_fields)
62
+
63
+
64
+ def ensure_iceberg_compatible_arrow_data(table: pa.Table) -> pa.Table:
65
+ """
66
+ Convert Arrow table to Iceberg-compatible schema and cast data.
67
+
68
+ Args:
69
+ table: PyArrow table
70
+
71
+ Returns:
72
+ Table with Iceberg-compatible schema
73
+ """
74
+ new_schema = ensure_iceberg_compatible_arrow_schema(table.schema)
75
+ return table.cast(new_schema)
76
+
77
+
15
78
  class CastingError(Exception):
16
79
  """Raised when a cast would result in data loss in strict mode."""
17
80
  pass
@@ -182,7 +182,7 @@ wheels = [
182
182
 
183
183
  [[package]]
184
184
  name = "dlt-iceberg"
185
- version = "0.1.1"
185
+ version = "0.1.3"
186
186
  source = { editable = "." }
187
187
  dependencies = [
188
188
  { name = "boto3" },
File without changes
File without changes
File without changes
File without changes