dlt-iceberg 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dlt-iceberg might be problematic. Click here for more details.

Files changed (42) hide show
  1. dlt_iceberg-0.1.5/.claude/settings.local.json +9 -0
  2. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/PKG-INFO +40 -5
  3. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/README.md +39 -4
  4. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/examples/README.md +30 -0
  5. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/examples/incremental_load.py +8 -0
  6. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/examples/merge_load.py +9 -0
  7. dlt_iceberg-0.1.5/examples/usgs_earthquakes.py +234 -0
  8. dlt_iceberg-0.1.5/examples/usgs_load.log +88 -0
  9. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/pyproject.toml +1 -1
  10. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/destination_client.py +1 -1
  11. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/uv.lock +1 -1
  12. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/.github/workflows/publish.yml +0 -0
  13. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/.github/workflows/test.yml +0 -0
  14. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/.gitignore +0 -0
  15. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/.python-version +0 -0
  16. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/LICENSE +0 -0
  17. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/TESTING.md +0 -0
  18. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/docker-compose.yml +0 -0
  19. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/examples/data/customers_initial.csv +0 -0
  20. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/examples/data/customers_updates.csv +0 -0
  21. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/examples/data/events_batch1.csv +0 -0
  22. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/examples/data/events_batch2.csv +0 -0
  23. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/__init__.py +0 -0
  24. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/destination.py +0 -0
  25. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/error_handling.py +0 -0
  26. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/partition_builder.py +0 -0
  27. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/schema_casting.py +0 -0
  28. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/schema_converter.py +0 -0
  29. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/src/dlt_iceberg/schema_evolution.py +0 -0
  30. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_class_based_atomic.py +0 -0
  31. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_destination_e2e.py +0 -0
  32. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_destination_rest_catalog.py +0 -0
  33. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_e2e_sqlite_catalog.py +0 -0
  34. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_error_handling.py +0 -0
  35. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_merge_disposition.py +0 -0
  36. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_partition_builder.py +0 -0
  37. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_partitioning_e2e.py +0 -0
  38. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_pyiceberg_append.py +0 -0
  39. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_schema_casting.py +0 -0
  40. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_schema_converter.py +0 -0
  41. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_schema_evolution.py +0 -0
  42. {dlt_iceberg-0.1.3 → dlt_iceberg-0.1.5}/tests/test_smoke.py +0 -0
@@ -0,0 +1,9 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(docker compose:*)"
5
+ ],
6
+ "deny": [],
7
+ "ask": []
8
+ }
9
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dlt-iceberg
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs
5
5
  Project-URL: Homepage, https://github.com/sidequery/dlt-iceberg
6
6
  Project-URL: Repository, https://github.com/sidequery/dlt-iceberg
@@ -47,9 +47,13 @@ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.ap
47
47
  ## Installation
48
48
 
49
49
  ```bash
50
- git clone https://github.com/sidequery/dlt-iceberg.git
51
- cd dlt-iceberg
52
- uv sync
50
+ pip install dlt-iceberg
51
+ ```
52
+
53
+ Or with uv:
54
+
55
+ ```bash
56
+ uv add dlt-iceberg
53
57
  ```
54
58
 
55
59
  ## Quick Start
@@ -95,7 +99,38 @@ def generate_users():
95
99
  pipeline.run(generate_users())
96
100
  ```
97
101
 
98
- ## Configuration
102
+ ## Configuration Options
103
+
104
+ All configuration options can be passed to `iceberg_rest()`:
105
+
106
+ ```python
107
+ iceberg_rest(
108
+ catalog_uri="...", # Required: REST catalog URI
109
+ namespace="...", # Required: Iceberg namespace (database)
110
+ warehouse="...", # Optional: Warehouse location
111
+
112
+ # Authentication
113
+ credential="...", # OAuth2 client credentials
114
+ oauth2_server_uri="...", # OAuth2 token endpoint
115
+ token="...", # Bearer token
116
+
117
+ # AWS SigV4
118
+ sigv4_enabled=True,
119
+ signing_region="us-east-1",
120
+
121
+ # S3 configuration
122
+ s3_endpoint="...",
123
+ s3_access_key_id="...",
124
+ s3_secret_access_key="...",
125
+ s3_region="...",
126
+
127
+ # Performance tuning
128
+ max_retries=5, # Retry attempts for transient failures
129
+ retry_backoff_base=2.0, # Exponential backoff multiplier
130
+ merge_batch_size=500000, # Rows per batch for merge operations
131
+ strict_casting=False, # Fail on potential data loss
132
+ )
133
+ ```
99
134
 
100
135
  ### Nessie (Docker)
101
136
 
@@ -15,9 +15,13 @@ A [dlt](https://dlthub.com/) destination for [Apache Iceberg](https://iceberg.ap
15
15
  ## Installation
16
16
 
17
17
  ```bash
18
- git clone https://github.com/sidequery/dlt-iceberg.git
19
- cd dlt-iceberg
20
- uv sync
18
+ pip install dlt-iceberg
19
+ ```
20
+
21
+ Or with uv:
22
+
23
+ ```bash
24
+ uv add dlt-iceberg
21
25
  ```
22
26
 
23
27
  ## Quick Start
@@ -63,7 +67,38 @@ def generate_users():
63
67
  pipeline.run(generate_users())
64
68
  ```
65
69
 
66
- ## Configuration
70
+ ## Configuration Options
71
+
72
+ All configuration options can be passed to `iceberg_rest()`:
73
+
74
+ ```python
75
+ iceberg_rest(
76
+ catalog_uri="...", # Required: REST catalog URI
77
+ namespace="...", # Required: Iceberg namespace (database)
78
+ warehouse="...", # Optional: Warehouse location
79
+
80
+ # Authentication
81
+ credential="...", # OAuth2 client credentials
82
+ oauth2_server_uri="...", # OAuth2 token endpoint
83
+ token="...", # Bearer token
84
+
85
+ # AWS SigV4
86
+ sigv4_enabled=True,
87
+ signing_region="us-east-1",
88
+
89
+ # S3 configuration
90
+ s3_endpoint="...",
91
+ s3_access_key_id="...",
92
+ s3_secret_access_key="...",
93
+ s3_region="...",
94
+
95
+ # Performance tuning
96
+ max_retries=5, # Retry attempts for transient failures
97
+ retry_backoff_base=2.0, # Exponential backoff multiplier
98
+ merge_batch_size=500000, # Rows per batch for merge operations
99
+ strict_casting=False, # Fail on potential data loss
100
+ )
101
+ ```
67
102
 
68
103
  ### Nessie (Docker)
69
104
 
@@ -2,6 +2,8 @@
2
2
 
3
3
  This directory contains example scripts demonstrating how to use dlt-iceberg with REST catalogs.
4
4
 
5
+ All examples use [PEP 723](https://peps.python.org/pep-0723/) inline script metadata, allowing them to be run directly with `uv run` without installing dependencies separately.
6
+
5
7
  ## Prerequisites
6
8
 
7
9
  Start the Docker services (Nessie REST catalog and MinIO):
@@ -48,6 +50,32 @@ Demonstrates merging two CSV files with overlapping customer IDs using merge dis
48
50
  uv run examples/merge_load.py
49
51
  ```
50
52
 
53
+ ### USGS Earthquake Data (`usgs_earthquakes.py`)
54
+
55
+ Demonstrates loading real-world GeoJSON data from the USGS Earthquake API from 2010 through current date.
56
+
57
+ - Fetches earthquake data from USGS API (2010-present, ~190 months)
58
+ - Loads data in monthly batches with automatic weekly splitting for high-volume months
59
+ - Handles API's 20,000 result limit by automatically splitting large months into weekly chunks
60
+ - Uses partitioning by month on timestamp column
61
+ - Transforms GeoJSON features into flat records
62
+ - Loads ~2.5 million earthquakes with complete metadata
63
+
64
+ **Features demonstrated:**
65
+ - Dynamic date range generation using `date.today()`
66
+ - Automatic handling of API result limits with recursive splitting
67
+ - Retry logic with exponential backoff
68
+ - Rate limiting between API requests
69
+ - Large-scale data ingestion (190+ API calls)
70
+
71
+ **Run:**
72
+
73
+ ```bash
74
+ uv run examples/usgs_earthquakes.py
75
+ ```
76
+
77
+ **Note:** This script takes approximately 15-20 minutes to complete as it makes 190+ API calls with rate limiting.
78
+
51
79
  ## Data Files
52
80
 
53
81
  Sample CSV files are in the `data/` directory:
@@ -63,4 +91,6 @@ Sample CSV files are in the `data/` directory:
63
91
  - **Incremental loads**: Append new data to existing tables
64
92
  - **Merge/Upsert**: Update existing records and insert new ones based on primary key
65
93
  - **REST catalog**: All examples use Nessie REST catalog with MinIO storage
94
+ - **Partitioning**: Partition tables by timestamp (month transform)
95
+ - **API integration**: Fetch and transform data from external APIs
66
96
  - **Querying**: Direct PyIceberg queries to verify loaded data
@@ -1,3 +1,11 @@
1
+ #!/usr/bin/env -S uv run
2
+ # /// script
3
+ # dependencies = [
4
+ # "dlt",
5
+ # "dlt-iceberg",
6
+ # "pyiceberg",
7
+ # ]
8
+ # ///
1
9
  """
2
10
  Incremental Load Example
3
11
 
@@ -1,3 +1,12 @@
1
+ #!/usr/bin/env -S uv run
2
+ # /// script
3
+ # dependencies = [
4
+ # "dlt",
5
+ # "dlt-iceberg",
6
+ # "pyiceberg",
7
+ # "pandas",
8
+ # ]
9
+ # ///
1
10
  """
2
11
  Merge Load Example
3
12
 
@@ -0,0 +1,234 @@
1
+ #!/usr/bin/env -S uv run
2
+ # /// script
3
+ # dependencies = [
4
+ # "dlt",
5
+ # "dlt-iceberg",
6
+ # "pyiceberg",
7
+ # "requests",
8
+ # "python-dateutil",
9
+ # ]
10
+ # ///
11
+ """
12
+ USGS Earthquake Data Example
13
+
14
+ Loads earthquake data from USGS GeoJSON API from 2010 through current date
15
+ into an Iceberg table.
16
+ """
17
+
18
+ import dlt
19
+ import requests
20
+ import time
21
+ from datetime import datetime, date
22
+ from dateutil.relativedelta import relativedelta
23
+ from dlt_iceberg import iceberg_rest
24
+
25
+
26
+ def fetch_earthquakes(start_date: str, end_date: str, split_on_error: bool = True):
27
+ """
28
+ Fetch earthquake data from USGS API for a date range.
29
+
30
+ If a 400 error occurs (likely due to >20k result limit), automatically
31
+ splits the range into smaller chunks and retries.
32
+
33
+ Args:
34
+ start_date: Start date in YYYY-MM-DD format
35
+ end_date: End date in YYYY-MM-DD format
36
+ split_on_error: If True, split the date range on 400 errors
37
+ """
38
+ url = "https://earthquake.usgs.gov/fdsnws/event/1/query"
39
+ params = {
40
+ "format": "geojson",
41
+ "starttime": start_date,
42
+ "endtime": end_date,
43
+ }
44
+
45
+ print(f"Fetching earthquakes from {start_date} to {end_date}...")
46
+
47
+ # Retry logic for rate limiting
48
+ max_retries = 3
49
+ for attempt in range(max_retries):
50
+ try:
51
+ response = requests.get(url, params=params, timeout=30)
52
+ response.raise_for_status()
53
+ break
54
+ except requests.exceptions.HTTPError as e:
55
+ if e.response.status_code == 400 and split_on_error:
56
+ # Split the date range and retry with smaller chunks
57
+ print(f" Too many results, splitting range into weekly chunks...")
58
+ from datetime import timedelta
59
+
60
+ start_dt = datetime.fromisoformat(start_date)
61
+ end_dt = datetime.fromisoformat(end_date)
62
+ current = start_dt
63
+
64
+ while current < end_dt:
65
+ next_week = min(current + timedelta(days=7), end_dt)
66
+ # Recursively fetch with split_on_error=False to avoid infinite recursion
67
+ yield from fetch_earthquakes(
68
+ current.date().isoformat(),
69
+ next_week.date().isoformat(),
70
+ split_on_error=False
71
+ )
72
+ current = next_week
73
+ return
74
+ elif e.response.status_code == 400:
75
+ print(f" Warning: API returned 400 error for {start_date} to {end_date}, skipping...")
76
+ return
77
+ if attempt < max_retries - 1:
78
+ wait_time = 2 ** attempt
79
+ print(f" Request failed, retrying in {wait_time}s...")
80
+ time.sleep(wait_time)
81
+ else:
82
+ raise
83
+ except requests.exceptions.RequestException as e:
84
+ if attempt < max_retries - 1:
85
+ wait_time = 2 ** attempt
86
+ print(f" Request failed, retrying in {wait_time}s...")
87
+ time.sleep(wait_time)
88
+ else:
89
+ raise
90
+
91
+ data = response.json()
92
+ features = data.get("features", [])
93
+ print(f"Retrieved {len(features)} earthquakes")
94
+
95
+ # Rate limiting: sleep briefly between requests
96
+ time.sleep(0.5)
97
+
98
+ # Transform GeoJSON features into flat records
99
+ for feature in features:
100
+ props = feature["properties"]
101
+ geom = feature["geometry"]
102
+
103
+ yield {
104
+ "earthquake_id": feature["id"],
105
+ "magnitude": props.get("mag"),
106
+ "place": props.get("place"),
107
+ "time": datetime.fromtimestamp(props["time"] / 1000) if props.get("time") else None,
108
+ "updated": datetime.fromtimestamp(props["updated"] / 1000) if props.get("updated") else None,
109
+ "url": props.get("url"),
110
+ "detail": props.get("detail"),
111
+ "felt": props.get("felt"),
112
+ "cdi": props.get("cdi"),
113
+ "mmi": props.get("mmi"),
114
+ "alert": props.get("alert"),
115
+ "status": props.get("status"),
116
+ "tsunami": props.get("tsunami"),
117
+ "sig": props.get("sig"),
118
+ "net": props.get("net"),
119
+ "code": props.get("code"),
120
+ "ids": props.get("ids"),
121
+ "sources": props.get("sources"),
122
+ "types": props.get("types"),
123
+ "nst": props.get("nst"),
124
+ "dmin": props.get("dmin"),
125
+ "rms": props.get("rms"),
126
+ "gap": props.get("gap"),
127
+ "magType": props.get("magType"),
128
+ "type": props.get("type"),
129
+ "title": props.get("title"),
130
+ "longitude": geom["coordinates"][0] if geom and geom.get("coordinates") else None,
131
+ "latitude": geom["coordinates"][1] if geom and geom.get("coordinates") else None,
132
+ "depth": geom["coordinates"][2] if geom and geom.get("coordinates") and len(geom["coordinates"]) > 2 else None,
133
+ }
134
+
135
+
136
+ def main():
137
+ # Create dlt pipeline with Nessie REST catalog
138
+ pipeline = dlt.pipeline(
139
+ pipeline_name="usgs_earthquakes",
140
+ destination=iceberg_rest(
141
+ catalog_uri="http://localhost:19120/iceberg/main",
142
+ namespace="examples",
143
+ s3_endpoint="http://localhost:9000",
144
+ s3_access_key_id="minioadmin",
145
+ s3_secret_access_key="minioadmin",
146
+ s3_region="us-east-1",
147
+ ),
148
+ dataset_name="usgs_data",
149
+ )
150
+
151
+ # Load earthquakes from 2010 through current date
152
+ # Breaking into monthly batches to avoid overwhelming the API
153
+ # Note: USGS endtime is exclusive, so we use the first day of next month
154
+
155
+ start_date = date(2010, 1, 1)
156
+ end_date = date.today()
157
+
158
+ date_ranges = []
159
+ current = start_date
160
+ while current <= end_date:
161
+ next_month = current + relativedelta(months=1)
162
+ date_ranges.append((current.isoformat(), next_month.isoformat()))
163
+ current = next_month
164
+
165
+ print(f"Loading {len(date_ranges)} months of earthquake data from {start_date} to {end_date}...")
166
+ print()
167
+
168
+ for i, (start, end) in enumerate(date_ranges, 1):
169
+ @dlt.resource(
170
+ name="earthquakes",
171
+ write_disposition="append",
172
+ columns={
173
+ "time": {
174
+ "data_type": "timestamp",
175
+ "x-partition": True,
176
+ "x-partition-transform": "month",
177
+ }
178
+ }
179
+ )
180
+ def earthquakes_batch():
181
+ return fetch_earthquakes(start, end)
182
+
183
+ load_info = pipeline.run(earthquakes_batch())
184
+ print(f"[{i}/{len(date_ranges)}] Loaded {start} to {end}")
185
+ print()
186
+
187
+ # Query the table to verify
188
+ from pyiceberg.catalog import load_catalog
189
+
190
+ catalog = load_catalog(
191
+ "query",
192
+ type="rest",
193
+ uri="http://localhost:19120/iceberg/main",
194
+ **{
195
+ "s3.endpoint": "http://localhost:9000",
196
+ "s3.access-key-id": "minioadmin",
197
+ "s3.secret-access-key": "minioadmin",
198
+ "s3.region": "us-east-1",
199
+ },
200
+ )
201
+
202
+ table = catalog.load_table("examples.earthquakes")
203
+ result = table.scan().to_arrow()
204
+
205
+ print(f"\n{'='*60}")
206
+ print(f"Total earthquakes loaded: {len(result)}")
207
+
208
+ import pyarrow.compute as pc
209
+ print(f"Date range: {pc.min(result['time']).as_py()} to {pc.max(result['time']).as_py()}")
210
+ print(f"Magnitude range: {pc.min(result['magnitude']).as_py()} to {pc.max(result['magnitude']).as_py()}")
211
+
212
+ # Show some sample records
213
+ import pandas as pd
214
+ df = result.to_pandas()
215
+ print(f"\nSample earthquakes:")
216
+ print(df[["time", "magnitude", "place", "depth"]].head(10).to_string(index=False))
217
+
218
+ # Show distribution by month
219
+ df["month"] = pd.to_datetime(df["time"]).dt.to_period("M")
220
+ monthly_counts = df.groupby("month").size().sort_index()
221
+ print(f"\nEarthquakes by month:")
222
+ for month, count in monthly_counts.items():
223
+ print(f" {month}: {count:,}")
224
+
225
+ # Show magnitude distribution
226
+ print(f"\nMagnitude distribution:")
227
+ print(df["magnitude"].describe())
228
+
229
+ print(f"\n{'='*60}")
230
+ print("USGS earthquake data load complete!")
231
+
232
+
233
+ if __name__ == "__main__":
234
+ main()
@@ -0,0 +1,88 @@
1
+ Reading inline script metadata from `usgs_earthquakes.py`
2
+ Installed 60 packages in 340ms
3
+ Loading 190 months of earthquake data from 2010-01-01 to 2025-10-12...
4
+
5
+ Fetching earthquakes from 2010-01-01 to 2010-02-01...
6
+ Retrieved 9923 earthquakes
7
+ Traceback (most recent call last):
8
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/extract/pipe_iterator.py", line 274, in _get_source_item
9
+ pipe_item = next(gen)
10
+ File "/Users/nico/Code/sidequery-dlt/examples/usgs_earthquakes.py", line 107, in fetch_earthquakes
11
+ "time": datetime.fromtimestamp(props["time"] / 1000) if props.get("time") else None,
12
+ ^^^^^^^^
13
+ UnboundLocalError: cannot access local variable 'datetime' where it is not associated with a value
14
+
15
+ The above exception was the direct cause of the following exception:
16
+
17
+ Traceback (most recent call last):
18
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 476, in extract
19
+ self._extract_source(
20
+ ~~~~~~~~~~~~~~~~~~~~^
21
+ extract_step,
22
+ ^^^^^^^^^^^^^
23
+ ...<3 lines>...
24
+ refresh=refresh or self.refresh,
25
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
26
+ )
27
+ ^
28
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 1308, in _extract_source
29
+ load_id = extract.extract(
30
+ source, max_parallel_items, workers, load_package_state_update=load_package_state_update
31
+ )
32
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/extract/extract.py", line 473, in extract
33
+ self._extract_single_source(
34
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~^
35
+ load_id,
36
+ ^^^^^^^^
37
+ ...<2 lines>...
38
+ workers=workers,
39
+ ^^^^^^^^^^^^^^^^
40
+ )
41
+ ^
42
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/extract/extract.py", line 397, in _extract_single_source
43
+ for pipe_item in pipes:
44
+ ^^^^^
45
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/extract/pipe_iterator.py", line 158, in __next__
46
+ pipe_item = self._get_source_item()
47
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/extract/pipe_iterator.py", line 304, in _get_source_item
48
+ raise ResourceExtractionError(pipe.name, gen, str(ex), "generator") from ex
49
+ dlt.extract.exceptions.ResourceExtractionError: In processing pipe `earthquakes`: extraction of resource `earthquakes` in `generator` `fetch_earthquakes` caused an exception: cannot access local variable 'datetime' where it is not associated with a value
50
+
51
+ The above exception was the direct cause of the following exception:
52
+
53
+ Traceback (most recent call last):
54
+ File "/Users/nico/Code/sidequery-dlt/examples/usgs_earthquakes.py", line 234, in <module>
55
+ main()
56
+ ~~~~^^
57
+ File "/Users/nico/Code/sidequery-dlt/examples/usgs_earthquakes.py", line 183, in main
58
+ load_info = pipeline.run(earthquakes_batch())
59
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 223, in _wrap
60
+ step_info = f(self, *args, **kwargs)
61
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 272, in _wrap
62
+ return f(self, *args, **kwargs)
63
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 724, in run
64
+ self.extract(
65
+ ~~~~~~~~~~~~^
66
+ data,
67
+ ^^^^^
68
+ ...<8 lines>...
69
+ loader_file_format=loader_file_format,
70
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
71
+ )
72
+ ^
73
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 223, in _wrap
74
+ step_info = f(self, *args, **kwargs)
75
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 177, in _wrap
76
+ rv = f(self, *args, **kwargs)
77
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 163, in _wrap
78
+ return f(self, *args, **kwargs)
79
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 272, in _wrap
80
+ return f(self, *args, **kwargs)
81
+ File "/Users/nico/.cache/uv/archive-v0/6bPdJD5TjlWUrQz91CSnF/lib/python3.13/site-packages/dlt/pipeline/pipeline.py", line 497, in extract
82
+ raise PipelineStepFailed(
83
+ ...<5 lines>...
84
+ ) from exc
85
+ dlt.pipeline.exceptions.PipelineStepFailed: Pipeline execution failed at `step=extract` when processing package with `load_id=1760337324.066329` with exception:
86
+
87
+ <class 'dlt.extract.exceptions.ResourceExtractionError'>
88
+ In processing pipe `earthquakes`: extraction of resource `earthquakes` in `generator` `fetch_earthquakes` caused an exception: cannot access local variable 'datetime' where it is not associated with a value
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dlt-iceberg"
3
- version = "0.1.3"
3
+ version = "0.1.5"
4
4
  description = "dlt destination for Apache Iceberg with atomic multi-file commits via REST catalogs"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -94,7 +94,7 @@ class IcebergRestConfiguration(DestinationClientConfiguration):
94
94
  strict_casting: bool = False
95
95
 
96
96
  # Merge batch size (for upsert operations to avoid memory issues)
97
- merge_batch_size: int = 100000
97
+ merge_batch_size: int = 500000
98
98
 
99
99
 
100
100
  class IcebergRestLoadJob(RunnableLoadJob):
@@ -182,7 +182,7 @@ wheels = [
182
182
 
183
183
  [[package]]
184
184
  name = "dlt-iceberg"
185
- version = "0.1.2"
185
+ version = "0.1.4"
186
186
  source = { editable = "." }
187
187
  dependencies = [
188
188
  { name = "boto3" },
File without changes
File without changes
File without changes
File without changes