duckrun 0.3.17.dev7__tar.gz → 0.3.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.3.17.dev7/duckrun.egg-info → duckrun-0.3.19}/PKG-INFO +64 -9
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/README.md +62 -7
- duckrun-0.3.19/dbt/adapters/duckrun/__version__.py +1 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/delta_dml.py +47 -4
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/impl.py +4 -0
- duckrun-0.3.19/duckrun/_runtime.py +51 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun/session.py +103 -13
- {duckrun-0.3.17.dev7 → duckrun-0.3.19/duckrun.egg-info}/PKG-INFO +64 -9
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun.egg-info/SOURCES.txt +1 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun.egg-info/requires.txt +1 -1
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/pyproject.toml +13 -14
- duckrun-0.3.17.dev7/dbt/adapters/duckrun/__version__.py +0 -1
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/LICENSE +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/MANIFEST.in +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/credentials.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/delta_plugin.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/engine.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/environment.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/remote.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/adapters/duckrun/secret.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/dbt_project.yml +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/catalog.sql +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/snapshot.sql +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun/auth.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun/delta_table.py +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.3.17.dev7 → duckrun-0.3.19}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.19
|
|
4
4
|
Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -12,7 +12,7 @@ Description-Content-Type: text/markdown
|
|
|
12
12
|
License-File: LICENSE
|
|
13
13
|
Requires-Dist: dbt-duckdb>=1.8
|
|
14
14
|
Requires-Dist: dbt-core<2.0,>=1.8
|
|
15
|
-
Requires-Dist: duckdb
|
|
15
|
+
Requires-Dist: duckdb<1.6.0,>=1.5.4
|
|
16
16
|
Requires-Dist: deltalake<1.5.1,>=1.5.0
|
|
17
17
|
Requires-Dist: requests
|
|
18
18
|
Provides-Extra: local
|
|
@@ -35,9 +35,13 @@ Dynamic: license-file
|
|
|
35
35
|
> not affiliated with, endorsed by, or supported by any employer or vendor. No warranty —
|
|
36
36
|
> use it at your own risk.
|
|
37
37
|
|
|
38
|
-
**duckrun**
|
|
39
|
-
**
|
|
40
|
-
|
|
38
|
+
**duckrun** runs SQL in [DuckDB](https://duckdb.org/) and writes
|
|
39
|
+
[**Delta Lake**](https://delta-io.github.io/delta-rs/) via delta_rs. It gives you:
|
|
40
|
+
|
|
41
|
+
- a [**dbt**](https://www.getdbt.com/) adapter that materializes models as Delta tables;
|
|
42
|
+
- a **`connect()`** helper to write Delta straight from SQL in a notebook;
|
|
43
|
+
- **full snapshot isolation** from read to write — concurrent writers fail loud, never interleave.
|
|
44
|
+
|
|
41
45
|
duckrun itself is just glue — it owns none of the heavy lifting. The real work is done
|
|
42
46
|
by **DuckDB** (executes the SQL), **delta-rs** (writes the Delta table), **Arrow** (the
|
|
43
47
|
zero-copy (kind of) bridge that hands query results from DuckDB to delta-rs), and **dbt** (orchestrates
|
|
@@ -67,6 +71,21 @@ pip install duckrun
|
|
|
67
71
|
|
|
68
72
|
That single install pulls in `dbt-duckdb` (and therefore `duckdb`) plus `deltalake`.
|
|
69
73
|
|
|
74
|
+
### In a Microsoft Fabric Python notebook
|
|
75
|
+
|
|
76
|
+
duckrun needs `duckdb` ≥ 1.5.4 — the release where `delta_scan` gained its `version => N`
|
|
77
|
+
parameter, which duckrun uses for snapshot-pinned reads. Fabric notebooks ship a **stable**
|
|
78
|
+
`duckdb` release, which trails the newest one, so the `duckdb` already loaded in the kernel may
|
|
79
|
+
predate 1.5.4. Upgrade, then restart the Python kernel so the new version loads.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
!pip install duckrun --upgrade
|
|
83
|
+
notebookutils.session.restartPython()
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
If you skip the restart, duckrun fails loud at `connect()` (and on `dbt run`) and tells you to
|
|
87
|
+
restart — it won't quietly run on the older `duckdb`/`deltalake` still bound in the kernel.
|
|
88
|
+
|
|
70
89
|
## Configure your profile
|
|
71
90
|
|
|
72
91
|
```yaml
|
|
@@ -79,12 +98,22 @@ my_project:
|
|
|
79
98
|
# No `threads:` needed — duckrun always runs single-threaded.
|
|
80
99
|
# DuckDB runs in-memory by default — the Delta tables are the only state.
|
|
81
100
|
# Default Delta location for models that don't set config(location=...).
|
|
82
|
-
|
|
101
|
+
# OneLake — address by GUID, not friendly names (see "OneLake: use GUID paths" below):
|
|
102
|
+
root_path: "abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables"
|
|
103
|
+
# Or any other store: './warehouse' (local), 's3://...', 'gs://...'.
|
|
83
104
|
# storage_options: {} # passed through to deltalake for remote stores
|
|
84
105
|
```
|
|
85
106
|
|
|
86
107
|
Persisted models are written to `<root_path>/<schema>/<model>` (e.g.
|
|
87
|
-
|
|
108
|
+
`.../Tables/dbo/orders`), or to an explicit `config(location=...)`.
|
|
109
|
+
|
|
110
|
+
### OneLake: use GUID paths for now
|
|
111
|
+
|
|
112
|
+
Address OneLake tables by **workspace GUID + lakehouse GUID**, not friendly names —
|
|
113
|
+
`abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables/...`. This
|
|
114
|
+
sidesteps an upstream `duckdb-delta` read bug ("No files in log segment") that is **already fixed
|
|
115
|
+
upstream but still rolling out to production OneLake**. Friendly-name paths will work again once
|
|
116
|
+
the fix finishes deploying.
|
|
88
117
|
|
|
89
118
|
### Fabric Lakehouse without a schema
|
|
90
119
|
|
|
@@ -95,7 +124,7 @@ let the schema fill that slot:
|
|
|
95
124
|
|
|
96
125
|
```yaml
|
|
97
126
|
schema: Tables
|
|
98
|
-
root_path: "abfss://<
|
|
127
|
+
root_path: "abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>"
|
|
99
128
|
```
|
|
100
129
|
|
|
101
130
|
Since models are written to `<root_path>/<schema>/<model>`, this lands them at
|
|
@@ -309,7 +338,7 @@ unchanged since the call, else raises `CommitFailedError`.
|
|
|
309
338
|
|
|
310
339
|
```python
|
|
311
340
|
import duckrun
|
|
312
|
-
conn = duckrun.connect("abfss
|
|
341
|
+
conn = duckrun.connect("abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables/dbo")
|
|
313
342
|
conn.sql("select * from orders").write.mode("overwrite").saveAsTable("orders_copy")
|
|
314
343
|
conn.table("orders_copy").show()
|
|
315
344
|
|
|
@@ -355,6 +384,32 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
|
|
|
355
384
|
runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
|
|
356
385
|
in those docs are rendered live by CI, so they always reflect the latest `main`.
|
|
357
386
|
|
|
387
|
+
## Limitations
|
|
388
|
+
|
|
389
|
+
These are core design trade-offs, not bugs — they're inherent to gluing DuckDB to delta_rs and
|
|
390
|
+
won't be "fixed" away:
|
|
391
|
+
|
|
392
|
+
- **A single dbt run is single-threaded — but concurrency works fine.** This is purely a dbt-adapter
|
|
393
|
+
implementation detail: *within one dbt process* models run with `threads: 1`, because the
|
|
394
|
+
in-process delta_rs write path isn't thread-safe (parallel writes to a table in the *same* process
|
|
395
|
+
collide). It is **not** a limit on concurrent writers. Multiple independent writers — separate dbt
|
|
396
|
+
runs, notebooks, jobs, whatever — writing the same tables at the same time is fully supported and
|
|
397
|
+
safe: every write uses optimistic concurrency (snapshot-pinned MERGE, `safeappend` compare-and-swap,
|
|
398
|
+
fail-loud on a conflicting commit). So you can absolutely run many writers in parallel; you just
|
|
399
|
+
can't multi-thread the models *inside a single* dbt invocation.
|
|
400
|
+
- **Two engines share one machine's memory.** DuckDB executes the SQL and delta_rs materializes the
|
|
401
|
+
Delta table — two separate memory systems in the same process, each with its own pool. Under heavy
|
|
402
|
+
memory pressure (large merges especially) the budget has to be split between them, and getting that
|
|
403
|
+
split right is fragile: delta_rs's merge spill-to-disk is itself flaky, and coordinating two
|
|
404
|
+
systems that don't know about each other's allocations is the hard, unavoidable part of this design.
|
|
405
|
+
- **`DROP TABLE` is a soft tombstone, not a physical delete.** delta_rs has no `DROP`, and removing the
|
|
406
|
+
Delta files directly would be a filesystem hack that fails on object stores — so `conn.sql("drop
|
|
407
|
+
table x")` overwrites the table with a one-column tombstone marker and unregisters it. The table
|
|
408
|
+
vanishes from `conn.catalog` and discovery, and a later `create table x as …` revives the path with
|
|
409
|
+
real data, but the **files are not reclaimed** (a human purges them). One consequence: reading the
|
|
410
|
+
path *directly* (`conn.read.delta("…/x")`) bypasses discovery and returns the one-row tombstone
|
|
411
|
+
marker rather than erroring — address dropped tables by name, not by path.
|
|
412
|
+
|
|
358
413
|
## License
|
|
359
414
|
|
|
360
415
|
MIT
|
|
@@ -6,9 +6,13 @@
|
|
|
6
6
|
> not affiliated with, endorsed by, or supported by any employer or vendor. No warranty —
|
|
7
7
|
> use it at your own risk.
|
|
8
8
|
|
|
9
|
-
**duckrun**
|
|
10
|
-
**
|
|
11
|
-
|
|
9
|
+
**duckrun** runs SQL in [DuckDB](https://duckdb.org/) and writes
|
|
10
|
+
[**Delta Lake**](https://delta-io.github.io/delta-rs/) via delta_rs. It gives you:
|
|
11
|
+
|
|
12
|
+
- a [**dbt**](https://www.getdbt.com/) adapter that materializes models as Delta tables;
|
|
13
|
+
- a **`connect()`** helper to write Delta straight from SQL in a notebook;
|
|
14
|
+
- **full snapshot isolation** from read to write — concurrent writers fail loud, never interleave.
|
|
15
|
+
|
|
12
16
|
duckrun itself is just glue — it owns none of the heavy lifting. The real work is done
|
|
13
17
|
by **DuckDB** (executes the SQL), **delta-rs** (writes the Delta table), **Arrow** (the
|
|
14
18
|
zero-copy (kind of) bridge that hands query results from DuckDB to delta-rs), and **dbt** (orchestrates
|
|
@@ -38,6 +42,21 @@ pip install duckrun
|
|
|
38
42
|
|
|
39
43
|
That single install pulls in `dbt-duckdb` (and therefore `duckdb`) plus `deltalake`.
|
|
40
44
|
|
|
45
|
+
### In a Microsoft Fabric Python notebook
|
|
46
|
+
|
|
47
|
+
duckrun needs `duckdb` ≥ 1.5.4 — the release where `delta_scan` gained its `version => N`
|
|
48
|
+
parameter, which duckrun uses for snapshot-pinned reads. Fabric notebooks ship a **stable**
|
|
49
|
+
`duckdb` release, which trails the newest one, so the `duckdb` already loaded in the kernel may
|
|
50
|
+
predate 1.5.4. Upgrade, then restart the Python kernel so the new version loads.
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
!pip install duckrun --upgrade
|
|
54
|
+
notebookutils.session.restartPython()
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
If you skip the restart, duckrun fails loud at `connect()` (and on `dbt run`) and tells you to
|
|
58
|
+
restart — it won't quietly run on the older `duckdb`/`deltalake` still bound in the kernel.
|
|
59
|
+
|
|
41
60
|
## Configure your profile
|
|
42
61
|
|
|
43
62
|
```yaml
|
|
@@ -50,12 +69,22 @@ my_project:
|
|
|
50
69
|
# No `threads:` needed — duckrun always runs single-threaded.
|
|
51
70
|
# DuckDB runs in-memory by default — the Delta tables are the only state.
|
|
52
71
|
# Default Delta location for models that don't set config(location=...).
|
|
53
|
-
|
|
72
|
+
# OneLake — address by GUID, not friendly names (see "OneLake: use GUID paths" below):
|
|
73
|
+
root_path: "abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables"
|
|
74
|
+
# Or any other store: './warehouse' (local), 's3://...', 'gs://...'.
|
|
54
75
|
# storage_options: {} # passed through to deltalake for remote stores
|
|
55
76
|
```
|
|
56
77
|
|
|
57
78
|
Persisted models are written to `<root_path>/<schema>/<model>` (e.g.
|
|
58
|
-
|
|
79
|
+
`.../Tables/dbo/orders`), or to an explicit `config(location=...)`.
|
|
80
|
+
|
|
81
|
+
### OneLake: use GUID paths for now
|
|
82
|
+
|
|
83
|
+
Address OneLake tables by **workspace GUID + lakehouse GUID**, not friendly names —
|
|
84
|
+
`abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables/...`. This
|
|
85
|
+
sidesteps an upstream `duckdb-delta` read bug ("No files in log segment") that is **already fixed
|
|
86
|
+
upstream but still rolling out to production OneLake**. Friendly-name paths will work again once
|
|
87
|
+
the fix finishes deploying.
|
|
59
88
|
|
|
60
89
|
### Fabric Lakehouse without a schema
|
|
61
90
|
|
|
@@ -66,7 +95,7 @@ let the schema fill that slot:
|
|
|
66
95
|
|
|
67
96
|
```yaml
|
|
68
97
|
schema: Tables
|
|
69
|
-
root_path: "abfss://<
|
|
98
|
+
root_path: "abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>"
|
|
70
99
|
```
|
|
71
100
|
|
|
72
101
|
Since models are written to `<root_path>/<schema>/<model>`, this lands them at
|
|
@@ -280,7 +309,7 @@ unchanged since the call, else raises `CommitFailedError`.
|
|
|
280
309
|
|
|
281
310
|
```python
|
|
282
311
|
import duckrun
|
|
283
|
-
conn = duckrun.connect("abfss
|
|
312
|
+
conn = duckrun.connect("abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables/dbo")
|
|
284
313
|
conn.sql("select * from orders").write.mode("overwrite").saveAsTable("orders_copy")
|
|
285
314
|
conn.table("orders_copy").show()
|
|
286
315
|
|
|
@@ -326,6 +355,32 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
|
|
|
326
355
|
runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
|
|
327
356
|
in those docs are rendered live by CI, so they always reflect the latest `main`.
|
|
328
357
|
|
|
358
|
+
## Limitations
|
|
359
|
+
|
|
360
|
+
These are core design trade-offs, not bugs — they're inherent to gluing DuckDB to delta_rs and
|
|
361
|
+
won't be "fixed" away:
|
|
362
|
+
|
|
363
|
+
- **A single dbt run is single-threaded — but concurrency works fine.** This is purely a dbt-adapter
|
|
364
|
+
implementation detail: *within one dbt process* models run with `threads: 1`, because the
|
|
365
|
+
in-process delta_rs write path isn't thread-safe (parallel writes to a table in the *same* process
|
|
366
|
+
collide). It is **not** a limit on concurrent writers. Multiple independent writers — separate dbt
|
|
367
|
+
runs, notebooks, jobs, whatever — writing the same tables at the same time is fully supported and
|
|
368
|
+
safe: every write uses optimistic concurrency (snapshot-pinned MERGE, `safeappend` compare-and-swap,
|
|
369
|
+
fail-loud on a conflicting commit). So you can absolutely run many writers in parallel; you just
|
|
370
|
+
can't multi-thread the models *inside a single* dbt invocation.
|
|
371
|
+
- **Two engines share one machine's memory.** DuckDB executes the SQL and delta_rs materializes the
|
|
372
|
+
Delta table — two separate memory systems in the same process, each with its own pool. Under heavy
|
|
373
|
+
memory pressure (large merges especially) the budget has to be split between them, and getting that
|
|
374
|
+
split right is fragile: delta_rs's merge spill-to-disk is itself flaky, and coordinating two
|
|
375
|
+
systems that don't know about each other's allocations is the hard, unavoidable part of this design.
|
|
376
|
+
- **`DROP TABLE` is a soft tombstone, not a physical delete.** delta_rs has no `DROP`, and removing the
|
|
377
|
+
Delta files directly would be a filesystem hack that fails on object stores — so `conn.sql("drop
|
|
378
|
+
table x")` overwrites the table with a one-column tombstone marker and unregisters it. The table
|
|
379
|
+
vanishes from `conn.catalog` and discovery, and a later `create table x as …` revives the path with
|
|
380
|
+
real data, but the **files are not reclaimed** (a human purges them). One consequence: reading the
|
|
381
|
+
path *directly* (`conn.read.delta("…/x")`) bypasses discovery and returns the one-row tombstone
|
|
382
|
+
marker rather than erroring — address dropped tables by name, not by path.
|
|
383
|
+
|
|
329
384
|
## License
|
|
330
385
|
|
|
331
386
|
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
version = "0.3.19"
|
|
@@ -129,6 +129,11 @@ _CREATE_TEMP_RE = re.compile(r"\s*create\s+(?:or\s+replace\s+)?(?:temp|temporary
|
|
|
129
129
|
# verb would match inside an identifier (e.g. `update` within `last_update`).
|
|
130
130
|
_LEADING_WITH = re.compile(r"\s*with\b", re.I)
|
|
131
131
|
_DRIVING_DML = re.compile(r"\b(?:insert\s+into|update|delete\s+from)\b", re.I)
|
|
132
|
+
# DuckDB numeric type names (DECIMAL(p,s) matches on the prefix). Used to scope the lossy-narrowing
|
|
133
|
+
# guard to numeric→numeric casts only, leaving the intentional timestamp/string alignment untouched.
|
|
134
|
+
_NUMERIC_TYPE_RE = re.compile(
|
|
135
|
+
r"^(?:TINYINT|SMALLINT|INTEGER|BIGINT|HUGEINT|UTINYINT|USMALLINT|UINTEGER|UBIGINT|UHUGEINT|"
|
|
136
|
+
r"FLOAT|REAL|DOUBLE|DECIMAL)\b", re.I)
|
|
132
137
|
|
|
133
138
|
|
|
134
139
|
def _strip_leading(query: str) -> str:
|
|
@@ -382,10 +387,10 @@ class _DeltaDML:
|
|
|
382
387
|
if self._with_clause: # `WITH … INSERT INTO t SELECT …`: re-attach the CTE to the body
|
|
383
388
|
body = f"{self._with_clause} {body}"
|
|
384
389
|
cols = m.group("cols")
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
390
|
+
# Always project onto the target schema — a column list maps by name, no list maps
|
|
391
|
+
# positionally. Routing both through _append_projected gives one place for the intentional
|
|
392
|
+
# type alignment AND the lossy-numeric-narrowing guard (so `insert … select 3.9` is caught too).
|
|
393
|
+
self._append_projected(loc, self._provided(cols) if cols else None, f"({body})")
|
|
389
394
|
|
|
390
395
|
def _insert_values(self, m, rel, schema, loc) -> None:
|
|
391
396
|
# `insert into <rel> [(<cols>)] values (...)`: the literals supply every target column when
|
|
@@ -420,6 +425,7 @@ class _DeltaDML:
|
|
|
420
425
|
|
|
421
426
|
quoted = ", ".join('"' + c + '"' for c in provided)
|
|
422
427
|
inner = f"{derived} v({quoted})"
|
|
428
|
+
self._reject_lossy_numeric_narrowing(inner, provided, dict(zip(target_cols, target_types)))
|
|
423
429
|
exprs = [
|
|
424
430
|
f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
|
|
425
431
|
else f'cast(null as {typ}) as "{col}"'
|
|
@@ -428,6 +434,43 @@ class _DeltaDML:
|
|
|
428
434
|
data = self.cursor.sql(f"select {', '.join(exprs)} from {inner}")
|
|
429
435
|
engine.write_delta(loc, data, "append", storage_options=self.so)
|
|
430
436
|
|
|
437
|
+
def _reject_lossy_numeric_narrowing(self, inner: str, provided, ttype) -> None:
|
|
438
|
+
"""Fail loud when a supplied numeric value would be SILENTLY changed by the cast onto its
|
|
439
|
+
target column — e.g. inserting 3.9 into an INTEGER column (which lands 4). The cast in
|
|
440
|
+
:meth:`_append_projected` aligns types ON PURPOSE — timestamp ntz, int widening — and those are
|
|
441
|
+
lossless and intended, so this guard only fires for a numeric→numeric cast where the value does
|
|
442
|
+
NOT survive a round-trip through the target type. Non-numeric casts (timestamps, strings) are
|
|
443
|
+
deliberately left untouched. Raises ``ValueError`` naming the column and an example value.
|
|
444
|
+
|
|
445
|
+
Costs one extra evaluation of ``inner`` (trivial for VALUES; a second scan for ``insert …
|
|
446
|
+
select`` — acceptable to turn silent corruption into a loud error)."""
|
|
447
|
+
src = self.cursor.sql(
|
|
448
|
+
"select " + ", ".join(f'v."{c}"' for c in provided) + f" from {inner} limit 0")
|
|
449
|
+
stype = {c: str(t) for c, t in zip(provided, src.types)}
|
|
450
|
+
checks = [] # (col, lossy-predicate) for numeric→numeric casts that could narrow
|
|
451
|
+
for col in provided:
|
|
452
|
+
s, t = stype[col], ttype[col]
|
|
453
|
+
if s == t or not (_NUMERIC_TYPE_RE.match(s) and _NUMERIC_TYPE_RE.match(t)):
|
|
454
|
+
continue
|
|
455
|
+
# round-trip through the target type; try_cast so the probe itself never throws — an
|
|
456
|
+
# out-of-range value becomes NULL → distinct → flagged, same as a fractional loss.
|
|
457
|
+
checks.append(
|
|
458
|
+
(col, f'try_cast(try_cast(v."{col}" as {t}) as {s}) is distinct from v."{col}"'))
|
|
459
|
+
if not checks:
|
|
460
|
+
return
|
|
461
|
+
sel = ", ".join(
|
|
462
|
+
f'count(*) filter (where {pred}) as "n{i}", '
|
|
463
|
+
f'any_value(v."{col}") filter (where {pred}) as "ex{i}"'
|
|
464
|
+
for i, (col, pred) in enumerate(checks))
|
|
465
|
+
row = self.cursor.sql(f"select {sel} from {inner}").fetchone()
|
|
466
|
+
for i, (col, _) in enumerate(checks):
|
|
467
|
+
n, ex = row[2 * i], row[2 * i + 1]
|
|
468
|
+
if n:
|
|
469
|
+
raise ValueError(
|
|
470
|
+
f"INSERT would silently narrow {n} value(s) for column '{col}' into "
|
|
471
|
+
f"{ttype[col]} (e.g. {ex!r}). Cast explicitly in the SELECT/VALUES if intended."
|
|
472
|
+
)
|
|
473
|
+
|
|
431
474
|
def _alter_add(self, m, rel, schema, loc) -> None:
|
|
432
475
|
col = m.group("col").strip().strip('"')
|
|
433
476
|
# Keep only the column type (drop any DEFAULT/NULL clause); add it as an all-null column by
|
|
@@ -32,6 +32,10 @@ class DuckrunConnectionManager(DuckDBConnectionManager):
|
|
|
32
32
|
|
|
33
33
|
@classmethod
|
|
34
34
|
def open(cls, connection):
|
|
35
|
+
# Fail loud if the kernel still has Fabric's stale duckdb/deltalake loaded (installed an
|
|
36
|
+
# upgrade but skipped notebookutils.session.restartPython()). Lazy import: same wheel.
|
|
37
|
+
from duckrun._runtime import check_runtime_versions
|
|
38
|
+
check_runtime_versions()
|
|
35
39
|
# duckrun runs single-threaded, so it uses ONE DuckDB connection for the whole run
|
|
36
40
|
# (DuckrunEnvironment) instead of dbt-duckdb's per-handle cursors — see environment.py.
|
|
37
41
|
# Pre-seed the base class's singleton _ENV with it for the local case; remote/MotherDuck
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Runtime version guardrail.
|
|
2
|
+
|
|
3
|
+
duckrun needs ``duckdb`` >= 1.5.4 — the release where ``delta_scan`` gained its ``version => N``
|
|
4
|
+
parameter (used for snapshot-pinned reads) — and ``deltalake`` >= 1.5.0 (for the merge
|
|
5
|
+
``max_spill_size`` cap). A Microsoft Fabric Python notebook ships a *stable* ``duckdb`` release,
|
|
6
|
+
which trails the newest one, so the ``duckdb`` already imported in the kernel may predate 1.5.4.
|
|
7
|
+
``pip install duckrun --upgrade`` writes the new wheels to disk, but the already-loaded modules stay
|
|
8
|
+
bound until the kernel restarts — so a user who skips the restart would keep running on the older
|
|
9
|
+
modules, quietly losing snapshot-pinned reads and the spill cap.
|
|
10
|
+
|
|
11
|
+
This check turns that into a loud, actionable error. It inspects the *loaded* versions (not the
|
|
12
|
+
pin), so it fires exactly on the forgot-to-restart case.
|
|
13
|
+
"""
|
|
14
|
+
from packaging.version import Version
|
|
15
|
+
|
|
16
|
+
# Floors duckrun needs at *runtime* — keep in sync with the pins in pyproject.toml:
|
|
17
|
+
# duckdb 1.5.4 -> delta_scan('...', version => N) for snapshot-pinned incremental reads
|
|
18
|
+
# deltalake 1.5.0 -> max_spill_size on MERGE to cap merge RAM and avoid OOM on large upserts
|
|
19
|
+
_MIN_DUCKDB = "1.5.4"
|
|
20
|
+
_MIN_DELTALAKE = "1.5.0"
|
|
21
|
+
|
|
22
|
+
_REMEDY = (
|
|
23
|
+
"In a Fabric Python notebook, upgrade then restart the kernel so the new versions load:\n"
|
|
24
|
+
" !pip install duckrun --upgrade\n"
|
|
25
|
+
" notebookutils.session.restartPython()\n"
|
|
26
|
+
"then re-run. (Elsewhere: pip install -U 'duckdb>={duckdb}' 'deltalake>={deltalake}' and "
|
|
27
|
+
"restart the interpreter.)"
|
|
28
|
+
).format(duckdb=_MIN_DUCKDB, deltalake=_MIN_DELTALAKE)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def check_runtime_versions():
|
|
32
|
+
"""Raise ``RuntimeError`` if the *loaded* duckdb/deltalake are older than duckrun requires.
|
|
33
|
+
|
|
34
|
+
Catches the notebook "installed but forgot ``restartPython()``" case: the kernel keeps the
|
|
35
|
+
older duckdb/deltalake bound until restart. Idempotent and cheap; called at each entry point
|
|
36
|
+
(``duckrun.connect()`` and the dbt connection open).
|
|
37
|
+
"""
|
|
38
|
+
import duckdb
|
|
39
|
+
import deltalake
|
|
40
|
+
|
|
41
|
+
too_old = []
|
|
42
|
+
if Version(duckdb.__version__) < Version(_MIN_DUCKDB):
|
|
43
|
+
too_old.append(f"duckdb {duckdb.__version__} (need >= {_MIN_DUCKDB})")
|
|
44
|
+
if Version(deltalake.__version__) < Version(_MIN_DELTALAKE):
|
|
45
|
+
too_old.append(f"deltalake {deltalake.__version__} (need >= {_MIN_DELTALAKE})")
|
|
46
|
+
|
|
47
|
+
if too_old:
|
|
48
|
+
raise RuntimeError(
|
|
49
|
+
"duckrun needs a newer " + " and ".join(too_old) + " than the kernel has loaded.\n"
|
|
50
|
+
+ _REMEDY
|
|
51
|
+
)
|
|
@@ -16,6 +16,7 @@ import duckdb
|
|
|
16
16
|
|
|
17
17
|
from dbt.adapters.duckrun import delta_dml, engine, remote, secret
|
|
18
18
|
from . import auth
|
|
19
|
+
from ._runtime import check_runtime_versions
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
# Statements that would WRITE to a table — rejected by the read-only conn.sql() with a pointer to
|
|
@@ -25,6 +26,8 @@ from . import auth
|
|
|
25
26
|
# TEMP/TEMPORARY TABLE and CREATE VIEW are DuckDB-local scratch by design and pass through.
|
|
26
27
|
_WRITE_KEYWORD_RE = re.compile(r"^(insert|update|delete|merge)\b", re.IGNORECASE)
|
|
27
28
|
_CREATE_TABLE_RE = re.compile(r"^create\s+(or\s+replace\s+)?table\b", re.IGNORECASE)
|
|
29
|
+
_DML_TARGET_RE = re.compile(
|
|
30
|
+
r"^(?:insert\s+into|delete\s+from|update)\s+(?P<rel>\"?[\w.]+\"?)", re.IGNORECASE)
|
|
28
31
|
_CREATE_TEMP_RE = re.compile(r"^create\s+(or\s+replace\s+)?(temp|temporary)\b", re.IGNORECASE)
|
|
29
32
|
|
|
30
33
|
# DML forms that genuinely can't be expressed through delta_rs (delta_dml.handle never applies them):
|
|
@@ -95,6 +98,34 @@ def _is_delta_write(query: str) -> bool:
|
|
|
95
98
|
return bool(_CREATE_TABLE_RE.match(s)) and not _CREATE_TEMP_RE.match(s)
|
|
96
99
|
|
|
97
100
|
|
|
101
|
+
def _delta_write_message(query: str) -> str:
|
|
102
|
+
"""The error for a raw-SQL write conn.sql() can't route to delta_rs. For an INSERT/UPDATE/DELETE
|
|
103
|
+
whose target isn't a discovered Delta table — the common cause being a typo or a table written
|
|
104
|
+
out-of-band before refresh() — name the table and give form-appropriate guidance, instead of the
|
|
105
|
+
generic 'use the Spark write API' redirect (which misdirects: for UPDATE/DELETE the problem is the
|
|
106
|
+
missing table, not the API)."""
|
|
107
|
+
s = _strip_leading(query)
|
|
108
|
+
m = _DML_TARGET_RE.match(s)
|
|
109
|
+
if m:
|
|
110
|
+
rel = m.group("rel").strip('"')
|
|
111
|
+
verb = s.split(None, 1)[0].lower()
|
|
112
|
+
if verb in ("update", "delete"):
|
|
113
|
+
return (
|
|
114
|
+
f"conn.sql(): no Delta table '{rel}' to {verb}. conn.sql() DML only targets a "
|
|
115
|
+
f"discovered Delta table — check the name, or call conn.refresh() if it was just "
|
|
116
|
+
f"written out-of-band."
|
|
117
|
+
)
|
|
118
|
+
return ( # insert into a table that doesn't exist yet
|
|
119
|
+
f"conn.sql(): no Delta table '{rel}' to insert into. Create it first with "
|
|
120
|
+
f"df.write.saveAsTable('{rel}'), then insert."
|
|
121
|
+
)
|
|
122
|
+
return ( # a CREATE … AS that didn't resolve, or any other unrouted Delta write
|
|
123
|
+
"conn.sql() can't write a Delta table from raw SQL here. "
|
|
124
|
+
"Use the Spark write API: df.write.saveAsTable(...) to create/append, or "
|
|
125
|
+
"conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
98
129
|
def _qid(name: str) -> str:
|
|
99
130
|
"""Quote a SQL identifier (schema/table/view name)."""
|
|
100
131
|
return '"' + str(name).replace('"', '""') + '"'
|
|
@@ -105,6 +136,40 @@ def _qlit(text: str) -> str:
|
|
|
105
136
|
return str(text).replace("'", "''")
|
|
106
137
|
|
|
107
138
|
|
|
139
|
+
def _strip_query_context(msg: str) -> str:
|
|
140
|
+
"""DuckDB appends the offending statement to errors as ``\\nLINE N: <sql>\\n ^``. When that
|
|
141
|
+
statement is one duckrun generated internally (the ``delta_scan`` view), echoing it back is
|
|
142
|
+
noise that makes the failure look like it's about the caller's input. Keep the real error
|
|
143
|
+
text; drop the generated-SQL context."""
|
|
144
|
+
idx = msg.find("\nLINE ")
|
|
145
|
+
return msg[:idx].rstrip() if idx != -1 else msg
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
_GUID = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _onelake_guid_hint(root_path: str) -> Optional[str]:
|
|
152
|
+
"""Workaround note for the OneLake ``delta_scan`` bug, shown only when a friendly-name
|
|
153
|
+
``abfss://`` path is involved. OneLake's delta_scan can fail to enumerate a valid table's
|
|
154
|
+
``_delta_log`` when the path uses friendly workspace/lakehouse names (duckdb-delta#307); the
|
|
155
|
+
GUID form reads fine. Returns ``None`` for non-abfss paths or paths already using GUIDs (no
|
|
156
|
+
point nagging those)."""
|
|
157
|
+
if not remote.is_abfss(root_path):
|
|
158
|
+
return None
|
|
159
|
+
workspace, _host, path = remote._parse_abfss(root_path)
|
|
160
|
+
lakehouse = path.split("/", 1)[0] if path else ""
|
|
161
|
+
if lakehouse.lower().endswith(".lakehouse"):
|
|
162
|
+
lakehouse = lakehouse[: -len(".Lakehouse")]
|
|
163
|
+
if _GUID.match(workspace) and _GUID.match(lakehouse):
|
|
164
|
+
return None
|
|
165
|
+
return (
|
|
166
|
+
"OneLake's delta_scan can fail to read a valid table's _delta_log when the abfss path uses "
|
|
167
|
+
"friendly names — a known upstream issue (duckdb-delta#307). Until it's fixed, use the "
|
|
168
|
+
"workspace and lakehouse GUIDs, e.g. "
|
|
169
|
+
"abfss://<workspace-guid>@onelake.dfs.fabric.microsoft.com/<lakehouse-guid>/Tables"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
108
173
|
def _split_root_schema(path: str, schema: Optional[str]):
|
|
109
174
|
"""Normalize ``path`` into ``(root_path, schema)``.
|
|
110
175
|
|
|
@@ -227,10 +292,22 @@ class DuckSession:
|
|
|
227
292
|
|
|
228
293
|
def _register_view(self, schema: str, table: str):
|
|
229
294
|
path = f"{self.root_path.rstrip('/')}/{schema}/{table}"
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
295
|
+
try:
|
|
296
|
+
self.con.execute(
|
|
297
|
+
f"CREATE OR REPLACE VIEW {_qid(schema)}.{_qid(table)} AS "
|
|
298
|
+
f"SELECT * FROM delta_scan('{_qlit(path)}')"
|
|
299
|
+
)
|
|
300
|
+
except Exception as exc:
|
|
301
|
+
# delta_scan failed reading the table. Keep the real engine error (it's the signal —
|
|
302
|
+
# e.g. the OneLake "No files in log segment" delta-kernel bug), but drop DuckDB's echo
|
|
303
|
+
# of the CREATE VIEW statement *we* generated, and say which table/path it was. Suppress
|
|
304
|
+
# the chained original (`from None`) so the noisy SQL echo doesn't reappear in tracebacks.
|
|
305
|
+
hint = _onelake_guid_hint(self.root_path)
|
|
306
|
+
raise RuntimeError(
|
|
307
|
+
f"duckrun: could not read Delta table {schema}.{table} at '{path}':\n"
|
|
308
|
+
f"{_strip_query_context(str(exc))}"
|
|
309
|
+
+ (f"\n\n{hint}" if hint else "")
|
|
310
|
+
) from None
|
|
234
311
|
|
|
235
312
|
def _set_search_path(self, schema: str):
|
|
236
313
|
try:
|
|
@@ -275,11 +352,7 @@ class DuckSession:
|
|
|
275
352
|
self.refresh(quiet=True)
|
|
276
353
|
return DataFrame(self.con.sql("SELECT 'ok' AS status"), self)
|
|
277
354
|
if _is_delta_write(query):
|
|
278
|
-
raise ValueError(
|
|
279
|
-
"conn.sql() can't write a Delta table from raw SQL here. "
|
|
280
|
-
"Use the Spark write API: df.write.saveAsTable(...) to create/append, or "
|
|
281
|
-
"conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
|
|
282
|
-
)
|
|
355
|
+
raise ValueError(_delta_write_message(query))
|
|
283
356
|
return DataFrame(self.con.sql(query), self)
|
|
284
357
|
|
|
285
358
|
def table(self, name: str) -> "DataFrame":
|
|
@@ -436,17 +509,17 @@ class DataFrameWriter:
|
|
|
436
509
|
self._partition_by = list(cols)
|
|
437
510
|
return self
|
|
438
511
|
|
|
439
|
-
def
|
|
512
|
+
def _write(self, path: str, descr: str) -> None:
|
|
513
|
+
"""Apply the configured mode to the Delta table at ``path`` (storage-neutral). ``descr``
|
|
514
|
+
names the target in the mode='error' message. Shared by saveAsTable and save."""
|
|
440
515
|
session = self._df.session
|
|
441
|
-
schema, table = session.resolve(name)
|
|
442
|
-
path = session.table_path(schema, table)
|
|
443
516
|
so = session.storage_options
|
|
444
517
|
|
|
445
518
|
mode = self._mode
|
|
446
519
|
if mode in ("error", "errorifexists"):
|
|
447
520
|
if engine.table_exists(path, so):
|
|
448
521
|
raise ValueError(
|
|
449
|
-
f"
|
|
522
|
+
f"{descr} already exists (mode='error'). "
|
|
450
523
|
f"Use mode('overwrite'), mode('append'), mode('safeappend'), or mode('ignore')."
|
|
451
524
|
)
|
|
452
525
|
mode = "overwrite"
|
|
@@ -487,6 +560,22 @@ class DataFrameWriter:
|
|
|
487
560
|
storage_options=so,
|
|
488
561
|
compaction_threshold=session.compaction_threshold,
|
|
489
562
|
)
|
|
563
|
+
|
|
564
|
+
def save(self, path: str) -> str:
|
|
565
|
+
"""Spark ``df.write.save(path)`` — write to a Delta table by PATH, not catalog name.
|
|
566
|
+
|
|
567
|
+
Storage-neutral (local / s3:// / gs:// / az:// / abfss://). Unlike :meth:`saveAsTable`,
|
|
568
|
+
the result is addressed only by ``path`` — there is no schema.table name to register a
|
|
569
|
+
view for — so it is read back with ``conn.read.delta(path)`` / ``delta_scan('<path>')``,
|
|
570
|
+
not as an unqualified table. Returns ``path``."""
|
|
571
|
+
self._write(path, f"delta table at '{path}'")
|
|
572
|
+
return path
|
|
573
|
+
|
|
574
|
+
def saveAsTable(self, name: str) -> str:
|
|
575
|
+
session = self._df.session
|
|
576
|
+
schema, table = session.resolve(name)
|
|
577
|
+
path = session.table_path(schema, table)
|
|
578
|
+
self._write(path, f"table '{schema}.{table}'")
|
|
490
579
|
# Surface the (new or grown) table immediately — no manual refresh() needed.
|
|
491
580
|
session.con.execute(f"CREATE SCHEMA IF NOT EXISTS {_qid(schema)}")
|
|
492
581
|
session._register_view(schema, table)
|
|
@@ -573,4 +662,5 @@ def connect(path: str, storage_options: Optional[Dict[str, str]] = None,
|
|
|
573
662
|
>>> conn.sql("SHOW TABLES").show()
|
|
574
663
|
>>> conn.sql("select * from orders").write.mode("overwrite").saveAsTable("orders_copy")
|
|
575
664
|
"""
|
|
665
|
+
check_runtime_versions() # fail loud if Fabric's stale duckdb/deltalake are still loaded
|
|
576
666
|
return DuckSession(path, storage_options, schema, compaction_threshold)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.19
|
|
4
4
|
Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -12,7 +12,7 @@ Description-Content-Type: text/markdown
|
|
|
12
12
|
License-File: LICENSE
|
|
13
13
|
Requires-Dist: dbt-duckdb>=1.8
|
|
14
14
|
Requires-Dist: dbt-core<2.0,>=1.8
|
|
15
|
-
Requires-Dist: duckdb
|
|
15
|
+
Requires-Dist: duckdb<1.6.0,>=1.5.4
|
|
16
16
|
Requires-Dist: deltalake<1.5.1,>=1.5.0
|
|
17
17
|
Requires-Dist: requests
|
|
18
18
|
Provides-Extra: local
|
|
@@ -35,9 +35,13 @@ Dynamic: license-file
|
|
|
35
35
|
> not affiliated with, endorsed by, or supported by any employer or vendor. No warranty —
|
|
36
36
|
> use it at your own risk.
|
|
37
37
|
|
|
38
|
-
**duckrun**
|
|
39
|
-
**
|
|
40
|
-
|
|
38
|
+
**duckrun** runs SQL in [DuckDB](https://duckdb.org/) and writes
|
|
39
|
+
[**Delta Lake**](https://delta-io.github.io/delta-rs/) via delta_rs. It gives you:
|
|
40
|
+
|
|
41
|
+
- a [**dbt**](https://www.getdbt.com/) adapter that materializes models as Delta tables;
|
|
42
|
+
- a **`connect()`** helper to write Delta straight from SQL in a notebook;
|
|
43
|
+
- **full snapshot isolation** from read to write — concurrent writers fail loud, never interleave.
|
|
44
|
+
|
|
41
45
|
duckrun itself is just glue — it owns none of the heavy lifting. The real work is done
|
|
42
46
|
by **DuckDB** (executes the SQL), **delta-rs** (writes the Delta table), **Arrow** (the
|
|
43
47
|
zero-copy (kind of) bridge that hands query results from DuckDB to delta-rs), and **dbt** (orchestrates
|
|
@@ -67,6 +71,21 @@ pip install duckrun
|
|
|
67
71
|
|
|
68
72
|
That single install pulls in `dbt-duckdb` (and therefore `duckdb`) plus `deltalake`.
|
|
69
73
|
|
|
74
|
+
### In a Microsoft Fabric Python notebook
|
|
75
|
+
|
|
76
|
+
duckrun needs `duckdb` ≥ 1.5.4 — the release where `delta_scan` gained its `version => N`
|
|
77
|
+
parameter, which duckrun uses for snapshot-pinned reads. Fabric notebooks ship a **stable**
|
|
78
|
+
`duckdb` release, which trails the newest one, so the `duckdb` already loaded in the kernel may
|
|
79
|
+
predate 1.5.4. Upgrade, then restart the Python kernel so the new version loads.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
!pip install duckrun --upgrade
|
|
83
|
+
notebookutils.session.restartPython()
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
If you skip the restart, duckrun fails loud at `connect()` (and on `dbt run`) and tells you to
|
|
87
|
+
restart — it won't quietly run on the older `duckdb`/`deltalake` still bound in the kernel.
|
|
88
|
+
|
|
70
89
|
## Configure your profile
|
|
71
90
|
|
|
72
91
|
```yaml
|
|
@@ -79,12 +98,22 @@ my_project:
|
|
|
79
98
|
# No `threads:` needed — duckrun always runs single-threaded.
|
|
80
99
|
# DuckDB runs in-memory by default — the Delta tables are the only state.
|
|
81
100
|
# Default Delta location for models that don't set config(location=...).
|
|
82
|
-
|
|
101
|
+
# OneLake — address by GUID, not friendly names (see "OneLake: use GUID paths" below):
|
|
102
|
+
root_path: "abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables"
|
|
103
|
+
# Or any other store: './warehouse' (local), 's3://...', 'gs://...'.
|
|
83
104
|
# storage_options: {} # passed through to deltalake for remote stores
|
|
84
105
|
```
|
|
85
106
|
|
|
86
107
|
Persisted models are written to `<root_path>/<schema>/<model>` (e.g.
|
|
87
|
-
|
|
108
|
+
`.../Tables/dbo/orders`), or to an explicit `config(location=...)`.
|
|
109
|
+
|
|
110
|
+
### OneLake: use GUID paths for now
|
|
111
|
+
|
|
112
|
+
Address OneLake tables by **workspace GUID + lakehouse GUID**, not friendly names —
|
|
113
|
+
`abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables/...`. This
|
|
114
|
+
sidesteps an upstream `duckdb-delta` read bug ("No files in log segment") that is **already fixed
|
|
115
|
+
upstream but still rolling out to production OneLake**. Friendly-name paths will work again once
|
|
116
|
+
the fix finishes deploying.
|
|
88
117
|
|
|
89
118
|
### Fabric Lakehouse without a schema
|
|
90
119
|
|
|
@@ -95,7 +124,7 @@ let the schema fill that slot:
|
|
|
95
124
|
|
|
96
125
|
```yaml
|
|
97
126
|
schema: Tables
|
|
98
|
-
root_path: "abfss://<
|
|
127
|
+
root_path: "abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>"
|
|
99
128
|
```
|
|
100
129
|
|
|
101
130
|
Since models are written to `<root_path>/<schema>/<model>`, this lands them at
|
|
@@ -309,7 +338,7 @@ unchanged since the call, else raises `CommitFailedError`.
|
|
|
309
338
|
|
|
310
339
|
```python
|
|
311
340
|
import duckrun
|
|
312
|
-
conn = duckrun.connect("abfss
|
|
341
|
+
conn = duckrun.connect("abfss://<workspace_id>@onelake.dfs.fabric.microsoft.com/<lakehouse_id>/Tables/dbo")
|
|
313
342
|
conn.sql("select * from orders").write.mode("overwrite").saveAsTable("orders_copy")
|
|
314
343
|
conn.table("orders_copy").show()
|
|
315
344
|
|
|
@@ -355,6 +384,32 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
|
|
|
355
384
|
runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
|
|
356
385
|
in those docs are rendered live by CI, so they always reflect the latest `main`.
|
|
357
386
|
|
|
387
|
+
## Limitations
|
|
388
|
+
|
|
389
|
+
These are core design trade-offs, not bugs — they're inherent to gluing DuckDB to delta_rs and
|
|
390
|
+
won't be "fixed" away:
|
|
391
|
+
|
|
392
|
+
- **A single dbt run is single-threaded — but concurrency works fine.** This is purely a dbt-adapter
|
|
393
|
+
implementation detail: *within one dbt process* models run with `threads: 1`, because the
|
|
394
|
+
in-process delta_rs write path isn't thread-safe (parallel writes to a table in the *same* process
|
|
395
|
+
collide). It is **not** a limit on concurrent writers. Multiple independent writers — separate dbt
|
|
396
|
+
runs, notebooks, jobs, whatever — writing the same tables at the same time is fully supported and
|
|
397
|
+
safe: every write uses optimistic concurrency (snapshot-pinned MERGE, `safeappend` compare-and-swap,
|
|
398
|
+
fail-loud on a conflicting commit). So you can absolutely run many writers in parallel; you just
|
|
399
|
+
can't multi-thread the models *inside a single* dbt invocation.
|
|
400
|
+
- **Two engines share one machine's memory.** DuckDB executes the SQL and delta_rs materializes the
|
|
401
|
+
Delta table — two separate memory systems in the same process, each with its own pool. Under heavy
|
|
402
|
+
memory pressure (large merges especially) the budget has to be split between them, and getting that
|
|
403
|
+
split right is fragile: delta_rs's merge spill-to-disk is itself flaky, and coordinating two
|
|
404
|
+
systems that don't know about each other's allocations is the hard, unavoidable part of this design.
|
|
405
|
+
- **`DROP TABLE` is a soft tombstone, not a physical delete.** delta_rs has no `DROP`, and removing the
|
|
406
|
+
Delta files directly would be a filesystem hack that fails on object stores — so `conn.sql("drop
|
|
407
|
+
table x")` overwrites the table with a one-column tombstone marker and unregisters it. The table
|
|
408
|
+
vanishes from `conn.catalog` and discovery, and a later `create table x as …` revives the path with
|
|
409
|
+
real data, but the **files are not reclaimed** (a human purges them). One consequence: reading the
|
|
410
|
+
path *directly* (`conn.read.delta("…/x")`) bypasses discovery and returns the one-row tombstone
|
|
411
|
+
marker rather than erroring — address dropped tables by name, not by path.
|
|
412
|
+
|
|
358
413
|
## License
|
|
359
414
|
|
|
360
415
|
MIT
|
|
@@ -21,6 +21,7 @@ dbt/include/duckrun/macros/materializations/incremental.sql
|
|
|
21
21
|
dbt/include/duckrun/macros/materializations/snapshot.sql
|
|
22
22
|
dbt/include/duckrun/macros/materializations/table.sql
|
|
23
23
|
duckrun/__init__.py
|
|
24
|
+
duckrun/_runtime.py
|
|
24
25
|
duckrun/auth.py
|
|
25
26
|
duckrun/delta_table.py
|
|
26
27
|
duckrun/session.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.19"
|
|
8
8
|
description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -21,23 +21,22 @@ dependencies = [
|
|
|
21
21
|
# fails at `from dbt.cli.main import dbtRunner`. Declared directly because we only depend on
|
|
22
22
|
# dbt-core transitively, so the ceiling has to live here to bite.
|
|
23
23
|
"dbt-core>=1.8,<2.0",
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
# duckdb
|
|
27
|
-
#
|
|
28
|
-
#
|
|
29
|
-
# 1.
|
|
30
|
-
#
|
|
31
|
-
#
|
|
32
|
-
#
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
# abfss paths, and is fixed upstream.
|
|
24
|
+
# duckdb floor is 1.5.4 with a <1.6.0 cap (a floor, NOT an exact pin): 1.5.4 is the first
|
|
25
|
+
# stable build whose bundled duckdb-delta extension supports `delta_scan('...', version => N)`
|
|
26
|
+
# (duckdb-delta #312) — the version-pinned read this project relies on to make the incremental
|
|
27
|
+
# read and the write commit resolve at ONE Delta snapshot (Spark single-snapshot MERGE parity;
|
|
28
|
+
# see the staging-read pin in _delta_core.sql and merge_delta's read_version). Stable 1.5.x
|
|
29
|
+
# patches above the floor are fine. The <1.6.0 cap matters because this project resolves with
|
|
30
|
+
# pip --pre (to pick up dbt pre-releases) and --pre is global: an open floor would let pip pull
|
|
31
|
+
# an unstable duckdb *prerelease* (verified: "--pre duckdb>=1.5.4" resolves to 1.6.0.dev12).
|
|
32
|
+
# Per PEP 440, "<1.6.0" excludes 1.6.0 AND its prereleases (1.6.0.devN), so no dev build slips
|
|
33
|
+
# in. The earlier 1.5.2+ "No files in log segment" read regression is avoided by addressing
|
|
34
|
+
# OneLake tables via GUID (workspace_id/lakehouse_id) abfss paths, and is fixed upstream.
|
|
36
35
|
# deltalake floor stays 1.5.0 (not just a ceiling): 1.5.0 is the first release with MERGE
|
|
37
36
|
# disk-spill config (max_spill_size), which engine.merge_delta relies on to cap the merge's RAM
|
|
38
37
|
# and avoid OOM on large upserts; the matching <1.5.1 ceiling avoids the deltalake delta-log
|
|
39
38
|
# write-side regression, pinning exactly 1.5.0.
|
|
40
|
-
"duckdb
|
|
39
|
+
"duckdb>=1.5.4,<1.6.0",
|
|
41
40
|
"deltalake>=1.5.0,<1.5.1",
|
|
42
41
|
# The top-level connection API (duckrun.connect) discovers OneLake tables via the DFS REST
|
|
43
42
|
# API directly; requests is otherwise only a transitive dbt dependency.
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
version = "0.3.17.dev7"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/_delta_core.sql
RENAMED
|
File without changes
|
{duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/delta.sql
RENAMED
|
File without changes
|
{duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/incremental.sql
RENAMED
|
File without changes
|
{duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/snapshot.sql
RENAMED
|
File without changes
|
{duckrun-0.3.17.dev7 → duckrun-0.3.19}/dbt/include/duckrun/macros/materializations/table.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|