duckrun 0.3.17.dev3__tar.gz → 0.3.17.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.3.17.dev3/duckrun.egg-info → duckrun-0.3.17.dev4}/PKG-INFO +11 -7
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/README.md +10 -6
- duckrun-0.3.17.dev4/dbt/adapters/duckrun/__version__.py +1 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/delta_dml.py +187 -27
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/session.py +61 -15
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4/duckrun.egg-info}/PKG-INFO +11 -7
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/pyproject.toml +1 -1
- duckrun-0.3.17.dev3/dbt/adapters/duckrun/__version__.py +0 -1
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/LICENSE +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/MANIFEST.in +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/credentials.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/delta_plugin.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/engine.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/environment.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/impl.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/remote.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/secret.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/dbt_project.yml +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/catalog.sql +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/snapshot.sql +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/auth.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/delta_table.py +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.3.17.
|
|
3
|
+
Version: 0.3.17.dev4
|
|
4
4
|
Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -291,11 +291,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
|
|
|
291
291
|
## Connection API (notebook)
|
|
292
292
|
|
|
293
293
|
Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
|
|
294
|
-
interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)`
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
294
|
+
interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
|
|
295
|
+
travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
|
|
296
|
+
`update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
|
|
297
|
+
`CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
|
|
298
|
+
delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
|
|
299
|
+
write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
|
|
300
|
+
`overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
|
|
301
|
+
/ `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
|
|
302
|
+
plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
|
|
299
303
|
|
|
300
304
|
`merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
|
|
301
305
|
the target version is captured and the commit is validated against it, so a concurrent writer fails
|
|
@@ -347,7 +351,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
|
|
|
347
351
|
|
|
348
352
|
**Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
|
|
349
353
|
`tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
|
|
350
|
-
[`
|
|
354
|
+
[`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
|
|
351
355
|
runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
|
|
352
356
|
in those docs are rendered live by CI, so they always reflect the latest `main`.
|
|
353
357
|
|
|
@@ -262,11 +262,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
|
|
|
262
262
|
## Connection API (notebook)
|
|
263
263
|
|
|
264
264
|
Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
|
|
265
|
-
interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)`
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
265
|
+
interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
|
|
266
|
+
travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
|
|
267
|
+
`update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
|
|
268
|
+
`CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
|
|
269
|
+
delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
|
|
270
|
+
write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
|
|
271
|
+
`overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
|
|
272
|
+
/ `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
|
|
273
|
+
plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
|
|
270
274
|
|
|
271
275
|
`merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
|
|
272
276
|
the target version is captured and the commit is validated against it, so a concurrent writer fails
|
|
@@ -318,7 +322,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
|
|
|
318
322
|
|
|
319
323
|
**Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
|
|
320
324
|
`tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
|
|
321
|
-
[`
|
|
325
|
+
[`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
|
|
322
326
|
runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
|
|
323
327
|
in those docs are rendered live by CI, so they always reflect the latest `main`.
|
|
324
328
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
version = "0.3.17.dev4"
|
|
@@ -21,11 +21,33 @@ object stores. The directory persists until a human purges it; a later ``create
|
|
|
21
21
|
overwrites the tombstone with real data and the table is live again.
|
|
22
22
|
|
|
23
23
|
The seed loader's own SQL (``create table <t> (<col defs>)``, ``insert ... values``, ``COPY``) lands
|
|
24
|
-
on a native DuckDB table, not a Delta table: ``create table (<col defs>)``
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
24
|
+
on a native DuckDB table, not a Delta table: bare ``create table (<col defs>)`` becomes a Delta
|
|
25
|
+
table only when a ``default_schema`` is set (the connection API), and the dbt/cursor path passes
|
|
26
|
+
None — so the seed's table stays native. ``insert ... values`` *does* match a form here, but the
|
|
27
|
+
mutate guard only applies it when a Delta table already exists at the target — the seed's native
|
|
28
|
+
table has none, so it falls through untouched. duckrun's own materializations emit ``create ...
|
|
29
|
+
view`` (not ``table``), so they pass through too.
|
|
30
|
+
|
|
31
|
+
Supported / unsupported (what reaches delta_rs):
|
|
32
|
+
|
|
33
|
+
create [or replace] table x [if not exists] as <query> Delta overwrite (query: select/with/(…))
|
|
34
|
+
create table x (<col defs>) empty Delta table (connection API only)
|
|
35
|
+
create temp/temporary table … native DuckDB (pass through) ── invariant:
|
|
36
|
+
create view … native DuckDB (pass through) ── only TEMP
|
|
37
|
+
and VIEW are
|
|
38
|
+
native; every
|
|
39
|
+
other CREATE
|
|
40
|
+
TABLE is Delta
|
|
41
|
+
insert into x [(cols)] select … Delta append (projected onto target schema)
|
|
42
|
+
insert into x [(cols)] values … Delta append (projected onto target schema)
|
|
43
|
+
[with …] insert into x select … Delta append (CTE re-attached to the body)
|
|
44
|
+
delete from x [where …] delta_rs delete
|
|
45
|
+
update x set … [where …] delta_rs update
|
|
46
|
+
alter table x add column … Delta overwrite (widen schema)
|
|
47
|
+
drop table x tombstone (no data deleted)
|
|
48
|
+
merge … / update … from / delete … using / multi-stmt NOT handled here — the connection API
|
|
49
|
+
(session.sql) rejects them with a clear
|
|
50
|
+
error; the dbt path never emits them.
|
|
29
51
|
"""
|
|
30
52
|
import re
|
|
31
53
|
from typing import List, Optional, Tuple
|
|
@@ -58,12 +80,24 @@ def is_dropped(con, location: str, storage_options=None) -> bool:
|
|
|
58
80
|
return False
|
|
59
81
|
|
|
60
82
|
# --- statement matchers (leading-anchored, DOTALL so multi-line bodies match) ----------------
|
|
83
|
+
# `create [or replace] table [if not exists] <rel> as <query>`. The body is ANY query text (a bare
|
|
84
|
+
# `select …`, a `with … select …` CTE, or a parenthesised `(select …)`); it's handed to DuckDB
|
|
85
|
+
# verbatim so anything DuckDB accepts after `as` works.
|
|
61
86
|
_CREATE_AS = re.compile(
|
|
62
|
-
r"\s*create\s+
|
|
87
|
+
r"\s*create\s+(?P<orrep>or\s+replace\s+)?table\s+(?P<ine>if\s+not\s+exists\s+)?"
|
|
88
|
+
r"(?P<rel>.+?)\s+as\s+(?P<body>.+)",
|
|
89
|
+
re.I | re.S,
|
|
90
|
+
)
|
|
91
|
+
# `create [or replace] table [if not exists] <rel> (<col defs>)` — no `as`. Connection-API only
|
|
92
|
+
# (see _create_coldefs): materializes an EMPTY Delta table so `CREATE TABLE` is always Delta-backed.
|
|
93
|
+
_CREATE_COLDEFS = re.compile(
|
|
94
|
+
r"\s*create\s+(?:or\s+replace\s+)?table\s+(?:if\s+not\s+exists\s+)?"
|
|
95
|
+
r"(?P<rel>.+?)\s*\((?P<defs>.+)\)\s*;?\s*",
|
|
63
96
|
re.I | re.S,
|
|
64
97
|
)
|
|
65
98
|
_INSERT_SELECT = re.compile(
|
|
66
|
-
r"\s*insert\s+into\s+(?P<rel>.+?)\s+(?P<body>select\b.*)",
|
|
99
|
+
r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s+(?P<body>select\b.*)",
|
|
100
|
+
re.I | re.S,
|
|
67
101
|
)
|
|
68
102
|
_INSERT_VALUES = re.compile(
|
|
69
103
|
r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s*values\s+(?P<body>\(.+)",
|
|
@@ -83,6 +117,67 @@ _ALTER_ADD = re.compile(
|
|
|
83
117
|
_DROP = re.compile(
|
|
84
118
|
r"\s*drop\s+table\s+(?:if\s+exists\s+)?(?P<rel>[^\s;]+)\s*;?\s*", re.I | re.S
|
|
85
119
|
)
|
|
120
|
+
# `create temp/temporary table …` is DuckDB-local scratch by design and must NEVER be captured —
|
|
121
|
+
# checked first in try_handle so it always passes through to native DuckDB (the invariant: only
|
|
122
|
+
# CREATE TEMP TABLE is native; every other CREATE TABLE is Delta-backed).
|
|
123
|
+
_CREATE_TEMP_RE = re.compile(r"\s*create\s+(?:or\s+replace\s+)?(?:temp|temporary)\b", re.I)
|
|
124
|
+
# CTE/whitespace handling: a leading `with …` block followed by a top-level INSERT/UPDATE/DELETE.
|
|
125
|
+
# leading `\b` is load-bearing: _find_top_level tries this at every depth-0 index, so without it the
|
|
126
|
+
# verb would match inside an identifier (e.g. `update` within `last_update`).
|
|
127
|
+
_LEADING_WITH = re.compile(r"\s*with\b", re.I)
|
|
128
|
+
_DRIVING_DML = re.compile(r"\b(?:insert\s+into|update|delete\s+from)\b", re.I)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _strip_leading(query: str) -> str:
|
|
132
|
+
"""Drop leading whitespace and ``--`` / ``/* */`` comments so the first keyword is visible."""
|
|
133
|
+
s = query
|
|
134
|
+
while True:
|
|
135
|
+
t = s.lstrip()
|
|
136
|
+
if t.startswith("--"):
|
|
137
|
+
nl = t.find("\n")
|
|
138
|
+
s = "" if nl == -1 else t[nl + 1:]
|
|
139
|
+
elif t.startswith("/*"):
|
|
140
|
+
end = t.find("*/")
|
|
141
|
+
s = "" if end == -1 else t[end + 2:]
|
|
142
|
+
else:
|
|
143
|
+
return t
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _find_top_level(s: str, pattern) -> int:
|
|
147
|
+
"""Index of the first ``pattern`` match at paren-depth 0 and outside quotes, else -1.
|
|
148
|
+
|
|
149
|
+
Lets us tell a top-level clause (the ``FROM`` of ``UPDATE … FROM``, the verb after a leading
|
|
150
|
+
``WITH``) from the same keyword nested in a subquery, without a full SQL parser."""
|
|
151
|
+
depth, quote, i, n = 0, None, 0, len(s)
|
|
152
|
+
while i < n:
|
|
153
|
+
ch = s[i]
|
|
154
|
+
if quote:
|
|
155
|
+
if ch == quote:
|
|
156
|
+
quote = None
|
|
157
|
+
elif ch in ("'", '"'):
|
|
158
|
+
quote = ch
|
|
159
|
+
elif ch in "([":
|
|
160
|
+
depth += 1
|
|
161
|
+
elif ch in ")]":
|
|
162
|
+
depth -= 1
|
|
163
|
+
elif depth == 0 and pattern.match(s, i):
|
|
164
|
+
return i
|
|
165
|
+
i += 1
|
|
166
|
+
return -1
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _split_leading_with(sql: str) -> Tuple[str, str]:
|
|
170
|
+
"""``(with_clause, remainder)`` for ``WITH … <INSERT/UPDATE/DELETE> …``; ``('', sql)`` otherwise.
|
|
171
|
+
|
|
172
|
+
So ``WITH c AS (…) INSERT INTO t SELECT … FROM c`` reaches the matchers (which anchor on the
|
|
173
|
+
verb) and the CTE is preserved when the body is evaluated. A leading ``WITH`` that drives a
|
|
174
|
+
plain ``SELECT`` (a read) is left untouched."""
|
|
175
|
+
if not _LEADING_WITH.match(sql):
|
|
176
|
+
return "", sql
|
|
177
|
+
idx = _find_top_level(sql, _DRIVING_DML)
|
|
178
|
+
if idx <= 0:
|
|
179
|
+
return "", sql
|
|
180
|
+
return sql[:idx].rstrip(), sql[idx:]
|
|
86
181
|
|
|
87
182
|
|
|
88
183
|
def _fullmatch(pattern, sql):
|
|
@@ -127,6 +222,7 @@ class _DeltaDML:
|
|
|
127
222
|
self.root_path = root_path.rstrip("/")
|
|
128
223
|
self.so = storage_options
|
|
129
224
|
self.default_schema = default_schema
|
|
225
|
+
self._with_clause = "" # a leading `WITH …` preceding an INSERT, prepended to the body
|
|
130
226
|
|
|
131
227
|
def _loc(self, schema: str, identifier: str) -> str:
|
|
132
228
|
return f"{self.root_path}/{schema}/{identifier}"
|
|
@@ -152,9 +248,15 @@ class _DeltaDML:
|
|
|
152
248
|
)
|
|
153
249
|
|
|
154
250
|
def try_handle(self, sql: str) -> bool:
|
|
251
|
+
# CREATE TEMP/TEMPORARY TABLE is native DuckDB scratch by design — never capture it.
|
|
252
|
+
if _CREATE_TEMP_RE.match(sql):
|
|
253
|
+
return False
|
|
155
254
|
m = _fullmatch(_CREATE_AS, sql)
|
|
156
255
|
if m and "__duckrun" not in m.group("rel"):
|
|
157
256
|
return self._create_as(m)
|
|
257
|
+
m = _fullmatch(_CREATE_COLDEFS, sql)
|
|
258
|
+
if m and "__duckrun" not in m.group("rel"):
|
|
259
|
+
return self._create_coldefs(m)
|
|
158
260
|
m = _fullmatch(_INSERT_SELECT, sql)
|
|
159
261
|
if m:
|
|
160
262
|
return self._mutate(m, self._insert_select)
|
|
@@ -175,12 +277,26 @@ class _DeltaDML:
|
|
|
175
277
|
return self._drop(m)
|
|
176
278
|
return False
|
|
177
279
|
|
|
178
|
-
# -- create table <rel> as <
|
|
280
|
+
# -- create table <rel> as <query>: always materialize as a duckrun Delta table ------------
|
|
179
281
|
def _create_as(self, m) -> bool:
|
|
180
282
|
rel = m.group("rel").strip()
|
|
181
283
|
schema, identifier, loc = self._resolve(rel)
|
|
182
284
|
if not loc:
|
|
183
285
|
return False
|
|
286
|
+
# dbt/cursor path (no default_schema): keep the ORIGINAL narrow interception — only a plain
|
|
287
|
+
# `create table … as select …` routes to Delta. The wider forms (`or replace`, a CTE or a
|
|
288
|
+
# parenthesised body) are a connection-API affordance; on the dbt path they must stay native
|
|
289
|
+
# so dbt keeps owning the relation. dbt-internal CTAS like store_failures' `create table … as
|
|
290
|
+
# (select …)` is a real TABLE dbt later drops/recreates — turning it into a delta_scan VIEW
|
|
291
|
+
# breaks that ("Existing object … is of type View, trying to drop type Table").
|
|
292
|
+
if self.default_schema is None and (
|
|
293
|
+
m.group("orrep") or not re.match(r"select\b", m.group("body").lstrip(), re.I)
|
|
294
|
+
):
|
|
295
|
+
return False
|
|
296
|
+
# `if not exists` over a live (non-tombstone) table is a no-op — just (re)surface the view.
|
|
297
|
+
if m.group("ine") and self._exists(loc) and not is_dropped(self.cursor, loc, self.so):
|
|
298
|
+
self._refresh_view(rel, schema, loc)
|
|
299
|
+
return True
|
|
184
300
|
data = self.cursor.sql(m.group("body"))
|
|
185
301
|
# overwrite_schema so this replaces a prior table (or a drop-tombstone) wholesale — a live
|
|
186
302
|
# table is recreated with the real schema, clearing any tombstone marker.
|
|
@@ -188,12 +304,38 @@ class _DeltaDML:
|
|
|
188
304
|
self._refresh_view(rel, schema, loc)
|
|
189
305
|
return True
|
|
190
306
|
|
|
307
|
+
# -- create table <rel> (<col defs>): an EMPTY Delta table (connection API only) -----------
|
|
308
|
+
def _create_coldefs(self, m) -> bool:
|
|
309
|
+
# Only the connection API (which carries a current database) makes a bare `CREATE TABLE
|
|
310
|
+
# (col defs)` a Delta table — so `CREATE TABLE` is always Delta-backed there. The dbt/cursor
|
|
311
|
+
# path passes default_schema=None: the seed loader emits this exact form and RELIES on it
|
|
312
|
+
# landing as a native DuckDB table, so we pass through untouched.
|
|
313
|
+
if self.default_schema is None:
|
|
314
|
+
return False
|
|
315
|
+
rel = m.group("rel").strip()
|
|
316
|
+
schema, identifier, loc = self._resolve(rel)
|
|
317
|
+
if not loc:
|
|
318
|
+
return False
|
|
319
|
+
# Let DuckDB parse the column defs (types, constraints, nested parens) by building the table
|
|
320
|
+
# as a TEMP, then take a 0-row typed relation from it and write that as an empty Delta table.
|
|
321
|
+
tmp = f"__duckrun_empty_{abs(hash((schema, identifier))) & 0xFFFFFFFF}"
|
|
322
|
+
self.cursor.execute(f'create or replace temp table "{tmp}" ({m.group("defs")})')
|
|
323
|
+
try:
|
|
324
|
+
empty = self.cursor.sql(f'select * from "{tmp}" limit 0')
|
|
325
|
+
engine.write_delta(loc, empty, "overwrite", overwrite_schema=True, storage_options=self.so)
|
|
326
|
+
finally:
|
|
327
|
+
self.cursor.execute(f'drop table if exists "{tmp}"')
|
|
328
|
+
self._refresh_view(rel, schema, loc)
|
|
329
|
+
return True
|
|
330
|
+
|
|
191
331
|
# -- forms that only apply when a Delta table already exists at the target ------------------
|
|
192
332
|
def _mutate(self, m, op) -> bool:
|
|
193
333
|
rel = m.group("rel").strip()
|
|
194
334
|
schema, identifier, loc = self._resolve(rel)
|
|
195
335
|
if not loc or not self._exists(loc):
|
|
196
336
|
return False # native relation (e.g. the test's `fact`/`seed`) -> let DuckDB handle it
|
|
337
|
+
if self._with_clause and op != self._insert_select:
|
|
338
|
+
return False # `WITH … UPDATE/DELETE` can't be expressed through a delta_rs predicate
|
|
197
339
|
op(m, rel, schema, loc)
|
|
198
340
|
self._refresh_view(rel, schema, loc)
|
|
199
341
|
return True
|
|
@@ -213,34 +355,48 @@ class _DeltaDML:
|
|
|
213
355
|
)
|
|
214
356
|
|
|
215
357
|
def _insert_select(self, m, rel, schema, loc) -> None:
|
|
216
|
-
|
|
217
|
-
|
|
358
|
+
body = m.group("body")
|
|
359
|
+
if self._with_clause: # `WITH … INSERT INTO t SELECT …`: re-attach the CTE to the body
|
|
360
|
+
body = f"{self._with_clause} {body}"
|
|
361
|
+
cols = m.group("cols")
|
|
362
|
+
if cols: # `insert into t (a, b) select …` → project the query onto the named columns
|
|
363
|
+
self._append_projected(loc, self._provided(cols), f"({body})")
|
|
364
|
+
else: # column count/order already matches the target → append as-is
|
|
365
|
+
engine.write_delta(loc, self.cursor.sql(body), "append", storage_options=self.so)
|
|
218
366
|
|
|
219
367
|
def _insert_values(self, m, rel, schema, loc) -> None:
|
|
220
|
-
# `insert into <rel> [(<cols>)] values (...)`:
|
|
221
|
-
#
|
|
222
|
-
|
|
368
|
+
# `insert into <rel> [(<cols>)] values (...)`: the literals supply every target column when
|
|
369
|
+
# no list is given, in order; otherwise the named columns.
|
|
370
|
+
cols = m.group("cols")
|
|
371
|
+
provided = self._provided(cols) if cols else None
|
|
372
|
+
self._append_projected(loc, provided, f"(values {m.group('body')})")
|
|
373
|
+
|
|
374
|
+
@staticmethod
|
|
375
|
+
def _provided(cols: str) -> List[str]:
|
|
376
|
+
return [c.strip().strip('"') for c in cols.split(",")]
|
|
377
|
+
|
|
378
|
+
def _append_projected(self, loc, provided, derived: str) -> None:
|
|
379
|
+
"""Append a ``derived`` table (a ``(values …)`` tuple list or a ``(select …)`` subquery) to
|
|
380
|
+
the Delta table at ``loc``, projecting its columns onto the FULL target schema: supplied
|
|
381
|
+
columns come from ``derived`` (positional when ``provided`` is None), any unsupplied target
|
|
382
|
+
column is a typed NULL, and every projected column is cast to the target column's type so
|
|
383
|
+
the appended Arrow schema matches the table exactly (what a plain SQL INSERT does, and it
|
|
384
|
+
stops a literal wider than the column from forcing delta_rs to add a new writer feature on
|
|
385
|
+
append)."""
|
|
223
386
|
loc_sql = loc.replace("'", "''")
|
|
224
387
|
template = self.cursor.sql(f"select * from delta_scan('{loc_sql}') limit 0")
|
|
225
388
|
target_cols = list(template.columns)
|
|
226
389
|
target_types = [str(t) for t in template.types]
|
|
227
390
|
by_lower = {c.lower(): c for c in target_cols}
|
|
228
391
|
|
|
229
|
-
|
|
230
|
-
if cols: # explicit column list → canonicalize to the target's casing
|
|
231
|
-
provided = [by_lower.get(c.strip().strip('"').lower(), c.strip().strip('"'))
|
|
232
|
-
for c in cols.split(",")]
|
|
233
|
-
else: # positional → the literals supply every target column, in order
|
|
392
|
+
if provided is None: # positional → every target column, in order
|
|
234
393
|
provided = target_cols
|
|
235
|
-
|
|
394
|
+
else: # explicit column list → canonicalize to the target's casing
|
|
395
|
+
provided = [by_lower.get(c.lower(), c) for c in provided]
|
|
396
|
+
provided_set = set(provided)
|
|
236
397
|
|
|
237
398
|
quoted = ", ".join('"' + c + '"' for c in provided)
|
|
238
|
-
inner = f"
|
|
239
|
-
# Cast every projected column to the TARGET column's type — both supplied values and the
|
|
240
|
-
# typed NULLs — so the appended Arrow schema matches the table exactly. This is also what a
|
|
241
|
-
# plain SQL INSERT does (a literal is coerced to the column type), and it stops a literal
|
|
242
|
-
# whose inferred type is wider than the column (e.g. a ::timestamp into a DATE column) from
|
|
243
|
-
# forcing delta_rs to add a new writer feature on append (TimestampWithoutTimezone).
|
|
399
|
+
inner = f"{derived} v({quoted})"
|
|
244
400
|
exprs = [
|
|
245
401
|
f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
|
|
246
402
|
else f'cast(null as {typ}) as "{col}"'
|
|
@@ -290,8 +446,12 @@ def handle(cursor, root_path, storage_options, sql: str, default_schema=None) ->
|
|
|
290
446
|
"""
|
|
291
447
|
if not root_path:
|
|
292
448
|
return False
|
|
449
|
+
sql = _strip_leading(sql) # so leading comments/whitespace don't hide the verb
|
|
450
|
+
with_clause, body = _split_leading_with(sql) # peel a leading `WITH …` off an INSERT/etc.
|
|
293
451
|
# Cheap pre-filter: only the candidate DML verbs.
|
|
294
|
-
head =
|
|
452
|
+
head = body[:7].lower()
|
|
295
453
|
if not head.startswith(("delete", "update", "insert", "create", "alter", "drop")):
|
|
296
454
|
return False
|
|
297
|
-
|
|
455
|
+
dml = _DeltaDML(cursor, root_path, storage_options, default_schema)
|
|
456
|
+
dml._with_clause = with_clause
|
|
457
|
+
return dml.try_handle(body)
|
|
@@ -27,20 +27,63 @@ _WRITE_KEYWORD_RE = re.compile(r"^(insert|update|delete|merge)\b", re.IGNORECASE
|
|
|
27
27
|
_CREATE_TABLE_RE = re.compile(r"^create\s+(or\s+replace\s+)?table\b", re.IGNORECASE)
|
|
28
28
|
_CREATE_TEMP_RE = re.compile(r"^create\s+(or\s+replace\s+)?(temp|temporary)\b", re.IGNORECASE)
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
30
|
+
# DML forms that genuinely can't be expressed through delta_rs (delta_dml.handle never applies them):
|
|
31
|
+
# rejected up front with a form-specific pointer rather than letting DuckDB raise a cryptic error on
|
|
32
|
+
# the read-only delta_scan view (or, for UPDATE … FROM, silently mangling the SET clause).
|
|
33
|
+
# leading `\b`: _find_top_level probes every depth-0 index (see delta_dml._find_top_level).
|
|
34
|
+
_TOP_FROM = re.compile(r"\bfrom\b", re.IGNORECASE)
|
|
35
|
+
_TOP_USING = re.compile(r"\busing\b", re.IGNORECASE)
|
|
36
|
+
_strip_leading = delta_dml._strip_leading # shared comment/whitespace stripper
|
|
37
|
+
|
|
38
|
+
_MERGE_MSG = (
|
|
39
|
+
"conn.sql() can't run a SQL MERGE via delta_rs. Use the Spark write API: "
|
|
40
|
+
"df.write.saveAsTable(...) to create/append, or "
|
|
41
|
+
"conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
|
|
42
|
+
)
|
|
43
|
+
_UPDATE_FROM_MSG = (
|
|
44
|
+
"conn.sql() can't run UPDATE … FROM via delta_rs. Rewrite the SET values as correlated "
|
|
45
|
+
"subqueries, or use conn.delta_table(name).update(...)/.merge(...)."
|
|
46
|
+
)
|
|
47
|
+
_DELETE_USING_MSG = (
|
|
48
|
+
"conn.sql() can't run DELETE … USING via delta_rs. Rewrite the predicate as a correlated "
|
|
49
|
+
"subquery (DELETE … WHERE … IN (SELECT …)), or use conn.delta_table(name).delete(...)/.merge(...)."
|
|
50
|
+
)
|
|
51
|
+
_MULTI_MSG = (
|
|
52
|
+
"conn.sql() runs one statement at a time — split the batch into separate conn.sql() calls."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _unsupported_dml(query: str) -> Optional[str]:
|
|
57
|
+
"""An error message if ``query`` is a DML form duckrun can't route to delta_rs, else None."""
|
|
58
|
+
s = _strip_leading(query)
|
|
59
|
+
low = s.lower()
|
|
60
|
+
if low.startswith("merge"):
|
|
61
|
+
return _MERGE_MSG
|
|
62
|
+
if low.startswith("update") and delta_dml._find_top_level(s, _TOP_FROM) != -1:
|
|
63
|
+
return _UPDATE_FROM_MSG
|
|
64
|
+
if low.startswith("delete") and delta_dml._find_top_level(s, _TOP_USING) != -1:
|
|
65
|
+
return _DELETE_USING_MSG
|
|
66
|
+
if re.match(r"(insert|update|delete|merge|create|alter|drop)\b", low) and _is_multi_statement(s):
|
|
67
|
+
return _MULTI_MSG
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _is_multi_statement(s: str) -> bool:
|
|
72
|
+
"""True if ``s`` holds more than one statement (a top-level ``;`` with anything after it)."""
|
|
73
|
+
depth, quote = 0, None
|
|
74
|
+
for i, ch in enumerate(s):
|
|
75
|
+
if quote:
|
|
76
|
+
if ch == quote:
|
|
77
|
+
quote = None
|
|
78
|
+
elif ch in ("'", '"'):
|
|
79
|
+
quote = ch
|
|
80
|
+
elif ch in "([":
|
|
81
|
+
depth += 1
|
|
82
|
+
elif ch in ")]":
|
|
83
|
+
depth -= 1
|
|
84
|
+
elif ch == ";" and depth == 0 and s[i + 1:].strip():
|
|
85
|
+
return True
|
|
86
|
+
return False
|
|
44
87
|
|
|
45
88
|
|
|
46
89
|
def _is_delta_write(query: str) -> bool:
|
|
@@ -224,13 +267,16 @@ class DuckSession:
|
|
|
224
267
|
``conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()``.
|
|
225
268
|
``CREATE TEMP/VIEW`` and other DuckDB-local scratch DDL pass through to DuckDB.
|
|
226
269
|
"""
|
|
270
|
+
unsupported = _unsupported_dml(query)
|
|
271
|
+
if unsupported:
|
|
272
|
+
raise ValueError(unsupported)
|
|
227
273
|
if delta_dml.handle(self.con, self.root_path, self.storage_options, query,
|
|
228
274
|
default_schema=self._current_database):
|
|
229
275
|
self.refresh(quiet=True)
|
|
230
276
|
return DataFrame(self.con.sql("SELECT 'ok' AS status"), self)
|
|
231
277
|
if _is_delta_write(query):
|
|
232
278
|
raise ValueError(
|
|
233
|
-
"conn.sql() can't
|
|
279
|
+
"conn.sql() can't write a Delta table from raw SQL here. "
|
|
234
280
|
"Use the Spark write API: df.write.saveAsTable(...) to create/append, or "
|
|
235
281
|
"conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
|
|
236
282
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.3.17.
|
|
3
|
+
Version: 0.3.17.dev4
|
|
4
4
|
Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -291,11 +291,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
|
|
|
291
291
|
## Connection API (notebook)
|
|
292
292
|
|
|
293
293
|
Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
|
|
294
|
-
interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)`
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
294
|
+
interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
|
|
295
|
+
travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
|
|
296
|
+
`update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
|
|
297
|
+
`CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
|
|
298
|
+
delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
|
|
299
|
+
write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
|
|
300
|
+
`overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
|
|
301
|
+
/ `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
|
|
302
|
+
plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
|
|
299
303
|
|
|
300
304
|
`merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
|
|
301
305
|
the target version is captured and the commit is validated against it, so a concurrent writer fails
|
|
@@ -347,7 +351,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
|
|
|
347
351
|
|
|
348
352
|
**Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
|
|
349
353
|
`tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
|
|
350
|
-
[`
|
|
354
|
+
[`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
|
|
351
355
|
runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
|
|
352
356
|
in those docs are rendered live by CI, so they always reflect the latest `main`.
|
|
353
357
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.3.17.
|
|
7
|
+
version = "0.3.17.dev4"
|
|
8
8
|
description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
version = "0.3.17.dev3"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/delta.sql
RENAMED
|
File without changes
|
|
File without changes
|
{duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/snapshot.sql
RENAMED
|
File without changes
|
{duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/table.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|