duckrun 0.3.17.dev3__tar.gz → 0.3.17.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {duckrun-0.3.17.dev3/duckrun.egg-info → duckrun-0.3.17.dev5}/PKG-INFO +11 -7
  2. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/README.md +10 -6
  3. duckrun-0.3.17.dev5/dbt/adapters/duckrun/__version__.py +1 -0
  4. duckrun-0.3.17.dev5/dbt/adapters/duckrun/delta_dml.py +480 -0
  5. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/engine.py +17 -0
  6. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun/session.py +61 -15
  7. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5/duckrun.egg-info}/PKG-INFO +11 -7
  8. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/pyproject.toml +1 -1
  9. duckrun-0.3.17.dev3/dbt/adapters/duckrun/__version__.py +0 -1
  10. duckrun-0.3.17.dev3/dbt/adapters/duckrun/delta_dml.py +0 -297
  11. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/LICENSE +0 -0
  12. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/MANIFEST.in +0 -0
  13. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/__init__.py +0 -0
  14. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/credentials.py +0 -0
  15. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/delta_plugin.py +0 -0
  16. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/environment.py +0 -0
  17. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/impl.py +0 -0
  18. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/remote.py +0 -0
  19. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/adapters/duckrun/secret.py +0 -0
  20. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/__init__.py +0 -0
  21. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/dbt_project.yml +0 -0
  22. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/macros/catalog.sql +0 -0
  23. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
  24. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
  25. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
  26. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/macros/materializations/snapshot.sql +0 -0
  27. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
  28. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun/__init__.py +0 -0
  29. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun/auth.py +0 -0
  30. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun/delta_table.py +0 -0
  31. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun.egg-info/SOURCES.txt +0 -0
  32. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun.egg-info/dependency_links.txt +0 -0
  33. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun.egg-info/requires.txt +0 -0
  34. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/duckrun.egg-info/top_level.txt +0 -0
  35. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev3
3
+ Version: 0.3.17.dev5
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -291,11 +291,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
291
291
  ## Connection API (notebook)
292
292
 
293
293
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
294
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
295
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
296
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
297
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
298
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
294
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
295
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
296
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
297
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
298
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
299
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
300
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
301
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
302
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
299
303
 
300
304
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
301
305
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -347,7 +351,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
347
351
 
348
352
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
349
353
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
350
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
354
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
351
355
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
352
356
  in those docs are rendered live by CI, so they always reflect the latest `main`.
353
357
 
@@ -262,11 +262,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
262
262
  ## Connection API (notebook)
263
263
 
264
264
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
265
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
266
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
267
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
268
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
269
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
265
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
266
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
267
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
268
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
269
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
270
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
271
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
272
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
273
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
270
274
 
271
275
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
272
276
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -318,7 +322,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
318
322
 
319
323
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
320
324
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
321
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
325
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
322
326
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
323
327
  in those docs are rendered live by CI, so they always reflect the latest `main`.
324
328
 
@@ -0,0 +1 @@
1
+ version = "0.3.17.dev5"
@@ -0,0 +1,480 @@
1
+ """Route raw SQL DML against duckrun-managed (Delta-backed) relations to delta_rs.
2
+
3
+ duckrun intercepts writes at the dbt *materialization* layer (a model/seed/snapshot goes through
4
+ the materialization macros -> store_relation -> delta_rs). But a duckrun relation is surfaced as a
5
+ read-only ``delta_scan`` view, so *raw* DML sent straight to the connection — ``delete from``,
6
+ ``update``, ``insert into ... select``, ``alter table ... add column``, ``create table ... as
7
+ select`` — lands on a view and fails ("Can only delete from base table"), or would create a native
8
+ DuckDB table that bypasses Delta entirely.
9
+
10
+ This module intercepts those statements (at the cursor, see environment.DuckrunCursorWrapper) and
11
+ applies them to the Delta table **via delta_rs only**, then refreshes the ``delta_scan`` view — so
12
+ nothing relies on a native, mutable DuckDB table, and every op works on local AND abfss/OneLake
13
+ stores (delta_rs carries ``storage_options``). ``create table ... as`` writes a new Delta table;
14
+ the mutate forms (delete/update/insert/alter) apply only when a Delta table already exists at the
15
+ target (otherwise the statement passes through — e.g. the test's native ``fact``/``seed``).
16
+
17
+ ``drop table`` unregisters the ``delta_scan`` view AND overwrites the table (via delta_rs) to a
18
+ one-column ``TOMBSTONE_COLUMN`` marker, which discovery recognizes and hides. It does NOT delete
19
+ data: delta_rs has no drop, and removing the Delta files would be a filesystem hack that fails on
20
+ object stores. The directory persists until a human purges it; a later ``create table ... as``
21
+ overwrites the tombstone with real data and the table is live again.
22
+
23
+ The seed loader's own SQL (``create table <t> (<col defs>)``, ``insert ... values``, ``COPY``) lands
24
+ on a native DuckDB table, not a Delta table: bare ``create table (<col defs>)`` becomes a Delta
25
+ table only when a ``default_schema`` is set (the connection API), and the dbt/cursor path passes
26
+ None — so the seed's table stays native. ``insert ... values`` *does* match a form here, but the
27
+ mutate guard only applies it when a Delta table already exists at the target — the seed's native
28
+ table has none, so it falls through untouched. duckrun's own materializations emit ``create ...
29
+ view`` (not ``table``), so they pass through too.
30
+
31
+ Supported / unsupported (what reaches delta_rs):
32
+
33
+ create [or replace] table x [if not exists] as <query> Delta CTAS (query: select/with/(…)); a
34
+ plain create errors if x is live, `or
35
+ replace` overwrites, `if not exists` no-ops
36
+ create [or replace] table x [if not exists] (<col defs>) empty Delta table (connection API only);
37
+ logs a CREATE TABLE op, same exists rules
38
+ create temp/temporary table … native DuckDB (pass through) ── invariant:
39
+ create view … native DuckDB (pass through) ── only TEMP
40
+ and VIEW are
41
+ native; every
42
+ other CREATE
43
+ TABLE is Delta
44
+ insert into x [(cols)] select … Delta append (projected onto target schema)
45
+ insert into x [(cols)] values … Delta append (projected onto target schema)
46
+ [with …] insert into x select … Delta append (CTE re-attached to the body)
47
+ delete from x [where …] delta_rs delete
48
+ update x set … [where …] delta_rs update
49
+ alter table x add column … Delta overwrite (widen schema)
50
+ drop table x tombstone (no data deleted)
51
+ merge … / update … from / delete … using / multi-stmt NOT handled here — the connection API
52
+ (session.sql) rejects them with a clear
53
+ error; the dbt path never emits them.
54
+ """
55
+ import re
56
+ from typing import List, Optional, Tuple
57
+
58
+ from . import engine
59
+
60
+ # `drop table` tombstone: a dropped relation is overwritten (via delta_rs) to a table whose ONLY
61
+ # column is this marker, so (a) discovery recognizes it as dropped and hides it, and (b) anyone who
62
+ # opens the files sees an obviously-not-a-real-table schema rather than a plausible empty table. No
63
+ # data is deleted — the directory stays until a human purges it; a later `create table ... as`
64
+ # overwrites the marker schema with real data and the table is live again.
65
+ TOMBSTONE_COLUMN = "__duckrun_deleted__"
66
+
67
+
68
+ def _columns_are_tombstone(colnames) -> bool:
69
+ return [str(c).lower() for c in colnames] == [TOMBSTONE_COLUMN]
70
+
71
+
72
+ def is_dropped(con, location: str, storage_options=None) -> bool:
73
+ """True if the Delta table at ``location`` is a duckrun drop-tombstone (single marker column).
74
+
75
+ Used by discovery (dbt + connection API) to hide dropped tables. Best-effort: anything that
76
+ can't be opened/scanned is treated as 'not a tombstone' (let normal handling deal with it).
77
+ """
78
+ loc_sql = str(location).replace("'", "''")
79
+ try:
80
+ rel = con.execute(f"select * from delta_scan('{loc_sql}') limit 0")
81
+ return _columns_are_tombstone([d[0] for d in rel.description])
82
+ except Exception:
83
+ return False
84
+
85
+ # --- statement matchers (leading-anchored, DOTALL so multi-line bodies match) ----------------
86
+ # `create [or replace] table [if not exists] <rel> as <query>`. The body is ANY query text (a bare
87
+ # `select …`, a `with … select …` CTE, or a parenthesised `(select …)`); it's handed to DuckDB
88
+ # verbatim so anything DuckDB accepts after `as` works.
89
+ _CREATE_AS = re.compile(
90
+ r"\s*create\s+(?P<orrep>or\s+replace\s+)?table\s+(?P<ine>if\s+not\s+exists\s+)?"
91
+ r"(?P<rel>.+?)\s+as\s+(?P<body>.+)",
92
+ re.I | re.S,
93
+ )
94
+ # `create [or replace] table [if not exists] <rel> (<col defs>)` — no `as`. Connection-API only
95
+ # (see _create_coldefs): materializes an EMPTY Delta table so `CREATE TABLE` is always Delta-backed.
96
+ _CREATE_COLDEFS = re.compile(
97
+ r"\s*create\s+(?P<orrep>or\s+replace\s+)?table\s+(?P<ine>if\s+not\s+exists\s+)?"
98
+ r"(?P<rel>.+?)\s*\((?P<defs>.+)\)\s*;?\s*",
99
+ re.I | re.S,
100
+ )
101
+ _INSERT_SELECT = re.compile(
102
+ r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s+(?P<body>select\b.*)",
103
+ re.I | re.S,
104
+ )
105
+ _INSERT_VALUES = re.compile(
106
+ r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s*values\s+(?P<body>\(.+)",
107
+ re.I | re.S,
108
+ )
109
+ _DELETE = re.compile(
110
+ r"\s*delete\s+from\s+(?P<rel>.+?)(?:\s+where\s+(?P<where>.+))?\s*;?\s*", re.I | re.S
111
+ )
112
+ _UPDATE = re.compile(
113
+ r"\s*update\s+(?P<rel>.+?)\s+set\s+(?P<set>.+?)(?:\s+where\s+(?P<where>.+?))?\s*;?\s*",
114
+ re.I | re.S,
115
+ )
116
+ _ALTER_ADD = re.compile(
117
+ r"\s*alter\s+table\s+(?P<rel>.+?)\s+add\s+column\s+(?P<col>\S+)\s+(?P<def>.+?)\s*;?\s*",
118
+ re.I | re.S,
119
+ )
120
+ _DROP = re.compile(
121
+ r"\s*drop\s+table\s+(?:if\s+exists\s+)?(?P<rel>[^\s;]+)\s*;?\s*", re.I | re.S
122
+ )
123
+ # `create temp/temporary table …` is DuckDB-local scratch by design and must NEVER be captured —
124
+ # checked first in try_handle so it always passes through to native DuckDB (the invariant: only
125
+ # CREATE TEMP TABLE is native; every other CREATE TABLE is Delta-backed).
126
+ _CREATE_TEMP_RE = re.compile(r"\s*create\s+(?:or\s+replace\s+)?(?:temp|temporary)\b", re.I)
127
+ # CTE/whitespace handling: a leading `with …` block followed by a top-level INSERT/UPDATE/DELETE.
128
+ # leading `\b` is load-bearing: _find_top_level tries this at every depth-0 index, so without it the
129
+ # verb would match inside an identifier (e.g. `update` within `last_update`).
130
+ _LEADING_WITH = re.compile(r"\s*with\b", re.I)
131
+ _DRIVING_DML = re.compile(r"\b(?:insert\s+into|update|delete\s+from)\b", re.I)
132
+
133
+
134
+ def _strip_leading(query: str) -> str:
135
+ """Drop leading whitespace and ``--`` / ``/* */`` comments so the first keyword is visible."""
136
+ s = query
137
+ while True:
138
+ t = s.lstrip()
139
+ if t.startswith("--"):
140
+ nl = t.find("\n")
141
+ s = "" if nl == -1 else t[nl + 1:]
142
+ elif t.startswith("/*"):
143
+ end = t.find("*/")
144
+ s = "" if end == -1 else t[end + 2:]
145
+ else:
146
+ return t
147
+
148
+
149
+ def _find_top_level(s: str, pattern) -> int:
150
+ """Index of the first ``pattern`` match at paren-depth 0 and outside quotes, else -1.
151
+
152
+ Lets us tell a top-level clause (the ``FROM`` of ``UPDATE … FROM``, the verb after a leading
153
+ ``WITH``) from the same keyword nested in a subquery, without a full SQL parser."""
154
+ depth, quote, i, n = 0, None, 0, len(s)
155
+ while i < n:
156
+ ch = s[i]
157
+ if quote:
158
+ if ch == quote:
159
+ quote = None
160
+ elif ch in ("'", '"'):
161
+ quote = ch
162
+ elif ch in "([":
163
+ depth += 1
164
+ elif ch in ")]":
165
+ depth -= 1
166
+ elif depth == 0 and pattern.match(s, i):
167
+ return i
168
+ i += 1
169
+ return -1
170
+
171
+
172
+ def _split_leading_with(sql: str) -> Tuple[str, str]:
173
+ """``(with_clause, remainder)`` for ``WITH … <INSERT/UPDATE/DELETE> …``; ``('', sql)`` otherwise.
174
+
175
+ So ``WITH c AS (…) INSERT INTO t SELECT … FROM c`` reaches the matchers (which anchor on the
176
+ verb) and the CTE is preserved when the body is evaluated. A leading ``WITH`` that drives a
177
+ plain ``SELECT`` (a read) is left untouched."""
178
+ if not _LEADING_WITH.match(sql):
179
+ return "", sql
180
+ idx = _find_top_level(sql, _DRIVING_DML)
181
+ if idx <= 0:
182
+ return "", sql
183
+ return sql[:idx].rstrip(), sql[idx:]
184
+
185
+
186
+ def _fullmatch(pattern, sql):
187
+ return pattern.fullmatch(sql.strip())
188
+
189
+
190
+ def _split_relation(rel: str) -> Tuple[Optional[str], Optional[str]]:
191
+ """`"db"."schema"."tbl"` / `schema.tbl` / `tbl` -> (schema, identifier), quotes stripped."""
192
+ parts = [p.strip().strip('"') for p in rel.strip().split(".")]
193
+ if not parts or not parts[-1]:
194
+ return None, None
195
+ identifier = parts[-1]
196
+ schema = parts[-2] if len(parts) >= 2 else None
197
+ return schema, identifier
198
+
199
+
200
+ def _split_top_level_commas(s: str) -> List[str]:
201
+ """Split on commas that aren't inside parentheses or quotes (so ``left(email, 3)`` stays whole)."""
202
+ out, depth, start, quote = [], 0, 0, None
203
+ for i, ch in enumerate(s):
204
+ if quote:
205
+ if ch == quote:
206
+ quote = None
207
+ elif ch in ("'", '"'):
208
+ quote = ch
209
+ elif ch in "([":
210
+ depth += 1
211
+ elif ch in ")]":
212
+ depth -= 1
213
+ elif ch == "," and depth == 0:
214
+ out.append(s[start:i])
215
+ start = i + 1
216
+ out.append(s[start:])
217
+ return [p.strip() for p in out if p.strip()]
218
+
219
+
220
+ class _DeltaDML:
221
+ """One attempt to handle a statement; ``run()`` returns True if it was applied to Delta."""
222
+
223
+ def __init__(self, cursor, root_path: str, storage_options, default_schema=None):
224
+ self.cursor = cursor
225
+ self.root_path = root_path.rstrip("/")
226
+ self.so = storage_options
227
+ self.default_schema = default_schema
228
+ self._with_clause = "" # a leading `WITH …` preceding an INSERT, prepended to the body
229
+
230
+ def _loc(self, schema: str, identifier: str) -> str:
231
+ return f"{self.root_path}/{schema}/{identifier}"
232
+
233
+ def _resolve(self, rel: str):
234
+ """(schema, identifier, location) for ``rel``, falling back to default_schema for an
235
+ unqualified name (the connection API relies on a current database). (None, None, None) when
236
+ no schema can be determined."""
237
+ schema, identifier = _split_relation(rel)
238
+ schema = schema or self.default_schema
239
+ if not schema or not identifier:
240
+ return None, None, None
241
+ return schema, identifier, self._loc(schema, identifier)
242
+
243
+ def _exists(self, loc: str) -> bool:
244
+ return engine.table_exists(loc, self.so)
245
+
246
+ def _refresh_view(self, rel: str, schema: str, loc: str) -> None:
247
+ loc_sql = loc.replace("'", "''")
248
+ self.cursor.execute(f'create schema if not exists "{schema}"')
249
+ self.cursor.execute(
250
+ f"create or replace view {rel} as select * from delta_scan('{loc_sql}')"
251
+ )
252
+
253
+ def try_handle(self, sql: str) -> bool:
254
+ # CREATE TEMP/TEMPORARY TABLE is native DuckDB scratch by design — never capture it.
255
+ if _CREATE_TEMP_RE.match(sql):
256
+ return False
257
+ m = _fullmatch(_CREATE_AS, sql)
258
+ if m and "__duckrun" not in m.group("rel"):
259
+ return self._create_as(m)
260
+ m = _fullmatch(_CREATE_COLDEFS, sql)
261
+ if m and "__duckrun" not in m.group("rel"):
262
+ return self._create_coldefs(m)
263
+ m = _fullmatch(_INSERT_SELECT, sql)
264
+ if m:
265
+ return self._mutate(m, self._insert_select)
266
+ m = _fullmatch(_INSERT_VALUES, sql)
267
+ if m:
268
+ return self._mutate(m, self._insert_values)
269
+ m = _fullmatch(_DELETE, sql)
270
+ if m:
271
+ return self._mutate(m, self._delete)
272
+ m = _fullmatch(_UPDATE, sql)
273
+ if m:
274
+ return self._mutate(m, self._update)
275
+ m = _fullmatch(_ALTER_ADD, sql)
276
+ if m:
277
+ return self._mutate(m, self._alter_add)
278
+ m = _fullmatch(_DROP, sql)
279
+ if m:
280
+ return self._drop(m)
281
+ return False
282
+
283
+ # -- create table <rel> as <query>: always materialize as a duckrun Delta table ------------
284
+ def _create_as(self, m) -> bool:
285
+ rel = m.group("rel").strip()
286
+ schema, identifier, loc = self._resolve(rel)
287
+ if not loc:
288
+ return False
289
+ # dbt/cursor path (no default_schema): keep the ORIGINAL narrow interception — only a plain
290
+ # `create table … as select …` routes to Delta. The wider forms (`or replace`, a CTE or a
291
+ # parenthesised body) are a connection-API affordance; on the dbt path they must stay native
292
+ # so dbt keeps owning the relation. dbt-internal CTAS like store_failures' `create table … as
293
+ # (select …)` is a real TABLE dbt later drops/recreates — turning it into a delta_scan VIEW
294
+ # breaks that ("Existing object … is of type View, trying to drop type Table").
295
+ if self.default_schema is None and (
296
+ m.group("orrep") or not re.match(r"select\b", m.group("body").lstrip(), re.I)
297
+ ):
298
+ return False
299
+ live = self._exists(loc) and not is_dropped(self.cursor, loc, self.so)
300
+ # `if not exists` over a live (non-tombstone) table is a no-op — just (re)surface the view.
301
+ if m.group("ine") and live:
302
+ self._refresh_view(rel, schema, loc)
303
+ return True
304
+ # Connection API: a plain `create table` must NOT silently clobber a live table — that's what
305
+ # `or replace` is for. (The dbt/cursor path keeps overwriting: dbt owns idempotent re-runs.)
306
+ if self.default_schema is not None and live and not m.group("orrep"):
307
+ raise ValueError(
308
+ f"table {schema}.{identifier} already exists — "
309
+ f"use CREATE OR REPLACE TABLE to replace it"
310
+ )
311
+ data = self.cursor.sql(m.group("body"))
312
+ # overwrite_schema so this replaces a prior table (or a drop-tombstone) wholesale — a live
313
+ # table is recreated with the real schema, clearing any tombstone marker.
314
+ engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
315
+ self._refresh_view(rel, schema, loc)
316
+ return True
317
+
318
+ # -- create table <rel> (<col defs>): an EMPTY Delta table (connection API only) -----------
319
+ def _create_coldefs(self, m) -> bool:
320
+ # Only the connection API (which carries a current database) makes a bare `CREATE TABLE
321
+ # (col defs)` a Delta table — so `CREATE TABLE` is always Delta-backed there. The dbt/cursor
322
+ # path passes default_schema=None: the seed loader emits this exact form and RELIES on it
323
+ # landing as a native DuckDB table, so we pass through untouched.
324
+ if self.default_schema is None:
325
+ return False
326
+ rel = m.group("rel").strip()
327
+ schema, identifier, loc = self._resolve(rel)
328
+ if not loc:
329
+ return False
330
+ live = self._exists(loc) and not is_dropped(self.cursor, loc, self.so)
331
+ if m.group("ine") and live: # IF NOT EXISTS over a live table → no-op
332
+ self._refresh_view(rel, schema, loc)
333
+ return True
334
+ if live and not m.group("orrep"): # plain CREATE over a live table → error
335
+ raise ValueError(
336
+ f"table {schema}.{identifier} already exists — "
337
+ f"use CREATE OR REPLACE TABLE to replace it"
338
+ )
339
+ # Let DuckDB parse the column defs (types, constraints, nested parens) by building the table
340
+ # as a TEMP, then take its Arrow schema and create an EMPTY Delta table from it. DeltaTable.create
341
+ # logs a CREATE TABLE operation (not a WRITE/Overwrite). A live table or a drop-tombstone already
342
+ # has files at ``loc``, so it must be replaced (overwrite); otherwise create-if-absent (error).
343
+ tmp = f"__duckrun_empty_{abs(hash((schema, identifier))) & 0xFFFFFFFF}"
344
+ self.cursor.execute(f'create or replace temp table "{tmp}" ({m.group("defs")})')
345
+ try:
346
+ arrow_schema = self.cursor.sql(f'select * from "{tmp}" limit 0').arrow().schema
347
+ finally:
348
+ self.cursor.execute(f'drop table if exists "{tmp}"')
349
+ mode = "overwrite" if self._exists(loc) else "error"
350
+ engine.create_empty_delta(loc, arrow_schema, mode=mode, storage_options=self.so)
351
+ self._refresh_view(rel, schema, loc)
352
+ return True
353
+
354
+ # -- forms that only apply when a Delta table already exists at the target ------------------
355
+ def _mutate(self, m, op) -> bool:
356
+ rel = m.group("rel").strip()
357
+ schema, identifier, loc = self._resolve(rel)
358
+ if not loc or not self._exists(loc):
359
+ return False # native relation (e.g. the test's `fact`/`seed`) -> let DuckDB handle it
360
+ if self._with_clause and op != self._insert_select:
361
+ return False # `WITH … UPDATE/DELETE` can't be expressed through a delta_rs predicate
362
+ op(m, rel, schema, loc)
363
+ self._refresh_view(rel, schema, loc)
364
+ return True
365
+
366
+ def _delete(self, m, rel, schema, loc) -> None:
367
+ where = m.group("where")
368
+ engine._delta_table(loc, self.so).delete(predicate=where.strip() if where else None)
369
+
370
+ def _update(self, m, rel, schema, loc) -> None:
371
+ updates = {}
372
+ for assign in _split_top_level_commas(m.group("set")):
373
+ col, _, expr = assign.partition("=")
374
+ updates[col.strip().strip('"')] = expr.strip()
375
+ where = m.group("where")
376
+ engine._delta_table(loc, self.so).update(
377
+ updates=updates, predicate=where.strip() if where else None
378
+ )
379
+
380
+ def _insert_select(self, m, rel, schema, loc) -> None:
381
+ body = m.group("body")
382
+ if self._with_clause: # `WITH … INSERT INTO t SELECT …`: re-attach the CTE to the body
383
+ body = f"{self._with_clause} {body}"
384
+ cols = m.group("cols")
385
+ if cols: # `insert into t (a, b) select …` → project the query onto the named columns
386
+ self._append_projected(loc, self._provided(cols), f"({body})")
387
+ else: # column count/order already matches the target → append as-is
388
+ engine.write_delta(loc, self.cursor.sql(body), "append", storage_options=self.so)
389
+
390
+ def _insert_values(self, m, rel, schema, loc) -> None:
391
+ # `insert into <rel> [(<cols>)] values (...)`: the literals supply every target column when
392
+ # no list is given, in order; otherwise the named columns.
393
+ cols = m.group("cols")
394
+ provided = self._provided(cols) if cols else None
395
+ self._append_projected(loc, provided, f"(values {m.group('body')})")
396
+
397
+ @staticmethod
398
+ def _provided(cols: str) -> List[str]:
399
+ return [c.strip().strip('"') for c in cols.split(",")]
400
+
401
+ def _append_projected(self, loc, provided, derived: str) -> None:
402
+ """Append a ``derived`` table (a ``(values …)`` tuple list or a ``(select …)`` subquery) to
403
+ the Delta table at ``loc``, projecting its columns onto the FULL target schema: supplied
404
+ columns come from ``derived`` (positional when ``provided`` is None), any unsupplied target
405
+ column is a typed NULL, and every projected column is cast to the target column's type so
406
+ the appended Arrow schema matches the table exactly (what a plain SQL INSERT does, and it
407
+ stops a literal wider than the column from forcing delta_rs to add a new writer feature on
408
+ append)."""
409
+ loc_sql = loc.replace("'", "''")
410
+ template = self.cursor.sql(f"select * from delta_scan('{loc_sql}') limit 0")
411
+ target_cols = list(template.columns)
412
+ target_types = [str(t) for t in template.types]
413
+ by_lower = {c.lower(): c for c in target_cols}
414
+
415
+ if provided is None: # positional → every target column, in order
416
+ provided = target_cols
417
+ else: # explicit column list → canonicalize to the target's casing
418
+ provided = [by_lower.get(c.lower(), c) for c in provided]
419
+ provided_set = set(provided)
420
+
421
+ quoted = ", ".join('"' + c + '"' for c in provided)
422
+ inner = f"{derived} v({quoted})"
423
+ exprs = [
424
+ f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
425
+ else f'cast(null as {typ}) as "{col}"'
426
+ for col, typ in zip(target_cols, target_types)
427
+ ]
428
+ data = self.cursor.sql(f"select {', '.join(exprs)} from {inner}")
429
+ engine.write_delta(loc, data, "append", storage_options=self.so)
430
+
431
+ def _alter_add(self, m, rel, schema, loc) -> None:
432
+ col = m.group("col").strip().strip('"')
433
+ # Keep only the column type (drop any DEFAULT/NULL clause); add it as an all-null column by
434
+ # rewriting the table with overwrite_schema so delta_rs accepts the widened schema.
435
+ coltype = re.split(r"\s+default\b|\s+null\b", m.group("def"), flags=re.I)[0].strip() or "VARCHAR"
436
+ loc_sql = loc.replace("'", "''")
437
+ data = self.cursor.sql(
438
+ f'select *, cast(null as {coltype}) as "{col}" from delta_scan(\'{loc_sql}\')'
439
+ )
440
+ engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
441
+
442
+ def _drop(self, m) -> bool:
443
+ # `drop table` on a duckrun relation: unregister the delta_scan view AND, via delta_rs,
444
+ # overwrite the table to a one-column tombstone (TOMBSTONE_COLUMN) so a later glob discovery
445
+ # hides it. NO data is deleted — delta_rs has no drop, and removing the Delta files would be
446
+ # a filesystem hack that fails on object stores. The directory persists until a human purges
447
+ # it; a later `create table ... as` overwrites the tombstone with real data. If the relation
448
+ # isn't a duckrun-managed Delta table, fall through and let DuckDB drop the native table.
449
+ rel = m.group("rel").strip()
450
+ schema, identifier, loc = self._resolve(rel)
451
+ if not loc or not self._exists(loc):
452
+ return False
453
+ tombstone = self.cursor.sql(f"select true as {TOMBSTONE_COLUMN}")
454
+ engine.write_delta(loc, tombstone, "overwrite", overwrite_schema=True, storage_options=self.so)
455
+ self.cursor.execute(f"drop view if exists {rel}")
456
+ return True
457
+
458
+
459
+ def handle(cursor, root_path, storage_options, sql: str, default_schema=None) -> bool:
460
+ """Apply ``sql`` to Delta if it's a DML form targeting a duckrun-managed relation, using
461
+ ``cursor`` to evaluate any SELECT body and to (re)create the ``delta_scan`` view.
462
+
463
+ Every handled form goes through delta_rs (``engine.write_delta`` / ``DeltaTable.delete`` /
464
+ ``.update``), which carries ``storage_options`` and so works on local AND abfss/OneLake stores.
465
+ ``default_schema`` resolves an unqualified table name (the connection API has a current
466
+ database; the dbt path always renders fully-qualified names so passes None).
467
+ Returns True if handled (the caller must NOT also run it on DuckDB), False to pass through —
468
+ anything unrecognized, or (for the mutate forms) a target that isn't a Delta table.
469
+ """
470
+ if not root_path:
471
+ return False
472
+ sql = _strip_leading(sql) # so leading comments/whitespace don't hide the verb
473
+ with_clause, body = _split_leading_with(sql) # peel a leading `WITH …` off an INSERT/etc.
474
+ # Cheap pre-filter: only the candidate DML verbs.
475
+ head = body[:7].lower()
476
+ if not head.startswith(("delete", "update", "insert", "create", "alter", "drop")):
477
+ return False
478
+ dml = _DeltaDML(cursor, root_path, storage_options, default_schema)
479
+ dml._with_clause = with_clause
480
+ return dml.try_handle(body)
@@ -732,6 +732,23 @@ def write_delta(
732
732
  _maintain(dt, compaction_threshold)
733
733
 
734
734
 
735
+ def create_empty_delta(
736
+ path: str,
737
+ schema,
738
+ *,
739
+ mode: str = "error",
740
+ storage_options: Optional[Dict[str, str]] = None,
741
+ ) -> None:
742
+ """Create an EMPTY Delta table at ``path`` from an Arrow ``schema`` (no data files).
743
+
744
+ Used by the connection API's bare ``CREATE TABLE (col defs)``: it logs a ``CREATE TABLE``
745
+ operation rather than a ``WRITE``/``Overwrite``, which is what a create — not an overwrite —
746
+ should record. ``mode`` follows delta-rs: ``error`` (fail if the table exists), ``overwrite``
747
+ (replace an existing table or drop-tombstone), or ``ignore`` (no-op if it exists).
748
+ """
749
+ DeltaTable.create(path, schema, mode=mode, storage_options=storage_options)
750
+
751
+
735
752
  def append_if_unchanged(
736
753
  path: str,
737
754
  data,
@@ -27,20 +27,63 @@ _WRITE_KEYWORD_RE = re.compile(r"^(insert|update|delete|merge)\b", re.IGNORECASE
27
27
  _CREATE_TABLE_RE = re.compile(r"^create\s+(or\s+replace\s+)?table\b", re.IGNORECASE)
28
28
  _CREATE_TEMP_RE = re.compile(r"^create\s+(or\s+replace\s+)?(temp|temporary)\b", re.IGNORECASE)
29
29
 
30
-
31
- def _strip_leading(query: str) -> str:
32
- """Drop leading whitespace and ``--`` / ``/* */`` comments so the first keyword is visible."""
33
- s = query
34
- while True:
35
- t = s.lstrip()
36
- if t.startswith("--"):
37
- nl = t.find("\n")
38
- s = "" if nl == -1 else t[nl + 1:]
39
- elif t.startswith("/*"):
40
- end = t.find("*/")
41
- s = "" if end == -1 else t[end + 2:]
42
- else:
43
- return t
30
+ # DML forms that genuinely can't be expressed through delta_rs (delta_dml.handle never applies them):
31
+ # rejected up front with a form-specific pointer rather than letting DuckDB raise a cryptic error on
32
+ # the read-only delta_scan view (or, for UPDATE FROM, silently mangling the SET clause).
33
+ # leading `\b`: _find_top_level probes every depth-0 index (see delta_dml._find_top_level).
34
+ _TOP_FROM = re.compile(r"\bfrom\b", re.IGNORECASE)
35
+ _TOP_USING = re.compile(r"\busing\b", re.IGNORECASE)
36
+ _strip_leading = delta_dml._strip_leading # shared comment/whitespace stripper
37
+
38
+ _MERGE_MSG = (
39
+ "conn.sql() can't run a SQL MERGE via delta_rs. Use the Spark write API: "
40
+ "df.write.saveAsTable(...) to create/append, or "
41
+ "conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
42
+ )
43
+ _UPDATE_FROM_MSG = (
44
+ "conn.sql() can't run UPDATE … FROM via delta_rs. Rewrite the SET values as correlated "
45
+ "subqueries, or use conn.delta_table(name).update(...)/.merge(...)."
46
+ )
47
+ _DELETE_USING_MSG = (
48
+ "conn.sql() can't run DELETE … USING via delta_rs. Rewrite the predicate as a correlated "
49
+ "subquery (DELETE … WHERE … IN (SELECT …)), or use conn.delta_table(name).delete(...)/.merge(...)."
50
+ )
51
+ _MULTI_MSG = (
52
+ "conn.sql() runs one statement at a time — split the batch into separate conn.sql() calls."
53
+ )
54
+
55
+
56
+ def _unsupported_dml(query: str) -> Optional[str]:
57
+ """An error message if ``query`` is a DML form duckrun can't route to delta_rs, else None."""
58
+ s = _strip_leading(query)
59
+ low = s.lower()
60
+ if low.startswith("merge"):
61
+ return _MERGE_MSG
62
+ if low.startswith("update") and delta_dml._find_top_level(s, _TOP_FROM) != -1:
63
+ return _UPDATE_FROM_MSG
64
+ if low.startswith("delete") and delta_dml._find_top_level(s, _TOP_USING) != -1:
65
+ return _DELETE_USING_MSG
66
+ if re.match(r"(insert|update|delete|merge|create|alter|drop)\b", low) and _is_multi_statement(s):
67
+ return _MULTI_MSG
68
+ return None
69
+
70
+
71
+ def _is_multi_statement(s: str) -> bool:
72
+ """True if ``s`` holds more than one statement (a top-level ``;`` with anything after it)."""
73
+ depth, quote = 0, None
74
+ for i, ch in enumerate(s):
75
+ if quote:
76
+ if ch == quote:
77
+ quote = None
78
+ elif ch in ("'", '"'):
79
+ quote = ch
80
+ elif ch in "([":
81
+ depth += 1
82
+ elif ch in ")]":
83
+ depth -= 1
84
+ elif ch == ";" and depth == 0 and s[i + 1:].strip():
85
+ return True
86
+ return False
44
87
 
45
88
 
46
89
  def _is_delta_write(query: str) -> bool:
@@ -224,13 +267,16 @@ class DuckSession:
224
267
  ``conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()``.
225
268
  ``CREATE TEMP/VIEW`` and other DuckDB-local scratch DDL pass through to DuckDB.
226
269
  """
270
+ unsupported = _unsupported_dml(query)
271
+ if unsupported:
272
+ raise ValueError(unsupported)
227
273
  if delta_dml.handle(self.con, self.root_path, self.storage_options, query,
228
274
  default_schema=self._current_database):
229
275
  self.refresh(quiet=True)
230
276
  return DataFrame(self.con.sql("SELECT 'ok' AS status"), self)
231
277
  if _is_delta_write(query):
232
278
  raise ValueError(
233
- "conn.sql() can't run a SQL MERGE via delta_rs. "
279
+ "conn.sql() can't write a Delta table from raw SQL here. "
234
280
  "Use the Spark write API: df.write.saveAsTable(...) to create/append, or "
235
281
  "conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
236
282
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev3
3
+ Version: 0.3.17.dev5
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -291,11 +291,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
291
291
  ## Connection API (notebook)
292
292
 
293
293
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
294
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
295
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
296
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
297
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
298
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
294
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
295
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
296
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
297
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
298
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
299
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
300
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
301
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
302
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
299
303
 
300
304
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
301
305
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -347,7 +351,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
347
351
 
348
352
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
349
353
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
350
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
354
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
351
355
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
352
356
  in those docs are rendered live by CI, so they always reflect the latest `main`.
353
357
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.3.17.dev3"
7
+ version = "0.3.17.dev5"
8
8
  description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -1 +0,0 @@
1
- version = "0.3.17.dev3"
@@ -1,297 +0,0 @@
1
- """Route raw SQL DML against duckrun-managed (Delta-backed) relations to delta_rs.
2
-
3
- duckrun intercepts writes at the dbt *materialization* layer (a model/seed/snapshot goes through
4
- the materialization macros -> store_relation -> delta_rs). But a duckrun relation is surfaced as a
5
- read-only ``delta_scan`` view, so *raw* DML sent straight to the connection — ``delete from``,
6
- ``update``, ``insert into ... select``, ``alter table ... add column``, ``create table ... as
7
- select`` — lands on a view and fails ("Can only delete from base table"), or would create a native
8
- DuckDB table that bypasses Delta entirely.
9
-
10
- This module intercepts those statements (at the cursor, see environment.DuckrunCursorWrapper) and
11
- applies them to the Delta table **via delta_rs only**, then refreshes the ``delta_scan`` view — so
12
- nothing relies on a native, mutable DuckDB table, and every op works on local AND abfss/OneLake
13
- stores (delta_rs carries ``storage_options``). ``create table ... as`` writes a new Delta table;
14
- the mutate forms (delete/update/insert/alter) apply only when a Delta table already exists at the
15
- target (otherwise the statement passes through — e.g. the test's native ``fact``/``seed``).
16
-
17
- ``drop table`` unregisters the ``delta_scan`` view AND overwrites the table (via delta_rs) to a
18
- one-column ``TOMBSTONE_COLUMN`` marker, which discovery recognizes and hides. It does NOT delete
19
- data: delta_rs has no drop, and removing the Delta files would be a filesystem hack that fails on
20
- object stores. The directory persists until a human purges it; a later ``create table ... as``
21
- overwrites the tombstone with real data and the table is live again.
22
-
23
- The seed loader's own SQL (``create table <t> (<col defs>)``, ``insert ... values``, ``COPY``) lands
24
- on a native DuckDB table, not a Delta table: ``create table (<col defs>)`` doesn't match the
25
- ``... as select`` form, and while ``insert ... values`` now *does* match a form here, the mutate
26
- guard only applies it when a Delta table already exists at the target — the seed's native table has
27
- none, so it falls through untouched. duckrun's own materializations emit ``create ... view`` (not
28
- ``table``), so they pass through too.
29
- """
30
- import re
31
- from typing import List, Optional, Tuple
32
-
33
- from . import engine
34
-
35
- # `drop table` tombstone: a dropped relation is overwritten (via delta_rs) to a table whose ONLY
36
- # column is this marker, so (a) discovery recognizes it as dropped and hides it, and (b) anyone who
37
- # opens the files sees an obviously-not-a-real-table schema rather than a plausible empty table. No
38
- # data is deleted — the directory stays until a human purges it; a later `create table ... as`
39
- # overwrites the marker schema with real data and the table is live again.
40
- TOMBSTONE_COLUMN = "__duckrun_deleted__"
41
-
42
-
43
- def _columns_are_tombstone(colnames) -> bool:
44
- return [str(c).lower() for c in colnames] == [TOMBSTONE_COLUMN]
45
-
46
-
47
- def is_dropped(con, location: str, storage_options=None) -> bool:
48
- """True if the Delta table at ``location`` is a duckrun drop-tombstone (single marker column).
49
-
50
- Used by discovery (dbt + connection API) to hide dropped tables. Best-effort: anything that
51
- can't be opened/scanned is treated as 'not a tombstone' (let normal handling deal with it).
52
- """
53
- loc_sql = str(location).replace("'", "''")
54
- try:
55
- rel = con.execute(f"select * from delta_scan('{loc_sql}') limit 0")
56
- return _columns_are_tombstone([d[0] for d in rel.description])
57
- except Exception:
58
- return False
59
-
60
- # --- statement matchers (leading-anchored, DOTALL so multi-line bodies match) ----------------
61
- _CREATE_AS = re.compile(
62
- r"\s*create\s+table\s+(?:if\s+not\s+exists\s+)?(?P<rel>.+?)\s+as\s+(?P<body>select\b.*)",
63
- re.I | re.S,
64
- )
65
- _INSERT_SELECT = re.compile(
66
- r"\s*insert\s+into\s+(?P<rel>.+?)\s+(?P<body>select\b.*)", re.I | re.S
67
- )
68
- _INSERT_VALUES = re.compile(
69
- r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s*values\s+(?P<body>\(.+)",
70
- re.I | re.S,
71
- )
72
- _DELETE = re.compile(
73
- r"\s*delete\s+from\s+(?P<rel>.+?)(?:\s+where\s+(?P<where>.+))?\s*;?\s*", re.I | re.S
74
- )
75
- _UPDATE = re.compile(
76
- r"\s*update\s+(?P<rel>.+?)\s+set\s+(?P<set>.+?)(?:\s+where\s+(?P<where>.+?))?\s*;?\s*",
77
- re.I | re.S,
78
- )
79
- _ALTER_ADD = re.compile(
80
- r"\s*alter\s+table\s+(?P<rel>.+?)\s+add\s+column\s+(?P<col>\S+)\s+(?P<def>.+?)\s*;?\s*",
81
- re.I | re.S,
82
- )
83
- _DROP = re.compile(
84
- r"\s*drop\s+table\s+(?:if\s+exists\s+)?(?P<rel>[^\s;]+)\s*;?\s*", re.I | re.S
85
- )
86
-
87
-
88
- def _fullmatch(pattern, sql):
89
- return pattern.fullmatch(sql.strip())
90
-
91
-
92
- def _split_relation(rel: str) -> Tuple[Optional[str], Optional[str]]:
93
- """`"db"."schema"."tbl"` / `schema.tbl` / `tbl` -> (schema, identifier), quotes stripped."""
94
- parts = [p.strip().strip('"') for p in rel.strip().split(".")]
95
- if not parts or not parts[-1]:
96
- return None, None
97
- identifier = parts[-1]
98
- schema = parts[-2] if len(parts) >= 2 else None
99
- return schema, identifier
100
-
101
-
102
- def _split_top_level_commas(s: str) -> List[str]:
103
- """Split on commas that aren't inside parentheses or quotes (so ``left(email, 3)`` stays whole)."""
104
- out, depth, start, quote = [], 0, 0, None
105
- for i, ch in enumerate(s):
106
- if quote:
107
- if ch == quote:
108
- quote = None
109
- elif ch in ("'", '"'):
110
- quote = ch
111
- elif ch in "([":
112
- depth += 1
113
- elif ch in ")]":
114
- depth -= 1
115
- elif ch == "," and depth == 0:
116
- out.append(s[start:i])
117
- start = i + 1
118
- out.append(s[start:])
119
- return [p.strip() for p in out if p.strip()]
120
-
121
-
122
- class _DeltaDML:
123
- """One attempt to handle a statement; ``run()`` returns True if it was applied to Delta."""
124
-
125
- def __init__(self, cursor, root_path: str, storage_options, default_schema=None):
126
- self.cursor = cursor
127
- self.root_path = root_path.rstrip("/")
128
- self.so = storage_options
129
- self.default_schema = default_schema
130
-
131
- def _loc(self, schema: str, identifier: str) -> str:
132
- return f"{self.root_path}/{schema}/{identifier}"
133
-
134
- def _resolve(self, rel: str):
135
- """(schema, identifier, location) for ``rel``, falling back to default_schema for an
136
- unqualified name (the connection API relies on a current database). (None, None, None) when
137
- no schema can be determined."""
138
- schema, identifier = _split_relation(rel)
139
- schema = schema or self.default_schema
140
- if not schema or not identifier:
141
- return None, None, None
142
- return schema, identifier, self._loc(schema, identifier)
143
-
144
- def _exists(self, loc: str) -> bool:
145
- return engine.table_exists(loc, self.so)
146
-
147
- def _refresh_view(self, rel: str, schema: str, loc: str) -> None:
148
- loc_sql = loc.replace("'", "''")
149
- self.cursor.execute(f'create schema if not exists "{schema}"')
150
- self.cursor.execute(
151
- f"create or replace view {rel} as select * from delta_scan('{loc_sql}')"
152
- )
153
-
154
- def try_handle(self, sql: str) -> bool:
155
- m = _fullmatch(_CREATE_AS, sql)
156
- if m and "__duckrun" not in m.group("rel"):
157
- return self._create_as(m)
158
- m = _fullmatch(_INSERT_SELECT, sql)
159
- if m:
160
- return self._mutate(m, self._insert_select)
161
- m = _fullmatch(_INSERT_VALUES, sql)
162
- if m:
163
- return self._mutate(m, self._insert_values)
164
- m = _fullmatch(_DELETE, sql)
165
- if m:
166
- return self._mutate(m, self._delete)
167
- m = _fullmatch(_UPDATE, sql)
168
- if m:
169
- return self._mutate(m, self._update)
170
- m = _fullmatch(_ALTER_ADD, sql)
171
- if m:
172
- return self._mutate(m, self._alter_add)
173
- m = _fullmatch(_DROP, sql)
174
- if m:
175
- return self._drop(m)
176
- return False
177
-
178
- # -- create table <rel> as <select>: always materialize as a duckrun Delta table -----------
179
- def _create_as(self, m) -> bool:
180
- rel = m.group("rel").strip()
181
- schema, identifier, loc = self._resolve(rel)
182
- if not loc:
183
- return False
184
- data = self.cursor.sql(m.group("body"))
185
- # overwrite_schema so this replaces a prior table (or a drop-tombstone) wholesale — a live
186
- # table is recreated with the real schema, clearing any tombstone marker.
187
- engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
188
- self._refresh_view(rel, schema, loc)
189
- return True
190
-
191
- # -- forms that only apply when a Delta table already exists at the target ------------------
192
- def _mutate(self, m, op) -> bool:
193
- rel = m.group("rel").strip()
194
- schema, identifier, loc = self._resolve(rel)
195
- if not loc or not self._exists(loc):
196
- return False # native relation (e.g. the test's `fact`/`seed`) -> let DuckDB handle it
197
- op(m, rel, schema, loc)
198
- self._refresh_view(rel, schema, loc)
199
- return True
200
-
201
- def _delete(self, m, rel, schema, loc) -> None:
202
- where = m.group("where")
203
- engine._delta_table(loc, self.so).delete(predicate=where.strip() if where else None)
204
-
205
- def _update(self, m, rel, schema, loc) -> None:
206
- updates = {}
207
- for assign in _split_top_level_commas(m.group("set")):
208
- col, _, expr = assign.partition("=")
209
- updates[col.strip().strip('"')] = expr.strip()
210
- where = m.group("where")
211
- engine._delta_table(loc, self.so).update(
212
- updates=updates, predicate=where.strip() if where else None
213
- )
214
-
215
- def _insert_select(self, m, rel, schema, loc) -> None:
216
- data = self.cursor.sql(m.group("body"))
217
- engine.write_delta(loc, data, "append", storage_options=self.so)
218
-
219
- def _insert_values(self, m, rel, schema, loc) -> None:
220
- # `insert into <rel> [(<cols>)] values (...)`: evaluate the VALUES tuples through DuckDB and
221
- # project them onto the FULL target Delta schema (so append schemas match) — supplied columns
222
- # come from the literals, any unsupplied target column is filled with a typed NULL.
223
- loc_sql = loc.replace("'", "''")
224
- template = self.cursor.sql(f"select * from delta_scan('{loc_sql}') limit 0")
225
- target_cols = list(template.columns)
226
- target_types = [str(t) for t in template.types]
227
- by_lower = {c.lower(): c for c in target_cols}
228
-
229
- cols = m.group("cols")
230
- if cols: # explicit column list → canonicalize to the target's casing
231
- provided = [by_lower.get(c.strip().strip('"').lower(), c.strip().strip('"'))
232
- for c in cols.split(",")]
233
- else: # positional → the literals supply every target column, in order
234
- provided = target_cols
235
- provided_set = {c for c in provided}
236
-
237
- quoted = ", ".join('"' + c + '"' for c in provided)
238
- inner = f"(values {m.group('body')}) v({quoted})"
239
- # Cast every projected column to the TARGET column's type — both supplied values and the
240
- # typed NULLs — so the appended Arrow schema matches the table exactly. This is also what a
241
- # plain SQL INSERT does (a literal is coerced to the column type), and it stops a literal
242
- # whose inferred type is wider than the column (e.g. a ::timestamp into a DATE column) from
243
- # forcing delta_rs to add a new writer feature on append (TimestampWithoutTimezone).
244
- exprs = [
245
- f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
246
- else f'cast(null as {typ}) as "{col}"'
247
- for col, typ in zip(target_cols, target_types)
248
- ]
249
- data = self.cursor.sql(f"select {', '.join(exprs)} from {inner}")
250
- engine.write_delta(loc, data, "append", storage_options=self.so)
251
-
252
- def _alter_add(self, m, rel, schema, loc) -> None:
253
- col = m.group("col").strip().strip('"')
254
- # Keep only the column type (drop any DEFAULT/NULL clause); add it as an all-null column by
255
- # rewriting the table with overwrite_schema so delta_rs accepts the widened schema.
256
- coltype = re.split(r"\s+default\b|\s+null\b", m.group("def"), flags=re.I)[0].strip() or "VARCHAR"
257
- loc_sql = loc.replace("'", "''")
258
- data = self.cursor.sql(
259
- f'select *, cast(null as {coltype}) as "{col}" from delta_scan(\'{loc_sql}\')'
260
- )
261
- engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
262
-
263
- def _drop(self, m) -> bool:
264
- # `drop table` on a duckrun relation: unregister the delta_scan view AND, via delta_rs,
265
- # overwrite the table to a one-column tombstone (TOMBSTONE_COLUMN) so a later glob discovery
266
- # hides it. NO data is deleted — delta_rs has no drop, and removing the Delta files would be
267
- # a filesystem hack that fails on object stores. The directory persists until a human purges
268
- # it; a later `create table ... as` overwrites the tombstone with real data. If the relation
269
- # isn't a duckrun-managed Delta table, fall through and let DuckDB drop the native table.
270
- rel = m.group("rel").strip()
271
- schema, identifier, loc = self._resolve(rel)
272
- if not loc or not self._exists(loc):
273
- return False
274
- tombstone = self.cursor.sql(f"select true as {TOMBSTONE_COLUMN}")
275
- engine.write_delta(loc, tombstone, "overwrite", overwrite_schema=True, storage_options=self.so)
276
- self.cursor.execute(f"drop view if exists {rel}")
277
- return True
278
-
279
-
280
- def handle(cursor, root_path, storage_options, sql: str, default_schema=None) -> bool:
281
- """Apply ``sql`` to Delta if it's a DML form targeting a duckrun-managed relation, using
282
- ``cursor`` to evaluate any SELECT body and to (re)create the ``delta_scan`` view.
283
-
284
- Every handled form goes through delta_rs (``engine.write_delta`` / ``DeltaTable.delete`` /
285
- ``.update``), which carries ``storage_options`` and so works on local AND abfss/OneLake stores.
286
- ``default_schema`` resolves an unqualified table name (the connection API has a current
287
- database; the dbt path always renders fully-qualified names so passes None).
288
- Returns True if handled (the caller must NOT also run it on DuckDB), False to pass through —
289
- anything unrecognized, or (for the mutate forms) a target that isn't a Delta table.
290
- """
291
- if not root_path:
292
- return False
293
- # Cheap pre-filter: only the candidate DML verbs.
294
- head = sql.lstrip()[:7].lower()
295
- if not head.startswith(("delete", "update", "insert", "create", "alter", "drop")):
296
- return False
297
- return _DeltaDML(cursor, root_path, storage_options, default_schema).try_handle(sql)
File without changes
File without changes
File without changes