duckrun 0.3.17.dev2__tar.gz → 0.3.17.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {duckrun-0.3.17.dev2/duckrun.egg-info → duckrun-0.3.17.dev4}/PKG-INFO +11 -7
  2. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/README.md +10 -6
  3. duckrun-0.3.17.dev4/dbt/adapters/duckrun/__version__.py +1 -0
  4. duckrun-0.3.17.dev4/dbt/adapters/duckrun/delta_dml.py +457 -0
  5. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/delta_plugin.py +37 -4
  6. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/engine.py +31 -0
  7. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/environment.py +43 -1
  8. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/impl.py +85 -0
  9. duckrun-0.3.17.dev4/dbt/include/duckrun/macros/catalog.sql +122 -0
  10. duckrun-0.3.17.dev4/dbt/include/duckrun/macros/materializations/snapshot.sql +144 -0
  11. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun/session.py +85 -26
  12. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4/duckrun.egg-info}/PKG-INFO +11 -7
  13. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun.egg-info/SOURCES.txt +2 -0
  14. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/pyproject.toml +1 -1
  15. duckrun-0.3.17.dev2/dbt/adapters/duckrun/__version__.py +0 -1
  16. duckrun-0.3.17.dev2/dbt/include/duckrun/macros/catalog.sql +0 -59
  17. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/LICENSE +0 -0
  18. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/MANIFEST.in +0 -0
  19. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/__init__.py +0 -0
  20. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/credentials.py +0 -0
  21. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/remote.py +0 -0
  22. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/secret.py +0 -0
  23. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/include/duckrun/__init__.py +0 -0
  24. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/include/duckrun/dbt_project.yml +0 -0
  25. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
  26. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
  27. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
  28. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
  29. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun/__init__.py +0 -0
  30. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun/auth.py +0 -0
  31. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun/delta_table.py +0 -0
  32. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
  33. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun.egg-info/requires.txt +0 -0
  34. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/duckrun.egg-info/top_level.txt +0 -0
  35. {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev2
3
+ Version: 0.3.17.dev4
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -291,11 +291,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
291
291
  ## Connection API (notebook)
292
292
 
293
293
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
294
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
295
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
296
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
297
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
298
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
294
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
295
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
296
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
297
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
298
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
299
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
300
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
301
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
302
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
299
303
 
300
304
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
301
305
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -347,7 +351,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
347
351
 
348
352
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
349
353
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
350
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
354
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
351
355
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
352
356
  in those docs are rendered live by CI, so they always reflect the latest `main`.
353
357
 
@@ -262,11 +262,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
262
262
  ## Connection API (notebook)
263
263
 
264
264
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
265
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
266
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
267
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
268
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
269
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
265
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
266
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
267
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
268
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
269
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
270
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
271
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
272
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
273
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
270
274
 
271
275
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
272
276
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -318,7 +322,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
318
322
 
319
323
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
320
324
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
321
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
325
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
322
326
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
323
327
  in those docs are rendered live by CI, so they always reflect the latest `main`.
324
328
 
@@ -0,0 +1 @@
1
+ version = "0.3.17.dev4"
@@ -0,0 +1,457 @@
1
+ """Route raw SQL DML against duckrun-managed (Delta-backed) relations to delta_rs.
2
+
3
+ duckrun intercepts writes at the dbt *materialization* layer (a model/seed/snapshot goes through
4
+ the materialization macros -> store_relation -> delta_rs). But a duckrun relation is surfaced as a
5
+ read-only ``delta_scan`` view, so *raw* DML sent straight to the connection — ``delete from``,
6
+ ``update``, ``insert into ... select``, ``alter table ... add column``, ``create table ... as
7
+ select`` — lands on a view and fails ("Can only delete from base table"), or would create a native
8
+ DuckDB table that bypasses Delta entirely.
9
+
10
+ This module intercepts those statements (at the cursor, see environment.DuckrunCursorWrapper) and
11
+ applies them to the Delta table **via delta_rs only**, then refreshes the ``delta_scan`` view — so
12
+ nothing relies on a native, mutable DuckDB table, and every op works on local AND abfss/OneLake
13
+ stores (delta_rs carries ``storage_options``). ``create table ... as`` writes a new Delta table;
14
+ the mutate forms (delete/update/insert/alter) apply only when a Delta table already exists at the
15
+ target (otherwise the statement passes through — e.g. the test's native ``fact``/``seed``).
16
+
17
+ ``drop table`` unregisters the ``delta_scan`` view AND overwrites the table (via delta_rs) to a
18
+ one-column ``TOMBSTONE_COLUMN`` marker, which discovery recognizes and hides. It does NOT delete
19
+ data: delta_rs has no drop, and removing the Delta files would be a filesystem hack that fails on
20
+ object stores. The directory persists until a human purges it; a later ``create table ... as``
21
+ overwrites the tombstone with real data and the table is live again.
22
+
23
+ The seed loader's own SQL (``create table <t> (<col defs>)``, ``insert ... values``, ``COPY``) lands
24
+ on a native DuckDB table, not a Delta table: bare ``create table (<col defs>)`` becomes a Delta
25
+ table only when a ``default_schema`` is set (the connection API), and the dbt/cursor path passes
26
+ None — so the seed's table stays native. ``insert ... values`` *does* match a form here, but the
27
+ mutate guard only applies it when a Delta table already exists at the target — the seed's native
28
+ table has none, so it falls through untouched. duckrun's own materializations emit ``create ...
29
+ view`` (not ``table``), so they pass through too.
30
+
31
+ Supported / unsupported (what reaches delta_rs):
32
+
33
+ create [or replace] table x [if not exists] as <query> Delta overwrite (query: select/with/(…))
34
+ create table x (<col defs>) empty Delta table (connection API only)
35
+ create temp/temporary table … native DuckDB (pass through) ── invariant:
36
+ create view … native DuckDB (pass through) ── only TEMP
37
+ and VIEW are
38
+ native; every
39
+ other CREATE
40
+ TABLE is Delta
41
+ insert into x [(cols)] select … Delta append (projected onto target schema)
42
+ insert into x [(cols)] values … Delta append (projected onto target schema)
43
+ [with …] insert into x select … Delta append (CTE re-attached to the body)
44
+ delete from x [where …] delta_rs delete
45
+ update x set … [where …] delta_rs update
46
+ alter table x add column … Delta overwrite (widen schema)
47
+ drop table x tombstone (no data deleted)
48
+ merge … / update … from / delete … using / multi-stmt NOT handled here — the connection API
49
+ (session.sql) rejects them with a clear
50
+ error; the dbt path never emits them.
51
+ """
52
+ import re
53
+ from typing import List, Optional, Tuple
54
+
55
+ from . import engine
56
+
57
+ # `drop table` tombstone: a dropped relation is overwritten (via delta_rs) to a table whose ONLY
58
+ # column is this marker, so (a) discovery recognizes it as dropped and hides it, and (b) anyone who
59
+ # opens the files sees an obviously-not-a-real-table schema rather than a plausible empty table. No
60
+ # data is deleted — the directory stays until a human purges it; a later `create table ... as`
61
+ # overwrites the marker schema with real data and the table is live again.
62
+ TOMBSTONE_COLUMN = "__duckrun_deleted__"
63
+
64
+
65
+ def _columns_are_tombstone(colnames) -> bool:
66
+ return [str(c).lower() for c in colnames] == [TOMBSTONE_COLUMN]
67
+
68
+
69
+ def is_dropped(con, location: str, storage_options=None) -> bool:
70
+ """True if the Delta table at ``location`` is a duckrun drop-tombstone (single marker column).
71
+
72
+ Used by discovery (dbt + connection API) to hide dropped tables. Best-effort: anything that
73
+ can't be opened/scanned is treated as 'not a tombstone' (let normal handling deal with it).
74
+ """
75
+ loc_sql = str(location).replace("'", "''")
76
+ try:
77
+ rel = con.execute(f"select * from delta_scan('{loc_sql}') limit 0")
78
+ return _columns_are_tombstone([d[0] for d in rel.description])
79
+ except Exception:
80
+ return False
81
+
82
+ # --- statement matchers (leading-anchored, DOTALL so multi-line bodies match) ----------------
83
+ # `create [or replace] table [if not exists] <rel> as <query>`. The body is ANY query text (a bare
84
+ # `select …`, a `with … select …` CTE, or a parenthesised `(select …)`); it's handed to DuckDB
85
+ # verbatim so anything DuckDB accepts after `as` works.
86
+ _CREATE_AS = re.compile(
87
+ r"\s*create\s+(?P<orrep>or\s+replace\s+)?table\s+(?P<ine>if\s+not\s+exists\s+)?"
88
+ r"(?P<rel>.+?)\s+as\s+(?P<body>.+)",
89
+ re.I | re.S,
90
+ )
91
+ # `create [or replace] table [if not exists] <rel> (<col defs>)` — no `as`. Connection-API only
92
+ # (see _create_coldefs): materializes an EMPTY Delta table so `CREATE TABLE` is always Delta-backed.
93
+ _CREATE_COLDEFS = re.compile(
94
+ r"\s*create\s+(?:or\s+replace\s+)?table\s+(?:if\s+not\s+exists\s+)?"
95
+ r"(?P<rel>.+?)\s*\((?P<defs>.+)\)\s*;?\s*",
96
+ re.I | re.S,
97
+ )
98
+ _INSERT_SELECT = re.compile(
99
+ r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s+(?P<body>select\b.*)",
100
+ re.I | re.S,
101
+ )
102
+ _INSERT_VALUES = re.compile(
103
+ r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s*values\s+(?P<body>\(.+)",
104
+ re.I | re.S,
105
+ )
106
+ _DELETE = re.compile(
107
+ r"\s*delete\s+from\s+(?P<rel>.+?)(?:\s+where\s+(?P<where>.+))?\s*;?\s*", re.I | re.S
108
+ )
109
+ _UPDATE = re.compile(
110
+ r"\s*update\s+(?P<rel>.+?)\s+set\s+(?P<set>.+?)(?:\s+where\s+(?P<where>.+?))?\s*;?\s*",
111
+ re.I | re.S,
112
+ )
113
+ _ALTER_ADD = re.compile(
114
+ r"\s*alter\s+table\s+(?P<rel>.+?)\s+add\s+column\s+(?P<col>\S+)\s+(?P<def>.+?)\s*;?\s*",
115
+ re.I | re.S,
116
+ )
117
+ _DROP = re.compile(
118
+ r"\s*drop\s+table\s+(?:if\s+exists\s+)?(?P<rel>[^\s;]+)\s*;?\s*", re.I | re.S
119
+ )
120
+ # `create temp/temporary table …` is DuckDB-local scratch by design and must NEVER be captured —
121
+ # checked first in try_handle so it always passes through to native DuckDB (the invariant: only
122
+ # CREATE TEMP TABLE is native; every other CREATE TABLE is Delta-backed).
123
+ _CREATE_TEMP_RE = re.compile(r"\s*create\s+(?:or\s+replace\s+)?(?:temp|temporary)\b", re.I)
124
+ # CTE/whitespace handling: a leading `with …` block followed by a top-level INSERT/UPDATE/DELETE.
125
+ # leading `\b` is load-bearing: _find_top_level tries this at every depth-0 index, so without it the
126
+ # verb would match inside an identifier (e.g. `update` within `last_update`).
127
+ _LEADING_WITH = re.compile(r"\s*with\b", re.I)
128
+ _DRIVING_DML = re.compile(r"\b(?:insert\s+into|update|delete\s+from)\b", re.I)
129
+
130
+
131
+ def _strip_leading(query: str) -> str:
132
+ """Drop leading whitespace and ``--`` / ``/* */`` comments so the first keyword is visible."""
133
+ s = query
134
+ while True:
135
+ t = s.lstrip()
136
+ if t.startswith("--"):
137
+ nl = t.find("\n")
138
+ s = "" if nl == -1 else t[nl + 1:]
139
+ elif t.startswith("/*"):
140
+ end = t.find("*/")
141
+ s = "" if end == -1 else t[end + 2:]
142
+ else:
143
+ return t
144
+
145
+
146
+ def _find_top_level(s: str, pattern) -> int:
147
+ """Index of the first ``pattern`` match at paren-depth 0 and outside quotes, else -1.
148
+
149
+ Lets us tell a top-level clause (the ``FROM`` of ``UPDATE … FROM``, the verb after a leading
150
+ ``WITH``) from the same keyword nested in a subquery, without a full SQL parser."""
151
+ depth, quote, i, n = 0, None, 0, len(s)
152
+ while i < n:
153
+ ch = s[i]
154
+ if quote:
155
+ if ch == quote:
156
+ quote = None
157
+ elif ch in ("'", '"'):
158
+ quote = ch
159
+ elif ch in "([":
160
+ depth += 1
161
+ elif ch in ")]":
162
+ depth -= 1
163
+ elif depth == 0 and pattern.match(s, i):
164
+ return i
165
+ i += 1
166
+ return -1
167
+
168
+
169
+ def _split_leading_with(sql: str) -> Tuple[str, str]:
170
+ """``(with_clause, remainder)`` for ``WITH … <INSERT/UPDATE/DELETE> …``; ``('', sql)`` otherwise.
171
+
172
+ So ``WITH c AS (…) INSERT INTO t SELECT … FROM c`` reaches the matchers (which anchor on the
173
+ verb) and the CTE is preserved when the body is evaluated. A leading ``WITH`` that drives a
174
+ plain ``SELECT`` (a read) is left untouched."""
175
+ if not _LEADING_WITH.match(sql):
176
+ return "", sql
177
+ idx = _find_top_level(sql, _DRIVING_DML)
178
+ if idx <= 0:
179
+ return "", sql
180
+ return sql[:idx].rstrip(), sql[idx:]
181
+
182
+
183
+ def _fullmatch(pattern, sql):
184
+ return pattern.fullmatch(sql.strip())
185
+
186
+
187
+ def _split_relation(rel: str) -> Tuple[Optional[str], Optional[str]]:
188
+ """`"db"."schema"."tbl"` / `schema.tbl` / `tbl` -> (schema, identifier), quotes stripped."""
189
+ parts = [p.strip().strip('"') for p in rel.strip().split(".")]
190
+ if not parts or not parts[-1]:
191
+ return None, None
192
+ identifier = parts[-1]
193
+ schema = parts[-2] if len(parts) >= 2 else None
194
+ return schema, identifier
195
+
196
+
197
+ def _split_top_level_commas(s: str) -> List[str]:
198
+ """Split on commas that aren't inside parentheses or quotes (so ``left(email, 3)`` stays whole)."""
199
+ out, depth, start, quote = [], 0, 0, None
200
+ for i, ch in enumerate(s):
201
+ if quote:
202
+ if ch == quote:
203
+ quote = None
204
+ elif ch in ("'", '"'):
205
+ quote = ch
206
+ elif ch in "([":
207
+ depth += 1
208
+ elif ch in ")]":
209
+ depth -= 1
210
+ elif ch == "," and depth == 0:
211
+ out.append(s[start:i])
212
+ start = i + 1
213
+ out.append(s[start:])
214
+ return [p.strip() for p in out if p.strip()]
215
+
216
+
217
+ class _DeltaDML:
218
+ """One attempt to handle a statement; ``run()`` returns True if it was applied to Delta."""
219
+
220
+ def __init__(self, cursor, root_path: str, storage_options, default_schema=None):
221
+ self.cursor = cursor
222
+ self.root_path = root_path.rstrip("/")
223
+ self.so = storage_options
224
+ self.default_schema = default_schema
225
+ self._with_clause = "" # a leading `WITH …` preceding an INSERT, prepended to the body
226
+
227
+ def _loc(self, schema: str, identifier: str) -> str:
228
+ return f"{self.root_path}/{schema}/{identifier}"
229
+
230
+ def _resolve(self, rel: str):
231
+ """(schema, identifier, location) for ``rel``, falling back to default_schema for an
232
+ unqualified name (the connection API relies on a current database). (None, None, None) when
233
+ no schema can be determined."""
234
+ schema, identifier = _split_relation(rel)
235
+ schema = schema or self.default_schema
236
+ if not schema or not identifier:
237
+ return None, None, None
238
+ return schema, identifier, self._loc(schema, identifier)
239
+
240
+ def _exists(self, loc: str) -> bool:
241
+ return engine.table_exists(loc, self.so)
242
+
243
+ def _refresh_view(self, rel: str, schema: str, loc: str) -> None:
244
+ loc_sql = loc.replace("'", "''")
245
+ self.cursor.execute(f'create schema if not exists "{schema}"')
246
+ self.cursor.execute(
247
+ f"create or replace view {rel} as select * from delta_scan('{loc_sql}')"
248
+ )
249
+
250
+ def try_handle(self, sql: str) -> bool:
251
+ # CREATE TEMP/TEMPORARY TABLE is native DuckDB scratch by design — never capture it.
252
+ if _CREATE_TEMP_RE.match(sql):
253
+ return False
254
+ m = _fullmatch(_CREATE_AS, sql)
255
+ if m and "__duckrun" not in m.group("rel"):
256
+ return self._create_as(m)
257
+ m = _fullmatch(_CREATE_COLDEFS, sql)
258
+ if m and "__duckrun" not in m.group("rel"):
259
+ return self._create_coldefs(m)
260
+ m = _fullmatch(_INSERT_SELECT, sql)
261
+ if m:
262
+ return self._mutate(m, self._insert_select)
263
+ m = _fullmatch(_INSERT_VALUES, sql)
264
+ if m:
265
+ return self._mutate(m, self._insert_values)
266
+ m = _fullmatch(_DELETE, sql)
267
+ if m:
268
+ return self._mutate(m, self._delete)
269
+ m = _fullmatch(_UPDATE, sql)
270
+ if m:
271
+ return self._mutate(m, self._update)
272
+ m = _fullmatch(_ALTER_ADD, sql)
273
+ if m:
274
+ return self._mutate(m, self._alter_add)
275
+ m = _fullmatch(_DROP, sql)
276
+ if m:
277
+ return self._drop(m)
278
+ return False
279
+
280
+ # -- create table <rel> as <query>: always materialize as a duckrun Delta table ------------
281
+ def _create_as(self, m) -> bool:
282
+ rel = m.group("rel").strip()
283
+ schema, identifier, loc = self._resolve(rel)
284
+ if not loc:
285
+ return False
286
+ # dbt/cursor path (no default_schema): keep the ORIGINAL narrow interception — only a plain
287
+ # `create table … as select …` routes to Delta. The wider forms (`or replace`, a CTE or a
288
+ # parenthesised body) are a connection-API affordance; on the dbt path they must stay native
289
+ # so dbt keeps owning the relation. dbt-internal CTAS like store_failures' `create table … as
290
+ # (select …)` is a real TABLE dbt later drops/recreates — turning it into a delta_scan VIEW
291
+ # breaks that ("Existing object … is of type View, trying to drop type Table").
292
+ if self.default_schema is None and (
293
+ m.group("orrep") or not re.match(r"select\b", m.group("body").lstrip(), re.I)
294
+ ):
295
+ return False
296
+ # `if not exists` over a live (non-tombstone) table is a no-op — just (re)surface the view.
297
+ if m.group("ine") and self._exists(loc) and not is_dropped(self.cursor, loc, self.so):
298
+ self._refresh_view(rel, schema, loc)
299
+ return True
300
+ data = self.cursor.sql(m.group("body"))
301
+ # overwrite_schema so this replaces a prior table (or a drop-tombstone) wholesale — a live
302
+ # table is recreated with the real schema, clearing any tombstone marker.
303
+ engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
304
+ self._refresh_view(rel, schema, loc)
305
+ return True
306
+
307
+ # -- create table <rel> (<col defs>): an EMPTY Delta table (connection API only) -----------
308
+ def _create_coldefs(self, m) -> bool:
309
+ # Only the connection API (which carries a current database) makes a bare `CREATE TABLE
310
+ # (col defs)` a Delta table — so `CREATE TABLE` is always Delta-backed there. The dbt/cursor
311
+ # path passes default_schema=None: the seed loader emits this exact form and RELIES on it
312
+ # landing as a native DuckDB table, so we pass through untouched.
313
+ if self.default_schema is None:
314
+ return False
315
+ rel = m.group("rel").strip()
316
+ schema, identifier, loc = self._resolve(rel)
317
+ if not loc:
318
+ return False
319
+ # Let DuckDB parse the column defs (types, constraints, nested parens) by building the table
320
+ # as a TEMP, then take a 0-row typed relation from it and write that as an empty Delta table.
321
+ tmp = f"__duckrun_empty_{abs(hash((schema, identifier))) & 0xFFFFFFFF}"
322
+ self.cursor.execute(f'create or replace temp table "{tmp}" ({m.group("defs")})')
323
+ try:
324
+ empty = self.cursor.sql(f'select * from "{tmp}" limit 0')
325
+ engine.write_delta(loc, empty, "overwrite", overwrite_schema=True, storage_options=self.so)
326
+ finally:
327
+ self.cursor.execute(f'drop table if exists "{tmp}"')
328
+ self._refresh_view(rel, schema, loc)
329
+ return True
330
+
331
+ # -- forms that only apply when a Delta table already exists at the target ------------------
332
+ def _mutate(self, m, op) -> bool:
333
+ rel = m.group("rel").strip()
334
+ schema, identifier, loc = self._resolve(rel)
335
+ if not loc or not self._exists(loc):
336
+ return False # native relation (e.g. the test's `fact`/`seed`) -> let DuckDB handle it
337
+ if self._with_clause and op != self._insert_select:
338
+ return False # `WITH … UPDATE/DELETE` can't be expressed through a delta_rs predicate
339
+ op(m, rel, schema, loc)
340
+ self._refresh_view(rel, schema, loc)
341
+ return True
342
+
343
+ def _delete(self, m, rel, schema, loc) -> None:
344
+ where = m.group("where")
345
+ engine._delta_table(loc, self.so).delete(predicate=where.strip() if where else None)
346
+
347
+ def _update(self, m, rel, schema, loc) -> None:
348
+ updates = {}
349
+ for assign in _split_top_level_commas(m.group("set")):
350
+ col, _, expr = assign.partition("=")
351
+ updates[col.strip().strip('"')] = expr.strip()
352
+ where = m.group("where")
353
+ engine._delta_table(loc, self.so).update(
354
+ updates=updates, predicate=where.strip() if where else None
355
+ )
356
+
357
+ def _insert_select(self, m, rel, schema, loc) -> None:
358
+ body = m.group("body")
359
+ if self._with_clause: # `WITH … INSERT INTO t SELECT …`: re-attach the CTE to the body
360
+ body = f"{self._with_clause} {body}"
361
+ cols = m.group("cols")
362
+ if cols: # `insert into t (a, b) select …` → project the query onto the named columns
363
+ self._append_projected(loc, self._provided(cols), f"({body})")
364
+ else: # column count/order already matches the target → append as-is
365
+ engine.write_delta(loc, self.cursor.sql(body), "append", storage_options=self.so)
366
+
367
+ def _insert_values(self, m, rel, schema, loc) -> None:
368
+ # `insert into <rel> [(<cols>)] values (...)`: the literals supply every target column when
369
+ # no list is given, in order; otherwise the named columns.
370
+ cols = m.group("cols")
371
+ provided = self._provided(cols) if cols else None
372
+ self._append_projected(loc, provided, f"(values {m.group('body')})")
373
+
374
+ @staticmethod
375
+ def _provided(cols: str) -> List[str]:
376
+ return [c.strip().strip('"') for c in cols.split(",")]
377
+
378
+ def _append_projected(self, loc, provided, derived: str) -> None:
379
+ """Append a ``derived`` table (a ``(values …)`` tuple list or a ``(select …)`` subquery) to
380
+ the Delta table at ``loc``, projecting its columns onto the FULL target schema: supplied
381
+ columns come from ``derived`` (positional when ``provided`` is None), any unsupplied target
382
+ column is a typed NULL, and every projected column is cast to the target column's type so
383
+ the appended Arrow schema matches the table exactly (what a plain SQL INSERT does, and it
384
+ stops a literal wider than the column from forcing delta_rs to add a new writer feature on
385
+ append)."""
386
+ loc_sql = loc.replace("'", "''")
387
+ template = self.cursor.sql(f"select * from delta_scan('{loc_sql}') limit 0")
388
+ target_cols = list(template.columns)
389
+ target_types = [str(t) for t in template.types]
390
+ by_lower = {c.lower(): c for c in target_cols}
391
+
392
+ if provided is None: # positional → every target column, in order
393
+ provided = target_cols
394
+ else: # explicit column list → canonicalize to the target's casing
395
+ provided = [by_lower.get(c.lower(), c) for c in provided]
396
+ provided_set = set(provided)
397
+
398
+ quoted = ", ".join('"' + c + '"' for c in provided)
399
+ inner = f"{derived} v({quoted})"
400
+ exprs = [
401
+ f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
402
+ else f'cast(null as {typ}) as "{col}"'
403
+ for col, typ in zip(target_cols, target_types)
404
+ ]
405
+ data = self.cursor.sql(f"select {', '.join(exprs)} from {inner}")
406
+ engine.write_delta(loc, data, "append", storage_options=self.so)
407
+
408
+ def _alter_add(self, m, rel, schema, loc) -> None:
409
+ col = m.group("col").strip().strip('"')
410
+ # Keep only the column type (drop any DEFAULT/NULL clause); add it as an all-null column by
411
+ # rewriting the table with overwrite_schema so delta_rs accepts the widened schema.
412
+ coltype = re.split(r"\s+default\b|\s+null\b", m.group("def"), flags=re.I)[0].strip() or "VARCHAR"
413
+ loc_sql = loc.replace("'", "''")
414
+ data = self.cursor.sql(
415
+ f'select *, cast(null as {coltype}) as "{col}" from delta_scan(\'{loc_sql}\')'
416
+ )
417
+ engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
418
+
419
+ def _drop(self, m) -> bool:
420
+ # `drop table` on a duckrun relation: unregister the delta_scan view AND, via delta_rs,
421
+ # overwrite the table to a one-column tombstone (TOMBSTONE_COLUMN) so a later glob discovery
422
+ # hides it. NO data is deleted — delta_rs has no drop, and removing the Delta files would be
423
+ # a filesystem hack that fails on object stores. The directory persists until a human purges
424
+ # it; a later `create table ... as` overwrites the tombstone with real data. If the relation
425
+ # isn't a duckrun-managed Delta table, fall through and let DuckDB drop the native table.
426
+ rel = m.group("rel").strip()
427
+ schema, identifier, loc = self._resolve(rel)
428
+ if not loc or not self._exists(loc):
429
+ return False
430
+ tombstone = self.cursor.sql(f"select true as {TOMBSTONE_COLUMN}")
431
+ engine.write_delta(loc, tombstone, "overwrite", overwrite_schema=True, storage_options=self.so)
432
+ self.cursor.execute(f"drop view if exists {rel}")
433
+ return True
434
+
435
+
436
+ def handle(cursor, root_path, storage_options, sql: str, default_schema=None) -> bool:
437
+ """Apply ``sql`` to Delta if it's a DML form targeting a duckrun-managed relation, using
438
+ ``cursor`` to evaluate any SELECT body and to (re)create the ``delta_scan`` view.
439
+
440
+ Every handled form goes through delta_rs (``engine.write_delta`` / ``DeltaTable.delete`` /
441
+ ``.update``), which carries ``storage_options`` and so works on local AND abfss/OneLake stores.
442
+ ``default_schema`` resolves an unqualified table name (the connection API has a current
443
+ database; the dbt path always renders fully-qualified names so passes None).
444
+ Returns True if handled (the caller must NOT also run it on DuckDB), False to pass through —
445
+ anything unrecognized, or (for the mutate forms) a target that isn't a Delta table.
446
+ """
447
+ if not root_path:
448
+ return False
449
+ sql = _strip_leading(sql) # so leading comments/whitespace don't hide the verb
450
+ with_clause, body = _split_leading_with(sql) # peel a leading `WITH …` off an INSERT/etc.
451
+ # Cheap pre-filter: only the candidate DML verbs.
452
+ head = body[:7].lower()
453
+ if not head.startswith(("delete", "update", "insert", "create", "alter", "drop")):
454
+ return False
455
+ dml = _DeltaDML(cursor, root_path, storage_options, default_schema)
456
+ dml._with_clause = with_clause
457
+ return dml.try_handle(body)
@@ -6,6 +6,7 @@ connection (``configure_connection``), and on ``store()`` hands the model relati
6
6
  straight to delta_rs. DuckDB relations expose the Arrow C-stream interface, which
7
7
  deltalake 1.x consumes directly, so there is no pyarrow dependency.
8
8
  """
9
+ import re
9
10
  from typing import Any, Optional
10
11
 
11
12
  from dbt.adapters.duckdb.plugins import BasePlugin
@@ -155,11 +156,22 @@ class Plugin(BasePlugin):
155
156
  # Table-like (non-incremental) models always overwrite. Incremental models
156
157
  # overwrite on first run / full-refresh, then apply the incremental strategy.
157
158
  if not incremental or full_refresh or not exists:
159
+ # This branch is a CREATE OR REPLACE: a table model, a --full-refresh, or a first run.
160
+ # When we are REPLACING an existing table (exists), allow delta_rs to replace the schema
161
+ # wholesale (schema_mode="overwrite") — the model SQL defines the new schema, exactly as
162
+ # `CREATE OR REPLACE TABLE` does on every other warehouse. Without it, delta_rs's strict
163
+ # overwrite keeps the OLD schema/protocol and so can't change a column's type or write a
164
+ # column needing a new writer feature the old table lacks (e.g. retyping to ::timestamp /
165
+ # timestampNtz). This is scoped to the full-rebuild replace ONLY — NOT append, safeappend,
166
+ # merge, or microbatch, which must keep their strict, schema-stable writes. A fresh create
167
+ # (not exists) doesn't need it. A user's explicit merge_schema still wins.
168
+ overwrite_schema = exists and not merge_schema
158
169
  with engine.mem_profile("overwrite", con=cur):
159
170
  engine.write_delta(
160
171
  path, data, "overwrite",
161
172
  partition_by=partition_by,
162
173
  merge_schema=merge_schema,
174
+ overwrite_schema=overwrite_schema,
163
175
  storage_options=storage_options,
164
176
  compaction_threshold=self._compaction_threshold,
165
177
  )
@@ -201,7 +213,7 @@ class Plugin(BasePlugin):
201
213
  insert_only=(strategy == "insert"),
202
214
  update_columns=cfg.get("merge_update_columns"),
203
215
  exclude_columns=cfg.get("merge_exclude_columns"),
204
- predicates=self._merge_predicates(cfg),
216
+ predicates=self._merge_predicates(cfg, data.columns),
205
217
  update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
206
218
  insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
207
219
  merge_schema=evolve_schema,
@@ -470,16 +482,37 @@ class Plugin(BasePlugin):
470
482
  return None
471
483
  return str(expr).replace("DBT_INTERNAL_DEST", "target").replace("DBT_INTERNAL_SOURCE", "source")
472
484
 
485
+ @staticmethod
486
+ def _qualify_predicate(expr, columns):
487
+ """Prefix bare references to known target columns with ``target.``.
488
+
489
+ duckrun folds ``incremental_predicates`` into the merge condition
490
+ (``target.k = source.k AND <predicate>``). A bare column there (e.g. ``id != 2``) exists
491
+ on BOTH the source and target, so delta_rs rejects it as an ambiguous reference. dbt's
492
+ ``incremental_predicates`` constrain the existing/target rows (the delete+insert delete, the
493
+ merge ON), so we qualify bare column tokens to ``target.``. Only exact column-name tokens
494
+ that aren't already qualified (preceded by ``.``) or quoted/literal are rewritten — literals
495
+ and functions (e.g. ``current_date``, which is not a column) are left untouched."""
496
+ if not expr or not columns:
497
+ return expr
498
+ # Longest names first so a column that's a prefix of another isn't partially matched.
499
+ for col in sorted({str(c) for c in columns}, key=len, reverse=True):
500
+ # whole-word col, not preceded by '.', a word char, or a quote (already qualified/quoted).
501
+ pattern = re.compile(r'(?<![.\w"\'])' + re.escape(col) + r'\b', re.I)
502
+ expr = pattern.sub(lambda m: "target." + m.group(0), expr)
503
+ return expr
504
+
473
505
  @classmethod
474
- def _merge_predicates(cls, cfg: dict):
506
+ def _merge_predicates(cls, cfg: dict, columns=None):
475
507
  """dbt ``incremental_predicates`` (or ``predicates``), with dbt's standard merge
476
- aliases rewritten to the ones delta_rs uses here."""
508
+ aliases rewritten to the ones delta_rs uses here and bare column refs qualified to
509
+ ``target.`` (see ``_qualify_predicate``)."""
477
510
  preds = cfg.get("incremental_predicates") or cfg.get("predicates")
478
511
  if not preds:
479
512
  return None
480
513
  if isinstance(preds, str):
481
514
  preds = [preds]
482
- return [cls._rewrite_merge_aliases(p) for p in preds]
515
+ return [cls._qualify_predicate(cls._rewrite_merge_aliases(p), columns) for p in preds]
483
516
 
484
517
  @staticmethod
485
518
  def _resolve_schema_change(on_schema_change, path, data, storage_options) -> bool:
@@ -536,6 +536,37 @@ def table_exists(path: str, storage_options: Optional[Dict[str, str]] = None) ->
536
536
  return False
537
537
 
538
538
 
539
+ def delta_stats(cur, path: str, storage_options: Optional[Dict[str, str]] = None):
540
+ """Cheap table statistics for ``dbt docs generate``, read from the Delta **log** (no data scan).
541
+
542
+ ``DeltaTable.get_add_actions()`` carries per-file ``num_records`` / ``size_bytes`` /
543
+ ``modification_time``; summing rows+bytes and taking the latest mtime gives the whole table's
544
+ stats without opening any data file. Aggregation goes through the DuckDB cursor (``cur``) via a
545
+ replacement scan over the arro3 table — no pyarrow dependency.
546
+
547
+ Returns ``{"num_rows", "bytes", "last_modified"}`` (last_modified = epoch milliseconds), or
548
+ ``None`` on ANY failure (a drop-tombstone, a missing table, an unreachable/credential-less remote
549
+ store). Best-effort by design: a statless catalog is fine, but a docs build must never break.
550
+ """
551
+ try:
552
+ add_actions = _delta_table(path, storage_options).get_add_actions() # noqa: F841 (replacement scan)
553
+ row = cur.sql(
554
+ "select coalesce(sum(num_records), 0)::bigint, "
555
+ "coalesce(sum(size_bytes), 0)::bigint, "
556
+ "max(modification_time)::bigint from add_actions"
557
+ ).fetchone()
558
+ except Exception as exc: # best-effort: docs stats must never fail catalog generation
559
+ logger.debug(f"duckrun: no Delta stats for {path!r}: {exc}")
560
+ return None
561
+ if row is None:
562
+ return None
563
+ return {
564
+ "num_rows": int(row[0]),
565
+ "bytes": int(row[1]),
566
+ "last_modified": int(row[2]) if row[2] is not None else None,
567
+ }
568
+
569
+
539
570
  # Delta column-metadata key under which we stash a dbt column description, and the dollar-quote
540
571
  # label used to embed arbitrary comment text (newlines, quotes, dollar signs) in COMMENT ON SQL.
541
572
  _DELTA_COMMENT_KEY = "comment"