duckrun 0.3.17.dev3__tar.gz → 0.3.17.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {duckrun-0.3.17.dev3/duckrun.egg-info → duckrun-0.3.17.dev4}/PKG-INFO +11 -7
  2. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/README.md +10 -6
  3. duckrun-0.3.17.dev4/dbt/adapters/duckrun/__version__.py +1 -0
  4. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/delta_dml.py +187 -27
  5. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/session.py +61 -15
  6. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4/duckrun.egg-info}/PKG-INFO +11 -7
  7. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/pyproject.toml +1 -1
  8. duckrun-0.3.17.dev3/dbt/adapters/duckrun/__version__.py +0 -1
  9. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/LICENSE +0 -0
  10. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/MANIFEST.in +0 -0
  11. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/__init__.py +0 -0
  12. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/credentials.py +0 -0
  13. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/delta_plugin.py +0 -0
  14. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/engine.py +0 -0
  15. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/environment.py +0 -0
  16. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/impl.py +0 -0
  17. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/remote.py +0 -0
  18. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/adapters/duckrun/secret.py +0 -0
  19. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/__init__.py +0 -0
  20. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/dbt_project.yml +0 -0
  21. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/catalog.sql +0 -0
  22. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
  23. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
  24. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
  25. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/snapshot.sql +0 -0
  26. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
  27. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/__init__.py +0 -0
  28. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/auth.py +0 -0
  29. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun/delta_table.py +0 -0
  30. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/SOURCES.txt +0 -0
  31. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/dependency_links.txt +0 -0
  32. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/requires.txt +0 -0
  33. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/duckrun.egg-info/top_level.txt +0 -0
  34. {duckrun-0.3.17.dev3 → duckrun-0.3.17.dev4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev3
3
+ Version: 0.3.17.dev4
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -291,11 +291,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
291
291
  ## Connection API (notebook)
292
292
 
293
293
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
294
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
295
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
296
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
297
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
298
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
294
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
295
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
296
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
297
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
298
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
299
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
300
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
301
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
302
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
299
303
 
300
304
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
301
305
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -347,7 +351,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
347
351
 
348
352
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
349
353
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
350
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
354
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
351
355
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
352
356
  in those docs are rendered live by CI, so they always reflect the latest `main`.
353
357
 
@@ -262,11 +262,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
262
262
  ## Connection API (notebook)
263
263
 
264
264
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
265
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
266
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
267
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
268
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
269
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
265
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
266
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
267
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
268
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
269
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
270
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
271
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
272
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
273
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
270
274
 
271
275
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
272
276
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -318,7 +322,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
318
322
 
319
323
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
320
324
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
321
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
325
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
322
326
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
323
327
  in those docs are rendered live by CI, so they always reflect the latest `main`.
324
328
 
@@ -0,0 +1 @@
1
+ version = "0.3.17.dev4"
@@ -21,11 +21,33 @@ object stores. The directory persists until a human purges it; a later ``create
21
21
  overwrites the tombstone with real data and the table is live again.
22
22
 
23
23
  The seed loader's own SQL (``create table <t> (<col defs>)``, ``insert ... values``, ``COPY``) lands
24
- on a native DuckDB table, not a Delta table: ``create table (<col defs>)`` doesn't match the
25
- ``... as select`` form, and while ``insert ... values`` now *does* match a form here, the mutate
26
- guard only applies it when a Delta table already exists at the target the seed's native table has
27
- none, so it falls through untouched. duckrun's own materializations emit ``create ... view`` (not
28
- ``table``), so they pass through too.
24
+ on a native DuckDB table, not a Delta table: bare ``create table (<col defs>)`` becomes a Delta
25
+ table only when a ``default_schema`` is set (the connection API), and the dbt/cursor path passes
26
+ None so the seed's table stays native. ``insert ... values`` *does* match a form here, but the
27
+ mutate guard only applies it when a Delta table already exists at the target the seed's native
28
+ table has none, so it falls through untouched. duckrun's own materializations emit ``create ...
29
+ view`` (not ``table``), so they pass through too.
30
+
31
+ Supported / unsupported (what reaches delta_rs):
32
+
33
+ create [or replace] table x [if not exists] as <query> Delta overwrite (query: select/with/(…))
34
+ create table x (<col defs>) empty Delta table (connection API only)
35
+ create temp/temporary table … native DuckDB (pass through) ── invariant:
36
+ create view … native DuckDB (pass through) ── only TEMP
37
+ and VIEW are
38
+ native; every
39
+ other CREATE
40
+ TABLE is Delta
41
+ insert into x [(cols)] select … Delta append (projected onto target schema)
42
+ insert into x [(cols)] values … Delta append (projected onto target schema)
43
+ [with …] insert into x select … Delta append (CTE re-attached to the body)
44
+ delete from x [where …] delta_rs delete
45
+ update x set … [where …] delta_rs update
46
+ alter table x add column … Delta overwrite (widen schema)
47
+ drop table x tombstone (no data deleted)
48
+ merge … / update … from / delete … using / multi-stmt NOT handled here — the connection API
49
+ (session.sql) rejects them with a clear
50
+ error; the dbt path never emits them.
29
51
  """
30
52
  import re
31
53
  from typing import List, Optional, Tuple
@@ -58,12 +80,24 @@ def is_dropped(con, location: str, storage_options=None) -> bool:
58
80
  return False
59
81
 
60
82
  # --- statement matchers (leading-anchored, DOTALL so multi-line bodies match) ----------------
83
+ # `create [or replace] table [if not exists] <rel> as <query>`. The body is ANY query text (a bare
84
+ # `select …`, a `with … select …` CTE, or a parenthesised `(select …)`); it's handed to DuckDB
85
+ # verbatim so anything DuckDB accepts after `as` works.
61
86
  _CREATE_AS = re.compile(
62
- r"\s*create\s+table\s+(?:if\s+not\s+exists\s+)?(?P<rel>.+?)\s+as\s+(?P<body>select\b.*)",
87
+ r"\s*create\s+(?P<orrep>or\s+replace\s+)?table\s+(?P<ine>if\s+not\s+exists\s+)?"
88
+ r"(?P<rel>.+?)\s+as\s+(?P<body>.+)",
89
+ re.I | re.S,
90
+ )
91
+ # `create [or replace] table [if not exists] <rel> (<col defs>)` — no `as`. Connection-API only
92
+ # (see _create_coldefs): materializes an EMPTY Delta table so `CREATE TABLE` is always Delta-backed.
93
+ _CREATE_COLDEFS = re.compile(
94
+ r"\s*create\s+(?:or\s+replace\s+)?table\s+(?:if\s+not\s+exists\s+)?"
95
+ r"(?P<rel>.+?)\s*\((?P<defs>.+)\)\s*;?\s*",
63
96
  re.I | re.S,
64
97
  )
65
98
  _INSERT_SELECT = re.compile(
66
- r"\s*insert\s+into\s+(?P<rel>.+?)\s+(?P<body>select\b.*)", re.I | re.S
99
+ r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s+(?P<body>select\b.*)",
100
+ re.I | re.S,
67
101
  )
68
102
  _INSERT_VALUES = re.compile(
69
103
  r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s*values\s+(?P<body>\(.+)",
@@ -83,6 +117,67 @@ _ALTER_ADD = re.compile(
83
117
  _DROP = re.compile(
84
118
  r"\s*drop\s+table\s+(?:if\s+exists\s+)?(?P<rel>[^\s;]+)\s*;?\s*", re.I | re.S
85
119
  )
120
+ # `create temp/temporary table …` is DuckDB-local scratch by design and must NEVER be captured —
121
+ # checked first in try_handle so it always passes through to native DuckDB (the invariant: only
122
+ # CREATE TEMP TABLE is native; every other CREATE TABLE is Delta-backed).
123
+ _CREATE_TEMP_RE = re.compile(r"\s*create\s+(?:or\s+replace\s+)?(?:temp|temporary)\b", re.I)
124
+ # CTE/whitespace handling: a leading `with …` block followed by a top-level INSERT/UPDATE/DELETE.
125
+ # leading `\b` is load-bearing: _find_top_level tries this at every depth-0 index, so without it the
126
+ # verb would match inside an identifier (e.g. `update` within `last_update`).
127
+ _LEADING_WITH = re.compile(r"\s*with\b", re.I)
128
+ _DRIVING_DML = re.compile(r"\b(?:insert\s+into|update|delete\s+from)\b", re.I)
129
+
130
+
131
+ def _strip_leading(query: str) -> str:
132
+ """Drop leading whitespace and ``--`` / ``/* */`` comments so the first keyword is visible."""
133
+ s = query
134
+ while True:
135
+ t = s.lstrip()
136
+ if t.startswith("--"):
137
+ nl = t.find("\n")
138
+ s = "" if nl == -1 else t[nl + 1:]
139
+ elif t.startswith("/*"):
140
+ end = t.find("*/")
141
+ s = "" if end == -1 else t[end + 2:]
142
+ else:
143
+ return t
144
+
145
+
146
+ def _find_top_level(s: str, pattern) -> int:
147
+ """Index of the first ``pattern`` match at paren-depth 0 and outside quotes, else -1.
148
+
149
+ Lets us tell a top-level clause (the ``FROM`` of ``UPDATE … FROM``, the verb after a leading
150
+ ``WITH``) from the same keyword nested in a subquery, without a full SQL parser."""
151
+ depth, quote, i, n = 0, None, 0, len(s)
152
+ while i < n:
153
+ ch = s[i]
154
+ if quote:
155
+ if ch == quote:
156
+ quote = None
157
+ elif ch in ("'", '"'):
158
+ quote = ch
159
+ elif ch in "([":
160
+ depth += 1
161
+ elif ch in ")]":
162
+ depth -= 1
163
+ elif depth == 0 and pattern.match(s, i):
164
+ return i
165
+ i += 1
166
+ return -1
167
+
168
+
169
+ def _split_leading_with(sql: str) -> Tuple[str, str]:
170
+ """``(with_clause, remainder)`` for ``WITH … <INSERT/UPDATE/DELETE> …``; ``('', sql)`` otherwise.
171
+
172
+ So ``WITH c AS (…) INSERT INTO t SELECT … FROM c`` reaches the matchers (which anchor on the
173
+ verb) and the CTE is preserved when the body is evaluated. A leading ``WITH`` that drives a
174
+ plain ``SELECT`` (a read) is left untouched."""
175
+ if not _LEADING_WITH.match(sql):
176
+ return "", sql
177
+ idx = _find_top_level(sql, _DRIVING_DML)
178
+ if idx <= 0:
179
+ return "", sql
180
+ return sql[:idx].rstrip(), sql[idx:]
86
181
 
87
182
 
88
183
  def _fullmatch(pattern, sql):
@@ -127,6 +222,7 @@ class _DeltaDML:
127
222
  self.root_path = root_path.rstrip("/")
128
223
  self.so = storage_options
129
224
  self.default_schema = default_schema
225
+ self._with_clause = "" # a leading `WITH …` preceding an INSERT, prepended to the body
130
226
 
131
227
  def _loc(self, schema: str, identifier: str) -> str:
132
228
  return f"{self.root_path}/{schema}/{identifier}"
@@ -152,9 +248,15 @@ class _DeltaDML:
152
248
  )
153
249
 
154
250
  def try_handle(self, sql: str) -> bool:
251
+ # CREATE TEMP/TEMPORARY TABLE is native DuckDB scratch by design — never capture it.
252
+ if _CREATE_TEMP_RE.match(sql):
253
+ return False
155
254
  m = _fullmatch(_CREATE_AS, sql)
156
255
  if m and "__duckrun" not in m.group("rel"):
157
256
  return self._create_as(m)
257
+ m = _fullmatch(_CREATE_COLDEFS, sql)
258
+ if m and "__duckrun" not in m.group("rel"):
259
+ return self._create_coldefs(m)
158
260
  m = _fullmatch(_INSERT_SELECT, sql)
159
261
  if m:
160
262
  return self._mutate(m, self._insert_select)
@@ -175,12 +277,26 @@ class _DeltaDML:
175
277
  return self._drop(m)
176
278
  return False
177
279
 
178
- # -- create table <rel> as <select>: always materialize as a duckrun Delta table -----------
280
+ # -- create table <rel> as <query>: always materialize as a duckrun Delta table ------------
179
281
  def _create_as(self, m) -> bool:
180
282
  rel = m.group("rel").strip()
181
283
  schema, identifier, loc = self._resolve(rel)
182
284
  if not loc:
183
285
  return False
286
+ # dbt/cursor path (no default_schema): keep the ORIGINAL narrow interception — only a plain
287
+ # `create table … as select …` routes to Delta. The wider forms (`or replace`, a CTE or a
288
+ # parenthesised body) are a connection-API affordance; on the dbt path they must stay native
289
+ # so dbt keeps owning the relation. dbt-internal CTAS like store_failures' `create table … as
290
+ # (select …)` is a real TABLE dbt later drops/recreates — turning it into a delta_scan VIEW
291
+ # breaks that ("Existing object … is of type View, trying to drop type Table").
292
+ if self.default_schema is None and (
293
+ m.group("orrep") or not re.match(r"select\b", m.group("body").lstrip(), re.I)
294
+ ):
295
+ return False
296
+ # `if not exists` over a live (non-tombstone) table is a no-op — just (re)surface the view.
297
+ if m.group("ine") and self._exists(loc) and not is_dropped(self.cursor, loc, self.so):
298
+ self._refresh_view(rel, schema, loc)
299
+ return True
184
300
  data = self.cursor.sql(m.group("body"))
185
301
  # overwrite_schema so this replaces a prior table (or a drop-tombstone) wholesale — a live
186
302
  # table is recreated with the real schema, clearing any tombstone marker.
@@ -188,12 +304,38 @@ class _DeltaDML:
188
304
  self._refresh_view(rel, schema, loc)
189
305
  return True
190
306
 
307
+ # -- create table <rel> (<col defs>): an EMPTY Delta table (connection API only) -----------
308
+ def _create_coldefs(self, m) -> bool:
309
+ # Only the connection API (which carries a current database) makes a bare `CREATE TABLE
310
+ # (col defs)` a Delta table — so `CREATE TABLE` is always Delta-backed there. The dbt/cursor
311
+ # path passes default_schema=None: the seed loader emits this exact form and RELIES on it
312
+ # landing as a native DuckDB table, so we pass through untouched.
313
+ if self.default_schema is None:
314
+ return False
315
+ rel = m.group("rel").strip()
316
+ schema, identifier, loc = self._resolve(rel)
317
+ if not loc:
318
+ return False
319
+ # Let DuckDB parse the column defs (types, constraints, nested parens) by building the table
320
+ # as a TEMP, then take a 0-row typed relation from it and write that as an empty Delta table.
321
+ tmp = f"__duckrun_empty_{abs(hash((schema, identifier))) & 0xFFFFFFFF}"
322
+ self.cursor.execute(f'create or replace temp table "{tmp}" ({m.group("defs")})')
323
+ try:
324
+ empty = self.cursor.sql(f'select * from "{tmp}" limit 0')
325
+ engine.write_delta(loc, empty, "overwrite", overwrite_schema=True, storage_options=self.so)
326
+ finally:
327
+ self.cursor.execute(f'drop table if exists "{tmp}"')
328
+ self._refresh_view(rel, schema, loc)
329
+ return True
330
+
191
331
  # -- forms that only apply when a Delta table already exists at the target ------------------
192
332
  def _mutate(self, m, op) -> bool:
193
333
  rel = m.group("rel").strip()
194
334
  schema, identifier, loc = self._resolve(rel)
195
335
  if not loc or not self._exists(loc):
196
336
  return False # native relation (e.g. the test's `fact`/`seed`) -> let DuckDB handle it
337
+ if self._with_clause and op != self._insert_select:
338
+ return False # `WITH … UPDATE/DELETE` can't be expressed through a delta_rs predicate
197
339
  op(m, rel, schema, loc)
198
340
  self._refresh_view(rel, schema, loc)
199
341
  return True
@@ -213,34 +355,48 @@ class _DeltaDML:
213
355
  )
214
356
 
215
357
  def _insert_select(self, m, rel, schema, loc) -> None:
216
- data = self.cursor.sql(m.group("body"))
217
- engine.write_delta(loc, data, "append", storage_options=self.so)
358
+ body = m.group("body")
359
+ if self._with_clause: # `WITH INSERT INTO t SELECT …`: re-attach the CTE to the body
360
+ body = f"{self._with_clause} {body}"
361
+ cols = m.group("cols")
362
+ if cols: # `insert into t (a, b) select …` → project the query onto the named columns
363
+ self._append_projected(loc, self._provided(cols), f"({body})")
364
+ else: # column count/order already matches the target → append as-is
365
+ engine.write_delta(loc, self.cursor.sql(body), "append", storage_options=self.so)
218
366
 
219
367
  def _insert_values(self, m, rel, schema, loc) -> None:
220
- # `insert into <rel> [(<cols>)] values (...)`: evaluate the VALUES tuples through DuckDB and
221
- # project them onto the FULL target Delta schema (so append schemas match) — supplied columns
222
- # come from the literals, any unsupplied target column is filled with a typed NULL.
368
+ # `insert into <rel> [(<cols>)] values (...)`: the literals supply every target column when
369
+ # no list is given, in order; otherwise the named columns.
370
+ cols = m.group("cols")
371
+ provided = self._provided(cols) if cols else None
372
+ self._append_projected(loc, provided, f"(values {m.group('body')})")
373
+
374
+ @staticmethod
375
+ def _provided(cols: str) -> List[str]:
376
+ return [c.strip().strip('"') for c in cols.split(",")]
377
+
378
+ def _append_projected(self, loc, provided, derived: str) -> None:
379
+ """Append a ``derived`` table (a ``(values …)`` tuple list or a ``(select …)`` subquery) to
380
+ the Delta table at ``loc``, projecting its columns onto the FULL target schema: supplied
381
+ columns come from ``derived`` (positional when ``provided`` is None), any unsupplied target
382
+ column is a typed NULL, and every projected column is cast to the target column's type so
383
+ the appended Arrow schema matches the table exactly (what a plain SQL INSERT does, and it
384
+ stops a literal wider than the column from forcing delta_rs to add a new writer feature on
385
+ append)."""
223
386
  loc_sql = loc.replace("'", "''")
224
387
  template = self.cursor.sql(f"select * from delta_scan('{loc_sql}') limit 0")
225
388
  target_cols = list(template.columns)
226
389
  target_types = [str(t) for t in template.types]
227
390
  by_lower = {c.lower(): c for c in target_cols}
228
391
 
229
- cols = m.group("cols")
230
- if cols: # explicit column list → canonicalize to the target's casing
231
- provided = [by_lower.get(c.strip().strip('"').lower(), c.strip().strip('"'))
232
- for c in cols.split(",")]
233
- else: # positional → the literals supply every target column, in order
392
+ if provided is None: # positional → every target column, in order
234
393
  provided = target_cols
235
- provided_set = {c for c in provided}
394
+ else: # explicit column list canonicalize to the target's casing
395
+ provided = [by_lower.get(c.lower(), c) for c in provided]
396
+ provided_set = set(provided)
236
397
 
237
398
  quoted = ", ".join('"' + c + '"' for c in provided)
238
- inner = f"(values {m.group('body')}) v({quoted})"
239
- # Cast every projected column to the TARGET column's type — both supplied values and the
240
- # typed NULLs — so the appended Arrow schema matches the table exactly. This is also what a
241
- # plain SQL INSERT does (a literal is coerced to the column type), and it stops a literal
242
- # whose inferred type is wider than the column (e.g. a ::timestamp into a DATE column) from
243
- # forcing delta_rs to add a new writer feature on append (TimestampWithoutTimezone).
399
+ inner = f"{derived} v({quoted})"
244
400
  exprs = [
245
401
  f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
246
402
  else f'cast(null as {typ}) as "{col}"'
@@ -290,8 +446,12 @@ def handle(cursor, root_path, storage_options, sql: str, default_schema=None) ->
290
446
  """
291
447
  if not root_path:
292
448
  return False
449
+ sql = _strip_leading(sql) # so leading comments/whitespace don't hide the verb
450
+ with_clause, body = _split_leading_with(sql) # peel a leading `WITH …` off an INSERT/etc.
293
451
  # Cheap pre-filter: only the candidate DML verbs.
294
- head = sql.lstrip()[:7].lower()
452
+ head = body[:7].lower()
295
453
  if not head.startswith(("delete", "update", "insert", "create", "alter", "drop")):
296
454
  return False
297
- return _DeltaDML(cursor, root_path, storage_options, default_schema).try_handle(sql)
455
+ dml = _DeltaDML(cursor, root_path, storage_options, default_schema)
456
+ dml._with_clause = with_clause
457
+ return dml.try_handle(body)
@@ -27,20 +27,63 @@ _WRITE_KEYWORD_RE = re.compile(r"^(insert|update|delete|merge)\b", re.IGNORECASE
27
27
  _CREATE_TABLE_RE = re.compile(r"^create\s+(or\s+replace\s+)?table\b", re.IGNORECASE)
28
28
  _CREATE_TEMP_RE = re.compile(r"^create\s+(or\s+replace\s+)?(temp|temporary)\b", re.IGNORECASE)
29
29
 
30
-
31
- def _strip_leading(query: str) -> str:
32
- """Drop leading whitespace and ``--`` / ``/* */`` comments so the first keyword is visible."""
33
- s = query
34
- while True:
35
- t = s.lstrip()
36
- if t.startswith("--"):
37
- nl = t.find("\n")
38
- s = "" if nl == -1 else t[nl + 1:]
39
- elif t.startswith("/*"):
40
- end = t.find("*/")
41
- s = "" if end == -1 else t[end + 2:]
42
- else:
43
- return t
30
+ # DML forms that genuinely can't be expressed through delta_rs (delta_dml.handle never applies them):
31
+ # rejected up front with a form-specific pointer rather than letting DuckDB raise a cryptic error on
32
+ # the read-only delta_scan view (or, for UPDATE FROM, silently mangling the SET clause).
33
+ # leading `\b`: _find_top_level probes every depth-0 index (see delta_dml._find_top_level).
34
+ _TOP_FROM = re.compile(r"\bfrom\b", re.IGNORECASE)
35
+ _TOP_USING = re.compile(r"\busing\b", re.IGNORECASE)
36
+ _strip_leading = delta_dml._strip_leading # shared comment/whitespace stripper
37
+
38
+ _MERGE_MSG = (
39
+ "conn.sql() can't run a SQL MERGE via delta_rs. Use the Spark write API: "
40
+ "df.write.saveAsTable(...) to create/append, or "
41
+ "conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
42
+ )
43
+ _UPDATE_FROM_MSG = (
44
+ "conn.sql() can't run UPDATE … FROM via delta_rs. Rewrite the SET values as correlated "
45
+ "subqueries, or use conn.delta_table(name).update(...)/.merge(...)."
46
+ )
47
+ _DELETE_USING_MSG = (
48
+ "conn.sql() can't run DELETE … USING via delta_rs. Rewrite the predicate as a correlated "
49
+ "subquery (DELETE … WHERE … IN (SELECT …)), or use conn.delta_table(name).delete(...)/.merge(...)."
50
+ )
51
+ _MULTI_MSG = (
52
+ "conn.sql() runs one statement at a time — split the batch into separate conn.sql() calls."
53
+ )
54
+
55
+
56
+ def _unsupported_dml(query: str) -> Optional[str]:
57
+ """An error message if ``query`` is a DML form duckrun can't route to delta_rs, else None."""
58
+ s = _strip_leading(query)
59
+ low = s.lower()
60
+ if low.startswith("merge"):
61
+ return _MERGE_MSG
62
+ if low.startswith("update") and delta_dml._find_top_level(s, _TOP_FROM) != -1:
63
+ return _UPDATE_FROM_MSG
64
+ if low.startswith("delete") and delta_dml._find_top_level(s, _TOP_USING) != -1:
65
+ return _DELETE_USING_MSG
66
+ if re.match(r"(insert|update|delete|merge|create|alter|drop)\b", low) and _is_multi_statement(s):
67
+ return _MULTI_MSG
68
+ return None
69
+
70
+
71
+ def _is_multi_statement(s: str) -> bool:
72
+ """True if ``s`` holds more than one statement (a top-level ``;`` with anything after it)."""
73
+ depth, quote = 0, None
74
+ for i, ch in enumerate(s):
75
+ if quote:
76
+ if ch == quote:
77
+ quote = None
78
+ elif ch in ("'", '"'):
79
+ quote = ch
80
+ elif ch in "([":
81
+ depth += 1
82
+ elif ch in ")]":
83
+ depth -= 1
84
+ elif ch == ";" and depth == 0 and s[i + 1:].strip():
85
+ return True
86
+ return False
44
87
 
45
88
 
46
89
  def _is_delta_write(query: str) -> bool:
@@ -224,13 +267,16 @@ class DuckSession:
224
267
  ``conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()``.
225
268
  ``CREATE TEMP/VIEW`` and other DuckDB-local scratch DDL pass through to DuckDB.
226
269
  """
270
+ unsupported = _unsupported_dml(query)
271
+ if unsupported:
272
+ raise ValueError(unsupported)
227
273
  if delta_dml.handle(self.con, self.root_path, self.storage_options, query,
228
274
  default_schema=self._current_database):
229
275
  self.refresh(quiet=True)
230
276
  return DataFrame(self.con.sql("SELECT 'ok' AS status"), self)
231
277
  if _is_delta_write(query):
232
278
  raise ValueError(
233
- "conn.sql() can't run a SQL MERGE via delta_rs. "
279
+ "conn.sql() can't write a Delta table from raw SQL here. "
234
280
  "Use the Spark write API: df.write.saveAsTable(...) to create/append, or "
235
281
  "conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
236
282
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev3
3
+ Version: 0.3.17.dev4
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -291,11 +291,15 @@ reclaimed — duckrun favors read-safety over immediate disk savings.
291
291
  ## Connection API (notebook)
292
292
 
293
293
  Besides the dbt adapter, duckrun ships a storage-neutral, PySpark-shaped `duckrun.connect()` for
294
- interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` is **read-only** (including
295
- time travel — `delta_scan('…', version => N)`); writes go through the Spark surface: a `DataFrame`
296
- with `.write…saveAsTable()` (modes `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle
297
- (`conn.delta_table(name)` / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`,
298
- `.replaceWhere()`, `.version()`, plus `conn.read` and `conn.catalog`.
294
+ interactive/notebook use (local, S3, GCS, ADLS, OneLake). `conn.sql(...)` runs reads (including time
295
+ travel — `delta_scan('…', version => N)`) and applies **raw SQL DML** (`create table … as`, `insert`,
296
+ `update`, `delete`, `alter add column`, `drop`) straight to the Delta table via delta_rs — every
297
+ `CREATE TABLE` is Delta-backed, only `CREATE TEMP TABLE`/`CREATE VIEW` stay native DuckDB, and forms
298
+ delta_rs can't express (`MERGE`, `UPDATE … FROM`, multi-statement) are rejected with a pointer to the
299
+ write API. Writes also go through the Spark surface: a `DataFrame` with `.write…saveAsTable()` (modes
300
+ `overwrite` / `append` / `safeappend` / `ignore`) and a `DeltaTable` handle (`conn.delta_table(name)`
301
+ / `DeltaTable.forName`) with `.merge(...)`, `.delete()`, `.update()`, `.replaceWhere()`, `.version()`,
302
+ plus `conn.read` and `conn.catalog`. See [the DML matrix](docs/connection-api.md#raw-sql-dml-through-connsql).
299
303
 
300
304
  `merge` is **snapshot-pinned by default** — Spark's single-snapshot MERGE, with no extra arguments:
301
305
  the target version is captured and the commit is validated against it, so a concurrent writer fails
@@ -347,7 +351,7 @@ None of this is required to use duckrun — `pip install duckrun` is unaffected.
347
351
 
348
352
  **Testing.** `tests/integration_tests/aemo/` is a small dbt project built against OneLake, and
349
353
  `tests/integration_tests/coffee/` is the connection-API coffee-shop scenario / stress test (CI:
350
- [`integration.yml`](.github/workflows/integration.yml)); `tests/conformance/`
354
+ [`integration_tests_onelake.yml`](.github/workflows/integration_tests_onelake.yml)); `tests/conformance/`
351
355
  runs the official suite (above); `tests/correctness/` proves the concurrency guarantees. The cards
352
356
  in those docs are rendered live by CI, so they always reflect the latest `main`.
353
357
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.3.17.dev3"
7
+ version = "0.3.17.dev4"
8
8
  description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -1 +0,0 @@
1
- version = "0.3.17.dev3"
File without changes
File without changes
File without changes