duckrun 0.3.17.dev2__tar.gz → 0.3.17.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.3.17.dev2/duckrun.egg-info → duckrun-0.3.17.dev3}/PKG-INFO +1 -1
- duckrun-0.3.17.dev3/dbt/adapters/duckrun/__version__.py +1 -0
- duckrun-0.3.17.dev3/dbt/adapters/duckrun/delta_dml.py +297 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/delta_plugin.py +37 -4
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/engine.py +31 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/environment.py +43 -1
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/impl.py +85 -0
- duckrun-0.3.17.dev3/dbt/include/duckrun/macros/catalog.sql +122 -0
- duckrun-0.3.17.dev3/dbt/include/duckrun/macros/materializations/snapshot.sql +144 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun/session.py +25 -12
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3/duckrun.egg-info}/PKG-INFO +1 -1
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun.egg-info/SOURCES.txt +2 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/pyproject.toml +1 -1
- duckrun-0.3.17.dev2/dbt/adapters/duckrun/__version__.py +0 -1
- duckrun-0.3.17.dev2/dbt/include/duckrun/macros/catalog.sql +0 -59
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/LICENSE +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/MANIFEST.in +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/README.md +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/credentials.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/remote.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/secret.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/dbt_project.yml +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun/auth.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun/delta_table.py +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/setup.cfg +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
version = "0.3.17.dev3"
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""Route raw SQL DML against duckrun-managed (Delta-backed) relations to delta_rs.
|
|
2
|
+
|
|
3
|
+
duckrun intercepts writes at the dbt *materialization* layer (a model/seed/snapshot goes through
|
|
4
|
+
the materialization macros -> store_relation -> delta_rs). But a duckrun relation is surfaced as a
|
|
5
|
+
read-only ``delta_scan`` view, so *raw* DML sent straight to the connection — ``delete from``,
|
|
6
|
+
``update``, ``insert into ... select``, ``alter table ... add column``, ``create table ... as
|
|
7
|
+
select`` — lands on a view and fails ("Can only delete from base table"), or would create a native
|
|
8
|
+
DuckDB table that bypasses Delta entirely.
|
|
9
|
+
|
|
10
|
+
This module intercepts those statements (at the cursor, see environment.DuckrunCursorWrapper) and
|
|
11
|
+
applies them to the Delta table **via delta_rs only**, then refreshes the ``delta_scan`` view — so
|
|
12
|
+
nothing relies on a native, mutable DuckDB table, and every op works on local AND abfss/OneLake
|
|
13
|
+
stores (delta_rs carries ``storage_options``). ``create table ... as`` writes a new Delta table;
|
|
14
|
+
the mutate forms (delete/update/insert/alter) apply only when a Delta table already exists at the
|
|
15
|
+
target (otherwise the statement passes through — e.g. the test's native ``fact``/``seed``).
|
|
16
|
+
|
|
17
|
+
``drop table`` unregisters the ``delta_scan`` view AND overwrites the table (via delta_rs) to a
|
|
18
|
+
one-column ``TOMBSTONE_COLUMN`` marker, which discovery recognizes and hides. It does NOT delete
|
|
19
|
+
data: delta_rs has no drop, and removing the Delta files would be a filesystem hack that fails on
|
|
20
|
+
object stores. The directory persists until a human purges it; a later ``create table ... as``
|
|
21
|
+
overwrites the tombstone with real data and the table is live again.
|
|
22
|
+
|
|
23
|
+
The seed loader's own SQL (``create table <t> (<col defs>)``, ``insert ... values``, ``COPY``) lands
|
|
24
|
+
on a native DuckDB table, not a Delta table: ``create table (<col defs>)`` doesn't match the
|
|
25
|
+
``... as select`` form, and while ``insert ... values`` now *does* match a form here, the mutate
|
|
26
|
+
guard only applies it when a Delta table already exists at the target — the seed's native table has
|
|
27
|
+
none, so it falls through untouched. duckrun's own materializations emit ``create ... view`` (not
|
|
28
|
+
``table``), so they pass through too.
|
|
29
|
+
"""
|
|
30
|
+
import re
|
|
31
|
+
from typing import List, Optional, Tuple
|
|
32
|
+
|
|
33
|
+
from . import engine
|
|
34
|
+
|
|
35
|
+
# `drop table` tombstone: a dropped relation is overwritten (via delta_rs) to a table whose ONLY
|
|
36
|
+
# column is this marker, so (a) discovery recognizes it as dropped and hides it, and (b) anyone who
|
|
37
|
+
# opens the files sees an obviously-not-a-real-table schema rather than a plausible empty table. No
|
|
38
|
+
# data is deleted — the directory stays until a human purges it; a later `create table ... as`
|
|
39
|
+
# overwrites the marker schema with real data and the table is live again.
|
|
40
|
+
TOMBSTONE_COLUMN = "__duckrun_deleted__"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _columns_are_tombstone(colnames) -> bool:
|
|
44
|
+
return [str(c).lower() for c in colnames] == [TOMBSTONE_COLUMN]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def is_dropped(con, location: str, storage_options=None) -> bool:
|
|
48
|
+
"""True if the Delta table at ``location`` is a duckrun drop-tombstone (single marker column).
|
|
49
|
+
|
|
50
|
+
Used by discovery (dbt + connection API) to hide dropped tables. Best-effort: anything that
|
|
51
|
+
can't be opened/scanned is treated as 'not a tombstone' (let normal handling deal with it).
|
|
52
|
+
"""
|
|
53
|
+
loc_sql = str(location).replace("'", "''")
|
|
54
|
+
try:
|
|
55
|
+
rel = con.execute(f"select * from delta_scan('{loc_sql}') limit 0")
|
|
56
|
+
return _columns_are_tombstone([d[0] for d in rel.description])
|
|
57
|
+
except Exception:
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
# --- statement matchers (leading-anchored, DOTALL so multi-line bodies match) ----------------
|
|
61
|
+
_CREATE_AS = re.compile(
|
|
62
|
+
r"\s*create\s+table\s+(?:if\s+not\s+exists\s+)?(?P<rel>.+?)\s+as\s+(?P<body>select\b.*)",
|
|
63
|
+
re.I | re.S,
|
|
64
|
+
)
|
|
65
|
+
_INSERT_SELECT = re.compile(
|
|
66
|
+
r"\s*insert\s+into\s+(?P<rel>.+?)\s+(?P<body>select\b.*)", re.I | re.S
|
|
67
|
+
)
|
|
68
|
+
_INSERT_VALUES = re.compile(
|
|
69
|
+
r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s*values\s+(?P<body>\(.+)",
|
|
70
|
+
re.I | re.S,
|
|
71
|
+
)
|
|
72
|
+
_DELETE = re.compile(
|
|
73
|
+
r"\s*delete\s+from\s+(?P<rel>.+?)(?:\s+where\s+(?P<where>.+))?\s*;?\s*", re.I | re.S
|
|
74
|
+
)
|
|
75
|
+
_UPDATE = re.compile(
|
|
76
|
+
r"\s*update\s+(?P<rel>.+?)\s+set\s+(?P<set>.+?)(?:\s+where\s+(?P<where>.+?))?\s*;?\s*",
|
|
77
|
+
re.I | re.S,
|
|
78
|
+
)
|
|
79
|
+
_ALTER_ADD = re.compile(
|
|
80
|
+
r"\s*alter\s+table\s+(?P<rel>.+?)\s+add\s+column\s+(?P<col>\S+)\s+(?P<def>.+?)\s*;?\s*",
|
|
81
|
+
re.I | re.S,
|
|
82
|
+
)
|
|
83
|
+
_DROP = re.compile(
|
|
84
|
+
r"\s*drop\s+table\s+(?:if\s+exists\s+)?(?P<rel>[^\s;]+)\s*;?\s*", re.I | re.S
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _fullmatch(pattern, sql):
|
|
89
|
+
return pattern.fullmatch(sql.strip())
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _split_relation(rel: str) -> Tuple[Optional[str], Optional[str]]:
|
|
93
|
+
"""`"db"."schema"."tbl"` / `schema.tbl` / `tbl` -> (schema, identifier), quotes stripped."""
|
|
94
|
+
parts = [p.strip().strip('"') for p in rel.strip().split(".")]
|
|
95
|
+
if not parts or not parts[-1]:
|
|
96
|
+
return None, None
|
|
97
|
+
identifier = parts[-1]
|
|
98
|
+
schema = parts[-2] if len(parts) >= 2 else None
|
|
99
|
+
return schema, identifier
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _split_top_level_commas(s: str) -> List[str]:
|
|
103
|
+
"""Split on commas that aren't inside parentheses or quotes (so ``left(email, 3)`` stays whole)."""
|
|
104
|
+
out, depth, start, quote = [], 0, 0, None
|
|
105
|
+
for i, ch in enumerate(s):
|
|
106
|
+
if quote:
|
|
107
|
+
if ch == quote:
|
|
108
|
+
quote = None
|
|
109
|
+
elif ch in ("'", '"'):
|
|
110
|
+
quote = ch
|
|
111
|
+
elif ch in "([":
|
|
112
|
+
depth += 1
|
|
113
|
+
elif ch in ")]":
|
|
114
|
+
depth -= 1
|
|
115
|
+
elif ch == "," and depth == 0:
|
|
116
|
+
out.append(s[start:i])
|
|
117
|
+
start = i + 1
|
|
118
|
+
out.append(s[start:])
|
|
119
|
+
return [p.strip() for p in out if p.strip()]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class _DeltaDML:
|
|
123
|
+
"""One attempt to handle a statement; ``run()`` returns True if it was applied to Delta."""
|
|
124
|
+
|
|
125
|
+
def __init__(self, cursor, root_path: str, storage_options, default_schema=None):
|
|
126
|
+
self.cursor = cursor
|
|
127
|
+
self.root_path = root_path.rstrip("/")
|
|
128
|
+
self.so = storage_options
|
|
129
|
+
self.default_schema = default_schema
|
|
130
|
+
|
|
131
|
+
def _loc(self, schema: str, identifier: str) -> str:
|
|
132
|
+
return f"{self.root_path}/{schema}/{identifier}"
|
|
133
|
+
|
|
134
|
+
def _resolve(self, rel: str):
|
|
135
|
+
"""(schema, identifier, location) for ``rel``, falling back to default_schema for an
|
|
136
|
+
unqualified name (the connection API relies on a current database). (None, None, None) when
|
|
137
|
+
no schema can be determined."""
|
|
138
|
+
schema, identifier = _split_relation(rel)
|
|
139
|
+
schema = schema or self.default_schema
|
|
140
|
+
if not schema or not identifier:
|
|
141
|
+
return None, None, None
|
|
142
|
+
return schema, identifier, self._loc(schema, identifier)
|
|
143
|
+
|
|
144
|
+
def _exists(self, loc: str) -> bool:
|
|
145
|
+
return engine.table_exists(loc, self.so)
|
|
146
|
+
|
|
147
|
+
def _refresh_view(self, rel: str, schema: str, loc: str) -> None:
|
|
148
|
+
loc_sql = loc.replace("'", "''")
|
|
149
|
+
self.cursor.execute(f'create schema if not exists "{schema}"')
|
|
150
|
+
self.cursor.execute(
|
|
151
|
+
f"create or replace view {rel} as select * from delta_scan('{loc_sql}')"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def try_handle(self, sql: str) -> bool:
|
|
155
|
+
m = _fullmatch(_CREATE_AS, sql)
|
|
156
|
+
if m and "__duckrun" not in m.group("rel"):
|
|
157
|
+
return self._create_as(m)
|
|
158
|
+
m = _fullmatch(_INSERT_SELECT, sql)
|
|
159
|
+
if m:
|
|
160
|
+
return self._mutate(m, self._insert_select)
|
|
161
|
+
m = _fullmatch(_INSERT_VALUES, sql)
|
|
162
|
+
if m:
|
|
163
|
+
return self._mutate(m, self._insert_values)
|
|
164
|
+
m = _fullmatch(_DELETE, sql)
|
|
165
|
+
if m:
|
|
166
|
+
return self._mutate(m, self._delete)
|
|
167
|
+
m = _fullmatch(_UPDATE, sql)
|
|
168
|
+
if m:
|
|
169
|
+
return self._mutate(m, self._update)
|
|
170
|
+
m = _fullmatch(_ALTER_ADD, sql)
|
|
171
|
+
if m:
|
|
172
|
+
return self._mutate(m, self._alter_add)
|
|
173
|
+
m = _fullmatch(_DROP, sql)
|
|
174
|
+
if m:
|
|
175
|
+
return self._drop(m)
|
|
176
|
+
return False
|
|
177
|
+
|
|
178
|
+
# -- create table <rel> as <select>: always materialize as a duckrun Delta table -----------
|
|
179
|
+
def _create_as(self, m) -> bool:
|
|
180
|
+
rel = m.group("rel").strip()
|
|
181
|
+
schema, identifier, loc = self._resolve(rel)
|
|
182
|
+
if not loc:
|
|
183
|
+
return False
|
|
184
|
+
data = self.cursor.sql(m.group("body"))
|
|
185
|
+
# overwrite_schema so this replaces a prior table (or a drop-tombstone) wholesale — a live
|
|
186
|
+
# table is recreated with the real schema, clearing any tombstone marker.
|
|
187
|
+
engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
|
|
188
|
+
self._refresh_view(rel, schema, loc)
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
# -- forms that only apply when a Delta table already exists at the target ------------------
|
|
192
|
+
def _mutate(self, m, op) -> bool:
|
|
193
|
+
rel = m.group("rel").strip()
|
|
194
|
+
schema, identifier, loc = self._resolve(rel)
|
|
195
|
+
if not loc or not self._exists(loc):
|
|
196
|
+
return False # native relation (e.g. the test's `fact`/`seed`) -> let DuckDB handle it
|
|
197
|
+
op(m, rel, schema, loc)
|
|
198
|
+
self._refresh_view(rel, schema, loc)
|
|
199
|
+
return True
|
|
200
|
+
|
|
201
|
+
def _delete(self, m, rel, schema, loc) -> None:
|
|
202
|
+
where = m.group("where")
|
|
203
|
+
engine._delta_table(loc, self.so).delete(predicate=where.strip() if where else None)
|
|
204
|
+
|
|
205
|
+
def _update(self, m, rel, schema, loc) -> None:
|
|
206
|
+
updates = {}
|
|
207
|
+
for assign in _split_top_level_commas(m.group("set")):
|
|
208
|
+
col, _, expr = assign.partition("=")
|
|
209
|
+
updates[col.strip().strip('"')] = expr.strip()
|
|
210
|
+
where = m.group("where")
|
|
211
|
+
engine._delta_table(loc, self.so).update(
|
|
212
|
+
updates=updates, predicate=where.strip() if where else None
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def _insert_select(self, m, rel, schema, loc) -> None:
|
|
216
|
+
data = self.cursor.sql(m.group("body"))
|
|
217
|
+
engine.write_delta(loc, data, "append", storage_options=self.so)
|
|
218
|
+
|
|
219
|
+
def _insert_values(self, m, rel, schema, loc) -> None:
|
|
220
|
+
# `insert into <rel> [(<cols>)] values (...)`: evaluate the VALUES tuples through DuckDB and
|
|
221
|
+
# project them onto the FULL target Delta schema (so append schemas match) — supplied columns
|
|
222
|
+
# come from the literals, any unsupplied target column is filled with a typed NULL.
|
|
223
|
+
loc_sql = loc.replace("'", "''")
|
|
224
|
+
template = self.cursor.sql(f"select * from delta_scan('{loc_sql}') limit 0")
|
|
225
|
+
target_cols = list(template.columns)
|
|
226
|
+
target_types = [str(t) for t in template.types]
|
|
227
|
+
by_lower = {c.lower(): c for c in target_cols}
|
|
228
|
+
|
|
229
|
+
cols = m.group("cols")
|
|
230
|
+
if cols: # explicit column list → canonicalize to the target's casing
|
|
231
|
+
provided = [by_lower.get(c.strip().strip('"').lower(), c.strip().strip('"'))
|
|
232
|
+
for c in cols.split(",")]
|
|
233
|
+
else: # positional → the literals supply every target column, in order
|
|
234
|
+
provided = target_cols
|
|
235
|
+
provided_set = {c for c in provided}
|
|
236
|
+
|
|
237
|
+
quoted = ", ".join('"' + c + '"' for c in provided)
|
|
238
|
+
inner = f"(values {m.group('body')}) v({quoted})"
|
|
239
|
+
# Cast every projected column to the TARGET column's type — both supplied values and the
|
|
240
|
+
# typed NULLs — so the appended Arrow schema matches the table exactly. This is also what a
|
|
241
|
+
# plain SQL INSERT does (a literal is coerced to the column type), and it stops a literal
|
|
242
|
+
# whose inferred type is wider than the column (e.g. a ::timestamp into a DATE column) from
|
|
243
|
+
# forcing delta_rs to add a new writer feature on append (TimestampWithoutTimezone).
|
|
244
|
+
exprs = [
|
|
245
|
+
f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
|
|
246
|
+
else f'cast(null as {typ}) as "{col}"'
|
|
247
|
+
for col, typ in zip(target_cols, target_types)
|
|
248
|
+
]
|
|
249
|
+
data = self.cursor.sql(f"select {', '.join(exprs)} from {inner}")
|
|
250
|
+
engine.write_delta(loc, data, "append", storage_options=self.so)
|
|
251
|
+
|
|
252
|
+
def _alter_add(self, m, rel, schema, loc) -> None:
|
|
253
|
+
col = m.group("col").strip().strip('"')
|
|
254
|
+
# Keep only the column type (drop any DEFAULT/NULL clause); add it as an all-null column by
|
|
255
|
+
# rewriting the table with overwrite_schema so delta_rs accepts the widened schema.
|
|
256
|
+
coltype = re.split(r"\s+default\b|\s+null\b", m.group("def"), flags=re.I)[0].strip() or "VARCHAR"
|
|
257
|
+
loc_sql = loc.replace("'", "''")
|
|
258
|
+
data = self.cursor.sql(
|
|
259
|
+
f'select *, cast(null as {coltype}) as "{col}" from delta_scan(\'{loc_sql}\')'
|
|
260
|
+
)
|
|
261
|
+
engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
|
|
262
|
+
|
|
263
|
+
def _drop(self, m) -> bool:
|
|
264
|
+
# `drop table` on a duckrun relation: unregister the delta_scan view AND, via delta_rs,
|
|
265
|
+
# overwrite the table to a one-column tombstone (TOMBSTONE_COLUMN) so a later glob discovery
|
|
266
|
+
# hides it. NO data is deleted — delta_rs has no drop, and removing the Delta files would be
|
|
267
|
+
# a filesystem hack that fails on object stores. The directory persists until a human purges
|
|
268
|
+
# it; a later `create table ... as` overwrites the tombstone with real data. If the relation
|
|
269
|
+
# isn't a duckrun-managed Delta table, fall through and let DuckDB drop the native table.
|
|
270
|
+
rel = m.group("rel").strip()
|
|
271
|
+
schema, identifier, loc = self._resolve(rel)
|
|
272
|
+
if not loc or not self._exists(loc):
|
|
273
|
+
return False
|
|
274
|
+
tombstone = self.cursor.sql(f"select true as {TOMBSTONE_COLUMN}")
|
|
275
|
+
engine.write_delta(loc, tombstone, "overwrite", overwrite_schema=True, storage_options=self.so)
|
|
276
|
+
self.cursor.execute(f"drop view if exists {rel}")
|
|
277
|
+
return True
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def handle(cursor, root_path, storage_options, sql: str, default_schema=None) -> bool:
|
|
281
|
+
"""Apply ``sql`` to Delta if it's a DML form targeting a duckrun-managed relation, using
|
|
282
|
+
``cursor`` to evaluate any SELECT body and to (re)create the ``delta_scan`` view.
|
|
283
|
+
|
|
284
|
+
Every handled form goes through delta_rs (``engine.write_delta`` / ``DeltaTable.delete`` /
|
|
285
|
+
``.update``), which carries ``storage_options`` and so works on local AND abfss/OneLake stores.
|
|
286
|
+
``default_schema`` resolves an unqualified table name (the connection API has a current
|
|
287
|
+
database; the dbt path always renders fully-qualified names so passes None).
|
|
288
|
+
Returns True if handled (the caller must NOT also run it on DuckDB), False to pass through —
|
|
289
|
+
anything unrecognized, or (for the mutate forms) a target that isn't a Delta table.
|
|
290
|
+
"""
|
|
291
|
+
if not root_path:
|
|
292
|
+
return False
|
|
293
|
+
# Cheap pre-filter: only the candidate DML verbs.
|
|
294
|
+
head = sql.lstrip()[:7].lower()
|
|
295
|
+
if not head.startswith(("delete", "update", "insert", "create", "alter", "drop")):
|
|
296
|
+
return False
|
|
297
|
+
return _DeltaDML(cursor, root_path, storage_options, default_schema).try_handle(sql)
|
|
@@ -6,6 +6,7 @@ connection (``configure_connection``), and on ``store()`` hands the model relati
|
|
|
6
6
|
straight to delta_rs. DuckDB relations expose the Arrow C-stream interface, which
|
|
7
7
|
deltalake 1.x consumes directly, so there is no pyarrow dependency.
|
|
8
8
|
"""
|
|
9
|
+
import re
|
|
9
10
|
from typing import Any, Optional
|
|
10
11
|
|
|
11
12
|
from dbt.adapters.duckdb.plugins import BasePlugin
|
|
@@ -155,11 +156,22 @@ class Plugin(BasePlugin):
|
|
|
155
156
|
# Table-like (non-incremental) models always overwrite. Incremental models
|
|
156
157
|
# overwrite on first run / full-refresh, then apply the incremental strategy.
|
|
157
158
|
if not incremental or full_refresh or not exists:
|
|
159
|
+
# This branch is a CREATE OR REPLACE: a table model, a --full-refresh, or a first run.
|
|
160
|
+
# When we are REPLACING an existing table (exists), allow delta_rs to replace the schema
|
|
161
|
+
# wholesale (schema_mode="overwrite") — the model SQL defines the new schema, exactly as
|
|
162
|
+
# `CREATE OR REPLACE TABLE` does on every other warehouse. Without it, delta_rs's strict
|
|
163
|
+
# overwrite keeps the OLD schema/protocol and so can't change a column's type or write a
|
|
164
|
+
# column needing a new writer feature the old table lacks (e.g. retyping to ::timestamp /
|
|
165
|
+
# timestampNtz). This is scoped to the full-rebuild replace ONLY — NOT append, safeappend,
|
|
166
|
+
# merge, or microbatch, which must keep their strict, schema-stable writes. A fresh create
|
|
167
|
+
# (not exists) doesn't need it. A user's explicit merge_schema still wins.
|
|
168
|
+
overwrite_schema = exists and not merge_schema
|
|
158
169
|
with engine.mem_profile("overwrite", con=cur):
|
|
159
170
|
engine.write_delta(
|
|
160
171
|
path, data, "overwrite",
|
|
161
172
|
partition_by=partition_by,
|
|
162
173
|
merge_schema=merge_schema,
|
|
174
|
+
overwrite_schema=overwrite_schema,
|
|
163
175
|
storage_options=storage_options,
|
|
164
176
|
compaction_threshold=self._compaction_threshold,
|
|
165
177
|
)
|
|
@@ -201,7 +213,7 @@ class Plugin(BasePlugin):
|
|
|
201
213
|
insert_only=(strategy == "insert"),
|
|
202
214
|
update_columns=cfg.get("merge_update_columns"),
|
|
203
215
|
exclude_columns=cfg.get("merge_exclude_columns"),
|
|
204
|
-
predicates=self._merge_predicates(cfg),
|
|
216
|
+
predicates=self._merge_predicates(cfg, data.columns),
|
|
205
217
|
update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
|
|
206
218
|
insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
|
|
207
219
|
merge_schema=evolve_schema,
|
|
@@ -470,16 +482,37 @@ class Plugin(BasePlugin):
|
|
|
470
482
|
return None
|
|
471
483
|
return str(expr).replace("DBT_INTERNAL_DEST", "target").replace("DBT_INTERNAL_SOURCE", "source")
|
|
472
484
|
|
|
485
|
+
@staticmethod
|
|
486
|
+
def _qualify_predicate(expr, columns):
|
|
487
|
+
"""Prefix bare references to known target columns with ``target.``.
|
|
488
|
+
|
|
489
|
+
duckrun folds ``incremental_predicates`` into the merge condition
|
|
490
|
+
(``target.k = source.k AND <predicate>``). A bare column there (e.g. ``id != 2``) exists
|
|
491
|
+
on BOTH the source and target, so delta_rs rejects it as an ambiguous reference. dbt's
|
|
492
|
+
``incremental_predicates`` constrain the existing/target rows (the delete+insert delete, the
|
|
493
|
+
merge ON), so we qualify bare column tokens to ``target.``. Only exact column-name tokens
|
|
494
|
+
that aren't already qualified (preceded by ``.``) or quoted/literal are rewritten — literals
|
|
495
|
+
and functions (e.g. ``current_date``, which is not a column) are left untouched."""
|
|
496
|
+
if not expr or not columns:
|
|
497
|
+
return expr
|
|
498
|
+
# Longest names first so a column that's a prefix of another isn't partially matched.
|
|
499
|
+
for col in sorted({str(c) for c in columns}, key=len, reverse=True):
|
|
500
|
+
# whole-word col, not preceded by '.', a word char, or a quote (already qualified/quoted).
|
|
501
|
+
pattern = re.compile(r'(?<![.\w"\'])' + re.escape(col) + r'\b', re.I)
|
|
502
|
+
expr = pattern.sub(lambda m: "target." + m.group(0), expr)
|
|
503
|
+
return expr
|
|
504
|
+
|
|
473
505
|
@classmethod
|
|
474
|
-
def _merge_predicates(cls, cfg: dict):
|
|
506
|
+
def _merge_predicates(cls, cfg: dict, columns=None):
|
|
475
507
|
"""dbt ``incremental_predicates`` (or ``predicates``), with dbt's standard merge
|
|
476
|
-
aliases rewritten to the ones delta_rs uses here
|
|
508
|
+
aliases rewritten to the ones delta_rs uses here and bare column refs qualified to
|
|
509
|
+
``target.`` (see ``_qualify_predicate``)."""
|
|
477
510
|
preds = cfg.get("incremental_predicates") or cfg.get("predicates")
|
|
478
511
|
if not preds:
|
|
479
512
|
return None
|
|
480
513
|
if isinstance(preds, str):
|
|
481
514
|
preds = [preds]
|
|
482
|
-
return [cls._rewrite_merge_aliases(p) for p in preds]
|
|
515
|
+
return [cls._qualify_predicate(cls._rewrite_merge_aliases(p), columns) for p in preds]
|
|
483
516
|
|
|
484
517
|
@staticmethod
|
|
485
518
|
def _resolve_schema_change(on_schema_change, path, data, storage_options) -> bool:
|
|
@@ -536,6 +536,37 @@ def table_exists(path: str, storage_options: Optional[Dict[str, str]] = None) ->
|
|
|
536
536
|
return False
|
|
537
537
|
|
|
538
538
|
|
|
539
|
+
def delta_stats(cur, path: str, storage_options: Optional[Dict[str, str]] = None):
|
|
540
|
+
"""Cheap table statistics for ``dbt docs generate``, read from the Delta **log** (no data scan).
|
|
541
|
+
|
|
542
|
+
``DeltaTable.get_add_actions()`` carries per-file ``num_records`` / ``size_bytes`` /
|
|
543
|
+
``modification_time``; summing rows+bytes and taking the latest mtime gives the whole table's
|
|
544
|
+
stats without opening any data file. Aggregation goes through the DuckDB cursor (``cur``) via a
|
|
545
|
+
replacement scan over the arro3 table — no pyarrow dependency.
|
|
546
|
+
|
|
547
|
+
Returns ``{"num_rows", "bytes", "last_modified"}`` (last_modified = epoch milliseconds), or
|
|
548
|
+
``None`` on ANY failure (a drop-tombstone, a missing table, an unreachable/credential-less remote
|
|
549
|
+
store). Best-effort by design: a statless catalog is fine, but a docs build must never break.
|
|
550
|
+
"""
|
|
551
|
+
try:
|
|
552
|
+
add_actions = _delta_table(path, storage_options).get_add_actions() # noqa: F841 (replacement scan)
|
|
553
|
+
row = cur.sql(
|
|
554
|
+
"select coalesce(sum(num_records), 0)::bigint, "
|
|
555
|
+
"coalesce(sum(size_bytes), 0)::bigint, "
|
|
556
|
+
"max(modification_time)::bigint from add_actions"
|
|
557
|
+
).fetchone()
|
|
558
|
+
except Exception as exc: # best-effort: docs stats must never fail catalog generation
|
|
559
|
+
logger.debug(f"duckrun: no Delta stats for {path!r}: {exc}")
|
|
560
|
+
return None
|
|
561
|
+
if row is None:
|
|
562
|
+
return None
|
|
563
|
+
return {
|
|
564
|
+
"num_rows": int(row[0]),
|
|
565
|
+
"bytes": int(row[1]),
|
|
566
|
+
"last_modified": int(row[2]) if row[2] is not None else None,
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
|
|
539
570
|
# Delta column-metadata key under which we stash a dbt column description, and the dollar-quote
|
|
540
571
|
# label used to embed arbitrary comment text (newlines, quotes, dollar signs) in COMMENT ON SQL.
|
|
541
572
|
_DELTA_COMMENT_KEY = "comment"
|
|
@@ -12,10 +12,52 @@ process. We do the same for plugin sources here: instead of registering a Python
|
|
|
12
12
|
``CREATE OR REPLACE VIEW <source> AS <scan sql>``. No pyarrow, no copying the source into a table,
|
|
13
13
|
and no dependence on dbt-duckdb's per-cursor relation re-registration.
|
|
14
14
|
"""
|
|
15
|
-
from dbt.adapters.duckdb.environments.local import
|
|
15
|
+
from dbt.adapters.duckdb.environments.local import (
|
|
16
|
+
DuckDBConnectionWrapper,
|
|
17
|
+
DuckDBCursorWrapper,
|
|
18
|
+
LocalEnvironment,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from . import delta_dml
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DuckrunCursorWrapper(DuckDBCursorWrapper):
|
|
25
|
+
"""Cursor wrapper that routes raw DML against duckrun-managed (Delta-backed) relations to
|
|
26
|
+
delta_rs instead of running it on the read-only ``delta_scan`` view.
|
|
27
|
+
|
|
28
|
+
Every SQL statement — whether issued by dbt's connection manager or by the adapter-test
|
|
29
|
+
harness (which goes straight to ``conn.handle.cursor().execute``) — funnels through here, so
|
|
30
|
+
this is the single production interception point. Non-matching statements, parameterized
|
|
31
|
+
statements (the seed loader's ``insert ... values (?)``), and DML against native relations all
|
|
32
|
+
fall through to DuckDB unchanged. See delta_dml.handle.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, cursor, credentials):
|
|
36
|
+
super().__init__(cursor)
|
|
37
|
+
self._duckrun_creds = credentials
|
|
38
|
+
|
|
39
|
+
def execute(self, sql, bindings=None):
|
|
40
|
+
if bindings is None:
|
|
41
|
+
creds = self._duckrun_creds
|
|
42
|
+
if delta_dml.handle(
|
|
43
|
+
self._cursor,
|
|
44
|
+
getattr(creds, "root_path", None),
|
|
45
|
+
getattr(creds, "storage_options", None),
|
|
46
|
+
sql,
|
|
47
|
+
):
|
|
48
|
+
return self._cursor # applied to Delta; nothing to run on DuckDB
|
|
49
|
+
return super().execute(sql, bindings)
|
|
16
50
|
|
|
17
51
|
|
|
18
52
|
class DuckrunEnvironment(LocalEnvironment):
|
|
53
|
+
def handle(self):
|
|
54
|
+
# Swap dbt-duckdb's cursor wrapper for ours so raw DML on Delta relations is intercepted
|
|
55
|
+
# on every cursor (connection-manager AND test-harness paths) — see DuckrunCursorWrapper.
|
|
56
|
+
h = super().handle()
|
|
57
|
+
if isinstance(h, DuckDBConnectionWrapper):
|
|
58
|
+
h._cursor = DuckrunCursorWrapper(h._cursor._cursor, self.creds)
|
|
59
|
+
return h
|
|
60
|
+
|
|
19
61
|
def load_source(self, plugin_name: str, source_config):
|
|
20
62
|
plugin = self._plugins.get(plugin_name)
|
|
21
63
|
# Only special-case the duckrun plugin (it knows how to turn a source into scan SQL).
|
|
@@ -13,6 +13,7 @@ from dbt.adapters.events.logging import AdapterLogger
|
|
|
13
13
|
from dbt.adapters.duckdb.connections import DuckDBConnectionManager
|
|
14
14
|
from dbt.adapters.duckdb.impl import DuckDBAdapter
|
|
15
15
|
|
|
16
|
+
from dbt.adapters.duckrun import delta_dml
|
|
16
17
|
from dbt.adapters.duckrun import remote
|
|
17
18
|
from dbt.adapters.duckrun import secret
|
|
18
19
|
from dbt.adapters.duckrun.credentials import DuckrunCredentials
|
|
@@ -248,6 +249,22 @@ class DuckrunAdapter(DuckDBAdapter):
|
|
|
248
249
|
if not discovered:
|
|
249
250
|
return in_memory
|
|
250
251
|
|
|
252
|
+
# Hide drop-tombstones: a `drop table` overwrites the table to a one-column marker (no data
|
|
253
|
+
# deleted). Such a table must not surface as a relation. Check before registering.
|
|
254
|
+
root_path = getattr(self.config.credentials, "root_path", "") or ""
|
|
255
|
+
so = getattr(self.config.credentials, "storage_options", None)
|
|
256
|
+
cur = self._cursor()
|
|
257
|
+
live = []
|
|
258
|
+
for rel in discovered:
|
|
259
|
+
loc = (root_path.rstrip("/") + "/" + str(rel.schema).strip('"')
|
|
260
|
+
+ "/" + str(rel.identifier).strip('"'))
|
|
261
|
+
if delta_dml.is_dropped(cur, loc, so):
|
|
262
|
+
continue
|
|
263
|
+
live.append(rel)
|
|
264
|
+
discovered = live
|
|
265
|
+
if not discovered:
|
|
266
|
+
return in_memory
|
|
267
|
+
|
|
251
268
|
# Physically register each discovered Delta table as a delta_scan view so read-only
|
|
252
269
|
# commands (dbt test/show/docs) can query models without a prior in-process run.
|
|
253
270
|
for rel in discovered:
|
|
@@ -268,3 +285,71 @@ class DuckrunAdapter(DuckDBAdapter):
|
|
|
268
285
|
]
|
|
269
286
|
merged.extend(discovered)
|
|
270
287
|
return merged
|
|
288
|
+
|
|
289
|
+
# --- dbt docs: table stats from the Delta log -------------------------------------------------
|
|
290
|
+
# The stock catalog query (duckrun__get_catalog) emits only column metadata, so dbt-docs shows an
|
|
291
|
+
# empty Stats panel (issue #3). dbt assembles the panel from columns named
|
|
292
|
+
# stats:<key>:{label,value,description,include}; we enrich the catalog agate table with those,
|
|
293
|
+
# sourced from each relation's Delta log (engine.delta_stats — no data scan). Done in Python here
|
|
294
|
+
# rather than in SQL because byte size / last-modified live in the Delta log, not DuckDB metadata.
|
|
295
|
+
_STATS_SPEC = (
|
|
296
|
+
("num_rows", "Row Count", "Number of rows in the table"),
|
|
297
|
+
("bytes", "Approximate Size", "Approximate size of the table on disk (bytes)"),
|
|
298
|
+
("last_modified", "Last Modified", "Time of the most recent Delta commit (UTC)"),
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
def get_catalog(self, *args, **kwargs):
|
|
302
|
+
table, exceptions = super().get_catalog(*args, **kwargs)
|
|
303
|
+
return self._with_delta_stats(table), exceptions
|
|
304
|
+
|
|
305
|
+
def get_catalog_by_relations(self, *args, **kwargs):
|
|
306
|
+
table, exceptions = super().get_catalog_by_relations(*args, **kwargs)
|
|
307
|
+
return self._with_delta_stats(table), exceptions
|
|
308
|
+
|
|
309
|
+
def _with_delta_stats(self, table):
|
|
310
|
+
"""Return ``table`` with stats:* columns appended, sourced per-relation from the Delta log.
|
|
311
|
+
|
|
312
|
+
A relation with no Delta table at ``root_path/schema/name`` (a native ``view``, a
|
|
313
|
+
drop-tombstone) gets ``include=False`` stats, so dbt leaves it statless. Best-effort: if
|
|
314
|
+
anything goes wrong the original table is returned unchanged — docs must never break.
|
|
315
|
+
"""
|
|
316
|
+
from datetime import datetime, timezone
|
|
317
|
+
from dbt_common.clients.agate_helper import table_from_data_flat
|
|
318
|
+
from . import engine
|
|
319
|
+
|
|
320
|
+
root_path = getattr(self.config.credentials, "root_path", "") or ""
|
|
321
|
+
if not root_path or len(table.rows) == 0:
|
|
322
|
+
return table
|
|
323
|
+
so = getattr(self.config.credentials, "storage_options", None)
|
|
324
|
+
cur = self._cursor()
|
|
325
|
+
|
|
326
|
+
cache = {}
|
|
327
|
+
|
|
328
|
+
def stats_for(schema, name):
|
|
329
|
+
key = (schema, name)
|
|
330
|
+
if key not in cache:
|
|
331
|
+
loc = (root_path.rstrip("/") + "/" + str(schema).strip('"')
|
|
332
|
+
+ "/" + str(name).strip('"'))
|
|
333
|
+
cache[key] = (None if delta_dml.is_dropped(cur, loc, so)
|
|
334
|
+
else engine.delta_stats(cur, loc, so))
|
|
335
|
+
return cache[key]
|
|
336
|
+
|
|
337
|
+
cols = list(table.column_names)
|
|
338
|
+
stat_cols = [f"stats:{k}:{p}" for k, _, _ in self._STATS_SPEC
|
|
339
|
+
for p in ("label", "value", "description", "include")]
|
|
340
|
+
rows = []
|
|
341
|
+
for r in table.rows:
|
|
342
|
+
d = dict(zip(cols, r))
|
|
343
|
+
st = stats_for(d.get("table_schema"), d.get("table_name"))
|
|
344
|
+
for k, label, desc in self._STATS_SPEC:
|
|
345
|
+
present = st is not None and st.get(k) is not None
|
|
346
|
+
if k == "last_modified" and present:
|
|
347
|
+
val = datetime.fromtimestamp(st[k] / 1000, tz=timezone.utc).isoformat()
|
|
348
|
+
else:
|
|
349
|
+
val = st.get(k) if present else None
|
|
350
|
+
d[f"stats:{k}:label"] = label
|
|
351
|
+
d[f"stats:{k}:value"] = val
|
|
352
|
+
d[f"stats:{k}:description"] = desc
|
|
353
|
+
d[f"stats:{k}:include"] = bool(present)
|
|
354
|
+
rows.append(d)
|
|
355
|
+
return table_from_data_flat(rows, cols + stat_cols)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
{#-- duckrun catalog: report Delta-backed relations as BASE TABLE.
|
|
2
|
+
|
|
3
|
+
Every duckrun model is physically a DuckDB *view* over `delta_scan('<location>')`, so the stock
|
|
4
|
+
dbt-duckdb catalog (duckdb_views() -> 'VIEW') reports them as views — which is dishonest: they are
|
|
5
|
+
Delta *tables*, and `dbt docs generate` / is_incremental() treat them as tables. We override the
|
|
6
|
+
catalog so a view whose definition reads from `delta_scan(...)` is reported as `BASE TABLE`, while
|
|
7
|
+
genuine `view`-materialized models (no delta_scan) stay `VIEW`. Comments (table + column) come
|
|
8
|
+
through unchanged from WS4's COMMENT ON, which get_catalog already reads.
|
|
9
|
+
|
|
10
|
+
Stats are intentionally not synthesized here: the duckrun conformance catalog fixtures use
|
|
11
|
+
`no_stats()`, so row/byte counts from the Delta log aren't required to pass — and computing them
|
|
12
|
+
per relation would re-open every table during docs generate. (If stats are wanted later, pull
|
|
13
|
+
num_records / size_bytes from DeltaTable.get_add_actions and cache per build.)
|
|
14
|
+
#}
|
|
15
|
+
{% macro duckrun__get_catalog(information_schema, schemas) -%}
|
|
16
|
+
{%- call statement('catalog', fetch_result=True) -%}
|
|
17
|
+
with relations AS (
|
|
18
|
+
select
|
|
19
|
+
t.table_name
|
|
20
|
+
, t.database_name
|
|
21
|
+
, t.schema_name
|
|
22
|
+
, 'BASE TABLE' as table_type
|
|
23
|
+
, t.comment as table_comment
|
|
24
|
+
from duckdb_tables() t
|
|
25
|
+
WHERE t.database_name = '{{ database }}'
|
|
26
|
+
UNION ALL
|
|
27
|
+
SELECT v.view_name as table_name
|
|
28
|
+
, v.database_name
|
|
29
|
+
, v.schema_name
|
|
30
|
+
-- A delta_scan view is a Delta table surfaced for reads; report it as a table, not a view.
|
|
31
|
+
, case when v.sql ilike '%delta_scan(%' then 'BASE TABLE' else 'VIEW' end as table_type
|
|
32
|
+
, v.comment as table_comment
|
|
33
|
+
from duckdb_views() v
|
|
34
|
+
WHERE v.database_name = '{{ database }}'
|
|
35
|
+
)
|
|
36
|
+
select
|
|
37
|
+
'{{ database }}' as table_database,
|
|
38
|
+
r.schema_name as table_schema,
|
|
39
|
+
r.table_name,
|
|
40
|
+
r.table_type,
|
|
41
|
+
r.table_comment,
|
|
42
|
+
c.column_name,
|
|
43
|
+
c.column_index as column_index,
|
|
44
|
+
c.data_type as column_type,
|
|
45
|
+
c.comment as column_comment,
|
|
46
|
+
NULL as table_owner
|
|
47
|
+
-- join on database too: an attached DB can hold a same-named table in the same schema, and
|
|
48
|
+
-- without this the columns of that shadow relation would bleed into the result.
|
|
49
|
+
FROM relations r JOIN duckdb_columns() c ON r.database_name = c.database_name AND r.schema_name = c.schema_name AND r.table_name = c.table_name
|
|
50
|
+
WHERE (
|
|
51
|
+
{%- for schema in schemas -%}
|
|
52
|
+
upper(r.schema_name) = upper('{{ schema }}'){%- if not loop.last %} or {% endif -%}
|
|
53
|
+
{%- endfor -%}
|
|
54
|
+
)
|
|
55
|
+
ORDER BY
|
|
56
|
+
r.schema_name,
|
|
57
|
+
r.table_name,
|
|
58
|
+
c.column_index
|
|
59
|
+
{%- endcall -%}
|
|
60
|
+
{{ return(load_result('catalog').table) }}
|
|
61
|
+
{%- endmacro %}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
{#-- duckrun catalog, scoped to a specific set of relations (the `get_catalog_relations`
|
|
65
|
+
capability dbt uses when it can ask for just the relations it cares about — see
|
|
66
|
+
BaseAdapter._get_one_catalog_by_relations). Same delta_scan-view → BASE TABLE rule as
|
|
67
|
+
duckrun__get_catalog above; the only difference is the WHERE clause filters to the passed
|
|
68
|
+
relations (by schema, and by identifier when one is given) instead of to whole schemas.
|
|
69
|
+
Without this, dbt falls back to default__get_catalog_relations, which just raises
|
|
70
|
+
"not implemented for duckrun". #}
|
|
71
|
+
{% macro duckrun__get_catalog_relations(information_schema, relations) -%}
|
|
72
|
+
{%- set database = information_schema.database -%}
|
|
73
|
+
{%- call statement('catalog', fetch_result=True) -%}
|
|
74
|
+
with relations AS (
|
|
75
|
+
select
|
|
76
|
+
t.table_name
|
|
77
|
+
, t.database_name
|
|
78
|
+
, t.schema_name
|
|
79
|
+
, 'BASE TABLE' as table_type
|
|
80
|
+
, t.comment as table_comment
|
|
81
|
+
from duckdb_tables() t
|
|
82
|
+
WHERE t.database_name = '{{ database }}'
|
|
83
|
+
UNION ALL
|
|
84
|
+
SELECT v.view_name as table_name
|
|
85
|
+
, v.database_name
|
|
86
|
+
, v.schema_name
|
|
87
|
+
-- A delta_scan view is a Delta table surfaced for reads; report it as a table, not a view.
|
|
88
|
+
, case when v.sql ilike '%delta_scan(%' then 'BASE TABLE' else 'VIEW' end as table_type
|
|
89
|
+
, v.comment as table_comment
|
|
90
|
+
from duckdb_views() v
|
|
91
|
+
WHERE v.database_name = '{{ database }}'
|
|
92
|
+
)
|
|
93
|
+
select
|
|
94
|
+
'{{ database }}' as table_database,
|
|
95
|
+
r.schema_name as table_schema,
|
|
96
|
+
r.table_name,
|
|
97
|
+
r.table_type,
|
|
98
|
+
r.table_comment,
|
|
99
|
+
c.column_name,
|
|
100
|
+
c.column_index as column_index,
|
|
101
|
+
c.data_type as column_type,
|
|
102
|
+
c.comment as column_comment,
|
|
103
|
+
NULL as table_owner
|
|
104
|
+
-- join on database too: an attached DB can hold a same-named table in the same schema, and
|
|
105
|
+
-- without this the columns of that shadow relation would bleed into the result.
|
|
106
|
+
FROM relations r JOIN duckdb_columns() c ON r.database_name = c.database_name AND r.schema_name = c.schema_name AND r.table_name = c.table_name
|
|
107
|
+
WHERE (
|
|
108
|
+
{%- for relation in relations -%}
|
|
109
|
+
(
|
|
110
|
+
upper(r.schema_name) = upper('{{ relation.schema }}')
|
|
111
|
+
{%- if relation.identifier %} and upper(r.table_name) = upper('{{ relation.identifier }}'){%- endif -%}
|
|
112
|
+
)
|
|
113
|
+
{%- if not loop.last %} or {% endif -%}
|
|
114
|
+
{%- endfor -%}
|
|
115
|
+
)
|
|
116
|
+
ORDER BY
|
|
117
|
+
r.schema_name,
|
|
118
|
+
r.table_name,
|
|
119
|
+
c.column_index
|
|
120
|
+
{%- endcall -%}
|
|
121
|
+
{{ return(load_result('catalog').table) }}
|
|
122
|
+
{%- endmacro %}
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
{#
|
|
2
|
+
dbt `snapshot` materialization, backed by Delta Lake.
|
|
3
|
+
|
|
4
|
+
Why duckrun needs its own: dbt's default snapshot materialization runs `create table` /
|
|
5
|
+
`merge` as SQL DDL/DML against the warehouse. On duckrun the warehouse is Delta + an
|
|
6
|
+
in-memory DuckDB session, so the default snapshot lands only in the in-memory catalog and is
|
|
7
|
+
LOST across processes — a snapshot appears to work within one `dbt` invocation but never
|
|
8
|
+
persists (real SCD2 is cross-invocation). This materialization persists the snapshot to Delta
|
|
9
|
+
via the same store path the table/incremental materializations use.
|
|
10
|
+
|
|
11
|
+
How it maps onto duckrun's supported merge (no SQL MERGE, no merge_clauses):
|
|
12
|
+
|
|
13
|
+
* First run / --full-refresh: `build_snapshot_table` SELECT -> overwrite the Delta table.
|
|
14
|
+
* Subsequent runs: `snapshot_staging_table` (dbt's own SCD2 change detection) produces the
|
|
15
|
+
'insert' rows (new versions, fresh dbt_scd_id) and the 'update'/'delete' rows (close the
|
|
16
|
+
open version, carry the new dbt_valid_to under the *existing* dbt_scd_id). We project away
|
|
17
|
+
the dbt_change_type / dbt_unique_key helper columns and MERGE on dbt_scd_id with
|
|
18
|
+
merge_update_columns=[dbt_valid_to]:
|
|
19
|
+
- close rows -> matched -> update dbt_valid_to (only that column)
|
|
20
|
+
- insert rows -> not matched -> insert the new version
|
|
21
|
+
The change_type partition is implicit: closes share the open version's scd_id (so they
|
|
22
|
+
match), inserts get a brand-new scd_id (so they don't). This reproduces dbt's
|
|
23
|
+
default__snapshot_merge_sql exactly with the merge controls delta_rs can express.
|
|
24
|
+
#}
|
|
25
|
+
{% materialization snapshot, adapter='duckrun' %}
|
|
26
|
+
|
|
27
|
+
{%- set strategy_name = config.get('strategy') -%}
|
|
28
|
+
{%- set unique_key = config.get('unique_key') -%}
|
|
29
|
+
|
|
30
|
+
{%- set p = duckrun__delta_paths() -%}
|
|
31
|
+
{%- set target_relation = p['target'] -%}
|
|
32
|
+
{%- set tmp_relation = p['tmp'] -%}
|
|
33
|
+
{%- set location = p['location'] -%}
|
|
34
|
+
{%- set _loc_sql = location | replace("'", "''") -%}
|
|
35
|
+
|
|
36
|
+
{#-- Version/existence of the Delta table, captured before the model reads anything, so the
|
|
37
|
+
merge can pin OCC to it (single snapshot for the staging read and the merge commit). --#}
|
|
38
|
+
{%- set read_version = adapter.delta_version(location) -%}
|
|
39
|
+
{%- set exists = adapter.delta_table_exists(location) -%}
|
|
40
|
+
|
|
41
|
+
{{ run_hooks(pre_hooks, inside_transaction=False) }}
|
|
42
|
+
{%- do adapter.create_schema(target_relation) -%}
|
|
43
|
+
{{ run_hooks(pre_hooks, inside_transaction=True) }}
|
|
44
|
+
|
|
45
|
+
{% set strategy_macro = strategy_dispatch(strategy_name) %}
|
|
46
|
+
{% set strategy = strategy_macro(model, "snapshotted_data", "source_data", model['config'], exists) %}
|
|
47
|
+
|
|
48
|
+
{% if not exists or should_full_refresh() %}
|
|
49
|
+
|
|
50
|
+
{#-- First run (or full refresh): build the initial snapshot and overwrite the Delta table. --#}
|
|
51
|
+
{% set build_sql = build_snapshot_table(strategy, model['compiled_code']) %}
|
|
52
|
+
{{ check_time_data_types(build_sql) }}
|
|
53
|
+
|
|
54
|
+
{% call statement('stage_model') -%}
|
|
55
|
+
create or replace view {{ tmp_relation }} as {{ build_sql }}
|
|
56
|
+
{%- endcall %}
|
|
57
|
+
{{ adapter.commit() }}
|
|
58
|
+
{%- set columns = adapter.get_columns_in_relation(tmp_relation) -%}
|
|
59
|
+
{% do adapter.store_relation('duckrun', tmp_relation, columns, location, 'delta', {
|
|
60
|
+
'incremental': false,
|
|
61
|
+
'full_refresh': true,
|
|
62
|
+
'invocation_id': invocation_id,
|
|
63
|
+
}) %}
|
|
64
|
+
{% call statement('drop_stage') -%}
|
|
65
|
+
drop view if exists {{ tmp_relation }}
|
|
66
|
+
{%- endcall %}
|
|
67
|
+
|
|
68
|
+
{% else %}
|
|
69
|
+
|
|
70
|
+
{% set snapshot_cols = config.get("snapshot_table_column_names") or get_snapshot_table_column_names() %}
|
|
71
|
+
{{ adapter.assert_valid_snapshot_target_given_strategy(target_relation, snapshot_cols, strategy) }}
|
|
72
|
+
|
|
73
|
+
{#-- Pin the existing snapshot read to read_version so the staging change-detection sees one
|
|
74
|
+
consistent version (matches the merge's OCC pin below). --#}
|
|
75
|
+
{% call statement('register_this') -%}
|
|
76
|
+
create or replace view {{ target_relation }} as
|
|
77
|
+
select * from delta_scan('{{ _loc_sql }}'{% if read_version is not none %}, version => {{ read_version }}{% endif %})
|
|
78
|
+
{%- endcall %}
|
|
79
|
+
|
|
80
|
+
{% set staging_sql = snapshot_staging_table(strategy, sql, target_relation) %}
|
|
81
|
+
{{ check_time_data_types(staging_sql) }}
|
|
82
|
+
{% call statement('stage_model') -%}
|
|
83
|
+
create or replace view {{ tmp_relation }} as {{ staging_sql }}
|
|
84
|
+
{%- endcall %}
|
|
85
|
+
{{ adapter.commit() }}
|
|
86
|
+
|
|
87
|
+
{#-- Drop dbt's staging-only helper columns; the merge source must match the snapshot table. --#}
|
|
88
|
+
{% set remove_columns = ['dbt_change_type', 'DBT_CHANGE_TYPE', 'dbt_unique_key', 'DBT_UNIQUE_KEY'] %}
|
|
89
|
+
{% if unique_key | is_list %}
|
|
90
|
+
{% for key in strategy.unique_key %}
|
|
91
|
+
{% do remove_columns.append('dbt_unique_key_' + loop.index|string) %}
|
|
92
|
+
{% do remove_columns.append('DBT_UNIQUE_KEY_' + loop.index|string) %}
|
|
93
|
+
{% endfor %}
|
|
94
|
+
{% endif %}
|
|
95
|
+
{% set source_columns = adapter.get_columns_in_relation(tmp_relation)
|
|
96
|
+
| rejectattr('name', 'in', remove_columns) | list %}
|
|
97
|
+
{% set col_csv = source_columns | map(attribute='name') | join(', ') %}
|
|
98
|
+
|
|
99
|
+
{%- set merge_src = api.Relation.create(
|
|
100
|
+
database=target_relation.database,
|
|
101
|
+
schema=target_relation.schema,
|
|
102
|
+
identifier=target_relation.identifier ~ '__duckrun_snap_src',
|
|
103
|
+
type='view') -%}
|
|
104
|
+
{% call statement('stage_merge_src') -%}
|
|
105
|
+
create or replace view {{ merge_src }} as select {{ col_csv }} from {{ tmp_relation }}
|
|
106
|
+
{%- endcall %}
|
|
107
|
+
{{ adapter.commit() }}
|
|
108
|
+
{%- set columns = adapter.get_columns_in_relation(merge_src) -%}
|
|
109
|
+
|
|
110
|
+
{#-- Merge on dbt_scd_id; matched (closes) update only dbt_valid_to, unmatched (new versions)
|
|
111
|
+
insert. read_version pins OCC to the version the staging read. --#}
|
|
112
|
+
{% do adapter.store_relation('duckrun', merge_src, columns, location, 'delta', {
|
|
113
|
+
'incremental': true,
|
|
114
|
+
'incremental_strategy': 'merge',
|
|
115
|
+
'unique_key': snapshot_cols.dbt_scd_id,
|
|
116
|
+
'merge_update_columns': [snapshot_cols.dbt_valid_to],
|
|
117
|
+
'read_version': read_version,
|
|
118
|
+
'dbt_believes_exists': true,
|
|
119
|
+
'full_refresh': false,
|
|
120
|
+
'on_schema_change': 'ignore',
|
|
121
|
+
'invocation_id': invocation_id,
|
|
122
|
+
}) %}
|
|
123
|
+
|
|
124
|
+
{% call statement('drop_stage') -%}
|
|
125
|
+
drop view if exists {{ merge_src }}; drop view if exists {{ tmp_relation }}
|
|
126
|
+
{%- endcall %}
|
|
127
|
+
|
|
128
|
+
{% endif %}
|
|
129
|
+
|
|
130
|
+
{#-- Surface the snapshot as a delta_scan view over the freshly written HEAD. --#}
|
|
131
|
+
{%- do adapter.create_schema(target_relation) -%}
|
|
132
|
+
{% call statement('main') -%}
|
|
133
|
+
create or replace view {{ target_relation }} as select * from delta_scan('{{ _loc_sql }}')
|
|
134
|
+
{%- endcall %}
|
|
135
|
+
|
|
136
|
+
{% do persist_docs(target_relation, model) %}
|
|
137
|
+
|
|
138
|
+
{{ run_hooks(post_hooks, inside_transaction=True) }}
|
|
139
|
+
{{ adapter.commit() }}
|
|
140
|
+
{{ run_hooks(post_hooks, inside_transaction=False) }}
|
|
141
|
+
|
|
142
|
+
{{ return({'relations': [target_relation]}) }}
|
|
143
|
+
|
|
144
|
+
{% endmaterialization %}
|
|
@@ -14,7 +14,7 @@ from typing import Dict, List, Optional
|
|
|
14
14
|
|
|
15
15
|
import duckdb
|
|
16
16
|
|
|
17
|
-
from dbt.adapters.duckrun import engine, remote, secret
|
|
17
|
+
from dbt.adapters.duckrun import delta_dml, engine, remote, secret
|
|
18
18
|
from . import auth
|
|
19
19
|
|
|
20
20
|
|
|
@@ -164,6 +164,10 @@ class DuckSession:
|
|
|
164
164
|
continue
|
|
165
165
|
self.con.execute(f"CREATE SCHEMA IF NOT EXISTS {_qid(schema)}")
|
|
166
166
|
for table in tables:
|
|
167
|
+
# Hide drop-tombstones (a `drop table` overwrites the table to a one-column marker;
|
|
168
|
+
# no data is deleted, the files persist, but the table must not surface).
|
|
169
|
+
if delta_dml.is_dropped(self.con, self.table_path(schema, table), self.storage_options):
|
|
170
|
+
continue
|
|
167
171
|
self._register_view(schema, table)
|
|
168
172
|
registered.append(f"{schema}.{table}")
|
|
169
173
|
|
|
@@ -204,20 +208,29 @@ class DuckSession:
|
|
|
204
208
|
# ---- Spark-shaped surface --------------------------------------------------------------
|
|
205
209
|
|
|
206
210
|
def sql(self, query: str) -> "DataFrame":
|
|
207
|
-
"""Run a
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
211
|
+
"""Run a query and return a :class:`DataFrame`.
|
|
212
|
+
|
|
213
|
+
Reads pass straight through to DuckDB over the ``delta_scan`` views (time-travel works for
|
|
214
|
+
free — ``conn.sql("from delta_scan('path', version => 0)")``).
|
|
215
|
+
|
|
216
|
+
Delta **DML** is applied to the Delta table via delta_rs (works local AND on OneLake):
|
|
217
|
+
``create table … as select`` (overwrite), ``insert into … select``/``insert into … values``
|
|
218
|
+
(append), ``delete``/``update`` (delta_rs delete/update), ``alter table … add column``, and
|
|
219
|
+
``drop table`` (tombstone — marks the table dropped without deleting data; a human purges
|
|
220
|
+
the files). After a DML statement the catalog is refreshed.
|
|
221
|
+
|
|
222
|
+
``merge`` isn't expressible via delta_rs DML here — use the Spark write surface instead:
|
|
223
|
+
``df.write.saveAsTable(...)`` or
|
|
224
|
+
``conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()``.
|
|
225
|
+
``CREATE TEMP/VIEW`` and other DuckDB-local scratch DDL pass through to DuckDB.
|
|
217
226
|
"""
|
|
227
|
+
if delta_dml.handle(self.con, self.root_path, self.storage_options, query,
|
|
228
|
+
default_schema=self._current_database):
|
|
229
|
+
self.refresh(quiet=True)
|
|
230
|
+
return DataFrame(self.con.sql("SELECT 'ok' AS status"), self)
|
|
218
231
|
if _is_delta_write(query):
|
|
219
232
|
raise ValueError(
|
|
220
|
-
"conn.sql()
|
|
233
|
+
"conn.sql() can't run a SQL MERGE via delta_rs. "
|
|
221
234
|
"Use the Spark write API: df.write.saveAsTable(...) to create/append, or "
|
|
222
235
|
"conn.delta_table(name).merge(...)/.delete()/.update()/.replaceWhere()."
|
|
223
236
|
)
|
|
@@ -5,6 +5,7 @@ pyproject.toml
|
|
|
5
5
|
dbt/adapters/duckrun/__init__.py
|
|
6
6
|
dbt/adapters/duckrun/__version__.py
|
|
7
7
|
dbt/adapters/duckrun/credentials.py
|
|
8
|
+
dbt/adapters/duckrun/delta_dml.py
|
|
8
9
|
dbt/adapters/duckrun/delta_plugin.py
|
|
9
10
|
dbt/adapters/duckrun/engine.py
|
|
10
11
|
dbt/adapters/duckrun/environment.py
|
|
@@ -17,6 +18,7 @@ dbt/include/duckrun/macros/catalog.sql
|
|
|
17
18
|
dbt/include/duckrun/macros/materializations/_delta_core.sql
|
|
18
19
|
dbt/include/duckrun/macros/materializations/delta.sql
|
|
19
20
|
dbt/include/duckrun/macros/materializations/incremental.sql
|
|
21
|
+
dbt/include/duckrun/macros/materializations/snapshot.sql
|
|
20
22
|
dbt/include/duckrun/macros/materializations/table.sql
|
|
21
23
|
duckrun/__init__.py
|
|
22
24
|
duckrun/auth.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.3.17.
|
|
7
|
+
version = "0.3.17.dev3"
|
|
8
8
|
description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
version = "0.3.17.dev2"
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
{#-- duckrun catalog: report Delta-backed relations as BASE TABLE.
|
|
2
|
-
|
|
3
|
-
Every duckrun model is physically a DuckDB *view* over `delta_scan('<location>')`, so the stock
|
|
4
|
-
dbt-duckdb catalog (duckdb_views() -> 'VIEW') reports them as views — which is dishonest: they are
|
|
5
|
-
Delta *tables*, and `dbt docs generate` / is_incremental() treat them as tables. We override the
|
|
6
|
-
catalog so a view whose definition reads from `delta_scan(...)` is reported as `BASE TABLE`, while
|
|
7
|
-
genuine `view`-materialized models (no delta_scan) stay `VIEW`. Comments (table + column) come
|
|
8
|
-
through unchanged from WS4's COMMENT ON, which get_catalog already reads.
|
|
9
|
-
|
|
10
|
-
Stats are intentionally not synthesized here: the duckrun conformance catalog fixtures use
|
|
11
|
-
`no_stats()`, so row/byte counts from the Delta log aren't required to pass — and computing them
|
|
12
|
-
per relation would re-open every table during docs generate. (If stats are wanted later, pull
|
|
13
|
-
num_records / size_bytes from DeltaTable.get_add_actions and cache per build.)
|
|
14
|
-
#}
|
|
15
|
-
{% macro duckrun__get_catalog(information_schema, schemas) -%}
|
|
16
|
-
{%- call statement('catalog', fetch_result=True) -%}
|
|
17
|
-
with relations AS (
|
|
18
|
-
select
|
|
19
|
-
t.table_name
|
|
20
|
-
, t.database_name
|
|
21
|
-
, t.schema_name
|
|
22
|
-
, 'BASE TABLE' as table_type
|
|
23
|
-
, t.comment as table_comment
|
|
24
|
-
from duckdb_tables() t
|
|
25
|
-
WHERE t.database_name = '{{ database }}'
|
|
26
|
-
UNION ALL
|
|
27
|
-
SELECT v.view_name as table_name
|
|
28
|
-
, v.database_name
|
|
29
|
-
, v.schema_name
|
|
30
|
-
-- A delta_scan view is a Delta table surfaced for reads; report it as a table, not a view.
|
|
31
|
-
, case when v.sql ilike '%delta_scan(%' then 'BASE TABLE' else 'VIEW' end as table_type
|
|
32
|
-
, v.comment as table_comment
|
|
33
|
-
from duckdb_views() v
|
|
34
|
-
WHERE v.database_name = '{{ database }}'
|
|
35
|
-
)
|
|
36
|
-
select
|
|
37
|
-
'{{ database }}' as table_database,
|
|
38
|
-
r.schema_name as table_schema,
|
|
39
|
-
r.table_name,
|
|
40
|
-
r.table_type,
|
|
41
|
-
r.table_comment,
|
|
42
|
-
c.column_name,
|
|
43
|
-
c.column_index as column_index,
|
|
44
|
-
c.data_type as column_type,
|
|
45
|
-
c.comment as column_comment,
|
|
46
|
-
NULL as table_owner
|
|
47
|
-
FROM relations r JOIN duckdb_columns() c ON r.schema_name = c.schema_name AND r.table_name = c.table_name
|
|
48
|
-
WHERE (
|
|
49
|
-
{%- for schema in schemas -%}
|
|
50
|
-
upper(r.schema_name) = upper('{{ schema }}'){%- if not loop.last %} or {% endif -%}
|
|
51
|
-
{%- endfor -%}
|
|
52
|
-
)
|
|
53
|
-
ORDER BY
|
|
54
|
-
r.schema_name,
|
|
55
|
-
r.table_name,
|
|
56
|
-
c.column_index
|
|
57
|
-
{%- endcall -%}
|
|
58
|
-
{{ return(load_result('catalog').table) }}
|
|
59
|
-
{%- endmacro %}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/delta.sql
RENAMED
|
File without changes
|
|
File without changes
|
{duckrun-0.3.17.dev2 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/table.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|