duckrun 0.3.17.dev1__tar.gz → 0.3.17.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {duckrun-0.3.17.dev1/duckrun.egg-info → duckrun-0.3.17.dev3}/PKG-INFO +2 -1
  2. duckrun-0.3.17.dev3/dbt/adapters/duckrun/__version__.py +1 -0
  3. duckrun-0.3.17.dev3/dbt/adapters/duckrun/delta_dml.py +297 -0
  4. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/delta_plugin.py +79 -42
  5. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/engine.py +158 -0
  6. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/environment.py +43 -1
  7. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/impl.py +85 -0
  8. duckrun-0.3.17.dev3/dbt/include/duckrun/macros/catalog.sql +122 -0
  9. duckrun-0.3.17.dev3/dbt/include/duckrun/macros/materializations/snapshot.sql +144 -0
  10. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun/session.py +25 -12
  11. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3/duckrun.egg-info}/PKG-INFO +2 -1
  12. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun.egg-info/SOURCES.txt +2 -0
  13. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun.egg-info/requires.txt +1 -0
  14. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/pyproject.toml +8 -1
  15. duckrun-0.3.17.dev1/dbt/adapters/duckrun/__version__.py +0 -1
  16. duckrun-0.3.17.dev1/dbt/include/duckrun/macros/catalog.sql +0 -59
  17. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/LICENSE +0 -0
  18. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/MANIFEST.in +0 -0
  19. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/README.md +0 -0
  20. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/__init__.py +0 -0
  21. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/credentials.py +0 -0
  22. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/remote.py +0 -0
  23. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/adapters/duckrun/secret.py +0 -0
  24. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/include/duckrun/__init__.py +0 -0
  25. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/include/duckrun/dbt_project.yml +0 -0
  26. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
  27. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
  28. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
  29. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
  30. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun/__init__.py +0 -0
  31. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun/auth.py +0 -0
  32. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun/delta_table.py +0 -0
  33. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun.egg-info/dependency_links.txt +0 -0
  34. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/duckrun.egg-info/top_level.txt +0 -0
  35. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev1
3
+ Version: 0.3.17.dev3
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -11,6 +11,7 @@ Requires-Python: >=3.9
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: dbt-duckdb>=1.8
14
+ Requires-Dist: dbt-core<2.0,>=1.8
14
15
  Requires-Dist: duckdb==1.5.4.dev18
15
16
  Requires-Dist: deltalake<1.5.1,>=1.5.0
16
17
  Requires-Dist: requests
@@ -0,0 +1 @@
1
+ version = "0.3.17.dev3"
@@ -0,0 +1,297 @@
1
+ """Route raw SQL DML against duckrun-managed (Delta-backed) relations to delta_rs.
2
+
3
+ duckrun intercepts writes at the dbt *materialization* layer (a model/seed/snapshot goes through
4
+ the materialization macros -> store_relation -> delta_rs). But a duckrun relation is surfaced as a
5
+ read-only ``delta_scan`` view, so *raw* DML sent straight to the connection — ``delete from``,
6
+ ``update``, ``insert into ... select``, ``alter table ... add column``, ``create table ... as
7
+ select`` — lands on a view and fails ("Can only delete from base table"), or would create a native
8
+ DuckDB table that bypasses Delta entirely.
9
+
10
+ This module intercepts those statements (at the cursor, see environment.DuckrunCursorWrapper) and
11
+ applies them to the Delta table **via delta_rs only**, then refreshes the ``delta_scan`` view — so
12
+ nothing relies on a native, mutable DuckDB table, and every op works on local AND abfss/OneLake
13
+ stores (delta_rs carries ``storage_options``). ``create table ... as`` writes a new Delta table;
14
+ the mutate forms (delete/update/insert/alter) apply only when a Delta table already exists at the
15
+ target (otherwise the statement passes through — e.g. the test's native ``fact``/``seed``).
16
+
17
+ ``drop table`` unregisters the ``delta_scan`` view AND overwrites the table (via delta_rs) to a
18
+ one-column ``TOMBSTONE_COLUMN`` marker, which discovery recognizes and hides. It does NOT delete
19
+ data: delta_rs has no drop, and removing the Delta files would be a filesystem hack that fails on
20
+ object stores. The directory persists until a human purges it; a later ``create table ... as``
21
+ overwrites the tombstone with real data and the table is live again.
22
+
23
+ The seed loader's own SQL (``create table <t> (<col defs>)``, ``insert ... values``, ``COPY``) lands
24
+ on a native DuckDB table, not a Delta table: ``create table (<col defs>)`` doesn't match the
25
+ ``... as select`` form, and while ``insert ... values`` now *does* match a form here, the mutate
26
+ guard only applies it when a Delta table already exists at the target — the seed's native table has
27
+ none, so it falls through untouched. duckrun's own materializations emit ``create ... view`` (not
28
+ ``table``), so they pass through too.
29
+ """
30
+ import re
31
+ from typing import List, Optional, Tuple
32
+
33
+ from . import engine
34
+
35
+ # `drop table` tombstone: a dropped relation is overwritten (via delta_rs) to a table whose ONLY
36
+ # column is this marker, so (a) discovery recognizes it as dropped and hides it, and (b) anyone who
37
+ # opens the files sees an obviously-not-a-real-table schema rather than a plausible empty table. No
38
+ # data is deleted — the directory stays until a human purges it; a later `create table ... as`
39
+ # overwrites the marker schema with real data and the table is live again.
40
+ TOMBSTONE_COLUMN = "__duckrun_deleted__"
41
+
42
+
43
+ def _columns_are_tombstone(colnames) -> bool:
44
+ return [str(c).lower() for c in colnames] == [TOMBSTONE_COLUMN]
45
+
46
+
47
+ def is_dropped(con, location: str, storage_options=None) -> bool:
48
+ """True if the Delta table at ``location`` is a duckrun drop-tombstone (single marker column).
49
+
50
+ Used by discovery (dbt + connection API) to hide dropped tables. Best-effort: anything that
51
+ can't be opened/scanned is treated as 'not a tombstone' (let normal handling deal with it).
52
+ """
53
+ loc_sql = str(location).replace("'", "''")
54
+ try:
55
+ rel = con.execute(f"select * from delta_scan('{loc_sql}') limit 0")
56
+ return _columns_are_tombstone([d[0] for d in rel.description])
57
+ except Exception:
58
+ return False
59
+
60
+ # --- statement matchers (leading-anchored, DOTALL so multi-line bodies match) ----------------
61
+ _CREATE_AS = re.compile(
62
+ r"\s*create\s+table\s+(?:if\s+not\s+exists\s+)?(?P<rel>.+?)\s+as\s+(?P<body>select\b.*)",
63
+ re.I | re.S,
64
+ )
65
+ _INSERT_SELECT = re.compile(
66
+ r"\s*insert\s+into\s+(?P<rel>.+?)\s+(?P<body>select\b.*)", re.I | re.S
67
+ )
68
+ _INSERT_VALUES = re.compile(
69
+ r"\s*insert\s+into\s+(?P<rel>.+?)\s*(?:\((?P<cols>[^)]*)\))?\s*values\s+(?P<body>\(.+)",
70
+ re.I | re.S,
71
+ )
72
+ _DELETE = re.compile(
73
+ r"\s*delete\s+from\s+(?P<rel>.+?)(?:\s+where\s+(?P<where>.+))?\s*;?\s*", re.I | re.S
74
+ )
75
+ _UPDATE = re.compile(
76
+ r"\s*update\s+(?P<rel>.+?)\s+set\s+(?P<set>.+?)(?:\s+where\s+(?P<where>.+?))?\s*;?\s*",
77
+ re.I | re.S,
78
+ )
79
+ _ALTER_ADD = re.compile(
80
+ r"\s*alter\s+table\s+(?P<rel>.+?)\s+add\s+column\s+(?P<col>\S+)\s+(?P<def>.+?)\s*;?\s*",
81
+ re.I | re.S,
82
+ )
83
+ _DROP = re.compile(
84
+ r"\s*drop\s+table\s+(?:if\s+exists\s+)?(?P<rel>[^\s;]+)\s*;?\s*", re.I | re.S
85
+ )
86
+
87
+
88
+ def _fullmatch(pattern, sql):
89
+ return pattern.fullmatch(sql.strip())
90
+
91
+
92
+ def _split_relation(rel: str) -> Tuple[Optional[str], Optional[str]]:
93
+ """`"db"."schema"."tbl"` / `schema.tbl` / `tbl` -> (schema, identifier), quotes stripped."""
94
+ parts = [p.strip().strip('"') for p in rel.strip().split(".")]
95
+ if not parts or not parts[-1]:
96
+ return None, None
97
+ identifier = parts[-1]
98
+ schema = parts[-2] if len(parts) >= 2 else None
99
+ return schema, identifier
100
+
101
+
102
+ def _split_top_level_commas(s: str) -> List[str]:
103
+ """Split on commas that aren't inside parentheses or quotes (so ``left(email, 3)`` stays whole)."""
104
+ out, depth, start, quote = [], 0, 0, None
105
+ for i, ch in enumerate(s):
106
+ if quote:
107
+ if ch == quote:
108
+ quote = None
109
+ elif ch in ("'", '"'):
110
+ quote = ch
111
+ elif ch in "([":
112
+ depth += 1
113
+ elif ch in ")]":
114
+ depth -= 1
115
+ elif ch == "," and depth == 0:
116
+ out.append(s[start:i])
117
+ start = i + 1
118
+ out.append(s[start:])
119
+ return [p.strip() for p in out if p.strip()]
120
+
121
+
122
+ class _DeltaDML:
123
+ """One attempt to handle a statement; ``run()`` returns True if it was applied to Delta."""
124
+
125
+ def __init__(self, cursor, root_path: str, storage_options, default_schema=None):
126
+ self.cursor = cursor
127
+ self.root_path = root_path.rstrip("/")
128
+ self.so = storage_options
129
+ self.default_schema = default_schema
130
+
131
+ def _loc(self, schema: str, identifier: str) -> str:
132
+ return f"{self.root_path}/{schema}/{identifier}"
133
+
134
+ def _resolve(self, rel: str):
135
+ """(schema, identifier, location) for ``rel``, falling back to default_schema for an
136
+ unqualified name (the connection API relies on a current database). (None, None, None) when
137
+ no schema can be determined."""
138
+ schema, identifier = _split_relation(rel)
139
+ schema = schema or self.default_schema
140
+ if not schema or not identifier:
141
+ return None, None, None
142
+ return schema, identifier, self._loc(schema, identifier)
143
+
144
+ def _exists(self, loc: str) -> bool:
145
+ return engine.table_exists(loc, self.so)
146
+
147
+ def _refresh_view(self, rel: str, schema: str, loc: str) -> None:
148
+ loc_sql = loc.replace("'", "''")
149
+ self.cursor.execute(f'create schema if not exists "{schema}"')
150
+ self.cursor.execute(
151
+ f"create or replace view {rel} as select * from delta_scan('{loc_sql}')"
152
+ )
153
+
154
+ def try_handle(self, sql: str) -> bool:
155
+ m = _fullmatch(_CREATE_AS, sql)
156
+ if m and "__duckrun" not in m.group("rel"):
157
+ return self._create_as(m)
158
+ m = _fullmatch(_INSERT_SELECT, sql)
159
+ if m:
160
+ return self._mutate(m, self._insert_select)
161
+ m = _fullmatch(_INSERT_VALUES, sql)
162
+ if m:
163
+ return self._mutate(m, self._insert_values)
164
+ m = _fullmatch(_DELETE, sql)
165
+ if m:
166
+ return self._mutate(m, self._delete)
167
+ m = _fullmatch(_UPDATE, sql)
168
+ if m:
169
+ return self._mutate(m, self._update)
170
+ m = _fullmatch(_ALTER_ADD, sql)
171
+ if m:
172
+ return self._mutate(m, self._alter_add)
173
+ m = _fullmatch(_DROP, sql)
174
+ if m:
175
+ return self._drop(m)
176
+ return False
177
+
178
+ # -- create table <rel> as <select>: always materialize as a duckrun Delta table -----------
179
+ def _create_as(self, m) -> bool:
180
+ rel = m.group("rel").strip()
181
+ schema, identifier, loc = self._resolve(rel)
182
+ if not loc:
183
+ return False
184
+ data = self.cursor.sql(m.group("body"))
185
+ # overwrite_schema so this replaces a prior table (or a drop-tombstone) wholesale — a live
186
+ # table is recreated with the real schema, clearing any tombstone marker.
187
+ engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
188
+ self._refresh_view(rel, schema, loc)
189
+ return True
190
+
191
+ # -- forms that only apply when a Delta table already exists at the target ------------------
192
+ def _mutate(self, m, op) -> bool:
193
+ rel = m.group("rel").strip()
194
+ schema, identifier, loc = self._resolve(rel)
195
+ if not loc or not self._exists(loc):
196
+ return False # native relation (e.g. the test's `fact`/`seed`) -> let DuckDB handle it
197
+ op(m, rel, schema, loc)
198
+ self._refresh_view(rel, schema, loc)
199
+ return True
200
+
201
+ def _delete(self, m, rel, schema, loc) -> None:
202
+ where = m.group("where")
203
+ engine._delta_table(loc, self.so).delete(predicate=where.strip() if where else None)
204
+
205
+ def _update(self, m, rel, schema, loc) -> None:
206
+ updates = {}
207
+ for assign in _split_top_level_commas(m.group("set")):
208
+ col, _, expr = assign.partition("=")
209
+ updates[col.strip().strip('"')] = expr.strip()
210
+ where = m.group("where")
211
+ engine._delta_table(loc, self.so).update(
212
+ updates=updates, predicate=where.strip() if where else None
213
+ )
214
+
215
+ def _insert_select(self, m, rel, schema, loc) -> None:
216
+ data = self.cursor.sql(m.group("body"))
217
+ engine.write_delta(loc, data, "append", storage_options=self.so)
218
+
219
+ def _insert_values(self, m, rel, schema, loc) -> None:
220
+ # `insert into <rel> [(<cols>)] values (...)`: evaluate the VALUES tuples through DuckDB and
221
+ # project them onto the FULL target Delta schema (so append schemas match) — supplied columns
222
+ # come from the literals, any unsupplied target column is filled with a typed NULL.
223
+ loc_sql = loc.replace("'", "''")
224
+ template = self.cursor.sql(f"select * from delta_scan('{loc_sql}') limit 0")
225
+ target_cols = list(template.columns)
226
+ target_types = [str(t) for t in template.types]
227
+ by_lower = {c.lower(): c for c in target_cols}
228
+
229
+ cols = m.group("cols")
230
+ if cols: # explicit column list → canonicalize to the target's casing
231
+ provided = [by_lower.get(c.strip().strip('"').lower(), c.strip().strip('"'))
232
+ for c in cols.split(",")]
233
+ else: # positional → the literals supply every target column, in order
234
+ provided = target_cols
235
+ provided_set = {c for c in provided}
236
+
237
+ quoted = ", ".join('"' + c + '"' for c in provided)
238
+ inner = f"(values {m.group('body')}) v({quoted})"
239
+ # Cast every projected column to the TARGET column's type — both supplied values and the
240
+ # typed NULLs — so the appended Arrow schema matches the table exactly. This is also what a
241
+ # plain SQL INSERT does (a literal is coerced to the column type), and it stops a literal
242
+ # whose inferred type is wider than the column (e.g. a ::timestamp into a DATE column) from
243
+ # forcing delta_rs to add a new writer feature on append (TimestampWithoutTimezone).
244
+ exprs = [
245
+ f'cast(v."{col}" as {typ}) as "{col}"' if col in provided_set
246
+ else f'cast(null as {typ}) as "{col}"'
247
+ for col, typ in zip(target_cols, target_types)
248
+ ]
249
+ data = self.cursor.sql(f"select {', '.join(exprs)} from {inner}")
250
+ engine.write_delta(loc, data, "append", storage_options=self.so)
251
+
252
+ def _alter_add(self, m, rel, schema, loc) -> None:
253
+ col = m.group("col").strip().strip('"')
254
+ # Keep only the column type (drop any DEFAULT/NULL clause); add it as an all-null column by
255
+ # rewriting the table with overwrite_schema so delta_rs accepts the widened schema.
256
+ coltype = re.split(r"\s+default\b|\s+null\b", m.group("def"), flags=re.I)[0].strip() or "VARCHAR"
257
+ loc_sql = loc.replace("'", "''")
258
+ data = self.cursor.sql(
259
+ f'select *, cast(null as {coltype}) as "{col}" from delta_scan(\'{loc_sql}\')'
260
+ )
261
+ engine.write_delta(loc, data, "overwrite", overwrite_schema=True, storage_options=self.so)
262
+
263
+ def _drop(self, m) -> bool:
264
+ # `drop table` on a duckrun relation: unregister the delta_scan view AND, via delta_rs,
265
+ # overwrite the table to a one-column tombstone (TOMBSTONE_COLUMN) so a later glob discovery
266
+ # hides it. NO data is deleted — delta_rs has no drop, and removing the Delta files would be
267
+ # a filesystem hack that fails on object stores. The directory persists until a human purges
268
+ # it; a later `create table ... as` overwrites the tombstone with real data. If the relation
269
+ # isn't a duckrun-managed Delta table, fall through and let DuckDB drop the native table.
270
+ rel = m.group("rel").strip()
271
+ schema, identifier, loc = self._resolve(rel)
272
+ if not loc or not self._exists(loc):
273
+ return False
274
+ tombstone = self.cursor.sql(f"select true as {TOMBSTONE_COLUMN}")
275
+ engine.write_delta(loc, tombstone, "overwrite", overwrite_schema=True, storage_options=self.so)
276
+ self.cursor.execute(f"drop view if exists {rel}")
277
+ return True
278
+
279
+
280
+ def handle(cursor, root_path, storage_options, sql: str, default_schema=None) -> bool:
281
+ """Apply ``sql`` to Delta if it's a DML form targeting a duckrun-managed relation, using
282
+ ``cursor`` to evaluate any SELECT body and to (re)create the ``delta_scan`` view.
283
+
284
+ Every handled form goes through delta_rs (``engine.write_delta`` / ``DeltaTable.delete`` /
285
+ ``.update``), which carries ``storage_options`` and so works on local AND abfss/OneLake stores.
286
+ ``default_schema`` resolves an unqualified table name (the connection API has a current
287
+ database; the dbt path always renders fully-qualified names so passes None).
288
+ Returns True if handled (the caller must NOT also run it on DuckDB), False to pass through —
289
+ anything unrecognized, or (for the mutate forms) a target that isn't a Delta table.
290
+ """
291
+ if not root_path:
292
+ return False
293
+ # Cheap pre-filter: only the candidate DML verbs.
294
+ head = sql.lstrip()[:7].lower()
295
+ if not head.startswith(("delete", "update", "insert", "create", "alter", "drop")):
296
+ return False
297
+ return _DeltaDML(cursor, root_path, storage_options, default_schema).try_handle(sql)
@@ -6,6 +6,7 @@ connection (``configure_connection``), and on ``store()`` hands the model relati
6
6
  straight to delta_rs. DuckDB relations expose the Arrow C-stream interface, which
7
7
  deltalake 1.x consumes directly, so there is no pyarrow dependency.
8
8
  """
9
+ import re
9
10
  from typing import Any, Optional
10
11
 
11
12
  from dbt.adapters.duckdb.plugins import BasePlugin
@@ -155,13 +156,25 @@ class Plugin(BasePlugin):
155
156
  # Table-like (non-incremental) models always overwrite. Incremental models
156
157
  # overwrite on first run / full-refresh, then apply the incremental strategy.
157
158
  if not incremental or full_refresh or not exists:
158
- engine.write_delta(
159
- path, data, "overwrite",
160
- partition_by=partition_by,
161
- merge_schema=merge_schema,
162
- storage_options=storage_options,
163
- compaction_threshold=self._compaction_threshold,
164
- )
159
+ # This branch is a CREATE OR REPLACE: a table model, a --full-refresh, or a first run.
160
+ # When we are REPLACING an existing table (exists), allow delta_rs to replace the schema
161
+ # wholesale (schema_mode="overwrite") — the model SQL defines the new schema, exactly as
162
+ # `CREATE OR REPLACE TABLE` does on every other warehouse. Without it, delta_rs's strict
163
+ # overwrite keeps the OLD schema/protocol and so can't change a column's type or write a
164
+ # column needing a new writer feature the old table lacks (e.g. retyping to ::timestamp /
165
+ # timestampNtz). This is scoped to the full-rebuild replace ONLY — NOT append, safeappend,
166
+ # merge, or microbatch, which must keep their strict, schema-stable writes. A fresh create
167
+ # (not exists) doesn't need it. A user's explicit merge_schema still wins.
168
+ overwrite_schema = exists and not merge_schema
169
+ with engine.mem_profile("overwrite", con=cur):
170
+ engine.write_delta(
171
+ path, data, "overwrite",
172
+ partition_by=partition_by,
173
+ merge_schema=merge_schema,
174
+ overwrite_schema=overwrite_schema,
175
+ storage_options=storage_options,
176
+ compaction_threshold=self._compaction_threshold,
177
+ )
165
178
  return
166
179
 
167
180
  # Resolve the incremental strategy: default to merge when a unique_key is
@@ -194,31 +207,33 @@ class Plugin(BasePlugin):
194
207
  # prune the target (right for small incremental deltas into a large table). A model
195
208
  # whose source is itself huge can set merge_streamed_exec=true to stream it instead.
196
209
  sx = cfg.get("merge_streamed_exec")
197
- engine.merge_delta(
198
- path, data, unique_key,
199
- insert_only=(strategy == "insert"),
200
- update_columns=cfg.get("merge_update_columns"),
201
- exclude_columns=cfg.get("merge_exclude_columns"),
202
- predicates=self._merge_predicates(cfg),
203
- update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
204
- insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
205
- merge_schema=evolve_schema,
206
- max_spill_size=cfg.get("merge_max_spill_size"),
207
- streamed_exec=(False if sx is None else bool(sx)),
208
- # Pin the merge target to the version the model read (vB, captured before it read
209
- # {{ this }}), so OCC validates (vB, HEAD] the read and the commit are one snapshot.
210
- read_version=cfg.get("read_version"),
211
- storage_options=storage_options,
212
- compaction_threshold=self._compaction_threshold,
213
- )
210
+ with engine.mem_profile("merge", con=cur):
211
+ engine.merge_delta(
212
+ path, data, unique_key,
213
+ insert_only=(strategy == "insert"),
214
+ update_columns=cfg.get("merge_update_columns"),
215
+ exclude_columns=cfg.get("merge_exclude_columns"),
216
+ predicates=self._merge_predicates(cfg, data.columns),
217
+ update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
218
+ insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
219
+ merge_schema=evolve_schema,
220
+ max_spill_size=cfg.get("merge_max_spill_size"),
221
+ streamed_exec=(False if sx is None else bool(sx)),
222
+ # Pin the merge target to the version the model read (vB, captured before it read
223
+ # {{ this }}), so OCC validates (vB, HEAD] — read and commit are one snapshot.
224
+ read_version=cfg.get("read_version"),
225
+ storage_options=storage_options,
226
+ compaction_threshold=self._compaction_threshold,
227
+ )
214
228
  elif strategy == "append":
215
- engine.write_delta(
216
- path, data, "append",
217
- partition_by=partition_by,
218
- merge_schema=merge_schema,
219
- storage_options=storage_options,
220
- compaction_threshold=self._compaction_threshold,
221
- )
229
+ with engine.mem_profile("append", con=cur):
230
+ engine.write_delta(
231
+ path, data, "append",
232
+ partition_by=partition_by,
233
+ merge_schema=merge_schema,
234
+ storage_options=storage_options,
235
+ compaction_threshold=self._compaction_threshold,
236
+ )
222
237
  elif strategy == "safeappend":
223
238
  # Optimistic append: commit only if the table version has not moved since the model
224
239
  # *started* (read_version, captured before it read {{ this }}), else fail so dbt errors
@@ -226,14 +241,15 @@ class Plugin(BasePlugin):
226
241
  # is what closes the read→write gap: a writer that commits any time during the build
227
242
  # makes this fail instead of appending a duplicate. No dedup — that's the SQL's job.
228
243
  # Compare-and-swap via delta_rs max_commit_retries=0 (see engine).
229
- engine.append_if_unchanged(
230
- path, data,
231
- read_version=cfg.get("read_version"),
232
- partition_by=partition_by,
233
- merge_schema=merge_schema,
234
- storage_options=storage_options,
235
- compaction_threshold=self._compaction_threshold,
236
- )
244
+ with engine.mem_profile("safeappend", con=cur):
245
+ engine.append_if_unchanged(
246
+ path, data,
247
+ read_version=cfg.get("read_version"),
248
+ partition_by=partition_by,
249
+ merge_schema=merge_schema,
250
+ storage_options=storage_options,
251
+ compaction_threshold=self._compaction_threshold,
252
+ )
237
253
  else:
238
254
  raise ValueError(
239
255
  f"Unknown incremental_strategy '{strategy}'. "
@@ -466,16 +482,37 @@ class Plugin(BasePlugin):
466
482
  return None
467
483
  return str(expr).replace("DBT_INTERNAL_DEST", "target").replace("DBT_INTERNAL_SOURCE", "source")
468
484
 
485
+ @staticmethod
486
+ def _qualify_predicate(expr, columns):
487
+ """Prefix bare references to known target columns with ``target.``.
488
+
489
+ duckrun folds ``incremental_predicates`` into the merge condition
490
+ (``target.k = source.k AND <predicate>``). A bare column there (e.g. ``id != 2``) exists
491
+ on BOTH the source and target, so delta_rs rejects it as an ambiguous reference. dbt's
492
+ ``incremental_predicates`` constrain the existing/target rows (the delete+insert delete, the
493
+ merge ON), so we qualify bare column tokens to ``target.``. Only exact column-name tokens
494
+ that aren't already qualified (preceded by ``.``) or quoted/literal are rewritten — literals
495
+ and functions (e.g. ``current_date``, which is not a column) are left untouched."""
496
+ if not expr or not columns:
497
+ return expr
498
+ # Longest names first so a column that's a prefix of another isn't partially matched.
499
+ for col in sorted({str(c) for c in columns}, key=len, reverse=True):
500
+ # whole-word col, not preceded by '.', a word char, or a quote (already qualified/quoted).
501
+ pattern = re.compile(r'(?<![.\w"\'])' + re.escape(col) + r'\b', re.I)
502
+ expr = pattern.sub(lambda m: "target." + m.group(0), expr)
503
+ return expr
504
+
469
505
  @classmethod
470
- def _merge_predicates(cls, cfg: dict):
506
+ def _merge_predicates(cls, cfg: dict, columns=None):
471
507
  """dbt ``incremental_predicates`` (or ``predicates``), with dbt's standard merge
472
- aliases rewritten to the ones delta_rs uses here."""
508
+ aliases rewritten to the ones delta_rs uses here and bare column refs qualified to
509
+ ``target.`` (see ``_qualify_predicate``)."""
473
510
  preds = cfg.get("incremental_predicates") or cfg.get("predicates")
474
511
  if not preds:
475
512
  return None
476
513
  if isinstance(preds, str):
477
514
  preds = [preds]
478
- return [cls._rewrite_merge_aliases(p) for p in preds]
515
+ return [cls._qualify_predicate(cls._rewrite_merge_aliases(p), columns) for p in preds]
479
516
 
480
517
  @staticmethod
481
518
  def _resolve_schema_change(on_schema_change, path, data, storage_options) -> bool:
@@ -196,6 +196,133 @@ def _effective_mem_limit_source() -> str:
196
196
  return "physical RAM"
197
197
 
198
198
 
199
+ # --------------------------------------------------------------- memory profiling (opt-in)
200
+ # A merge that OOMs has three suspects sharing one process: DuckDB (producing the source), the
201
+ # Arrow buffers delta_rs collects when streamed_exec=False, and delta_rs's own merge pool. RSS
202
+ # alone can't tell them apart. With DUCKRUN_MEM_PROFILE set, mem_profile() samples this process's
203
+ # RSS *and* DuckDB's own allocation through a write/merge and logs the split, so "who's the slob"
204
+ # is measured, not inferred. Off by default: no thread, no samples, no overhead in production.
205
+
206
+ def _proc_rss_bytes() -> Optional[int]:
207
+ """Resident set size of THIS process in bytes — the number the OOM-killer actually watches;
208
+ None if it can't be read. Linux: VmRSS from /proc/self/status. Windows: WorkingSetSize."""
209
+ try:
210
+ with open("/proc/self/status") as fh:
211
+ for line in fh:
212
+ if line.startswith("VmRSS:"):
213
+ return int(line.split()[1]) * 1024 # value is in kB
214
+ except (OSError, ValueError, IndexError):
215
+ pass
216
+ try: # Windows: GetProcessMemoryInfo -> WorkingSetSize
217
+ from ctypes import wintypes
218
+
219
+ class _PMC(ctypes.Structure):
220
+ _fields_ = [("cb", ctypes.c_ulong), ("PageFaultCount", ctypes.c_ulong)] + [
221
+ (n, ctypes.c_size_t) for n in (
222
+ "PeakWorkingSetSize", "WorkingSetSize", "QuotaPeakPagedPoolUsage",
223
+ "QuotaPagedPoolUsage", "QuotaPeakNonPagedPoolUsage", "QuotaNonPagedPoolUsage",
224
+ "PagefileUsage", "PeakPagefileUsage")
225
+ ]
226
+ # argtypes are required: GetCurrentProcess returns the pseudo-handle (-1), which overflows
227
+ # ctypes' default int marshalling unless the parameter is typed as a HANDLE.
228
+ k32 = ctypes.windll.kernel32
229
+ k32.GetCurrentProcess.restype = wintypes.HANDLE
230
+ psapi = ctypes.windll.psapi
231
+ psapi.GetProcessMemoryInfo.argtypes = [wintypes.HANDLE, ctypes.POINTER(_PMC), ctypes.c_ulong]
232
+ psapi.GetProcessMemoryInfo.restype = wintypes.BOOL
233
+ p = _PMC()
234
+ p.cb = ctypes.sizeof(_PMC)
235
+ if psapi.GetProcessMemoryInfo(k32.GetCurrentProcess(), ctypes.byref(p), p.cb):
236
+ return int(p.WorkingSetSize)
237
+ except Exception:
238
+ pass
239
+ return None
240
+
241
+
242
+ def _duckdb_mem_bytes(con):
243
+ """(allocated_bytes, temp_spill_bytes) DuckDB currently holds, via duckdb_memory(); None on any
244
+ error. Runs on a *separate* cursor so it's safe to call while another query streams on `con` —
245
+ and this is a diagnostic-only path, so it must never raise into the real write/merge."""
246
+ if con is None:
247
+ return None
248
+ try:
249
+ cur = con.cursor() # duckdb's cursor() is a new connection on the same instance
250
+ row = cur.execute(
251
+ "SELECT coalesce(sum(memory_usage_bytes), 0), "
252
+ "coalesce(sum(temporary_storage_bytes), 0) FROM duckdb_memory()"
253
+ ).fetchone()
254
+ return (int(row[0]), int(row[1]))
255
+ except Exception:
256
+ return None
257
+
258
+
259
+ class _MemSampler:
260
+ """Background RSS / DuckDB-memory sampler for one write or merge. See mem_profile()."""
261
+
262
+ def __init__(self, label: str, con=None, interval: float = 0.1):
263
+ self.label = label
264
+ self.con = con
265
+ self.interval = interval
266
+ self._thread = None
267
+ self._stop = None
268
+ self.samples = 0
269
+ self.peak_rss = 0
270
+ self.duckdb_at_rss_peak = None # DuckDB alloc at the instant RSS peaked
271
+ self.duckdb_spill_at_rss_peak = None
272
+ self.peak_duckdb = 0 # DuckDB's own high-water, independently
273
+
274
+ def __enter__(self):
275
+ if not os.environ.get("DUCKRUN_MEM_PROFILE"):
276
+ return self # disabled: no thread, no overhead
277
+ import threading
278
+ self._stop = threading.Event()
279
+ self._thread = threading.Thread(
280
+ target=self._run, name=f"duckrun-mem-{self.label}", daemon=True)
281
+ self._thread.start()
282
+ return self
283
+
284
+ def _run(self):
285
+ while not self._stop.is_set():
286
+ rss = _proc_rss_bytes()
287
+ dd = _duckdb_mem_bytes(self.con)
288
+ self.samples += 1
289
+ if dd is not None and dd[0] > self.peak_duckdb:
290
+ self.peak_duckdb = dd[0]
291
+ if rss is not None and rss > self.peak_rss:
292
+ self.peak_rss = rss
293
+ if dd is not None:
294
+ self.duckdb_at_rss_peak, self.duckdb_spill_at_rss_peak = dd
295
+ self._stop.wait(self.interval)
296
+
297
+ def __exit__(self, *exc):
298
+ if self._thread is None:
299
+ return False
300
+ self._stop.set()
301
+ self._thread.join(timeout=2.0)
302
+
303
+ def mb(n):
304
+ return "n/a" if n is None else f"{n / 2 ** 20:,.0f} MB"
305
+
306
+ non_duck = None
307
+ if self.peak_rss and self.duckdb_at_rss_peak is not None:
308
+ non_duck = max(0, self.peak_rss - self.duckdb_at_rss_peak)
309
+ logger.info(
310
+ f"mem[{self.label}]: peak RSS={mb(self.peak_rss)} | "
311
+ f"DuckDB peak={mb(self.peak_duckdb)} "
312
+ f"(at RSS-peak {mb(self.duckdb_at_rss_peak)}, spill {mb(self.duckdb_spill_at_rss_peak)}) | "
313
+ f"non-DuckDB~={mb(non_duck)} (delta_rs + Arrow) | samples={self.samples}"
314
+ )
315
+ return False
316
+
317
+
318
+ def mem_profile(label: str, con=None, interval: float = 0.1):
319
+ """Context manager that profiles a write/merge's memory when DUCKRUN_MEM_PROFILE is set, else a
320
+ no-op. Wraps an engine call so RSS, DuckDB's allocation, and the delta_rs/Arrow remainder are
321
+ measured for that phase and logged once on exit. `con` (the DuckDB connection) enables the
322
+ DuckDB-vs-delta_rs split; omit it to log RSS only. Diagnostic only — never affects the write."""
323
+ return _MemSampler(label, con=con, interval=interval)
324
+
325
+
199
326
  # How the effective memory limit is split between the two big consumers that can peak at the
200
327
  # same time during a merge — DuckDB (producing the source relation) and delta_rs (the merge
201
328
  # pool). They share one cap, so the shares must sum *under* 1.0 or we've just moved the OOM; each
@@ -409,6 +536,37 @@ def table_exists(path: str, storage_options: Optional[Dict[str, str]] = None) ->
409
536
  return False
410
537
 
411
538
 
539
+ def delta_stats(cur, path: str, storage_options: Optional[Dict[str, str]] = None):
540
+ """Cheap table statistics for ``dbt docs generate``, read from the Delta **log** (no data scan).
541
+
542
+ ``DeltaTable.get_add_actions()`` carries per-file ``num_records`` / ``size_bytes`` /
543
+ ``modification_time``; summing rows+bytes and taking the latest mtime gives the whole table's
544
+ stats without opening any data file. Aggregation goes through the DuckDB cursor (``cur``) via a
545
+ replacement scan over the arro3 table — no pyarrow dependency.
546
+
547
+ Returns ``{"num_rows", "bytes", "last_modified"}`` (last_modified = epoch milliseconds), or
548
+ ``None`` on ANY failure (a drop-tombstone, a missing table, an unreachable/credential-less remote
549
+ store). Best-effort by design: a statless catalog is fine, but a docs build must never break.
550
+ """
551
+ try:
552
+ add_actions = _delta_table(path, storage_options).get_add_actions() # noqa: F841 (replacement scan)
553
+ row = cur.sql(
554
+ "select coalesce(sum(num_records), 0)::bigint, "
555
+ "coalesce(sum(size_bytes), 0)::bigint, "
556
+ "max(modification_time)::bigint from add_actions"
557
+ ).fetchone()
558
+ except Exception as exc: # best-effort: docs stats must never fail catalog generation
559
+ logger.debug(f"duckrun: no Delta stats for {path!r}: {exc}")
560
+ return None
561
+ if row is None:
562
+ return None
563
+ return {
564
+ "num_rows": int(row[0]),
565
+ "bytes": int(row[1]),
566
+ "last_modified": int(row[2]) if row[2] is not None else None,
567
+ }
568
+
569
+
412
570
  # Delta column-metadata key under which we stash a dbt column description, and the dollar-quote
413
571
  # label used to embed arbitrary comment text (newlines, quotes, dollar signs) in COMMENT ON SQL.
414
572
  _DELTA_COMMENT_KEY = "comment"