duckrun 0.3.17.dev1__tar.gz → 0.3.17.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {duckrun-0.3.17.dev1/duckrun.egg-info → duckrun-0.3.17.dev2}/PKG-INFO +2 -1
- duckrun-0.3.17.dev2/dbt/adapters/duckrun/__version__.py +1 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/delta_plugin.py +43 -39
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/engine.py +127 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2/duckrun.egg-info}/PKG-INFO +2 -1
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/requires.txt +1 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/pyproject.toml +8 -1
- duckrun-0.3.17.dev1/dbt/adapters/duckrun/__version__.py +0 -1
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/LICENSE +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/MANIFEST.in +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/README.md +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/credentials.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/environment.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/impl.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/remote.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/secret.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/dbt_project.yml +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/catalog.sql +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/__init__.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/auth.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/delta_table.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/session.py +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.3.17.
|
|
3
|
+
Version: 0.3.17.dev2
|
|
4
4
|
Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -11,6 +11,7 @@ Requires-Python: >=3.9
|
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
13
|
Requires-Dist: dbt-duckdb>=1.8
|
|
14
|
+
Requires-Dist: dbt-core<2.0,>=1.8
|
|
14
15
|
Requires-Dist: duckdb==1.5.4.dev18
|
|
15
16
|
Requires-Dist: deltalake<1.5.1,>=1.5.0
|
|
16
17
|
Requires-Dist: requests
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
version = "0.3.17.dev2"
|
|
@@ -155,13 +155,14 @@ class Plugin(BasePlugin):
|
|
|
155
155
|
# Table-like (non-incremental) models always overwrite. Incremental models
|
|
156
156
|
# overwrite on first run / full-refresh, then apply the incremental strategy.
|
|
157
157
|
if not incremental or full_refresh or not exists:
|
|
158
|
-
engine.
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
158
|
+
with engine.mem_profile("overwrite", con=cur):
|
|
159
|
+
engine.write_delta(
|
|
160
|
+
path, data, "overwrite",
|
|
161
|
+
partition_by=partition_by,
|
|
162
|
+
merge_schema=merge_schema,
|
|
163
|
+
storage_options=storage_options,
|
|
164
|
+
compaction_threshold=self._compaction_threshold,
|
|
165
|
+
)
|
|
165
166
|
return
|
|
166
167
|
|
|
167
168
|
# Resolve the incremental strategy: default to merge when a unique_key is
|
|
@@ -194,31 +195,33 @@ class Plugin(BasePlugin):
|
|
|
194
195
|
# prune the target (right for small incremental deltas into a large table). A model
|
|
195
196
|
# whose source is itself huge can set merge_streamed_exec=true to stream it instead.
|
|
196
197
|
sx = cfg.get("merge_streamed_exec")
|
|
197
|
-
engine.
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
198
|
+
with engine.mem_profile("merge", con=cur):
|
|
199
|
+
engine.merge_delta(
|
|
200
|
+
path, data, unique_key,
|
|
201
|
+
insert_only=(strategy == "insert"),
|
|
202
|
+
update_columns=cfg.get("merge_update_columns"),
|
|
203
|
+
exclude_columns=cfg.get("merge_exclude_columns"),
|
|
204
|
+
predicates=self._merge_predicates(cfg),
|
|
205
|
+
update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
|
|
206
|
+
insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
|
|
207
|
+
merge_schema=evolve_schema,
|
|
208
|
+
max_spill_size=cfg.get("merge_max_spill_size"),
|
|
209
|
+
streamed_exec=(False if sx is None else bool(sx)),
|
|
210
|
+
# Pin the merge target to the version the model read (vB, captured before it read
|
|
211
|
+
# {{ this }}), so OCC validates (vB, HEAD] — read and commit are one snapshot.
|
|
212
|
+
read_version=cfg.get("read_version"),
|
|
213
|
+
storage_options=storage_options,
|
|
214
|
+
compaction_threshold=self._compaction_threshold,
|
|
215
|
+
)
|
|
214
216
|
elif strategy == "append":
|
|
215
|
-
engine.
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
217
|
+
with engine.mem_profile("append", con=cur):
|
|
218
|
+
engine.write_delta(
|
|
219
|
+
path, data, "append",
|
|
220
|
+
partition_by=partition_by,
|
|
221
|
+
merge_schema=merge_schema,
|
|
222
|
+
storage_options=storage_options,
|
|
223
|
+
compaction_threshold=self._compaction_threshold,
|
|
224
|
+
)
|
|
222
225
|
elif strategy == "safeappend":
|
|
223
226
|
# Optimistic append: commit only if the table version has not moved since the model
|
|
224
227
|
# *started* (read_version, captured before it read {{ this }}), else fail so dbt errors
|
|
@@ -226,14 +229,15 @@ class Plugin(BasePlugin):
|
|
|
226
229
|
# is what closes the read→write gap: a writer that commits any time during the build
|
|
227
230
|
# makes this fail instead of appending a duplicate. No dedup — that's the SQL's job.
|
|
228
231
|
# Compare-and-swap via delta_rs max_commit_retries=0 (see engine).
|
|
229
|
-
engine.
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
232
|
+
with engine.mem_profile("safeappend", con=cur):
|
|
233
|
+
engine.append_if_unchanged(
|
|
234
|
+
path, data,
|
|
235
|
+
read_version=cfg.get("read_version"),
|
|
236
|
+
partition_by=partition_by,
|
|
237
|
+
merge_schema=merge_schema,
|
|
238
|
+
storage_options=storage_options,
|
|
239
|
+
compaction_threshold=self._compaction_threshold,
|
|
240
|
+
)
|
|
237
241
|
else:
|
|
238
242
|
raise ValueError(
|
|
239
243
|
f"Unknown incremental_strategy '{strategy}'. "
|
|
@@ -196,6 +196,133 @@ def _effective_mem_limit_source() -> str:
|
|
|
196
196
|
return "physical RAM"
|
|
197
197
|
|
|
198
198
|
|
|
199
|
+
# --------------------------------------------------------------- memory profiling (opt-in)
|
|
200
|
+
# A merge that OOMs has three suspects sharing one process: DuckDB (producing the source), the
|
|
201
|
+
# Arrow buffers delta_rs collects when streamed_exec=False, and delta_rs's own merge pool. RSS
|
|
202
|
+
# alone can't tell them apart. With DUCKRUN_MEM_PROFILE set, mem_profile() samples this process's
|
|
203
|
+
# RSS *and* DuckDB's own allocation through a write/merge and logs the split, so "who's the slob"
|
|
204
|
+
# is measured, not inferred. Off by default: no thread, no samples, no overhead in production.
|
|
205
|
+
|
|
206
|
+
def _proc_rss_bytes() -> Optional[int]:
|
|
207
|
+
"""Resident set size of THIS process in bytes — the number the OOM-killer actually watches;
|
|
208
|
+
None if it can't be read. Linux: VmRSS from /proc/self/status. Windows: WorkingSetSize."""
|
|
209
|
+
try:
|
|
210
|
+
with open("/proc/self/status") as fh:
|
|
211
|
+
for line in fh:
|
|
212
|
+
if line.startswith("VmRSS:"):
|
|
213
|
+
return int(line.split()[1]) * 1024 # value is in kB
|
|
214
|
+
except (OSError, ValueError, IndexError):
|
|
215
|
+
pass
|
|
216
|
+
try: # Windows: GetProcessMemoryInfo -> WorkingSetSize
|
|
217
|
+
from ctypes import wintypes
|
|
218
|
+
|
|
219
|
+
class _PMC(ctypes.Structure):
|
|
220
|
+
_fields_ = [("cb", ctypes.c_ulong), ("PageFaultCount", ctypes.c_ulong)] + [
|
|
221
|
+
(n, ctypes.c_size_t) for n in (
|
|
222
|
+
"PeakWorkingSetSize", "WorkingSetSize", "QuotaPeakPagedPoolUsage",
|
|
223
|
+
"QuotaPagedPoolUsage", "QuotaPeakNonPagedPoolUsage", "QuotaNonPagedPoolUsage",
|
|
224
|
+
"PagefileUsage", "PeakPagefileUsage")
|
|
225
|
+
]
|
|
226
|
+
# argtypes are required: GetCurrentProcess returns the pseudo-handle (-1), which overflows
|
|
227
|
+
# ctypes' default int marshalling unless the parameter is typed as a HANDLE.
|
|
228
|
+
k32 = ctypes.windll.kernel32
|
|
229
|
+
k32.GetCurrentProcess.restype = wintypes.HANDLE
|
|
230
|
+
psapi = ctypes.windll.psapi
|
|
231
|
+
psapi.GetProcessMemoryInfo.argtypes = [wintypes.HANDLE, ctypes.POINTER(_PMC), ctypes.c_ulong]
|
|
232
|
+
psapi.GetProcessMemoryInfo.restype = wintypes.BOOL
|
|
233
|
+
p = _PMC()
|
|
234
|
+
p.cb = ctypes.sizeof(_PMC)
|
|
235
|
+
if psapi.GetProcessMemoryInfo(k32.GetCurrentProcess(), ctypes.byref(p), p.cb):
|
|
236
|
+
return int(p.WorkingSetSize)
|
|
237
|
+
except Exception:
|
|
238
|
+
pass
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _duckdb_mem_bytes(con):
|
|
243
|
+
"""(allocated_bytes, temp_spill_bytes) DuckDB currently holds, via duckdb_memory(); None on any
|
|
244
|
+
error. Runs on a *separate* cursor so it's safe to call while another query streams on `con` —
|
|
245
|
+
and this is a diagnostic-only path, so it must never raise into the real write/merge."""
|
|
246
|
+
if con is None:
|
|
247
|
+
return None
|
|
248
|
+
try:
|
|
249
|
+
cur = con.cursor() # duckdb's cursor() is a new connection on the same instance
|
|
250
|
+
row = cur.execute(
|
|
251
|
+
"SELECT coalesce(sum(memory_usage_bytes), 0), "
|
|
252
|
+
"coalesce(sum(temporary_storage_bytes), 0) FROM duckdb_memory()"
|
|
253
|
+
).fetchone()
|
|
254
|
+
return (int(row[0]), int(row[1]))
|
|
255
|
+
except Exception:
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class _MemSampler:
|
|
260
|
+
"""Background RSS / DuckDB-memory sampler for one write or merge. See mem_profile()."""
|
|
261
|
+
|
|
262
|
+
def __init__(self, label: str, con=None, interval: float = 0.1):
|
|
263
|
+
self.label = label
|
|
264
|
+
self.con = con
|
|
265
|
+
self.interval = interval
|
|
266
|
+
self._thread = None
|
|
267
|
+
self._stop = None
|
|
268
|
+
self.samples = 0
|
|
269
|
+
self.peak_rss = 0
|
|
270
|
+
self.duckdb_at_rss_peak = None # DuckDB alloc at the instant RSS peaked
|
|
271
|
+
self.duckdb_spill_at_rss_peak = None
|
|
272
|
+
self.peak_duckdb = 0 # DuckDB's own high-water, independently
|
|
273
|
+
|
|
274
|
+
def __enter__(self):
|
|
275
|
+
if not os.environ.get("DUCKRUN_MEM_PROFILE"):
|
|
276
|
+
return self # disabled: no thread, no overhead
|
|
277
|
+
import threading
|
|
278
|
+
self._stop = threading.Event()
|
|
279
|
+
self._thread = threading.Thread(
|
|
280
|
+
target=self._run, name=f"duckrun-mem-{self.label}", daemon=True)
|
|
281
|
+
self._thread.start()
|
|
282
|
+
return self
|
|
283
|
+
|
|
284
|
+
def _run(self):
|
|
285
|
+
while not self._stop.is_set():
|
|
286
|
+
rss = _proc_rss_bytes()
|
|
287
|
+
dd = _duckdb_mem_bytes(self.con)
|
|
288
|
+
self.samples += 1
|
|
289
|
+
if dd is not None and dd[0] > self.peak_duckdb:
|
|
290
|
+
self.peak_duckdb = dd[0]
|
|
291
|
+
if rss is not None and rss > self.peak_rss:
|
|
292
|
+
self.peak_rss = rss
|
|
293
|
+
if dd is not None:
|
|
294
|
+
self.duckdb_at_rss_peak, self.duckdb_spill_at_rss_peak = dd
|
|
295
|
+
self._stop.wait(self.interval)
|
|
296
|
+
|
|
297
|
+
def __exit__(self, *exc):
|
|
298
|
+
if self._thread is None:
|
|
299
|
+
return False
|
|
300
|
+
self._stop.set()
|
|
301
|
+
self._thread.join(timeout=2.0)
|
|
302
|
+
|
|
303
|
+
def mb(n):
|
|
304
|
+
return "n/a" if n is None else f"{n / 2 ** 20:,.0f} MB"
|
|
305
|
+
|
|
306
|
+
non_duck = None
|
|
307
|
+
if self.peak_rss and self.duckdb_at_rss_peak is not None:
|
|
308
|
+
non_duck = max(0, self.peak_rss - self.duckdb_at_rss_peak)
|
|
309
|
+
logger.info(
|
|
310
|
+
f"mem[{self.label}]: peak RSS={mb(self.peak_rss)} | "
|
|
311
|
+
f"DuckDB peak={mb(self.peak_duckdb)} "
|
|
312
|
+
f"(at RSS-peak {mb(self.duckdb_at_rss_peak)}, spill {mb(self.duckdb_spill_at_rss_peak)}) | "
|
|
313
|
+
f"non-DuckDB~={mb(non_duck)} (delta_rs + Arrow) | samples={self.samples}"
|
|
314
|
+
)
|
|
315
|
+
return False
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def mem_profile(label: str, con=None, interval: float = 0.1):
|
|
319
|
+
"""Context manager that profiles a write/merge's memory when DUCKRUN_MEM_PROFILE is set, else a
|
|
320
|
+
no-op. Wraps an engine call so RSS, DuckDB's allocation, and the delta_rs/Arrow remainder are
|
|
321
|
+
measured for that phase and logged once on exit. `con` (the DuckDB connection) enables the
|
|
322
|
+
DuckDB-vs-delta_rs split; omit it to log RSS only. Diagnostic only — never affects the write."""
|
|
323
|
+
return _MemSampler(label, con=con, interval=interval)
|
|
324
|
+
|
|
325
|
+
|
|
199
326
|
# How the effective memory limit is split between the two big consumers that can peak at the
|
|
200
327
|
# same time during a merge — DuckDB (producing the source relation) and delta_rs (the merge
|
|
201
328
|
# pool). They share one cap, so the shares must sum *under* 1.0 or we've just moved the OOM; each
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: duckrun
|
|
3
|
-
Version: 0.3.17.
|
|
3
|
+
Version: 0.3.17.dev2
|
|
4
4
|
Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
|
|
5
5
|
Author: mim
|
|
6
6
|
License: MIT
|
|
@@ -11,6 +11,7 @@ Requires-Python: >=3.9
|
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
13
|
Requires-Dist: dbt-duckdb>=1.8
|
|
14
|
+
Requires-Dist: dbt-core<2.0,>=1.8
|
|
14
15
|
Requires-Dist: duckdb==1.5.4.dev18
|
|
15
16
|
Requires-Dist: deltalake<1.5.1,>=1.5.0
|
|
16
17
|
Requires-Dist: requests
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.3.17.
|
|
7
|
+
version = "0.3.17.dev2"
|
|
8
8
|
description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -14,6 +14,13 @@ authors = [
|
|
|
14
14
|
requires-python = ">=3.9"
|
|
15
15
|
dependencies = [
|
|
16
16
|
"dbt-duckdb>=1.8",
|
|
17
|
+
# Cap dbt-core below 2.0: it's a breaking change (restructured packaging — `dbt.cli` moves,
|
|
18
|
+
# the dbtRunner import path changes) that the adapter has not been ported to. Without this
|
|
19
|
+
# direct pin, `pip install --pre` (or any pre-release resolution) pulls dbt-core 2.0 alphas
|
|
20
|
+
# via dbt-duckdb's uncapped requirement, leaving a 1.x-adapters / 2.0-core Frankenstein that
|
|
21
|
+
# fails at `from dbt.cli.main import dbtRunner`. Declared directly because we only depend on
|
|
22
|
+
# dbt-core transitively, so the ceiling has to live here to bite.
|
|
23
|
+
"dbt-core>=1.8,<2.0",
|
|
17
24
|
# TEMPORARY pin to a duckdb dev build — TODO: move to "==1.5.4" (or a range) once stable
|
|
18
25
|
# 1.5.4 ships (~end of month). 1.5.4.dev18 is the first build whose bundled duckdb-delta
|
|
19
26
|
# extension supports `delta_scan('...', version => N)` (duckdb-delta #312) — the version-pinned
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
version = "0.3.17.dev1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/delta.sql
RENAMED
|
File without changes
|
|
File without changes
|
{duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/table.sql
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|