PyPI - duckrun - Versions diffs - 0.3.17.dev1__tar.gz → 0.3.17.dev2__tar.gz - Mend

duckrun 0.3.17.dev1tar.gz → 0.3.17.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{duckrun-0.3.17.dev1/duckrun.egg-info → duckrun-0.3.17.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.3.17.dev1
+Version: 0.3.17.dev2
 Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
 Author: mim
 License: MIT
@@ -11,6 +11,7 @@ Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: dbt-duckdb>=1.8
+Requires-Dist: dbt-core<2.0,>=1.8
 Requires-Dist: duckdb==1.5.4.dev18
 Requires-Dist: deltalake<1.5.1,>=1.5.0
 Requires-Dist: requests

duckrun-0.3.17.dev2/dbt/adapters/duckrun/__version__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ version = "0.3.17.dev2"

{duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/delta_plugin.py RENAMED Viewed

@@ -155,13 +155,14 @@ class Plugin(BasePlugin):
         # Table-like (non-incremental) models always overwrite. Incremental models
         # overwrite on first run / full-refresh, then apply the incremental strategy.
         if not incremental or full_refresh or not exists:
-            engine.write_delta(
-                path, data, "overwrite",
-                partition_by=partition_by,
-                merge_schema=merge_schema,
-                storage_options=storage_options,
-                compaction_threshold=self._compaction_threshold,
-            )
+            with engine.mem_profile("overwrite", con=cur):
+                engine.write_delta(
+                    path, data, "overwrite",
+                    partition_by=partition_by,
+                    merge_schema=merge_schema,
+                    storage_options=storage_options,
+                    compaction_threshold=self._compaction_threshold,
+                )
             return
         # Resolve the incremental strategy: default to merge when a unique_key is
@@ -194,31 +195,33 @@ class Plugin(BasePlugin):
             # prune the target (right for small incremental deltas into a large table). A model
             # whose source is itself huge can set merge_streamed_exec=true to stream it instead.
             sx = cfg.get("merge_streamed_exec")
-            engine.merge_delta(
-                path, data, unique_key,
-                insert_only=(strategy == "insert"),
-                update_columns=cfg.get("merge_update_columns"),
-                exclude_columns=cfg.get("merge_exclude_columns"),
-                predicates=self._merge_predicates(cfg),
-                update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
-                insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
-                merge_schema=evolve_schema,
-                max_spill_size=cfg.get("merge_max_spill_size"),
-                streamed_exec=(False if sx is None else bool(sx)),
-                # Pin the merge target to the version the model read (vB, captured before it read
-                # {{ this }}), so OCC validates (vB, HEAD] — the read and the commit are one snapshot.
-                read_version=cfg.get("read_version"),
-                storage_options=storage_options,
-                compaction_threshold=self._compaction_threshold,
-            )
+            with engine.mem_profile("merge", con=cur):
+                engine.merge_delta(
+                    path, data, unique_key,
+                    insert_only=(strategy == "insert"),
+                    update_columns=cfg.get("merge_update_columns"),
+                    exclude_columns=cfg.get("merge_exclude_columns"),
+                    predicates=self._merge_predicates(cfg),
+                    update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
+                    insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
+                    merge_schema=evolve_schema,
+                    max_spill_size=cfg.get("merge_max_spill_size"),
+                    streamed_exec=(False if sx is None else bool(sx)),
+                    # Pin the merge target to the version the model read (vB, captured before it read
+                    # {{ this }}), so OCC validates (vB, HEAD] — read and commit are one snapshot.
+                    read_version=cfg.get("read_version"),
+                    storage_options=storage_options,
+                    compaction_threshold=self._compaction_threshold,
+                )
         elif strategy == "append":
-            engine.write_delta(
-                path, data, "append",
-                partition_by=partition_by,
-                merge_schema=merge_schema,
-                storage_options=storage_options,
-                compaction_threshold=self._compaction_threshold,
-            )
+            with engine.mem_profile("append", con=cur):
+                engine.write_delta(
+                    path, data, "append",
+                    partition_by=partition_by,
+                    merge_schema=merge_schema,
+                    storage_options=storage_options,
+                    compaction_threshold=self._compaction_threshold,
+                )
         elif strategy == "safeappend":
             # Optimistic append: commit only if the table version has not moved since the model
             # *started* (read_version, captured before it read {{ this }}), else fail so dbt errors
@@ -226,14 +229,15 @@ class Plugin(BasePlugin):
             # is what closes the read→write gap: a writer that commits any time during the build
             # makes this fail instead of appending a duplicate. No dedup — that's the SQL's job.
             # Compare-and-swap via delta_rs max_commit_retries=0 (see engine).
-            engine.append_if_unchanged(
-                path, data,
-                read_version=cfg.get("read_version"),
-                partition_by=partition_by,
-                merge_schema=merge_schema,
-                storage_options=storage_options,
-                compaction_threshold=self._compaction_threshold,
-            )
+            with engine.mem_profile("safeappend", con=cur):
+                engine.append_if_unchanged(
+                    path, data,
+                    read_version=cfg.get("read_version"),
+                    partition_by=partition_by,
+                    merge_schema=merge_schema,
+                    storage_options=storage_options,
+                    compaction_threshold=self._compaction_threshold,
+                )
         else:
             raise ValueError(
                 f"Unknown incremental_strategy '{strategy}'. "

{duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/engine.py RENAMED Viewed

@@ -196,6 +196,133 @@ def _effective_mem_limit_source() -> str:
     return "physical RAM"
+# --------------------------------------------------------------- memory profiling (opt-in)
+# A merge that OOMs has three suspects sharing one process: DuckDB (producing the source), the
+# Arrow buffers delta_rs collects when streamed_exec=False, and delta_rs's own merge pool. RSS
+# alone can't tell them apart. With DUCKRUN_MEM_PROFILE set, mem_profile() samples this process's
+# RSS *and* DuckDB's own allocation through a write/merge and logs the split, so "who's the slob"
+# is measured, not inferred. Off by default: no thread, no samples, no overhead in production.
+def _proc_rss_bytes() -> Optional[int]:
+    """Resident set size of THIS process in bytes — the number the OOM-killer actually watches;
+    None if it can't be read. Linux: VmRSS from /proc/self/status. Windows: WorkingSetSize."""
+    try:
+        with open("/proc/self/status") as fh:
+            for line in fh:
+                if line.startswith("VmRSS:"):
+                    return int(line.split()[1]) * 1024  # value is in kB
+    except (OSError, ValueError, IndexError):
+        pass
+    try:  # Windows: GetProcessMemoryInfo -> WorkingSetSize
+        from ctypes import wintypes
+        class _PMC(ctypes.Structure):
+            _fields_ = [("cb", ctypes.c_ulong), ("PageFaultCount", ctypes.c_ulong)] + [
+                (n, ctypes.c_size_t) for n in (
+                    "PeakWorkingSetSize", "WorkingSetSize", "QuotaPeakPagedPoolUsage",
+                    "QuotaPagedPoolUsage", "QuotaPeakNonPagedPoolUsage", "QuotaNonPagedPoolUsage",
+                    "PagefileUsage", "PeakPagefileUsage")
+            ]
+        # argtypes are required: GetCurrentProcess returns the pseudo-handle (-1), which overflows
+        # ctypes' default int marshalling unless the parameter is typed as a HANDLE.
+        k32 = ctypes.windll.kernel32
+        k32.GetCurrentProcess.restype = wintypes.HANDLE
+        psapi = ctypes.windll.psapi
+        psapi.GetProcessMemoryInfo.argtypes = [wintypes.HANDLE, ctypes.POINTER(_PMC), ctypes.c_ulong]
+        psapi.GetProcessMemoryInfo.restype = wintypes.BOOL
+        p = _PMC()
+        p.cb = ctypes.sizeof(_PMC)
+        if psapi.GetProcessMemoryInfo(k32.GetCurrentProcess(), ctypes.byref(p), p.cb):
+            return int(p.WorkingSetSize)
+    except Exception:
+        pass
+    return None
+def _duckdb_mem_bytes(con):
+    """(allocated_bytes, temp_spill_bytes) DuckDB currently holds, via duckdb_memory(); None on any
+    error. Runs on a *separate* cursor so it's safe to call while another query streams on `con` —
+    and this is a diagnostic-only path, so it must never raise into the real write/merge."""
+    if con is None:
+        return None
+    try:
+        cur = con.cursor()  # duckdb's cursor() is a new connection on the same instance
+        row = cur.execute(
+            "SELECT coalesce(sum(memory_usage_bytes), 0), "
+            "coalesce(sum(temporary_storage_bytes), 0) FROM duckdb_memory()"
+        ).fetchone()
+        return (int(row[0]), int(row[1]))
+    except Exception:
+        return None
+class _MemSampler:
+    """Background RSS / DuckDB-memory sampler for one write or merge. See mem_profile()."""
+    def __init__(self, label: str, con=None, interval: float = 0.1):
+        self.label = label
+        self.con = con
+        self.interval = interval
+        self._thread = None
+        self._stop = None
+        self.samples = 0
+        self.peak_rss = 0
+        self.duckdb_at_rss_peak = None        # DuckDB alloc at the instant RSS peaked
+        self.duckdb_spill_at_rss_peak = None
+        self.peak_duckdb = 0                  # DuckDB's own high-water, independently
+    def __enter__(self):
+        if not os.environ.get("DUCKRUN_MEM_PROFILE"):
+            return self  # disabled: no thread, no overhead
+        import threading
+        self._stop = threading.Event()
+        self._thread = threading.Thread(
+            target=self._run, name=f"duckrun-mem-{self.label}", daemon=True)
+        self._thread.start()
+        return self
+    def _run(self):
+        while not self._stop.is_set():
+            rss = _proc_rss_bytes()
+            dd = _duckdb_mem_bytes(self.con)
+            self.samples += 1
+            if dd is not None and dd[0] > self.peak_duckdb:
+                self.peak_duckdb = dd[0]
+            if rss is not None and rss > self.peak_rss:
+                self.peak_rss = rss
+                if dd is not None:
+                    self.duckdb_at_rss_peak, self.duckdb_spill_at_rss_peak = dd
+            self._stop.wait(self.interval)
+    def __exit__(self, *exc):
+        if self._thread is None:
+            return False
+        self._stop.set()
+        self._thread.join(timeout=2.0)
+        def mb(n):
+            return "n/a" if n is None else f"{n / 2 ** 20:,.0f} MB"
+        non_duck = None
+        if self.peak_rss and self.duckdb_at_rss_peak is not None:
+            non_duck = max(0, self.peak_rss - self.duckdb_at_rss_peak)
+        logger.info(
+            f"mem[{self.label}]: peak RSS={mb(self.peak_rss)} | "
+            f"DuckDB peak={mb(self.peak_duckdb)} "
+            f"(at RSS-peak {mb(self.duckdb_at_rss_peak)}, spill {mb(self.duckdb_spill_at_rss_peak)}) | "
+            f"non-DuckDB~={mb(non_duck)} (delta_rs + Arrow) | samples={self.samples}"
+        )
+        return False
+def mem_profile(label: str, con=None, interval: float = 0.1):
+    """Context manager that profiles a write/merge's memory when DUCKRUN_MEM_PROFILE is set, else a
+    no-op. Wraps an engine call so RSS, DuckDB's allocation, and the delta_rs/Arrow remainder are
+    measured for that phase and logged once on exit. `con` (the DuckDB connection) enables the
+    DuckDB-vs-delta_rs split; omit it to log RSS only. Diagnostic only — never affects the write."""
+    return _MemSampler(label, con=con, interval=interval)
 # How the effective memory limit is split between the two big consumers that can peak at the
 # same time during a merge — DuckDB (producing the source relation) and delta_rs (the merge
 # pool). They share one cap, so the shares must sum *under* 1.0 or we've just moved the OOM; each

{duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2/duckrun.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.3.17.dev1
+Version: 0.3.17.dev2
 Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
 Author: mim
 License: MIT
@@ -11,6 +11,7 @@ Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: dbt-duckdb>=1.8
+Requires-Dist: dbt-core<2.0,>=1.8
 Requires-Dist: duckdb==1.5.4.dev18
 Requires-Dist: deltalake<1.5.1,>=1.5.0
 Requires-Dist: requests

{duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 dbt-duckdb>=1.8
+dbt-core<2.0,>=1.8
 duckdb==1.5.4.dev18
 deltalake<1.5.1,>=1.5.0
 requests

{duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "duckrun"
-version = "0.3.17.dev1"
+version = "0.3.17.dev2"
 description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
 readme = "README.md"
 license = {text = "MIT"}
@@ -14,6 +14,13 @@ authors = [
 requires-python = ">=3.9"
 dependencies = [
     "dbt-duckdb>=1.8",
+    # Cap dbt-core below 2.0: it's a breaking change (restructured packaging — `dbt.cli` moves,
+    # the dbtRunner import path changes) that the adapter has not been ported to. Without this
+    # direct pin, `pip install --pre` (or any pre-release resolution) pulls dbt-core 2.0 alphas
+    # via dbt-duckdb's uncapped requirement, leaving a 1.x-adapters / 2.0-core Frankenstein that
+    # fails at `from dbt.cli.main import dbtRunner`. Declared directly because we only depend on
+    # dbt-core transitively, so the ceiling has to live here to bite.
+    "dbt-core>=1.8,<2.0",
     # TEMPORARY pin to a duckdb dev build — TODO: move to "==1.5.4" (or a range) once stable
     # 1.5.4 ships (~end of month). 1.5.4.dev18 is the first build whose bundled duckdb-delta
     # extension supports `delta_scan('...', version => N)` (duckdb-delta #312) — the version-pinned