duckrun 0.3.17.dev1__tar.gz → 0.3.17.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {duckrun-0.3.17.dev1/duckrun.egg-info → duckrun-0.3.17.dev2}/PKG-INFO +2 -1
  2. duckrun-0.3.17.dev2/dbt/adapters/duckrun/__version__.py +1 -0
  3. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/delta_plugin.py +43 -39
  4. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/engine.py +127 -0
  5. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2/duckrun.egg-info}/PKG-INFO +2 -1
  6. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/requires.txt +1 -0
  7. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/pyproject.toml +8 -1
  8. duckrun-0.3.17.dev1/dbt/adapters/duckrun/__version__.py +0 -1
  9. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/LICENSE +0 -0
  10. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/MANIFEST.in +0 -0
  11. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/README.md +0 -0
  12. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/__init__.py +0 -0
  13. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/credentials.py +0 -0
  14. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/environment.py +0 -0
  15. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/impl.py +0 -0
  16. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/remote.py +0 -0
  17. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/adapters/duckrun/secret.py +0 -0
  18. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/__init__.py +0 -0
  19. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/dbt_project.yml +0 -0
  20. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/catalog.sql +0 -0
  21. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/_delta_core.sql +0 -0
  22. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/delta.sql +0 -0
  23. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/incremental.sql +0 -0
  24. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/dbt/include/duckrun/macros/materializations/table.sql +0 -0
  25. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/__init__.py +0 -0
  26. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/auth.py +0 -0
  27. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/delta_table.py +0 -0
  28. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun/session.py +0 -0
  29. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/SOURCES.txt +0 -0
  30. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/dependency_links.txt +0 -0
  31. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/duckrun.egg-info/top_level.txt +0 -0
  32. {duckrun-0.3.17.dev1 → duckrun-0.3.17.dev2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev1
3
+ Version: 0.3.17.dev2
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -11,6 +11,7 @@ Requires-Python: >=3.9
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: dbt-duckdb>=1.8
14
+ Requires-Dist: dbt-core<2.0,>=1.8
14
15
  Requires-Dist: duckdb==1.5.4.dev18
15
16
  Requires-Dist: deltalake<1.5.1,>=1.5.0
16
17
  Requires-Dist: requests
@@ -0,0 +1 @@
1
+ version = "0.3.17.dev2"
@@ -155,13 +155,14 @@ class Plugin(BasePlugin):
155
155
  # Table-like (non-incremental) models always overwrite. Incremental models
156
156
  # overwrite on first run / full-refresh, then apply the incremental strategy.
157
157
  if not incremental or full_refresh or not exists:
158
- engine.write_delta(
159
- path, data, "overwrite",
160
- partition_by=partition_by,
161
- merge_schema=merge_schema,
162
- storage_options=storage_options,
163
- compaction_threshold=self._compaction_threshold,
164
- )
158
+ with engine.mem_profile("overwrite", con=cur):
159
+ engine.write_delta(
160
+ path, data, "overwrite",
161
+ partition_by=partition_by,
162
+ merge_schema=merge_schema,
163
+ storage_options=storage_options,
164
+ compaction_threshold=self._compaction_threshold,
165
+ )
165
166
  return
166
167
 
167
168
  # Resolve the incremental strategy: default to merge when a unique_key is
@@ -194,31 +195,33 @@ class Plugin(BasePlugin):
194
195
  # prune the target (right for small incremental deltas into a large table). A model
195
196
  # whose source is itself huge can set merge_streamed_exec=true to stream it instead.
196
197
  sx = cfg.get("merge_streamed_exec")
197
- engine.merge_delta(
198
- path, data, unique_key,
199
- insert_only=(strategy == "insert"),
200
- update_columns=cfg.get("merge_update_columns"),
201
- exclude_columns=cfg.get("merge_exclude_columns"),
202
- predicates=self._merge_predicates(cfg),
203
- update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
204
- insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
205
- merge_schema=evolve_schema,
206
- max_spill_size=cfg.get("merge_max_spill_size"),
207
- streamed_exec=(False if sx is None else bool(sx)),
208
- # Pin the merge target to the version the model read (vB, captured before it read
209
- # {{ this }}), so OCC validates (vB, HEAD] the read and the commit are one snapshot.
210
- read_version=cfg.get("read_version"),
211
- storage_options=storage_options,
212
- compaction_threshold=self._compaction_threshold,
213
- )
198
+ with engine.mem_profile("merge", con=cur):
199
+ engine.merge_delta(
200
+ path, data, unique_key,
201
+ insert_only=(strategy == "insert"),
202
+ update_columns=cfg.get("merge_update_columns"),
203
+ exclude_columns=cfg.get("merge_exclude_columns"),
204
+ predicates=self._merge_predicates(cfg),
205
+ update_condition=self._rewrite_merge_aliases(cfg.get("merge_update_condition")),
206
+ insert_condition=self._rewrite_merge_aliases(cfg.get("merge_insert_condition")),
207
+ merge_schema=evolve_schema,
208
+ max_spill_size=cfg.get("merge_max_spill_size"),
209
+ streamed_exec=(False if sx is None else bool(sx)),
210
+ # Pin the merge target to the version the model read (vB, captured before it read
211
+ # {{ this }}), so OCC validates (vB, HEAD] — read and commit are one snapshot.
212
+ read_version=cfg.get("read_version"),
213
+ storage_options=storage_options,
214
+ compaction_threshold=self._compaction_threshold,
215
+ )
214
216
  elif strategy == "append":
215
- engine.write_delta(
216
- path, data, "append",
217
- partition_by=partition_by,
218
- merge_schema=merge_schema,
219
- storage_options=storage_options,
220
- compaction_threshold=self._compaction_threshold,
221
- )
217
+ with engine.mem_profile("append", con=cur):
218
+ engine.write_delta(
219
+ path, data, "append",
220
+ partition_by=partition_by,
221
+ merge_schema=merge_schema,
222
+ storage_options=storage_options,
223
+ compaction_threshold=self._compaction_threshold,
224
+ )
222
225
  elif strategy == "safeappend":
223
226
  # Optimistic append: commit only if the table version has not moved since the model
224
227
  # *started* (read_version, captured before it read {{ this }}), else fail so dbt errors
@@ -226,14 +229,15 @@ class Plugin(BasePlugin):
226
229
  # is what closes the read→write gap: a writer that commits any time during the build
227
230
  # makes this fail instead of appending a duplicate. No dedup — that's the SQL's job.
228
231
  # Compare-and-swap via delta_rs max_commit_retries=0 (see engine).
229
- engine.append_if_unchanged(
230
- path, data,
231
- read_version=cfg.get("read_version"),
232
- partition_by=partition_by,
233
- merge_schema=merge_schema,
234
- storage_options=storage_options,
235
- compaction_threshold=self._compaction_threshold,
236
- )
232
+ with engine.mem_profile("safeappend", con=cur):
233
+ engine.append_if_unchanged(
234
+ path, data,
235
+ read_version=cfg.get("read_version"),
236
+ partition_by=partition_by,
237
+ merge_schema=merge_schema,
238
+ storage_options=storage_options,
239
+ compaction_threshold=self._compaction_threshold,
240
+ )
237
241
  else:
238
242
  raise ValueError(
239
243
  f"Unknown incremental_strategy '{strategy}'. "
@@ -196,6 +196,133 @@ def _effective_mem_limit_source() -> str:
196
196
  return "physical RAM"
197
197
 
198
198
 
199
+ # --------------------------------------------------------------- memory profiling (opt-in)
200
+ # A merge that OOMs has three suspects sharing one process: DuckDB (producing the source), the
201
+ # Arrow buffers delta_rs collects when streamed_exec=False, and delta_rs's own merge pool. RSS
202
+ # alone can't tell them apart. With DUCKRUN_MEM_PROFILE set, mem_profile() samples this process's
203
+ # RSS *and* DuckDB's own allocation through a write/merge and logs the split, so "who's the slob"
204
+ # is measured, not inferred. Off by default: no thread, no samples, no overhead in production.
205
+
206
+ def _proc_rss_bytes() -> Optional[int]:
207
+ """Resident set size of THIS process in bytes — the number the OOM-killer actually watches;
208
+ None if it can't be read. Linux: VmRSS from /proc/self/status. Windows: WorkingSetSize."""
209
+ try:
210
+ with open("/proc/self/status") as fh:
211
+ for line in fh:
212
+ if line.startswith("VmRSS:"):
213
+ return int(line.split()[1]) * 1024 # value is in kB
214
+ except (OSError, ValueError, IndexError):
215
+ pass
216
+ try: # Windows: GetProcessMemoryInfo -> WorkingSetSize
217
+ from ctypes import wintypes
218
+
219
+ class _PMC(ctypes.Structure):
220
+ _fields_ = [("cb", ctypes.c_ulong), ("PageFaultCount", ctypes.c_ulong)] + [
221
+ (n, ctypes.c_size_t) for n in (
222
+ "PeakWorkingSetSize", "WorkingSetSize", "QuotaPeakPagedPoolUsage",
223
+ "QuotaPagedPoolUsage", "QuotaPeakNonPagedPoolUsage", "QuotaNonPagedPoolUsage",
224
+ "PagefileUsage", "PeakPagefileUsage")
225
+ ]
226
+ # argtypes are required: GetCurrentProcess returns the pseudo-handle (-1), which overflows
227
+ # ctypes' default int marshalling unless the parameter is typed as a HANDLE.
228
+ k32 = ctypes.windll.kernel32
229
+ k32.GetCurrentProcess.restype = wintypes.HANDLE
230
+ psapi = ctypes.windll.psapi
231
+ psapi.GetProcessMemoryInfo.argtypes = [wintypes.HANDLE, ctypes.POINTER(_PMC), ctypes.c_ulong]
232
+ psapi.GetProcessMemoryInfo.restype = wintypes.BOOL
233
+ p = _PMC()
234
+ p.cb = ctypes.sizeof(_PMC)
235
+ if psapi.GetProcessMemoryInfo(k32.GetCurrentProcess(), ctypes.byref(p), p.cb):
236
+ return int(p.WorkingSetSize)
237
+ except Exception:
238
+ pass
239
+ return None
240
+
241
+
242
+ def _duckdb_mem_bytes(con):
243
+ """(allocated_bytes, temp_spill_bytes) DuckDB currently holds, via duckdb_memory(); None on any
244
+ error. Runs on a *separate* cursor so it's safe to call while another query streams on `con` —
245
+ and this is a diagnostic-only path, so it must never raise into the real write/merge."""
246
+ if con is None:
247
+ return None
248
+ try:
249
+ cur = con.cursor() # duckdb's cursor() is a new connection on the same instance
250
+ row = cur.execute(
251
+ "SELECT coalesce(sum(memory_usage_bytes), 0), "
252
+ "coalesce(sum(temporary_storage_bytes), 0) FROM duckdb_memory()"
253
+ ).fetchone()
254
+ return (int(row[0]), int(row[1]))
255
+ except Exception:
256
+ return None
257
+
258
+
259
+ class _MemSampler:
260
+ """Background RSS / DuckDB-memory sampler for one write or merge. See mem_profile()."""
261
+
262
+ def __init__(self, label: str, con=None, interval: float = 0.1):
263
+ self.label = label
264
+ self.con = con
265
+ self.interval = interval
266
+ self._thread = None
267
+ self._stop = None
268
+ self.samples = 0
269
+ self.peak_rss = 0
270
+ self.duckdb_at_rss_peak = None # DuckDB alloc at the instant RSS peaked
271
+ self.duckdb_spill_at_rss_peak = None
272
+ self.peak_duckdb = 0 # DuckDB's own high-water, independently
273
+
274
+ def __enter__(self):
275
+ if not os.environ.get("DUCKRUN_MEM_PROFILE"):
276
+ return self # disabled: no thread, no overhead
277
+ import threading
278
+ self._stop = threading.Event()
279
+ self._thread = threading.Thread(
280
+ target=self._run, name=f"duckrun-mem-{self.label}", daemon=True)
281
+ self._thread.start()
282
+ return self
283
+
284
+ def _run(self):
285
+ while not self._stop.is_set():
286
+ rss = _proc_rss_bytes()
287
+ dd = _duckdb_mem_bytes(self.con)
288
+ self.samples += 1
289
+ if dd is not None and dd[0] > self.peak_duckdb:
290
+ self.peak_duckdb = dd[0]
291
+ if rss is not None and rss > self.peak_rss:
292
+ self.peak_rss = rss
293
+ if dd is not None:
294
+ self.duckdb_at_rss_peak, self.duckdb_spill_at_rss_peak = dd
295
+ self._stop.wait(self.interval)
296
+
297
+ def __exit__(self, *exc):
298
+ if self._thread is None:
299
+ return False
300
+ self._stop.set()
301
+ self._thread.join(timeout=2.0)
302
+
303
+ def mb(n):
304
+ return "n/a" if n is None else f"{n / 2 ** 20:,.0f} MB"
305
+
306
+ non_duck = None
307
+ if self.peak_rss and self.duckdb_at_rss_peak is not None:
308
+ non_duck = max(0, self.peak_rss - self.duckdb_at_rss_peak)
309
+ logger.info(
310
+ f"mem[{self.label}]: peak RSS={mb(self.peak_rss)} | "
311
+ f"DuckDB peak={mb(self.peak_duckdb)} "
312
+ f"(at RSS-peak {mb(self.duckdb_at_rss_peak)}, spill {mb(self.duckdb_spill_at_rss_peak)}) | "
313
+ f"non-DuckDB~={mb(non_duck)} (delta_rs + Arrow) | samples={self.samples}"
314
+ )
315
+ return False
316
+
317
+
318
+ def mem_profile(label: str, con=None, interval: float = 0.1):
319
+ """Context manager that profiles a write/merge's memory when DUCKRUN_MEM_PROFILE is set, else a
320
+ no-op. Wraps an engine call so RSS, DuckDB's allocation, and the delta_rs/Arrow remainder are
321
+ measured for that phase and logged once on exit. `con` (the DuckDB connection) enables the
322
+ DuckDB-vs-delta_rs split; omit it to log RSS only. Diagnostic only — never affects the write."""
323
+ return _MemSampler(label, con=con, interval=interval)
324
+
325
+
199
326
  # How the effective memory limit is split between the two big consumers that can peak at the
200
327
  # same time during a merge — DuckDB (producing the source relation) and delta_rs (the merge
201
328
  # pool). They share one cap, so the shares must sum *under* 1.0 or we've just moved the OOM; each
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.3.17.dev1
3
+ Version: 0.3.17.dev2
4
4
  Summary: A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs).
5
5
  Author: mim
6
6
  License: MIT
@@ -11,6 +11,7 @@ Requires-Python: >=3.9
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: dbt-duckdb>=1.8
14
+ Requires-Dist: dbt-core<2.0,>=1.8
14
15
  Requires-Dist: duckdb==1.5.4.dev18
15
16
  Requires-Dist: deltalake<1.5.1,>=1.5.0
16
17
  Requires-Dist: requests
@@ -1,4 +1,5 @@
1
1
  dbt-duckdb>=1.8
2
+ dbt-core<2.0,>=1.8
2
3
  duckdb==1.5.4.dev18
3
4
  deltalake<1.5.1,>=1.5.0
4
5
  requests
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.3.17.dev1"
7
+ version = "0.3.17.dev2"
8
8
  description = "A dbt adapter that runs SQL in DuckDB and materializes to Delta Lake (delta_rs)."
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -14,6 +14,13 @@ authors = [
14
14
  requires-python = ">=3.9"
15
15
  dependencies = [
16
16
  "dbt-duckdb>=1.8",
17
+ # Cap dbt-core below 2.0: it's a breaking change (restructured packaging — `dbt.cli` moves,
18
+ # the dbtRunner import path changes) that the adapter has not been ported to. Without this
19
+ # direct pin, `pip install --pre` (or any pre-release resolution) pulls dbt-core 2.0 alphas
20
+ # via dbt-duckdb's uncapped requirement, leaving a 1.x-adapters / 2.0-core Frankenstein that
21
+ # fails at `from dbt.cli.main import dbtRunner`. Declared directly because we only depend on
22
+ # dbt-core transitively, so the ceiling has to live here to bite.
23
+ "dbt-core>=1.8,<2.0",
17
24
  # TEMPORARY pin to a duckdb dev build — TODO: move to "==1.5.4" (or a range) once stable
18
25
  # 1.5.4 ships (~end of month). 1.5.4.dev18 is the first build whose bundled duckdb-delta
19
26
  # extension supports `delta_scan('...', version => N)` (duckdb-delta #312) — the version-pinned
@@ -1 +0,0 @@
1
- version = "0.3.17.dev1"
File without changes
File without changes
File without changes
File without changes