PyPI - ygg - Versions diffs - 0.1.45__tar.gz → 0.1.46__tar.gz - Mend

ygg 0.1.45tar.gz → 0.1.46tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

{ygg-0.1.45 → ygg-0.1.46}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ygg
-Version: 0.1.45
+Version: 0.1.46
 Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
 Author: Yggdrasil contributors
 License:                                  Apache License

{ygg-0.1.45 → ygg-0.1.46}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ygg"
-version = "0.1.45"
+version = "0.1.46"
 description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { file = "LICENSE" }

{ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ygg
-Version: 0.1.45
+Version: 0.1.46
 Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
 Author: Yggdrasil contributors
 License:                                  Apache License

{ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/execution_context.py RENAMED Viewed

@@ -12,6 +12,7 @@ import re
 import sys
 import threading
 import zipfile
+from threading import Thread
 from types import ModuleType
 from typing import TYPE_CHECKING, Optional, Any, Callable, List, Dict, Union, Iterable, Tuple
@@ -114,7 +115,11 @@ class ExecutionContext:
     def __del__(self):
         """Best-effort cleanup for the remote execution context."""
-        self.close()
+        if self.context_id:
+            try:
+                Thread(target=self.close).start()
+            except BaseException:
+                pass
     @property
     def remote_metadata(self) -> RemoteMetadata:
@@ -380,7 +385,11 @@ print(json.dumps(meta))"""
         )
         try:
-            result = serialized.parse_command_result(raw_result, result_tag=result_tag)
+            result = serialized.parse_command_result(
+                raw_result,
+                result_tag=result_tag,
+                workspace=self.cluster.workspace
+            )
         except ModuleNotFoundError as remote_module_error:
             _MOD_NOT_FOUND_RE = re.compile(r"No module named ['\"]([^'\"]+)['\"]")
             module_name = _MOD_NOT_FOUND_RE.search(str(remote_module_error))
@@ -634,16 +643,4 @@ with zipfile.ZipFile(buf, "r") as zf:
         else:
             output = ""
-        # result_tag slicing
-        if result_tag:
-            start = output.find(result_tag)
-            if start != -1:
-                content_start = start + len(result_tag)
-                end = output.find(result_tag, content_start)
-                if end != -1:
-                    before = output[:start].strip()
-                    if before and print_stdout:
-                        print(before)
-                    return output[content_start:end]
         return output

{ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/workspace.py RENAMED Viewed

@@ -220,7 +220,6 @@ class Workspace:
         instance = self.clone_instance() if clone else self
         require_databricks_sdk()
-        logger.debug("Connecting %s", self)
         # Build Config from config_dict if available, else from fields.
         kwargs = {
@@ -291,8 +290,6 @@ class Workspace:
                 if v is not None:
                     setattr(instance, key, v)
-        logger.info("Connected %s", instance)
         return instance
     # ------------------------------------------------------------------ #

{ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/callable_serde.py RENAMED Viewed

@@ -1,4 +1,23 @@
-"""Callable serialization helpers for cross-process execution."""
+"""Callable serialization helpers for cross-process execution.
+Design goals:
+- Prefer import-by-reference when possible (module + qualname), fallback to dill.
+- Optional environment payload: selected globals and/or closure values.
+- Cross-process bridge: generate a self-contained Python command string that:
+    1) materializes the callable
+    2) decodes args/kwargs payload
+    3) executes
+    4) emits a single tagged base64 line with a compressed result blob
+Compression/framing:
+- CS2 framing only (no CS1 logic).
+- Frame header: MAGIC(3) + codec(u8) + orig_len(u32) + param(u8) + data
+- Codecs:
+    0 raw (rarely used; mostly means "no frame")
+    1 zlib
+    2 lzma
+    3 zstd (optional dependency)
+"""
 from __future__ import annotations
@@ -7,30 +26,33 @@ import binascii
 import dis
 import importlib
 import inspect
+import io
 import lzma
 import os
+import secrets
 import struct
 import sys
 import zlib
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, Set, Tuple, TypeVar, Union, Iterable
+from typing import Any, Callable, Dict, Iterable, Optional, Set, Tuple, TypeVar, Union, TYPE_CHECKING
 import dill
+if TYPE_CHECKING:
+    from ..databricks.workspaces import Workspace
 __all__ = ["CallableSerde"]
 T = TypeVar("T", bound="CallableSerde")
-# ---------- internal helpers ----------
-_MAGIC_V1 = b"CS1"  # legacy framing v1: zlib only (FLAG_COMPRESSED)
-_MAGIC_V2 = b"CS2"  # new framing v2: codec-aware
+# ---------------------------
+# Framing / compression (CS2)
+# ---------------------------
-_FLAG_COMPRESSED = 1  # legacy CS1 meaning
+_MAGIC = b"CS2"
-# CS2 codecs (u8)
-_CODEC_RAW  = 0
+_CODEC_RAW = 0
 _CODEC_ZLIB = 1
 _CODEC_LZMA = 2
 _CODEC_ZSTD = 3
@@ -44,6 +66,102 @@ def _try_import_zstd():
         return None
+def _pick_zlib_level(n: int, limit: int) -> int:
+    """Ramp compression level 1..9 based on how far we exceed the byte_limit."""
+    ratio = n / max(1, limit)
+    x = min(1.0, max(0.0, (ratio - 1.0) / 3.0))
+    return max(1, min(9, int(round(1 + 8 * x))))
+def _frame(codec: int, orig_len: int, param: int, payload: bytes) -> bytes:
+    return _MAGIC + struct.pack(">BIB", int(codec) & 0xFF, int(orig_len), int(param) & 0xFF) + payload
+def _encode_with_candidates(raw: bytes, *, byte_limit: int, allow_zstd: bool) -> bytes:
+    """Choose the smallest among available codecs; fall back to raw if not beneficial."""
+    if len(raw) <= byte_limit:
+        return raw
+    candidates: list[bytes] = []
+    if allow_zstd:
+        zstd = _try_import_zstd()
+        if zstd is not None:
+            for lvl in (6, 10, 15):
+                try:
+                    c = zstd.ZstdCompressor(level=lvl).compress(raw)
+                    candidates.append(_frame(_CODEC_ZSTD, len(raw), lvl, c))
+                except Exception:
+                    pass
+    for preset in (6, 9):
+        try:
+            c = lzma.compress(raw, preset=preset)
+            candidates.append(_frame(_CODEC_LZMA, len(raw), preset, c))
+        except Exception:
+            pass
+    lvl = _pick_zlib_level(len(raw), byte_limit)
+    try:
+        c = zlib.compress(raw, lvl)
+        candidates.append(_frame(_CODEC_ZLIB, len(raw), lvl, c))
+    except Exception:
+        pass
+    if not candidates:
+        return raw
+    best = min(candidates, key=len)
+    return best if len(best) < len(raw) else raw
+def _encode_result_blob(raw: bytes, byte_limit: int) -> bytes:
+    """Result payload: zstd (if available) -> lzma -> zlib."""
+    return _encode_with_candidates(raw, byte_limit=byte_limit, allow_zstd=True)
+def _encode_wire_blob_stdlib(raw: bytes, byte_limit: int) -> bytes:
+    """Wire payload (args/kwargs): stdlib-only (lzma -> zlib)."""
+    return _encode_with_candidates(raw, byte_limit=byte_limit, allow_zstd=False)
+def _decode_result_blob(blob: bytes) -> bytes:
+    """Decode raw or CS2 framed data (no CS1 support)."""
+    if not isinstance(blob, (bytes, bytearray)) or len(blob) < 3:
+        return blob  # type: ignore[return-value]
+    if not blob.startswith(_MAGIC):
+        return blob
+    if len(blob) < 3 + 6:
+        raise ValueError("CS2 framed blob too short / truncated.")
+    codec, orig_len, _param = struct.unpack(">BIB", blob[3 : 3 + 6])
+    data = blob[3 + 6 :]
+    if codec == _CODEC_RAW:
+        raw = data
+    elif codec == _CODEC_ZLIB:
+        raw = zlib.decompress(data)
+    elif codec == _CODEC_LZMA:
+        raw = lzma.decompress(data)
+    elif codec == _CODEC_ZSTD:
+        zstd = _try_import_zstd()
+        if zstd is None:
+            raise RuntimeError("CS2 uses zstd but 'zstandard' is not installed.")
+        raw = zstd.ZstdDecompressor().decompress(data, max_output_size=int(orig_len) if orig_len else 0)
+    else:
+        raise ValueError(f"Unknown CS2 codec: {codec}")
+    if orig_len and len(raw) != orig_len:
+        raise ValueError(f"Decoded length mismatch: got {len(raw)}, expected {orig_len}")
+    return raw
+# ---------------------------
+# Callable reference helpers
+# ---------------------------
 def _resolve_attr_chain(mod: Any, qualname: str) -> Any:
     obj = mod
     for part in qualname.split("."):
@@ -109,147 +227,9 @@ def _is_importable_reference(fn: Callable[..., Any]) -> bool:
         return False
-def _pick_zlib_level(n: int, limit: int) -> int:
-    ratio = n / max(1, limit)
-    x = min(1.0, max(0.0, (ratio - 1.0) / 3.0))
-    return max(1, min(9, int(round(1 + 8 * x))))
-def _frame_v2(codec: int, orig_len: int, param: int, payload: bytes) -> bytes:
-    # Frame: MAGIC(3) + codec(u8) + orig_len(u32) + param(u8) + data
-    return _MAGIC_V2 + struct.pack(">BIB", int(codec) & 0xFF, int(orig_len), int(param) & 0xFF) + payload
-def _encode_result_blob(raw: bytes, byte_limit: int) -> bytes:
-    """
-    Result payload (remote -> host):
-      - If small: return raw dill bytes (no framing)
-      - Else: try strongest available codecs and pick smallest:
-          zstd (if installed) -> lzma -> zlib
-      - Frame as CS2(codec, orig_len, param) + payload
-    Back-compat: decoder also supports legacy CS1 frames.
-    """
-    if len(raw) <= byte_limit:
-        return raw
-    candidates: list[bytes] = []
-    # zstd (best tradeoff, optional dependency)
-    zstd = _try_import_zstd()
-    if zstd is not None:
-        for lvl in (6, 10, 15):
-            try:
-                c = zstd.ZstdCompressor(level=lvl).compress(raw)
-                candidates.append(_frame_v2(_CODEC_ZSTD, len(raw), lvl, c))
-            except Exception:
-                pass
-    # lzma (stdlib, strong, slower)
-    for preset in (6, 9):
-        try:
-            c = lzma.compress(raw, preset=preset)
-            candidates.append(_frame_v2(_CODEC_LZMA, len(raw), preset, c))
-        except Exception:
-            pass
-    # zlib (stdlib, weaker)
-    lvl = _pick_zlib_level(len(raw), byte_limit)
-    try:
-        c = zlib.compress(raw, lvl)
-        candidates.append(_frame_v2(_CODEC_ZLIB, len(raw), lvl, c))
-    except Exception:
-        pass
-    best = min(candidates, key=len, default=b"")
-    if not best or len(best) >= len(raw):
-        return raw
-    return best
-def _encode_wire_blob_stdlib(raw: bytes, byte_limit: int) -> bytes:
-    """
-    Input payload (host -> remote):
-    MUST be decodable on a vanilla Python. So: lzma (if available) -> zlib.
-    Same CS2 framing.
-    """
-    if len(raw) <= byte_limit:
-        return raw
-    candidates: list[bytes] = []
-    # lzma may be absent in some minimal builds; guard it
-    for preset in (6, 9):
-        try:
-            c = lzma.compress(raw, preset=preset)
-            candidates.append(_frame_v2(_CODEC_LZMA, len(raw), preset, c))
-        except Exception:
-            pass
-    lvl = _pick_zlib_level(len(raw), byte_limit)
-    try:
-        c = zlib.compress(raw, lvl)
-        candidates.append(_frame_v2(_CODEC_ZLIB, len(raw), lvl, c))
-    except Exception:
-        pass
-    best = min(candidates, key=len, default=b"")
-    if not best or len(best) >= len(raw):
-        return raw
-    return best
-def _decode_result_blob(blob: bytes) -> bytes:
-    """
-    Decode:
-      - raw (no MAGIC) => blob
-      - CS1 legacy => zlib if flagged
-      - CS2 => decode by codec
-    """
-    # raw
-    if not isinstance(blob, (bytes, bytearray)) or len(blob) < 3:
-        return blob  # type: ignore[return-value]
-    # ---- legacy CS1 ----
-    if blob.startswith(_MAGIC_V1):
-        if len(blob) < 3 + 1 + 4 + 1:
-            raise ValueError("Framed result too short / corrupted (CS1).")
-        flags, orig_len, _level = struct.unpack(">BIB", blob[3 : 3 + 6])
-        data = blob[3 + 6 :]
-        if flags & _FLAG_COMPRESSED:
-            raw = zlib.decompress(data)
-            if orig_len and len(raw) != orig_len:
-                raise ValueError(f"Decompressed length mismatch: got {len(raw)}, expected {orig_len}")
-            return raw
-        return data
-    # ---- new CS2 ----
-    if blob.startswith(_MAGIC_V2):
-        if len(blob) < 3 + 1 + 4 + 1:
-            raise ValueError("Framed result too short / corrupted (CS2).")
-        codec, orig_len, param = struct.unpack(">BIB", blob[3 : 3 + 6])
-        data = blob[3 + 6 :]
-        if codec == _CODEC_RAW:
-            raw = data
-        elif codec == _CODEC_ZLIB:
-            raw = zlib.decompress(data)
-        elif codec == _CODEC_LZMA:
-            raw = lzma.decompress(data)
-        elif codec == _CODEC_ZSTD:
-            zstd = _try_import_zstd()
-            if zstd is None:
-                raise RuntimeError("CS2 payload uses zstd, but 'zstandard' is not installed.")
-            raw = zstd.ZstdDecompressor().decompress(data, max_output_size=int(orig_len) if orig_len else 0)
-        else:
-            raise ValueError(f"Unknown CS2 codec: {codec}")
-        if orig_len and len(raw) != orig_len:
-            raise ValueError(f"Decoded length mismatch: got {len(raw)}, expected {orig_len}")
-        return raw
-    # not framed
-    return blob
+# ---------------------------
+# Environment snapshot
+# ---------------------------
 def _dump_env(
     fn: Callable[..., Any],
@@ -304,10 +284,24 @@ def _dump_env(
     return env, meta
-# ---------- main class ----------
+# ----------
+# Main class
+# ----------
 @dataclass
 class CallableSerde:
+    """
+    Core field: `fn`
+    kind:
+      - "auto": resolve import if possible else dill
+      - "import": module + qualname
+      - "dill": dill_b64
+    Optional env payload:
+      - env_b64: dill(base64) of {"globals": {...}, "closure": {...}}
+    """
     fn: Optional[Callable[..., Any]] = None
     _kind: str = "auto"  # "auto" | "import" | "dill"
@@ -319,12 +313,15 @@ class CallableSerde:
     _env_b64: Optional[str] = None
     _env_meta: Optional[Dict[str, Any]] = None
+    # ----- construction -----
     @classmethod
     def from_callable(cls: type[T], x: Union[Callable[..., Any], T]) -> T:
         if isinstance(x, cls):
             return x
-        obj = cls(fn=x)  # type: ignore[return-value]
-        return obj
+        return cls(fn=x)  # type: ignore[return-value]
+    # ----- properties -----
     @property
     def module(self) -> Optional[str]:
@@ -372,11 +369,13 @@ class CallableSerde:
             return bool(self.module and self.qualname and "<locals>" not in (self.qualname or ""))
         return _is_importable_reference(self.fn)
+    # ----- serde API -----
     def dump(
         self,
         *,
-        prefer: str = "import",
-        dump_env: str = "none",
+        prefer: str = "import",           # "import" | "dill"
+        dump_env: str = "none",           # "none" | "globals" | "closure" | "both"
         filter_used_globals: bool = True,
         env_keys: Optional[Iterable[str]] = None,
         env_variables: Optional[Dict[str, str]] = None,
@@ -418,6 +417,7 @@ class CallableSerde:
                 raise ValueError("dump_env requested but fn is not present.")
             include_globals = dump_env in ("globals", "both")
             include_closure = dump_env in ("closure", "both")
             env, meta = _dump_env(
                 self.fn,
                 include_globals=include_globals,
@@ -487,7 +487,9 @@ class CallableSerde:
         fn = self.materialize()
         return fn(*args, **kwargs)
-    # ----- command execution bridge -----
+    # -------------------------
+    # Command execution bridge
+    # -------------------------
     def to_command(
         self,
@@ -496,16 +498,19 @@ class CallableSerde:
         *,
         result_tag: str = "__CALLABLE_SERDE_RESULT__",
         prefer: str = "dill",
-        byte_limit: int = 4 * 1024,
-        dump_env: str = "none",  # "none" | "globals" | "closure" | "both"
+        byte_limit: int = 64 * 1024,
+        dump_env: str = "none",           # "none" | "globals" | "closure" | "both"
         filter_used_globals: bool = True,
         env_keys: Optional[Iterable[str]] = None,
         env_variables: Optional[Dict[str, str]] = None,
+        file_dump_limit: int = 512 * 1024,
+        transaction_id: Optional[str] = None
     ) -> str:
         """
         Returns Python code string to execute in another interpreter.
-        Prints one line: "{result_tag}:{base64(blob)}"
-        where blob is raw dill bytes or framed (CS1/CS2).
+        Emits exactly one line to stdout:
+            "{result_tag}:{base64(blob)}\\n"
+        where blob is raw dill bytes or CS2 framed.
         """
         import json
@@ -521,29 +526,30 @@ class CallableSerde:
         )
         serde_json = json.dumps(serde_dict, ensure_ascii=False)
-        # Encode (args, kwargs) with stdlib-only strategy so remote can always decode.
+        # args/kwargs payload: stdlib-only compression (lzma/zlib)
         call_raw = dill.dumps((args, kwargs), recurse=True)
-        # Use your local encoder for wire payload (stdlib only)
         call_blob = _encode_wire_blob_stdlib(call_raw, int(byte_limit))
         call_payload_b64 = base64.b64encode(call_blob).decode("ascii")
+        transaction_id = transaction_id or secrets.token_urlsafe(16)
         template = r"""
 import base64, json, os, sys
 import dill
+import pandas
-# thin import from your real module
+from yggdrasil.databricks import Workspace
 from yggdrasil.pyutils.callable_serde import (
     CallableSerde,
-    _decode_result_blob,     # decodes raw/CS1/CS2
-    _encode_result_blob,     # encodes result with strongest available
+    _decode_result_blob,
+    _encode_result_blob,
 )
 RESULT_TAG = __RESULT_TAG__
 BYTE_LIMIT = __BYTE_LIMIT__
+FILE_DUMP_LIMIT = __FILE_DUMP_LIMIT__
+TRANSACTION_ID = __TRANSACTION_ID__
 def _needed_globals(fn) -> set[str]:
-    # keep this tiny + local; doesn’t need full module internals
     import dis
     names = set()
     try:
@@ -566,47 +572,63 @@ def _apply_env(fn, env: dict, filter_used: bool):
         return
     env_g = env.get("globals") or {}
-    if env_g:
-        if filter_used:
-            needed = _needed_globals(fn)
-            for name in needed:
-                if name in env_g:
-                    g.setdefault(name, env_g[name])
-        else:
-            for name, val in env_g.items():
-                g.setdefault(name, val)
+    if not env_g:
+        return
+    if filter_used:
+        needed = _needed_globals(fn)
+        for name in needed:
+            if name in env_g:
+                g.setdefault(name, env_g[name])
+    else:
+        for name, val in env_g.items():
+            g.setdefault(name, val)
 serde = json.loads(__SERDE_JSON__)
-# materialize callable
 cs = CallableSerde.load(serde, add_pkg_root_to_syspath=True)
 fn = cs.materialize(add_pkg_root_to_syspath=True)
-# apply os env vars (if present)
 osenv = serde.get("osenv")
 if osenv:
     for k, v in osenv.items():
         os.environ[k] = v
-# apply dill'd env payload (if present)
 env_b64 = serde.get("env_b64")
 if env_b64:
     env = dill.loads(base64.b64decode(env_b64))
     meta = serde.get("env_meta") or {}
     _apply_env(fn, env, bool(meta.get("filter_used_globals", True)))
-# decode call payload
 call_blob = base64.b64decode(__CALL_PAYLOAD_B64__)
 call_raw = _decode_result_blob(call_blob)
 args, kwargs = dill.loads(call_raw)
-# execute
 res = fn(*args, **kwargs)
-# encode + print result
-raw = dill.dumps(res)
-blob = _encode_result_blob(raw, BYTE_LIMIT)
-print(f"{RESULT_TAG}:{base64.b64encode(blob).decode('ascii')}")
+if isinstance(res, pandas.DataFrame):
+    dump_path = Workspace().shared_cache_path("/cmd/" + TRANSACTION_ID + ".parquet")
+    with dump_path.open(mode="wb") as f:
+        res.to_parquet(f)
+    blob = "DBXPATH:" + str(dump_path)
+else:
+    raw = dill.dumps(res)
+    blob = _encode_result_blob(raw, BYTE_LIMIT)
+    if len(blob) > FILE_DUMP_LIMIT:
+        dump_path = Workspace().shared_cache_path("/cmd/" + TRANSACTION_ID)
+        with dump_path.open(mode="wb") as f:
+            f.write_all_bytes(data=blob)
+        blob = "DBXPATH:" + str(dump_path)
+    else:
+        blob = base64.b64encode(blob).decode('ascii')
+sys.stdout.write(f"{RESULT_TAG}:{len(blob)}:{blob}\n")
+sys.stdout.flush()
 """
         return (
@@ -615,27 +637,91 @@ print(f"{RESULT_TAG}:{base64.b64encode(blob).decode('ascii')}")
             .replace("__BYTE_LIMIT__", str(int(byte_limit)))
             .replace("__SERDE_JSON__", repr(serde_json))
             .replace("__CALL_PAYLOAD_B64__", repr(call_payload_b64))
+            .replace("__FILE_DUMP_LIMIT__", str(int(file_dump_limit)))
+            .replace("__TRANSACTION_ID__", repr(str(transaction_id)))
         )
     @staticmethod
-    def parse_command_result(output: str, *, result_tag: str = "__CALLABLE_SERDE_RESULT__") -> Any:
+    def parse_command_result(
+        output: str,
+        *,
+        result_tag: str = "__CALLABLE_SERDE_RESULT__",
+        workspace: Optional["Workspace"] = None
+    ) -> Any:
+        """
+        Expect last tagged line:
+            "{result_tag}:{blob_nbytes}:{b64}"
+        We use blob_nbytes to compute expected base64 char length and detect truncation
+        before decoding/decompressing.
+        """
         prefix = f"{result_tag}:"
         if prefix not in output:
             raise ValueError(f"Result tag not found in output: {result_tag}")
-        # last tagged line, first line after it
-        _, b64 = output.rsplit(prefix, 1)
+        # Grab everything after the LAST occurrence of the tag
+        _, tail = output.rsplit(prefix, 1)
-        if not b64:
-            raise ValueError(f"Found result tag {result_tag} but payload is empty")
+        # Parse "{nbytes}:{b64}"
+        try:
+            nbytes_str, string_result = tail.split(":", 1)
+        except ValueError as e:
+            raise ValueError(
+                f"Malformed result line after tag {result_tag}. "
+                "Expected '{tag}:{nbytes}:{b64}'."
+            ) from e
         try:
-            blob = base64.b64decode(b64.encode("ascii"))
-        except (UnicodeEncodeError, binascii.Error) as e:
-            raise ValueError("Invalid base64 payload after result tag") from e
+            content_length = int(nbytes_str)
+        except ValueError as e:
+            raise ValueError(f"Malformed byte count '{nbytes_str}' after tag {result_tag}") from e
+        if content_length < 0:
+            raise ValueError(f"Negative byte count {content_length} after tag {result_tag}")
+        string_result = string_result[:content_length]
+        if len(string_result) != content_length:
+            raise ValueError(
+                "Got truncated result content from command, got %s bytes and expected %s bytes" % (
+                    len(string_result),
+                    content_length
+                )
+            )
+        if string_result.startswith("DBXPATH:"):
+            from ..databricks.workspaces import Workspace
+            workspace = Workspace() if workspace is None else workspace
+            path = workspace.dbfs_path(
+                string_result.replace("DBXPATH:", "")
+            )
+            if path.name.endswith(".parquet"):
+                import pandas
+                with path.open(mode="rb") as f:
+                    buf = io.BytesIO(f.read_all_bytes())
+                path.rmfile()
+                buf.seek(0)
+                return pandas.read_parquet(buf)
+            with path.open(mode="rb") as f:
+                blob = f.read_all_bytes()
+            path.rmfile()
+        else:
+            # Strict base64 decode (rejects junk chars)
+            try:
+                blob = base64.b64decode(string_result.encode("ascii"), validate=True)
+            except (UnicodeEncodeError, binascii.Error) as e:
+                raise ValueError("Invalid base64 payload after result tag (corrupted/contaminated).") from e
         raw = _decode_result_blob(blob)
         try:
-            return dill.loads(raw)
+            result = dill.loads(raw)
         except Exception as e:
             raise ValueError("Failed to dill.loads decoded payload") from e
+        return result