ygg 0.1.45__tar.gz → 0.1.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.45 → ygg-0.1.46}/PKG-INFO +1 -1
- {ygg-0.1.45 → ygg-0.1.46}/pyproject.toml +1 -1
- {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/PKG-INFO +1 -1
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/execution_context.py +11 -14
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/workspace.py +0 -3
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/callable_serde.py +280 -194
- ygg-0.1.46/src/yggdrasil/version.py +1 -0
- ygg-0.1.45/src/yggdrasil/version.py +0 -1
- {ygg-0.1.45 → ygg-0.1.46}/LICENSE +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/README.md +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/setup.cfg +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/SOURCES.txt +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/dependency_links.txt +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/entry_points.txt +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/requires.txt +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/top_level.txt +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/cluster.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/remote.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/jobs/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/jobs/config.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/engine.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/exceptions.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/statement_result.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/types.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/warehouse.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/filesytem.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/io.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/path.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/path_kind.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/dataclasses/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/dataclasses/dataclass.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/databrickslib.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/extensions/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/extensions/polars_extensions.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/extensions/spark_extensions.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/pandaslib.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/polarslib.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/sparklib.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/equality.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/exceptions.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/expiring_dict.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/modules.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/parallel.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/python_env.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/retry.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/requests/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/requests/msal.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/requests/session.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/__init__.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/arrow_cast.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/cast_options.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/pandas_cast.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/polars_cast.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/polars_pandas_cast.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/registry.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/spark_cast.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/spark_pandas_cast.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/spark_polars_cast.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/libs.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/python_arrow.py +0 -0
- {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/python_defaults.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ygg"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.46"
|
|
8
8
|
description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
|
|
9
9
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -12,6 +12,7 @@ import re
|
|
|
12
12
|
import sys
|
|
13
13
|
import threading
|
|
14
14
|
import zipfile
|
|
15
|
+
from threading import Thread
|
|
15
16
|
from types import ModuleType
|
|
16
17
|
from typing import TYPE_CHECKING, Optional, Any, Callable, List, Dict, Union, Iterable, Tuple
|
|
17
18
|
|
|
@@ -114,7 +115,11 @@ class ExecutionContext:
|
|
|
114
115
|
|
|
115
116
|
def __del__(self):
|
|
116
117
|
"""Best-effort cleanup for the remote execution context."""
|
|
117
|
-
self.
|
|
118
|
+
if self.context_id:
|
|
119
|
+
try:
|
|
120
|
+
Thread(target=self.close).start()
|
|
121
|
+
except BaseException:
|
|
122
|
+
pass
|
|
118
123
|
|
|
119
124
|
@property
|
|
120
125
|
def remote_metadata(self) -> RemoteMetadata:
|
|
@@ -380,7 +385,11 @@ print(json.dumps(meta))"""
|
|
|
380
385
|
)
|
|
381
386
|
|
|
382
387
|
try:
|
|
383
|
-
result = serialized.parse_command_result(
|
|
388
|
+
result = serialized.parse_command_result(
|
|
389
|
+
raw_result,
|
|
390
|
+
result_tag=result_tag,
|
|
391
|
+
workspace=self.cluster.workspace
|
|
392
|
+
)
|
|
384
393
|
except ModuleNotFoundError as remote_module_error:
|
|
385
394
|
_MOD_NOT_FOUND_RE = re.compile(r"No module named ['\"]([^'\"]+)['\"]")
|
|
386
395
|
module_name = _MOD_NOT_FOUND_RE.search(str(remote_module_error))
|
|
@@ -634,16 +643,4 @@ with zipfile.ZipFile(buf, "r") as zf:
|
|
|
634
643
|
else:
|
|
635
644
|
output = ""
|
|
636
645
|
|
|
637
|
-
# result_tag slicing
|
|
638
|
-
if result_tag:
|
|
639
|
-
start = output.find(result_tag)
|
|
640
|
-
if start != -1:
|
|
641
|
-
content_start = start + len(result_tag)
|
|
642
|
-
end = output.find(result_tag, content_start)
|
|
643
|
-
if end != -1:
|
|
644
|
-
before = output[:start].strip()
|
|
645
|
-
if before and print_stdout:
|
|
646
|
-
print(before)
|
|
647
|
-
return output[content_start:end]
|
|
648
|
-
|
|
649
646
|
return output
|
|
@@ -220,7 +220,6 @@ class Workspace:
|
|
|
220
220
|
instance = self.clone_instance() if clone else self
|
|
221
221
|
|
|
222
222
|
require_databricks_sdk()
|
|
223
|
-
logger.debug("Connecting %s", self)
|
|
224
223
|
|
|
225
224
|
# Build Config from config_dict if available, else from fields.
|
|
226
225
|
kwargs = {
|
|
@@ -291,8 +290,6 @@ class Workspace:
|
|
|
291
290
|
if v is not None:
|
|
292
291
|
setattr(instance, key, v)
|
|
293
292
|
|
|
294
|
-
logger.info("Connected %s", instance)
|
|
295
|
-
|
|
296
293
|
return instance
|
|
297
294
|
|
|
298
295
|
# ------------------------------------------------------------------ #
|
|
@@ -1,4 +1,23 @@
|
|
|
1
|
-
"""Callable serialization helpers for cross-process execution.
|
|
1
|
+
"""Callable serialization helpers for cross-process execution.
|
|
2
|
+
|
|
3
|
+
Design goals:
|
|
4
|
+
- Prefer import-by-reference when possible (module + qualname), fallback to dill.
|
|
5
|
+
- Optional environment payload: selected globals and/or closure values.
|
|
6
|
+
- Cross-process bridge: generate a self-contained Python command string that:
|
|
7
|
+
1) materializes the callable
|
|
8
|
+
2) decodes args/kwargs payload
|
|
9
|
+
3) executes
|
|
10
|
+
4) emits a single tagged base64 line with a compressed result blob
|
|
11
|
+
|
|
12
|
+
Compression/framing:
|
|
13
|
+
- CS2 framing only (no CS1 logic).
|
|
14
|
+
- Frame header: MAGIC(3) + codec(u8) + orig_len(u32) + param(u8) + data
|
|
15
|
+
- Codecs:
|
|
16
|
+
0 raw (rarely used; mostly means "no frame")
|
|
17
|
+
1 zlib
|
|
18
|
+
2 lzma
|
|
19
|
+
3 zstd (optional dependency)
|
|
20
|
+
"""
|
|
2
21
|
|
|
3
22
|
from __future__ import annotations
|
|
4
23
|
|
|
@@ -7,30 +26,33 @@ import binascii
|
|
|
7
26
|
import dis
|
|
8
27
|
import importlib
|
|
9
28
|
import inspect
|
|
29
|
+
import io
|
|
10
30
|
import lzma
|
|
11
31
|
import os
|
|
32
|
+
import secrets
|
|
12
33
|
import struct
|
|
13
34
|
import sys
|
|
14
35
|
import zlib
|
|
15
36
|
from dataclasses import dataclass
|
|
16
37
|
from pathlib import Path
|
|
17
|
-
from typing import Any, Callable, Dict, Optional, Set, Tuple, TypeVar, Union,
|
|
38
|
+
from typing import Any, Callable, Dict, Iterable, Optional, Set, Tuple, TypeVar, Union, TYPE_CHECKING
|
|
18
39
|
|
|
19
40
|
import dill
|
|
20
41
|
|
|
42
|
+
if TYPE_CHECKING:
|
|
43
|
+
from ..databricks.workspaces import Workspace
|
|
44
|
+
|
|
21
45
|
__all__ = ["CallableSerde"]
|
|
22
46
|
|
|
23
47
|
T = TypeVar("T", bound="CallableSerde")
|
|
24
48
|
|
|
25
|
-
#
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
_MAGIC_V2 = b"CS2" # new framing v2: codec-aware
|
|
49
|
+
# ---------------------------
|
|
50
|
+
# Framing / compression (CS2)
|
|
51
|
+
# ---------------------------
|
|
29
52
|
|
|
30
|
-
|
|
53
|
+
_MAGIC = b"CS2"
|
|
31
54
|
|
|
32
|
-
|
|
33
|
-
_CODEC_RAW = 0
|
|
55
|
+
_CODEC_RAW = 0
|
|
34
56
|
_CODEC_ZLIB = 1
|
|
35
57
|
_CODEC_LZMA = 2
|
|
36
58
|
_CODEC_ZSTD = 3
|
|
@@ -44,6 +66,102 @@ def _try_import_zstd():
|
|
|
44
66
|
return None
|
|
45
67
|
|
|
46
68
|
|
|
69
|
+
def _pick_zlib_level(n: int, limit: int) -> int:
|
|
70
|
+
"""Ramp compression level 1..9 based on how far we exceed the byte_limit."""
|
|
71
|
+
ratio = n / max(1, limit)
|
|
72
|
+
x = min(1.0, max(0.0, (ratio - 1.0) / 3.0))
|
|
73
|
+
return max(1, min(9, int(round(1 + 8 * x))))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _frame(codec: int, orig_len: int, param: int, payload: bytes) -> bytes:
|
|
77
|
+
return _MAGIC + struct.pack(">BIB", int(codec) & 0xFF, int(orig_len), int(param) & 0xFF) + payload
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _encode_with_candidates(raw: bytes, *, byte_limit: int, allow_zstd: bool) -> bytes:
|
|
81
|
+
"""Choose the smallest among available codecs; fall back to raw if not beneficial."""
|
|
82
|
+
if len(raw) <= byte_limit:
|
|
83
|
+
return raw
|
|
84
|
+
|
|
85
|
+
candidates: list[bytes] = []
|
|
86
|
+
|
|
87
|
+
if allow_zstd:
|
|
88
|
+
zstd = _try_import_zstd()
|
|
89
|
+
if zstd is not None:
|
|
90
|
+
for lvl in (6, 10, 15):
|
|
91
|
+
try:
|
|
92
|
+
c = zstd.ZstdCompressor(level=lvl).compress(raw)
|
|
93
|
+
candidates.append(_frame(_CODEC_ZSTD, len(raw), lvl, c))
|
|
94
|
+
except Exception:
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
for preset in (6, 9):
|
|
98
|
+
try:
|
|
99
|
+
c = lzma.compress(raw, preset=preset)
|
|
100
|
+
candidates.append(_frame(_CODEC_LZMA, len(raw), preset, c))
|
|
101
|
+
except Exception:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
lvl = _pick_zlib_level(len(raw), byte_limit)
|
|
105
|
+
try:
|
|
106
|
+
c = zlib.compress(raw, lvl)
|
|
107
|
+
candidates.append(_frame(_CODEC_ZLIB, len(raw), lvl, c))
|
|
108
|
+
except Exception:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
if not candidates:
|
|
112
|
+
return raw
|
|
113
|
+
|
|
114
|
+
best = min(candidates, key=len)
|
|
115
|
+
return best if len(best) < len(raw) else raw
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _encode_result_blob(raw: bytes, byte_limit: int) -> bytes:
|
|
119
|
+
"""Result payload: zstd (if available) -> lzma -> zlib."""
|
|
120
|
+
return _encode_with_candidates(raw, byte_limit=byte_limit, allow_zstd=True)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _encode_wire_blob_stdlib(raw: bytes, byte_limit: int) -> bytes:
|
|
124
|
+
"""Wire payload (args/kwargs): stdlib-only (lzma -> zlib)."""
|
|
125
|
+
return _encode_with_candidates(raw, byte_limit=byte_limit, allow_zstd=False)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _decode_result_blob(blob: bytes) -> bytes:
|
|
129
|
+
"""Decode raw or CS2 framed data (no CS1 support)."""
|
|
130
|
+
if not isinstance(blob, (bytes, bytearray)) or len(blob) < 3:
|
|
131
|
+
return blob # type: ignore[return-value]
|
|
132
|
+
|
|
133
|
+
if not blob.startswith(_MAGIC):
|
|
134
|
+
return blob
|
|
135
|
+
|
|
136
|
+
if len(blob) < 3 + 6:
|
|
137
|
+
raise ValueError("CS2 framed blob too short / truncated.")
|
|
138
|
+
|
|
139
|
+
codec, orig_len, _param = struct.unpack(">BIB", blob[3 : 3 + 6])
|
|
140
|
+
data = blob[3 + 6 :]
|
|
141
|
+
|
|
142
|
+
if codec == _CODEC_RAW:
|
|
143
|
+
raw = data
|
|
144
|
+
elif codec == _CODEC_ZLIB:
|
|
145
|
+
raw = zlib.decompress(data)
|
|
146
|
+
elif codec == _CODEC_LZMA:
|
|
147
|
+
raw = lzma.decompress(data)
|
|
148
|
+
elif codec == _CODEC_ZSTD:
|
|
149
|
+
zstd = _try_import_zstd()
|
|
150
|
+
if zstd is None:
|
|
151
|
+
raise RuntimeError("CS2 uses zstd but 'zstandard' is not installed.")
|
|
152
|
+
raw = zstd.ZstdDecompressor().decompress(data, max_output_size=int(orig_len) if orig_len else 0)
|
|
153
|
+
else:
|
|
154
|
+
raise ValueError(f"Unknown CS2 codec: {codec}")
|
|
155
|
+
|
|
156
|
+
if orig_len and len(raw) != orig_len:
|
|
157
|
+
raise ValueError(f"Decoded length mismatch: got {len(raw)}, expected {orig_len}")
|
|
158
|
+
return raw
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------
|
|
162
|
+
# Callable reference helpers
|
|
163
|
+
# ---------------------------
|
|
164
|
+
|
|
47
165
|
def _resolve_attr_chain(mod: Any, qualname: str) -> Any:
|
|
48
166
|
obj = mod
|
|
49
167
|
for part in qualname.split("."):
|
|
@@ -109,147 +227,9 @@ def _is_importable_reference(fn: Callable[..., Any]) -> bool:
|
|
|
109
227
|
return False
|
|
110
228
|
|
|
111
229
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
return max(1, min(9, int(round(1 + 8 * x))))
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def _frame_v2(codec: int, orig_len: int, param: int, payload: bytes) -> bytes:
|
|
119
|
-
# Frame: MAGIC(3) + codec(u8) + orig_len(u32) + param(u8) + data
|
|
120
|
-
return _MAGIC_V2 + struct.pack(">BIB", int(codec) & 0xFF, int(orig_len), int(param) & 0xFF) + payload
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def _encode_result_blob(raw: bytes, byte_limit: int) -> bytes:
|
|
124
|
-
"""
|
|
125
|
-
Result payload (remote -> host):
|
|
126
|
-
- If small: return raw dill bytes (no framing)
|
|
127
|
-
- Else: try strongest available codecs and pick smallest:
|
|
128
|
-
zstd (if installed) -> lzma -> zlib
|
|
129
|
-
- Frame as CS2(codec, orig_len, param) + payload
|
|
130
|
-
Back-compat: decoder also supports legacy CS1 frames.
|
|
131
|
-
"""
|
|
132
|
-
if len(raw) <= byte_limit:
|
|
133
|
-
return raw
|
|
134
|
-
|
|
135
|
-
candidates: list[bytes] = []
|
|
136
|
-
|
|
137
|
-
# zstd (best tradeoff, optional dependency)
|
|
138
|
-
zstd = _try_import_zstd()
|
|
139
|
-
if zstd is not None:
|
|
140
|
-
for lvl in (6, 10, 15):
|
|
141
|
-
try:
|
|
142
|
-
c = zstd.ZstdCompressor(level=lvl).compress(raw)
|
|
143
|
-
candidates.append(_frame_v2(_CODEC_ZSTD, len(raw), lvl, c))
|
|
144
|
-
except Exception:
|
|
145
|
-
pass
|
|
146
|
-
|
|
147
|
-
# lzma (stdlib, strong, slower)
|
|
148
|
-
for preset in (6, 9):
|
|
149
|
-
try:
|
|
150
|
-
c = lzma.compress(raw, preset=preset)
|
|
151
|
-
candidates.append(_frame_v2(_CODEC_LZMA, len(raw), preset, c))
|
|
152
|
-
except Exception:
|
|
153
|
-
pass
|
|
154
|
-
|
|
155
|
-
# zlib (stdlib, weaker)
|
|
156
|
-
lvl = _pick_zlib_level(len(raw), byte_limit)
|
|
157
|
-
try:
|
|
158
|
-
c = zlib.compress(raw, lvl)
|
|
159
|
-
candidates.append(_frame_v2(_CODEC_ZLIB, len(raw), lvl, c))
|
|
160
|
-
except Exception:
|
|
161
|
-
pass
|
|
162
|
-
|
|
163
|
-
best = min(candidates, key=len, default=b"")
|
|
164
|
-
if not best or len(best) >= len(raw):
|
|
165
|
-
return raw
|
|
166
|
-
return best
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def _encode_wire_blob_stdlib(raw: bytes, byte_limit: int) -> bytes:
|
|
170
|
-
"""
|
|
171
|
-
Input payload (host -> remote):
|
|
172
|
-
MUST be decodable on a vanilla Python. So: lzma (if available) -> zlib.
|
|
173
|
-
Same CS2 framing.
|
|
174
|
-
"""
|
|
175
|
-
if len(raw) <= byte_limit:
|
|
176
|
-
return raw
|
|
177
|
-
|
|
178
|
-
candidates: list[bytes] = []
|
|
179
|
-
|
|
180
|
-
# lzma may be absent in some minimal builds; guard it
|
|
181
|
-
for preset in (6, 9):
|
|
182
|
-
try:
|
|
183
|
-
c = lzma.compress(raw, preset=preset)
|
|
184
|
-
candidates.append(_frame_v2(_CODEC_LZMA, len(raw), preset, c))
|
|
185
|
-
except Exception:
|
|
186
|
-
pass
|
|
187
|
-
|
|
188
|
-
lvl = _pick_zlib_level(len(raw), byte_limit)
|
|
189
|
-
try:
|
|
190
|
-
c = zlib.compress(raw, lvl)
|
|
191
|
-
candidates.append(_frame_v2(_CODEC_ZLIB, len(raw), lvl, c))
|
|
192
|
-
except Exception:
|
|
193
|
-
pass
|
|
194
|
-
|
|
195
|
-
best = min(candidates, key=len, default=b"")
|
|
196
|
-
if not best or len(best) >= len(raw):
|
|
197
|
-
return raw
|
|
198
|
-
return best
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def _decode_result_blob(blob: bytes) -> bytes:
|
|
202
|
-
"""
|
|
203
|
-
Decode:
|
|
204
|
-
- raw (no MAGIC) => blob
|
|
205
|
-
- CS1 legacy => zlib if flagged
|
|
206
|
-
- CS2 => decode by codec
|
|
207
|
-
"""
|
|
208
|
-
# raw
|
|
209
|
-
if not isinstance(blob, (bytes, bytearray)) or len(blob) < 3:
|
|
210
|
-
return blob # type: ignore[return-value]
|
|
211
|
-
|
|
212
|
-
# ---- legacy CS1 ----
|
|
213
|
-
if blob.startswith(_MAGIC_V1):
|
|
214
|
-
if len(blob) < 3 + 1 + 4 + 1:
|
|
215
|
-
raise ValueError("Framed result too short / corrupted (CS1).")
|
|
216
|
-
flags, orig_len, _level = struct.unpack(">BIB", blob[3 : 3 + 6])
|
|
217
|
-
data = blob[3 + 6 :]
|
|
218
|
-
if flags & _FLAG_COMPRESSED:
|
|
219
|
-
raw = zlib.decompress(data)
|
|
220
|
-
if orig_len and len(raw) != orig_len:
|
|
221
|
-
raise ValueError(f"Decompressed length mismatch: got {len(raw)}, expected {orig_len}")
|
|
222
|
-
return raw
|
|
223
|
-
return data
|
|
224
|
-
|
|
225
|
-
# ---- new CS2 ----
|
|
226
|
-
if blob.startswith(_MAGIC_V2):
|
|
227
|
-
if len(blob) < 3 + 1 + 4 + 1:
|
|
228
|
-
raise ValueError("Framed result too short / corrupted (CS2).")
|
|
229
|
-
codec, orig_len, param = struct.unpack(">BIB", blob[3 : 3 + 6])
|
|
230
|
-
data = blob[3 + 6 :]
|
|
231
|
-
|
|
232
|
-
if codec == _CODEC_RAW:
|
|
233
|
-
raw = data
|
|
234
|
-
elif codec == _CODEC_ZLIB:
|
|
235
|
-
raw = zlib.decompress(data)
|
|
236
|
-
elif codec == _CODEC_LZMA:
|
|
237
|
-
raw = lzma.decompress(data)
|
|
238
|
-
elif codec == _CODEC_ZSTD:
|
|
239
|
-
zstd = _try_import_zstd()
|
|
240
|
-
if zstd is None:
|
|
241
|
-
raise RuntimeError("CS2 payload uses zstd, but 'zstandard' is not installed.")
|
|
242
|
-
raw = zstd.ZstdDecompressor().decompress(data, max_output_size=int(orig_len) if orig_len else 0)
|
|
243
|
-
else:
|
|
244
|
-
raise ValueError(f"Unknown CS2 codec: {codec}")
|
|
245
|
-
|
|
246
|
-
if orig_len and len(raw) != orig_len:
|
|
247
|
-
raise ValueError(f"Decoded length mismatch: got {len(raw)}, expected {orig_len}")
|
|
248
|
-
return raw
|
|
249
|
-
|
|
250
|
-
# not framed
|
|
251
|
-
return blob
|
|
252
|
-
|
|
230
|
+
# ---------------------------
|
|
231
|
+
# Environment snapshot
|
|
232
|
+
# ---------------------------
|
|
253
233
|
|
|
254
234
|
def _dump_env(
|
|
255
235
|
fn: Callable[..., Any],
|
|
@@ -304,10 +284,24 @@ def _dump_env(
|
|
|
304
284
|
return env, meta
|
|
305
285
|
|
|
306
286
|
|
|
307
|
-
# ----------
|
|
287
|
+
# ----------
|
|
288
|
+
# Main class
|
|
289
|
+
# ----------
|
|
308
290
|
|
|
309
291
|
@dataclass
|
|
310
292
|
class CallableSerde:
|
|
293
|
+
"""
|
|
294
|
+
Core field: `fn`
|
|
295
|
+
|
|
296
|
+
kind:
|
|
297
|
+
- "auto": resolve import if possible else dill
|
|
298
|
+
- "import": module + qualname
|
|
299
|
+
- "dill": dill_b64
|
|
300
|
+
|
|
301
|
+
Optional env payload:
|
|
302
|
+
- env_b64: dill(base64) of {"globals": {...}, "closure": {...}}
|
|
303
|
+
"""
|
|
304
|
+
|
|
311
305
|
fn: Optional[Callable[..., Any]] = None
|
|
312
306
|
|
|
313
307
|
_kind: str = "auto" # "auto" | "import" | "dill"
|
|
@@ -319,12 +313,15 @@ class CallableSerde:
|
|
|
319
313
|
_env_b64: Optional[str] = None
|
|
320
314
|
_env_meta: Optional[Dict[str, Any]] = None
|
|
321
315
|
|
|
316
|
+
# ----- construction -----
|
|
317
|
+
|
|
322
318
|
@classmethod
|
|
323
319
|
def from_callable(cls: type[T], x: Union[Callable[..., Any], T]) -> T:
|
|
324
320
|
if isinstance(x, cls):
|
|
325
321
|
return x
|
|
326
|
-
|
|
327
|
-
|
|
322
|
+
return cls(fn=x) # type: ignore[return-value]
|
|
323
|
+
|
|
324
|
+
# ----- properties -----
|
|
328
325
|
|
|
329
326
|
@property
|
|
330
327
|
def module(self) -> Optional[str]:
|
|
@@ -372,11 +369,13 @@ class CallableSerde:
|
|
|
372
369
|
return bool(self.module and self.qualname and "<locals>" not in (self.qualname or ""))
|
|
373
370
|
return _is_importable_reference(self.fn)
|
|
374
371
|
|
|
372
|
+
# ----- serde API -----
|
|
373
|
+
|
|
375
374
|
def dump(
|
|
376
375
|
self,
|
|
377
376
|
*,
|
|
378
|
-
prefer: str = "import",
|
|
379
|
-
dump_env: str = "none",
|
|
377
|
+
prefer: str = "import", # "import" | "dill"
|
|
378
|
+
dump_env: str = "none", # "none" | "globals" | "closure" | "both"
|
|
380
379
|
filter_used_globals: bool = True,
|
|
381
380
|
env_keys: Optional[Iterable[str]] = None,
|
|
382
381
|
env_variables: Optional[Dict[str, str]] = None,
|
|
@@ -418,6 +417,7 @@ class CallableSerde:
|
|
|
418
417
|
raise ValueError("dump_env requested but fn is not present.")
|
|
419
418
|
include_globals = dump_env in ("globals", "both")
|
|
420
419
|
include_closure = dump_env in ("closure", "both")
|
|
420
|
+
|
|
421
421
|
env, meta = _dump_env(
|
|
422
422
|
self.fn,
|
|
423
423
|
include_globals=include_globals,
|
|
@@ -487,7 +487,9 @@ class CallableSerde:
|
|
|
487
487
|
fn = self.materialize()
|
|
488
488
|
return fn(*args, **kwargs)
|
|
489
489
|
|
|
490
|
-
#
|
|
490
|
+
# -------------------------
|
|
491
|
+
# Command execution bridge
|
|
492
|
+
# -------------------------
|
|
491
493
|
|
|
492
494
|
def to_command(
|
|
493
495
|
self,
|
|
@@ -496,16 +498,19 @@ class CallableSerde:
|
|
|
496
498
|
*,
|
|
497
499
|
result_tag: str = "__CALLABLE_SERDE_RESULT__",
|
|
498
500
|
prefer: str = "dill",
|
|
499
|
-
byte_limit: int =
|
|
500
|
-
dump_env: str = "none",
|
|
501
|
+
byte_limit: int = 64 * 1024,
|
|
502
|
+
dump_env: str = "none", # "none" | "globals" | "closure" | "both"
|
|
501
503
|
filter_used_globals: bool = True,
|
|
502
504
|
env_keys: Optional[Iterable[str]] = None,
|
|
503
505
|
env_variables: Optional[Dict[str, str]] = None,
|
|
506
|
+
file_dump_limit: int = 512 * 1024,
|
|
507
|
+
transaction_id: Optional[str] = None
|
|
504
508
|
) -> str:
|
|
505
509
|
"""
|
|
506
510
|
Returns Python code string to execute in another interpreter.
|
|
507
|
-
|
|
508
|
-
|
|
511
|
+
Emits exactly one line to stdout:
|
|
512
|
+
"{result_tag}:{base64(blob)}\\n"
|
|
513
|
+
where blob is raw dill bytes or CS2 framed.
|
|
509
514
|
"""
|
|
510
515
|
import json
|
|
511
516
|
|
|
@@ -521,29 +526,30 @@ class CallableSerde:
|
|
|
521
526
|
)
|
|
522
527
|
serde_json = json.dumps(serde_dict, ensure_ascii=False)
|
|
523
528
|
|
|
524
|
-
#
|
|
529
|
+
# args/kwargs payload: stdlib-only compression (lzma/zlib)
|
|
525
530
|
call_raw = dill.dumps((args, kwargs), recurse=True)
|
|
526
|
-
|
|
527
|
-
# Use your local encoder for wire payload (stdlib only)
|
|
528
531
|
call_blob = _encode_wire_blob_stdlib(call_raw, int(byte_limit))
|
|
529
532
|
call_payload_b64 = base64.b64encode(call_blob).decode("ascii")
|
|
533
|
+
transaction_id = transaction_id or secrets.token_urlsafe(16)
|
|
530
534
|
|
|
531
535
|
template = r"""
|
|
532
536
|
import base64, json, os, sys
|
|
533
537
|
import dill
|
|
538
|
+
import pandas
|
|
534
539
|
|
|
535
|
-
|
|
540
|
+
from yggdrasil.databricks import Workspace
|
|
536
541
|
from yggdrasil.pyutils.callable_serde import (
|
|
537
542
|
CallableSerde,
|
|
538
|
-
_decode_result_blob,
|
|
539
|
-
_encode_result_blob,
|
|
543
|
+
_decode_result_blob,
|
|
544
|
+
_encode_result_blob,
|
|
540
545
|
)
|
|
541
546
|
|
|
542
547
|
RESULT_TAG = __RESULT_TAG__
|
|
543
548
|
BYTE_LIMIT = __BYTE_LIMIT__
|
|
549
|
+
FILE_DUMP_LIMIT = __FILE_DUMP_LIMIT__
|
|
550
|
+
TRANSACTION_ID = __TRANSACTION_ID__
|
|
544
551
|
|
|
545
552
|
def _needed_globals(fn) -> set[str]:
|
|
546
|
-
# keep this tiny + local; doesn’t need full module internals
|
|
547
553
|
import dis
|
|
548
554
|
names = set()
|
|
549
555
|
try:
|
|
@@ -566,47 +572,63 @@ def _apply_env(fn, env: dict, filter_used: bool):
|
|
|
566
572
|
return
|
|
567
573
|
|
|
568
574
|
env_g = env.get("globals") or {}
|
|
569
|
-
if env_g:
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
575
|
+
if not env_g:
|
|
576
|
+
return
|
|
577
|
+
|
|
578
|
+
if filter_used:
|
|
579
|
+
needed = _needed_globals(fn)
|
|
580
|
+
for name in needed:
|
|
581
|
+
if name in env_g:
|
|
582
|
+
g.setdefault(name, env_g[name])
|
|
583
|
+
else:
|
|
584
|
+
for name, val in env_g.items():
|
|
585
|
+
g.setdefault(name, val)
|
|
578
586
|
|
|
579
587
|
serde = json.loads(__SERDE_JSON__)
|
|
580
588
|
|
|
581
|
-
# materialize callable
|
|
582
589
|
cs = CallableSerde.load(serde, add_pkg_root_to_syspath=True)
|
|
583
590
|
fn = cs.materialize(add_pkg_root_to_syspath=True)
|
|
584
591
|
|
|
585
|
-
# apply os env vars (if present)
|
|
586
592
|
osenv = serde.get("osenv")
|
|
587
593
|
if osenv:
|
|
588
594
|
for k, v in osenv.items():
|
|
589
595
|
os.environ[k] = v
|
|
590
596
|
|
|
591
|
-
# apply dill'd env payload (if present)
|
|
592
597
|
env_b64 = serde.get("env_b64")
|
|
593
598
|
if env_b64:
|
|
594
599
|
env = dill.loads(base64.b64decode(env_b64))
|
|
595
600
|
meta = serde.get("env_meta") or {}
|
|
596
601
|
_apply_env(fn, env, bool(meta.get("filter_used_globals", True)))
|
|
597
602
|
|
|
598
|
-
# decode call payload
|
|
599
603
|
call_blob = base64.b64decode(__CALL_PAYLOAD_B64__)
|
|
600
604
|
call_raw = _decode_result_blob(call_blob)
|
|
601
605
|
args, kwargs = dill.loads(call_raw)
|
|
602
606
|
|
|
603
|
-
# execute
|
|
604
607
|
res = fn(*args, **kwargs)
|
|
605
608
|
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
609
|
+
if isinstance(res, pandas.DataFrame):
|
|
610
|
+
dump_path = Workspace().shared_cache_path("/cmd/" + TRANSACTION_ID + ".parquet")
|
|
611
|
+
|
|
612
|
+
with dump_path.open(mode="wb") as f:
|
|
613
|
+
res.to_parquet(f)
|
|
614
|
+
|
|
615
|
+
blob = "DBXPATH:" + str(dump_path)
|
|
616
|
+
else:
|
|
617
|
+
raw = dill.dumps(res)
|
|
618
|
+
blob = _encode_result_blob(raw, BYTE_LIMIT)
|
|
619
|
+
|
|
620
|
+
if len(blob) > FILE_DUMP_LIMIT:
|
|
621
|
+
dump_path = Workspace().shared_cache_path("/cmd/" + TRANSACTION_ID)
|
|
622
|
+
|
|
623
|
+
with dump_path.open(mode="wb") as f:
|
|
624
|
+
f.write_all_bytes(data=blob)
|
|
625
|
+
|
|
626
|
+
blob = "DBXPATH:" + str(dump_path)
|
|
627
|
+
else:
|
|
628
|
+
blob = base64.b64encode(blob).decode('ascii')
|
|
629
|
+
|
|
630
|
+
sys.stdout.write(f"{RESULT_TAG}:{len(blob)}:{blob}\n")
|
|
631
|
+
sys.stdout.flush()
|
|
610
632
|
"""
|
|
611
633
|
|
|
612
634
|
return (
|
|
@@ -615,27 +637,91 @@ print(f"{RESULT_TAG}:{base64.b64encode(blob).decode('ascii')}")
|
|
|
615
637
|
.replace("__BYTE_LIMIT__", str(int(byte_limit)))
|
|
616
638
|
.replace("__SERDE_JSON__", repr(serde_json))
|
|
617
639
|
.replace("__CALL_PAYLOAD_B64__", repr(call_payload_b64))
|
|
640
|
+
.replace("__FILE_DUMP_LIMIT__", str(int(file_dump_limit)))
|
|
641
|
+
.replace("__TRANSACTION_ID__", repr(str(transaction_id)))
|
|
618
642
|
)
|
|
619
643
|
|
|
620
644
|
@staticmethod
|
|
621
|
-
def parse_command_result(
|
|
645
|
+
def parse_command_result(
|
|
646
|
+
output: str,
|
|
647
|
+
*,
|
|
648
|
+
result_tag: str = "__CALLABLE_SERDE_RESULT__",
|
|
649
|
+
workspace: Optional["Workspace"] = None
|
|
650
|
+
) -> Any:
|
|
651
|
+
"""
|
|
652
|
+
Expect last tagged line:
|
|
653
|
+
"{result_tag}:{blob_nbytes}:{b64}"
|
|
654
|
+
|
|
655
|
+
We use blob_nbytes to compute expected base64 char length and detect truncation
|
|
656
|
+
before decoding/decompressing.
|
|
657
|
+
"""
|
|
622
658
|
prefix = f"{result_tag}:"
|
|
623
659
|
if prefix not in output:
|
|
624
660
|
raise ValueError(f"Result tag not found in output: {result_tag}")
|
|
625
661
|
|
|
626
|
-
#
|
|
627
|
-
_,
|
|
662
|
+
# Grab everything after the LAST occurrence of the tag
|
|
663
|
+
_, tail = output.rsplit(prefix, 1)
|
|
628
664
|
|
|
629
|
-
|
|
630
|
-
|
|
665
|
+
# Parse "{nbytes}:{b64}"
|
|
666
|
+
try:
|
|
667
|
+
nbytes_str, string_result = tail.split(":", 1)
|
|
668
|
+
except ValueError as e:
|
|
669
|
+
raise ValueError(
|
|
670
|
+
f"Malformed result line after tag {result_tag}. "
|
|
671
|
+
"Expected '{tag}:{nbytes}:{b64}'."
|
|
672
|
+
) from e
|
|
631
673
|
|
|
632
674
|
try:
|
|
633
|
-
|
|
634
|
-
except
|
|
635
|
-
raise ValueError("
|
|
675
|
+
content_length = int(nbytes_str)
|
|
676
|
+
except ValueError as e:
|
|
677
|
+
raise ValueError(f"Malformed byte count '{nbytes_str}' after tag {result_tag}") from e
|
|
678
|
+
|
|
679
|
+
if content_length < 0:
|
|
680
|
+
raise ValueError(f"Negative byte count {content_length} after tag {result_tag}")
|
|
681
|
+
|
|
682
|
+
string_result = string_result[:content_length]
|
|
683
|
+
|
|
684
|
+
if len(string_result) != content_length:
|
|
685
|
+
raise ValueError(
|
|
686
|
+
"Got truncated result content from command, got %s bytes and expected %s bytes" % (
|
|
687
|
+
len(string_result),
|
|
688
|
+
content_length
|
|
689
|
+
)
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
if string_result.startswith("DBXPATH:"):
|
|
693
|
+
from ..databricks.workspaces import Workspace
|
|
694
|
+
|
|
695
|
+
workspace = Workspace() if workspace is None else workspace
|
|
696
|
+
path = workspace.dbfs_path(
|
|
697
|
+
string_result.replace("DBXPATH:", "")
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
if path.name.endswith(".parquet"):
|
|
701
|
+
import pandas
|
|
702
|
+
|
|
703
|
+
with path.open(mode="rb") as f:
|
|
704
|
+
buf = io.BytesIO(f.read_all_bytes())
|
|
705
|
+
|
|
706
|
+
path.rmfile()
|
|
707
|
+
buf.seek(0)
|
|
708
|
+
return pandas.read_parquet(buf)
|
|
709
|
+
|
|
710
|
+
with path.open(mode="rb") as f:
|
|
711
|
+
blob = f.read_all_bytes()
|
|
712
|
+
|
|
713
|
+
path.rmfile()
|
|
714
|
+
else:
|
|
715
|
+
# Strict base64 decode (rejects junk chars)
|
|
716
|
+
try:
|
|
717
|
+
blob = base64.b64decode(string_result.encode("ascii"), validate=True)
|
|
718
|
+
except (UnicodeEncodeError, binascii.Error) as e:
|
|
719
|
+
raise ValueError("Invalid base64 payload after result tag (corrupted/contaminated).") from e
|
|
636
720
|
|
|
637
721
|
raw = _decode_result_blob(blob)
|
|
638
722
|
try:
|
|
639
|
-
|
|
723
|
+
result = dill.loads(raw)
|
|
640
724
|
except Exception as e:
|
|
641
725
|
raise ValueError("Failed to dill.loads decoded payload") from e
|
|
726
|
+
|
|
727
|
+
return result
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.46"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.1.45"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|