ygg 0.1.45__tar.gz → 0.1.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {ygg-0.1.45 → ygg-0.1.46}/PKG-INFO +1 -1
  2. {ygg-0.1.45 → ygg-0.1.46}/pyproject.toml +1 -1
  3. {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/PKG-INFO +1 -1
  4. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/execution_context.py +11 -14
  5. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/workspace.py +0 -3
  6. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/callable_serde.py +280 -194
  7. ygg-0.1.46/src/yggdrasil/version.py +1 -0
  8. ygg-0.1.45/src/yggdrasil/version.py +0 -1
  9. {ygg-0.1.45 → ygg-0.1.46}/LICENSE +0 -0
  10. {ygg-0.1.45 → ygg-0.1.46}/README.md +0 -0
  11. {ygg-0.1.45 → ygg-0.1.46}/setup.cfg +0 -0
  12. {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/SOURCES.txt +0 -0
  13. {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/dependency_links.txt +0 -0
  14. {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/entry_points.txt +0 -0
  15. {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/requires.txt +0 -0
  16. {ygg-0.1.45 → ygg-0.1.46}/src/ygg.egg-info/top_level.txt +0 -0
  17. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/__init__.py +0 -0
  18. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/__init__.py +0 -0
  19. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/__init__.py +0 -0
  20. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/cluster.py +0 -0
  21. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/compute/remote.py +0 -0
  22. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/jobs/__init__.py +0 -0
  23. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/jobs/config.py +0 -0
  24. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/__init__.py +0 -0
  25. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/engine.py +0 -0
  26. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/exceptions.py +0 -0
  27. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/statement_result.py +0 -0
  28. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/types.py +0 -0
  29. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/sql/warehouse.py +0 -0
  30. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/__init__.py +0 -0
  31. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/filesytem.py +0 -0
  32. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/io.py +0 -0
  33. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/path.py +0 -0
  34. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/databricks/workspaces/path_kind.py +0 -0
  35. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/dataclasses/__init__.py +0 -0
  36. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/dataclasses/dataclass.py +0 -0
  37. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/__init__.py +0 -0
  38. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/databrickslib.py +0 -0
  39. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/extensions/__init__.py +0 -0
  40. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/extensions/polars_extensions.py +0 -0
  41. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/extensions/spark_extensions.py +0 -0
  42. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/pandaslib.py +0 -0
  43. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/polarslib.py +0 -0
  44. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/libs/sparklib.py +0 -0
  45. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/__init__.py +0 -0
  46. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/equality.py +0 -0
  47. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/exceptions.py +0 -0
  48. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/expiring_dict.py +0 -0
  49. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/modules.py +0 -0
  50. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/parallel.py +0 -0
  51. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/python_env.py +0 -0
  52. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/pyutils/retry.py +0 -0
  53. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/requests/__init__.py +0 -0
  54. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/requests/msal.py +0 -0
  55. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/requests/session.py +0 -0
  56. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/__init__.py +0 -0
  57. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/__init__.py +0 -0
  58. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/arrow_cast.py +0 -0
  59. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/cast_options.py +0 -0
  60. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/pandas_cast.py +0 -0
  61. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/polars_cast.py +0 -0
  62. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/polars_pandas_cast.py +0 -0
  63. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/registry.py +0 -0
  64. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/spark_cast.py +0 -0
  65. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/spark_pandas_cast.py +0 -0
  66. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/cast/spark_polars_cast.py +0 -0
  67. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/libs.py +0 -0
  68. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/python_arrow.py +0 -0
  69. {ygg-0.1.45 → ygg-0.1.46}/src/yggdrasil/types/python_defaults.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.45
3
+ Version: 0.1.46
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ygg"
7
- version = "0.1.45"
7
+ version = "0.1.46"
8
8
  description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  license = { file = "LICENSE" }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.45
3
+ Version: 0.1.46
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -12,6 +12,7 @@ import re
12
12
  import sys
13
13
  import threading
14
14
  import zipfile
15
+ from threading import Thread
15
16
  from types import ModuleType
16
17
  from typing import TYPE_CHECKING, Optional, Any, Callable, List, Dict, Union, Iterable, Tuple
17
18
 
@@ -114,7 +115,11 @@ class ExecutionContext:
114
115
 
115
116
  def __del__(self):
116
117
  """Best-effort cleanup for the remote execution context."""
117
- self.close()
118
+ if self.context_id:
119
+ try:
120
+ Thread(target=self.close).start()
121
+ except BaseException:
122
+ pass
118
123
 
119
124
  @property
120
125
  def remote_metadata(self) -> RemoteMetadata:
@@ -380,7 +385,11 @@ print(json.dumps(meta))"""
380
385
  )
381
386
 
382
387
  try:
383
- result = serialized.parse_command_result(raw_result, result_tag=result_tag)
388
+ result = serialized.parse_command_result(
389
+ raw_result,
390
+ result_tag=result_tag,
391
+ workspace=self.cluster.workspace
392
+ )
384
393
  except ModuleNotFoundError as remote_module_error:
385
394
  _MOD_NOT_FOUND_RE = re.compile(r"No module named ['\"]([^'\"]+)['\"]")
386
395
  module_name = _MOD_NOT_FOUND_RE.search(str(remote_module_error))
@@ -634,16 +643,4 @@ with zipfile.ZipFile(buf, "r") as zf:
634
643
  else:
635
644
  output = ""
636
645
 
637
- # result_tag slicing
638
- if result_tag:
639
- start = output.find(result_tag)
640
- if start != -1:
641
- content_start = start + len(result_tag)
642
- end = output.find(result_tag, content_start)
643
- if end != -1:
644
- before = output[:start].strip()
645
- if before and print_stdout:
646
- print(before)
647
- return output[content_start:end]
648
-
649
646
  return output
@@ -220,7 +220,6 @@ class Workspace:
220
220
  instance = self.clone_instance() if clone else self
221
221
 
222
222
  require_databricks_sdk()
223
- logger.debug("Connecting %s", self)
224
223
 
225
224
  # Build Config from config_dict if available, else from fields.
226
225
  kwargs = {
@@ -291,8 +290,6 @@ class Workspace:
291
290
  if v is not None:
292
291
  setattr(instance, key, v)
293
292
 
294
- logger.info("Connected %s", instance)
295
-
296
293
  return instance
297
294
 
298
295
  # ------------------------------------------------------------------ #
@@ -1,4 +1,23 @@
1
- """Callable serialization helpers for cross-process execution."""
1
+ """Callable serialization helpers for cross-process execution.
2
+
3
+ Design goals:
4
+ - Prefer import-by-reference when possible (module + qualname), fallback to dill.
5
+ - Optional environment payload: selected globals and/or closure values.
6
+ - Cross-process bridge: generate a self-contained Python command string that:
7
+ 1) materializes the callable
8
+ 2) decodes args/kwargs payload
9
+ 3) executes
10
+ 4) emits a single tagged base64 line with a compressed result blob
11
+
12
+ Compression/framing:
13
+ - CS2 framing only (no CS1 logic).
14
+ - Frame header: MAGIC(3) + codec(u8) + orig_len(u32) + param(u8) + data
15
+ - Codecs:
16
+ 0 raw (rarely used; mostly means "no frame")
17
+ 1 zlib
18
+ 2 lzma
19
+ 3 zstd (optional dependency)
20
+ """
2
21
 
3
22
  from __future__ import annotations
4
23
 
@@ -7,30 +26,33 @@ import binascii
7
26
  import dis
8
27
  import importlib
9
28
  import inspect
29
+ import io
10
30
  import lzma
11
31
  import os
32
+ import secrets
12
33
  import struct
13
34
  import sys
14
35
  import zlib
15
36
  from dataclasses import dataclass
16
37
  from pathlib import Path
17
- from typing import Any, Callable, Dict, Optional, Set, Tuple, TypeVar, Union, Iterable
38
+ from typing import Any, Callable, Dict, Iterable, Optional, Set, Tuple, TypeVar, Union, TYPE_CHECKING
18
39
 
19
40
  import dill
20
41
 
42
+ if TYPE_CHECKING:
43
+ from ..databricks.workspaces import Workspace
44
+
21
45
  __all__ = ["CallableSerde"]
22
46
 
23
47
  T = TypeVar("T", bound="CallableSerde")
24
48
 
25
- # ---------- internal helpers ----------
26
-
27
- _MAGIC_V1 = b"CS1" # legacy framing v1: zlib only (FLAG_COMPRESSED)
28
- _MAGIC_V2 = b"CS2" # new framing v2: codec-aware
49
+ # ---------------------------
50
+ # Framing / compression (CS2)
51
+ # ---------------------------
29
52
 
30
- _FLAG_COMPRESSED = 1 # legacy CS1 meaning
53
+ _MAGIC = b"CS2"
31
54
 
32
- # CS2 codecs (u8)
33
- _CODEC_RAW = 0
55
+ _CODEC_RAW = 0
34
56
  _CODEC_ZLIB = 1
35
57
  _CODEC_LZMA = 2
36
58
  _CODEC_ZSTD = 3
@@ -44,6 +66,102 @@ def _try_import_zstd():
44
66
  return None
45
67
 
46
68
 
69
+ def _pick_zlib_level(n: int, limit: int) -> int:
70
+ """Ramp compression level 1..9 based on how far we exceed the byte_limit."""
71
+ ratio = n / max(1, limit)
72
+ x = min(1.0, max(0.0, (ratio - 1.0) / 3.0))
73
+ return max(1, min(9, int(round(1 + 8 * x))))
74
+
75
+
76
+ def _frame(codec: int, orig_len: int, param: int, payload: bytes) -> bytes:
77
+ return _MAGIC + struct.pack(">BIB", int(codec) & 0xFF, int(orig_len), int(param) & 0xFF) + payload
78
+
79
+
80
+ def _encode_with_candidates(raw: bytes, *, byte_limit: int, allow_zstd: bool) -> bytes:
81
+ """Choose the smallest among available codecs; fall back to raw if not beneficial."""
82
+ if len(raw) <= byte_limit:
83
+ return raw
84
+
85
+ candidates: list[bytes] = []
86
+
87
+ if allow_zstd:
88
+ zstd = _try_import_zstd()
89
+ if zstd is not None:
90
+ for lvl in (6, 10, 15):
91
+ try:
92
+ c = zstd.ZstdCompressor(level=lvl).compress(raw)
93
+ candidates.append(_frame(_CODEC_ZSTD, len(raw), lvl, c))
94
+ except Exception:
95
+ pass
96
+
97
+ for preset in (6, 9):
98
+ try:
99
+ c = lzma.compress(raw, preset=preset)
100
+ candidates.append(_frame(_CODEC_LZMA, len(raw), preset, c))
101
+ except Exception:
102
+ pass
103
+
104
+ lvl = _pick_zlib_level(len(raw), byte_limit)
105
+ try:
106
+ c = zlib.compress(raw, lvl)
107
+ candidates.append(_frame(_CODEC_ZLIB, len(raw), lvl, c))
108
+ except Exception:
109
+ pass
110
+
111
+ if not candidates:
112
+ return raw
113
+
114
+ best = min(candidates, key=len)
115
+ return best if len(best) < len(raw) else raw
116
+
117
+
118
+ def _encode_result_blob(raw: bytes, byte_limit: int) -> bytes:
119
+ """Result payload: zstd (if available) -> lzma -> zlib."""
120
+ return _encode_with_candidates(raw, byte_limit=byte_limit, allow_zstd=True)
121
+
122
+
123
+ def _encode_wire_blob_stdlib(raw: bytes, byte_limit: int) -> bytes:
124
+ """Wire payload (args/kwargs): stdlib-only (lzma -> zlib)."""
125
+ return _encode_with_candidates(raw, byte_limit=byte_limit, allow_zstd=False)
126
+
127
+
128
+ def _decode_result_blob(blob: bytes) -> bytes:
129
+ """Decode raw or CS2 framed data (no CS1 support)."""
130
+ if not isinstance(blob, (bytes, bytearray)) or len(blob) < 3:
131
+ return blob # type: ignore[return-value]
132
+
133
+ if not blob.startswith(_MAGIC):
134
+ return blob
135
+
136
+ if len(blob) < 3 + 6:
137
+ raise ValueError("CS2 framed blob too short / truncated.")
138
+
139
+ codec, orig_len, _param = struct.unpack(">BIB", blob[3 : 3 + 6])
140
+ data = blob[3 + 6 :]
141
+
142
+ if codec == _CODEC_RAW:
143
+ raw = data
144
+ elif codec == _CODEC_ZLIB:
145
+ raw = zlib.decompress(data)
146
+ elif codec == _CODEC_LZMA:
147
+ raw = lzma.decompress(data)
148
+ elif codec == _CODEC_ZSTD:
149
+ zstd = _try_import_zstd()
150
+ if zstd is None:
151
+ raise RuntimeError("CS2 uses zstd but 'zstandard' is not installed.")
152
+ raw = zstd.ZstdDecompressor().decompress(data, max_output_size=int(orig_len) if orig_len else 0)
153
+ else:
154
+ raise ValueError(f"Unknown CS2 codec: {codec}")
155
+
156
+ if orig_len and len(raw) != orig_len:
157
+ raise ValueError(f"Decoded length mismatch: got {len(raw)}, expected {orig_len}")
158
+ return raw
159
+
160
+
161
+ # ---------------------------
162
+ # Callable reference helpers
163
+ # ---------------------------
164
+
47
165
  def _resolve_attr_chain(mod: Any, qualname: str) -> Any:
48
166
  obj = mod
49
167
  for part in qualname.split("."):
@@ -109,147 +227,9 @@ def _is_importable_reference(fn: Callable[..., Any]) -> bool:
109
227
  return False
110
228
 
111
229
 
112
- def _pick_zlib_level(n: int, limit: int) -> int:
113
- ratio = n / max(1, limit)
114
- x = min(1.0, max(0.0, (ratio - 1.0) / 3.0))
115
- return max(1, min(9, int(round(1 + 8 * x))))
116
-
117
-
118
- def _frame_v2(codec: int, orig_len: int, param: int, payload: bytes) -> bytes:
119
- # Frame: MAGIC(3) + codec(u8) + orig_len(u32) + param(u8) + data
120
- return _MAGIC_V2 + struct.pack(">BIB", int(codec) & 0xFF, int(orig_len), int(param) & 0xFF) + payload
121
-
122
-
123
- def _encode_result_blob(raw: bytes, byte_limit: int) -> bytes:
124
- """
125
- Result payload (remote -> host):
126
- - If small: return raw dill bytes (no framing)
127
- - Else: try strongest available codecs and pick smallest:
128
- zstd (if installed) -> lzma -> zlib
129
- - Frame as CS2(codec, orig_len, param) + payload
130
- Back-compat: decoder also supports legacy CS1 frames.
131
- """
132
- if len(raw) <= byte_limit:
133
- return raw
134
-
135
- candidates: list[bytes] = []
136
-
137
- # zstd (best tradeoff, optional dependency)
138
- zstd = _try_import_zstd()
139
- if zstd is not None:
140
- for lvl in (6, 10, 15):
141
- try:
142
- c = zstd.ZstdCompressor(level=lvl).compress(raw)
143
- candidates.append(_frame_v2(_CODEC_ZSTD, len(raw), lvl, c))
144
- except Exception:
145
- pass
146
-
147
- # lzma (stdlib, strong, slower)
148
- for preset in (6, 9):
149
- try:
150
- c = lzma.compress(raw, preset=preset)
151
- candidates.append(_frame_v2(_CODEC_LZMA, len(raw), preset, c))
152
- except Exception:
153
- pass
154
-
155
- # zlib (stdlib, weaker)
156
- lvl = _pick_zlib_level(len(raw), byte_limit)
157
- try:
158
- c = zlib.compress(raw, lvl)
159
- candidates.append(_frame_v2(_CODEC_ZLIB, len(raw), lvl, c))
160
- except Exception:
161
- pass
162
-
163
- best = min(candidates, key=len, default=b"")
164
- if not best or len(best) >= len(raw):
165
- return raw
166
- return best
167
-
168
-
169
- def _encode_wire_blob_stdlib(raw: bytes, byte_limit: int) -> bytes:
170
- """
171
- Input payload (host -> remote):
172
- MUST be decodable on a vanilla Python. So: lzma (if available) -> zlib.
173
- Same CS2 framing.
174
- """
175
- if len(raw) <= byte_limit:
176
- return raw
177
-
178
- candidates: list[bytes] = []
179
-
180
- # lzma may be absent in some minimal builds; guard it
181
- for preset in (6, 9):
182
- try:
183
- c = lzma.compress(raw, preset=preset)
184
- candidates.append(_frame_v2(_CODEC_LZMA, len(raw), preset, c))
185
- except Exception:
186
- pass
187
-
188
- lvl = _pick_zlib_level(len(raw), byte_limit)
189
- try:
190
- c = zlib.compress(raw, lvl)
191
- candidates.append(_frame_v2(_CODEC_ZLIB, len(raw), lvl, c))
192
- except Exception:
193
- pass
194
-
195
- best = min(candidates, key=len, default=b"")
196
- if not best or len(best) >= len(raw):
197
- return raw
198
- return best
199
-
200
-
201
- def _decode_result_blob(blob: bytes) -> bytes:
202
- """
203
- Decode:
204
- - raw (no MAGIC) => blob
205
- - CS1 legacy => zlib if flagged
206
- - CS2 => decode by codec
207
- """
208
- # raw
209
- if not isinstance(blob, (bytes, bytearray)) or len(blob) < 3:
210
- return blob # type: ignore[return-value]
211
-
212
- # ---- legacy CS1 ----
213
- if blob.startswith(_MAGIC_V1):
214
- if len(blob) < 3 + 1 + 4 + 1:
215
- raise ValueError("Framed result too short / corrupted (CS1).")
216
- flags, orig_len, _level = struct.unpack(">BIB", blob[3 : 3 + 6])
217
- data = blob[3 + 6 :]
218
- if flags & _FLAG_COMPRESSED:
219
- raw = zlib.decompress(data)
220
- if orig_len and len(raw) != orig_len:
221
- raise ValueError(f"Decompressed length mismatch: got {len(raw)}, expected {orig_len}")
222
- return raw
223
- return data
224
-
225
- # ---- new CS2 ----
226
- if blob.startswith(_MAGIC_V2):
227
- if len(blob) < 3 + 1 + 4 + 1:
228
- raise ValueError("Framed result too short / corrupted (CS2).")
229
- codec, orig_len, param = struct.unpack(">BIB", blob[3 : 3 + 6])
230
- data = blob[3 + 6 :]
231
-
232
- if codec == _CODEC_RAW:
233
- raw = data
234
- elif codec == _CODEC_ZLIB:
235
- raw = zlib.decompress(data)
236
- elif codec == _CODEC_LZMA:
237
- raw = lzma.decompress(data)
238
- elif codec == _CODEC_ZSTD:
239
- zstd = _try_import_zstd()
240
- if zstd is None:
241
- raise RuntimeError("CS2 payload uses zstd, but 'zstandard' is not installed.")
242
- raw = zstd.ZstdDecompressor().decompress(data, max_output_size=int(orig_len) if orig_len else 0)
243
- else:
244
- raise ValueError(f"Unknown CS2 codec: {codec}")
245
-
246
- if orig_len and len(raw) != orig_len:
247
- raise ValueError(f"Decoded length mismatch: got {len(raw)}, expected {orig_len}")
248
- return raw
249
-
250
- # not framed
251
- return blob
252
-
230
+ # ---------------------------
231
+ # Environment snapshot
232
+ # ---------------------------
253
233
 
254
234
  def _dump_env(
255
235
  fn: Callable[..., Any],
@@ -304,10 +284,24 @@ def _dump_env(
304
284
  return env, meta
305
285
 
306
286
 
307
- # ---------- main class ----------
287
+ # ----------
288
+ # Main class
289
+ # ----------
308
290
 
309
291
  @dataclass
310
292
  class CallableSerde:
293
+ """
294
+ Core field: `fn`
295
+
296
+ kind:
297
+ - "auto": resolve import if possible else dill
298
+ - "import": module + qualname
299
+ - "dill": dill_b64
300
+
301
+ Optional env payload:
302
+ - env_b64: dill(base64) of {"globals": {...}, "closure": {...}}
303
+ """
304
+
311
305
  fn: Optional[Callable[..., Any]] = None
312
306
 
313
307
  _kind: str = "auto" # "auto" | "import" | "dill"
@@ -319,12 +313,15 @@ class CallableSerde:
319
313
  _env_b64: Optional[str] = None
320
314
  _env_meta: Optional[Dict[str, Any]] = None
321
315
 
316
+ # ----- construction -----
317
+
322
318
  @classmethod
323
319
  def from_callable(cls: type[T], x: Union[Callable[..., Any], T]) -> T:
324
320
  if isinstance(x, cls):
325
321
  return x
326
- obj = cls(fn=x) # type: ignore[return-value]
327
- return obj
322
+ return cls(fn=x) # type: ignore[return-value]
323
+
324
+ # ----- properties -----
328
325
 
329
326
  @property
330
327
  def module(self) -> Optional[str]:
@@ -372,11 +369,13 @@ class CallableSerde:
372
369
  return bool(self.module and self.qualname and "<locals>" not in (self.qualname or ""))
373
370
  return _is_importable_reference(self.fn)
374
371
 
372
+ # ----- serde API -----
373
+
375
374
  def dump(
376
375
  self,
377
376
  *,
378
- prefer: str = "import",
379
- dump_env: str = "none",
377
+ prefer: str = "import", # "import" | "dill"
378
+ dump_env: str = "none", # "none" | "globals" | "closure" | "both"
380
379
  filter_used_globals: bool = True,
381
380
  env_keys: Optional[Iterable[str]] = None,
382
381
  env_variables: Optional[Dict[str, str]] = None,
@@ -418,6 +417,7 @@ class CallableSerde:
418
417
  raise ValueError("dump_env requested but fn is not present.")
419
418
  include_globals = dump_env in ("globals", "both")
420
419
  include_closure = dump_env in ("closure", "both")
420
+
421
421
  env, meta = _dump_env(
422
422
  self.fn,
423
423
  include_globals=include_globals,
@@ -487,7 +487,9 @@ class CallableSerde:
487
487
  fn = self.materialize()
488
488
  return fn(*args, **kwargs)
489
489
 
490
- # ----- command execution bridge -----
490
+ # -------------------------
491
+ # Command execution bridge
492
+ # -------------------------
491
493
 
492
494
  def to_command(
493
495
  self,
@@ -496,16 +498,19 @@ class CallableSerde:
496
498
  *,
497
499
  result_tag: str = "__CALLABLE_SERDE_RESULT__",
498
500
  prefer: str = "dill",
499
- byte_limit: int = 4 * 1024,
500
- dump_env: str = "none", # "none" | "globals" | "closure" | "both"
501
+ byte_limit: int = 64 * 1024,
502
+ dump_env: str = "none", # "none" | "globals" | "closure" | "both"
501
503
  filter_used_globals: bool = True,
502
504
  env_keys: Optional[Iterable[str]] = None,
503
505
  env_variables: Optional[Dict[str, str]] = None,
506
+ file_dump_limit: int = 512 * 1024,
507
+ transaction_id: Optional[str] = None
504
508
  ) -> str:
505
509
  """
506
510
  Returns Python code string to execute in another interpreter.
507
- Prints one line: "{result_tag}:{base64(blob)}"
508
- where blob is raw dill bytes or framed (CS1/CS2).
511
+ Emits exactly one line to stdout:
512
+ "{result_tag}:{base64(blob)}\\n"
513
+ where blob is raw dill bytes or CS2 framed.
509
514
  """
510
515
  import json
511
516
 
@@ -521,29 +526,30 @@ class CallableSerde:
521
526
  )
522
527
  serde_json = json.dumps(serde_dict, ensure_ascii=False)
523
528
 
524
- # Encode (args, kwargs) with stdlib-only strategy so remote can always decode.
529
+ # args/kwargs payload: stdlib-only compression (lzma/zlib)
525
530
  call_raw = dill.dumps((args, kwargs), recurse=True)
526
-
527
- # Use your local encoder for wire payload (stdlib only)
528
531
  call_blob = _encode_wire_blob_stdlib(call_raw, int(byte_limit))
529
532
  call_payload_b64 = base64.b64encode(call_blob).decode("ascii")
533
+ transaction_id = transaction_id or secrets.token_urlsafe(16)
530
534
 
531
535
  template = r"""
532
536
  import base64, json, os, sys
533
537
  import dill
538
+ import pandas
534
539
 
535
- # thin import from your real module
540
+ from yggdrasil.databricks import Workspace
536
541
  from yggdrasil.pyutils.callable_serde import (
537
542
  CallableSerde,
538
- _decode_result_blob, # decodes raw/CS1/CS2
539
- _encode_result_blob, # encodes result with strongest available
543
+ _decode_result_blob,
544
+ _encode_result_blob,
540
545
  )
541
546
 
542
547
  RESULT_TAG = __RESULT_TAG__
543
548
  BYTE_LIMIT = __BYTE_LIMIT__
549
+ FILE_DUMP_LIMIT = __FILE_DUMP_LIMIT__
550
+ TRANSACTION_ID = __TRANSACTION_ID__
544
551
 
545
552
  def _needed_globals(fn) -> set[str]:
546
- # keep this tiny + local; doesn’t need full module internals
547
553
  import dis
548
554
  names = set()
549
555
  try:
@@ -566,47 +572,63 @@ def _apply_env(fn, env: dict, filter_used: bool):
566
572
  return
567
573
 
568
574
  env_g = env.get("globals") or {}
569
- if env_g:
570
- if filter_used:
571
- needed = _needed_globals(fn)
572
- for name in needed:
573
- if name in env_g:
574
- g.setdefault(name, env_g[name])
575
- else:
576
- for name, val in env_g.items():
577
- g.setdefault(name, val)
575
+ if not env_g:
576
+ return
577
+
578
+ if filter_used:
579
+ needed = _needed_globals(fn)
580
+ for name in needed:
581
+ if name in env_g:
582
+ g.setdefault(name, env_g[name])
583
+ else:
584
+ for name, val in env_g.items():
585
+ g.setdefault(name, val)
578
586
 
579
587
  serde = json.loads(__SERDE_JSON__)
580
588
 
581
- # materialize callable
582
589
  cs = CallableSerde.load(serde, add_pkg_root_to_syspath=True)
583
590
  fn = cs.materialize(add_pkg_root_to_syspath=True)
584
591
 
585
- # apply os env vars (if present)
586
592
  osenv = serde.get("osenv")
587
593
  if osenv:
588
594
  for k, v in osenv.items():
589
595
  os.environ[k] = v
590
596
 
591
- # apply dill'd env payload (if present)
592
597
  env_b64 = serde.get("env_b64")
593
598
  if env_b64:
594
599
  env = dill.loads(base64.b64decode(env_b64))
595
600
  meta = serde.get("env_meta") or {}
596
601
  _apply_env(fn, env, bool(meta.get("filter_used_globals", True)))
597
602
 
598
- # decode call payload
599
603
  call_blob = base64.b64decode(__CALL_PAYLOAD_B64__)
600
604
  call_raw = _decode_result_blob(call_blob)
601
605
  args, kwargs = dill.loads(call_raw)
602
606
 
603
- # execute
604
607
  res = fn(*args, **kwargs)
605
608
 
606
- # encode + print result
607
- raw = dill.dumps(res)
608
- blob = _encode_result_blob(raw, BYTE_LIMIT)
609
- print(f"{RESULT_TAG}:{base64.b64encode(blob).decode('ascii')}")
609
+ if isinstance(res, pandas.DataFrame):
610
+ dump_path = Workspace().shared_cache_path("/cmd/" + TRANSACTION_ID + ".parquet")
611
+
612
+ with dump_path.open(mode="wb") as f:
613
+ res.to_parquet(f)
614
+
615
+ blob = "DBXPATH:" + str(dump_path)
616
+ else:
617
+ raw = dill.dumps(res)
618
+ blob = _encode_result_blob(raw, BYTE_LIMIT)
619
+
620
+ if len(blob) > FILE_DUMP_LIMIT:
621
+ dump_path = Workspace().shared_cache_path("/cmd/" + TRANSACTION_ID)
622
+
623
+ with dump_path.open(mode="wb") as f:
624
+ f.write_all_bytes(data=blob)
625
+
626
+ blob = "DBXPATH:" + str(dump_path)
627
+ else:
628
+ blob = base64.b64encode(blob).decode('ascii')
629
+
630
+ sys.stdout.write(f"{RESULT_TAG}:{len(blob)}:{blob}\n")
631
+ sys.stdout.flush()
610
632
  """
611
633
 
612
634
  return (
@@ -615,27 +637,91 @@ print(f"{RESULT_TAG}:{base64.b64encode(blob).decode('ascii')}")
615
637
  .replace("__BYTE_LIMIT__", str(int(byte_limit)))
616
638
  .replace("__SERDE_JSON__", repr(serde_json))
617
639
  .replace("__CALL_PAYLOAD_B64__", repr(call_payload_b64))
640
+ .replace("__FILE_DUMP_LIMIT__", str(int(file_dump_limit)))
641
+ .replace("__TRANSACTION_ID__", repr(str(transaction_id)))
618
642
  )
619
643
 
620
644
  @staticmethod
621
- def parse_command_result(output: str, *, result_tag: str = "__CALLABLE_SERDE_RESULT__") -> Any:
645
+ def parse_command_result(
646
+ output: str,
647
+ *,
648
+ result_tag: str = "__CALLABLE_SERDE_RESULT__",
649
+ workspace: Optional["Workspace"] = None
650
+ ) -> Any:
651
+ """
652
+ Expect last tagged line:
653
+ "{result_tag}:{blob_nbytes}:{b64}"
654
+
655
+ We use blob_nbytes to compute expected base64 char length and detect truncation
656
+ before decoding/decompressing.
657
+ """
622
658
  prefix = f"{result_tag}:"
623
659
  if prefix not in output:
624
660
  raise ValueError(f"Result tag not found in output: {result_tag}")
625
661
 
626
- # last tagged line, first line after it
627
- _, b64 = output.rsplit(prefix, 1)
662
+ # Grab everything after the LAST occurrence of the tag
663
+ _, tail = output.rsplit(prefix, 1)
628
664
 
629
- if not b64:
630
- raise ValueError(f"Found result tag {result_tag} but payload is empty")
665
+ # Parse "{nbytes}:{b64}"
666
+ try:
667
+ nbytes_str, string_result = tail.split(":", 1)
668
+ except ValueError as e:
669
+ raise ValueError(
670
+ f"Malformed result line after tag {result_tag}. "
671
+ "Expected '{tag}:{nbytes}:{b64}'."
672
+ ) from e
631
673
 
632
674
  try:
633
- blob = base64.b64decode(b64.encode("ascii"))
634
- except (UnicodeEncodeError, binascii.Error) as e:
635
- raise ValueError("Invalid base64 payload after result tag") from e
675
+ content_length = int(nbytes_str)
676
+ except ValueError as e:
677
+ raise ValueError(f"Malformed byte count '{nbytes_str}' after tag {result_tag}") from e
678
+
679
+ if content_length < 0:
680
+ raise ValueError(f"Negative byte count {content_length} after tag {result_tag}")
681
+
682
+ string_result = string_result[:content_length]
683
+
684
+ if len(string_result) != content_length:
685
+ raise ValueError(
686
+ "Got truncated result content from command, got %s bytes and expected %s bytes" % (
687
+ len(string_result),
688
+ content_length
689
+ )
690
+ )
691
+
692
+ if string_result.startswith("DBXPATH:"):
693
+ from ..databricks.workspaces import Workspace
694
+
695
+ workspace = Workspace() if workspace is None else workspace
696
+ path = workspace.dbfs_path(
697
+ string_result.replace("DBXPATH:", "")
698
+ )
699
+
700
+ if path.name.endswith(".parquet"):
701
+ import pandas
702
+
703
+ with path.open(mode="rb") as f:
704
+ buf = io.BytesIO(f.read_all_bytes())
705
+
706
+ path.rmfile()
707
+ buf.seek(0)
708
+ return pandas.read_parquet(buf)
709
+
710
+ with path.open(mode="rb") as f:
711
+ blob = f.read_all_bytes()
712
+
713
+ path.rmfile()
714
+ else:
715
+ # Strict base64 decode (rejects junk chars)
716
+ try:
717
+ blob = base64.b64decode(string_result.encode("ascii"), validate=True)
718
+ except (UnicodeEncodeError, binascii.Error) as e:
719
+ raise ValueError("Invalid base64 payload after result tag (corrupted/contaminated).") from e
636
720
 
637
721
  raw = _decode_result_blob(blob)
638
722
  try:
639
- return dill.loads(raw)
723
+ result = dill.loads(raw)
640
724
  except Exception as e:
641
725
  raise ValueError("Failed to dill.loads decoded payload") from e
726
+
727
+ return result
@@ -0,0 +1 @@
1
+ __version__ = "0.1.46"
@@ -1 +0,0 @@
1
- __version__ = "0.1.45"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes