ygg 0.1.31__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
- ygg-0.1.32.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +241 -2
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +16 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +29 -4
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +147 -0
- yggdrasil/databricks/sql/types.py +33 -1
- yggdrasil/databricks/workspaces/__init__.py +2 -1
- yggdrasil/databricks/workspaces/filesytem.py +183 -0
- yggdrasil/databricks/workspaces/io.py +387 -9
- yggdrasil/databricks/workspaces/path.py +297 -2
- yggdrasil/databricks/workspaces/path_kind.py +3 -0
- yggdrasil/databricks/workspaces/workspace.py +202 -5
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +123 -1
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.31.dist-info/RECORD +0 -59
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Remote execution helpers for Databricks command contexts."""
|
|
2
|
+
|
|
1
3
|
import base64
|
|
2
4
|
import dataclasses as dc
|
|
3
5
|
import datetime as dt
|
|
@@ -33,6 +35,7 @@ logger = logging.getLogger(__name__)
|
|
|
33
35
|
|
|
34
36
|
@dc.dataclass
|
|
35
37
|
class RemoteMetadata:
|
|
38
|
+
"""Metadata describing the remote cluster execution environment."""
|
|
36
39
|
site_packages_path: Optional[str] = dc.field(default=None)
|
|
37
40
|
os_env: Dict[str, str] = dc.field(default_factory=dict)
|
|
38
41
|
requirements: Optional[str] = dc.field(default=None)
|
|
@@ -42,6 +45,7 @@ class RemoteMetadata:
|
|
|
42
45
|
self,
|
|
43
46
|
current: Optional[Dict] = None
|
|
44
47
|
):
|
|
48
|
+
"""Return environment variables present locally but missing remotely."""
|
|
45
49
|
if current is None:
|
|
46
50
|
current = os.environ
|
|
47
51
|
|
|
@@ -81,6 +85,7 @@ class ExecutionContext:
|
|
|
81
85
|
|
|
82
86
|
# --- Pickle / cloudpickle support (don’t serialize locks or cached remote metadata) ---
|
|
83
87
|
def __getstate__(self):
|
|
88
|
+
"""Serialize context state, excluding locks and remote metadata."""
|
|
84
89
|
state = self.__dict__.copy()
|
|
85
90
|
|
|
86
91
|
# name-mangled field for _lock in instance dict:
|
|
@@ -89,25 +94,30 @@ class ExecutionContext:
|
|
|
89
94
|
return state
|
|
90
95
|
|
|
91
96
|
def __setstate__(self, state):
|
|
97
|
+
"""Restore context state, rehydrating locks if needed."""
|
|
92
98
|
state["_lock"] = state.get("_lock", threading.RLock())
|
|
93
99
|
|
|
94
100
|
self.__dict__.update(state)
|
|
95
101
|
|
|
96
102
|
def __enter__(self) -> "ExecutionContext":
|
|
103
|
+
"""Enter a context manager, opening a remote execution context."""
|
|
97
104
|
self.cluster.__enter__()
|
|
98
105
|
self._was_connected = self.context_id is not None
|
|
99
106
|
return self.connect()
|
|
100
107
|
|
|
101
108
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
109
|
+
"""Exit the context manager and close the remote context if created."""
|
|
102
110
|
if not self._was_connected:
|
|
103
111
|
self.close()
|
|
104
112
|
self.cluster.__exit__(exc_type, exc_val=exc_val, exc_tb=exc_tb)
|
|
105
113
|
|
|
106
114
|
def __del__(self):
|
|
115
|
+
"""Best-effort cleanup for the remote execution context."""
|
|
107
116
|
self.close()
|
|
108
117
|
|
|
109
118
|
@property
|
|
110
119
|
def remote_metadata(self) -> RemoteMetadata:
|
|
120
|
+
"""Fetch and cache remote environment metadata for the cluster."""
|
|
111
121
|
# fast path (no lock)
|
|
112
122
|
rm = self._remote_metadata
|
|
113
123
|
if rm is not None:
|
|
@@ -134,7 +144,7 @@ os_env = meta["os_env"] = {}
|
|
|
134
144
|
for k, v in os.environ.items():
|
|
135
145
|
os_env[k] = v
|
|
136
146
|
|
|
137
|
-
meta["requirements"] = current_env.
|
|
147
|
+
meta["requirements"] = current_env.requirements()
|
|
138
148
|
meta["version_info"] = current_env.version_info
|
|
139
149
|
|
|
140
150
|
print(json.dumps(meta))"""
|
|
@@ -151,17 +161,24 @@ print(json.dumps(meta))"""
|
|
|
151
161
|
|
|
152
162
|
# ------------ internal helpers ------------
|
|
153
163
|
def _workspace_client(self):
|
|
164
|
+
"""Return the Databricks SDK client for command execution.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
The underlying WorkspaceClient instance.
|
|
168
|
+
"""
|
|
154
169
|
return self.cluster.workspace.sdk()
|
|
155
170
|
|
|
156
171
|
def _create_command(
|
|
157
172
|
self,
|
|
158
173
|
language: "Language",
|
|
159
174
|
) -> any:
|
|
160
|
-
"""
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
175
|
+
"""Create a command execution context, retrying if needed.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
language: The Databricks command language to use.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
The created command execution context response.
|
|
165
182
|
"""
|
|
166
183
|
self.cluster.ensure_running()
|
|
167
184
|
|
|
@@ -182,7 +199,14 @@ print(json.dumps(meta))"""
|
|
|
182
199
|
self,
|
|
183
200
|
language: Optional["Language"] = None
|
|
184
201
|
) -> "ExecutionContext":
|
|
185
|
-
"""Create a remote command execution context if not already open.
|
|
202
|
+
"""Create a remote command execution context if not already open.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
language: Optional language override for the context.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
The connected ExecutionContext instance.
|
|
209
|
+
"""
|
|
186
210
|
if self.context_id is not None:
|
|
187
211
|
logger.debug(
|
|
188
212
|
"Execution context already open for %s",
|
|
@@ -209,7 +233,11 @@ print(json.dumps(meta))"""
|
|
|
209
233
|
return self
|
|
210
234
|
|
|
211
235
|
def close(self) -> None:
|
|
212
|
-
"""Destroy the remote command execution context if it exists.
|
|
236
|
+
"""Destroy the remote command execution context if it exists.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
None.
|
|
240
|
+
"""
|
|
213
241
|
if self.context_id is None:
|
|
214
242
|
return
|
|
215
243
|
|
|
@@ -241,6 +269,21 @@ print(json.dumps(meta))"""
|
|
|
241
269
|
result_tag: Optional[str] = None,
|
|
242
270
|
**options
|
|
243
271
|
):
|
|
272
|
+
"""Execute a string command or a callable in the remote context.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
obj: Command string or callable to execute.
|
|
276
|
+
args: Optional positional arguments for callables.
|
|
277
|
+
kwargs: Optional keyword arguments for callables.
|
|
278
|
+
env_keys: Environment variable names to forward.
|
|
279
|
+
env_variables: Environment variables to inject remotely.
|
|
280
|
+
timeout: Optional timeout for execution.
|
|
281
|
+
result_tag: Optional result tag for parsing output.
|
|
282
|
+
**options: Additional execution options.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
The decoded execution result.
|
|
286
|
+
"""
|
|
244
287
|
if isinstance(obj, str):
|
|
245
288
|
return self.execute_command(
|
|
246
289
|
command=obj,
|
|
@@ -261,6 +304,7 @@ print(json.dumps(meta))"""
|
|
|
261
304
|
raise ValueError(f"Cannot execute {type(obj)}")
|
|
262
305
|
|
|
263
306
|
def is_in_databricks_environment(self):
|
|
307
|
+
"""Return True when running on a Databricks runtime."""
|
|
264
308
|
return self.cluster.is_in_databricks_environment()
|
|
265
309
|
|
|
266
310
|
def execute_callable(
|
|
@@ -274,12 +318,26 @@ print(json.dumps(meta))"""
|
|
|
274
318
|
timeout: Optional[dt.timedelta] = None,
|
|
275
319
|
command: Optional[str] = None,
|
|
276
320
|
) -> Any:
|
|
321
|
+
"""Execute a Python callable remotely and return the decoded result.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
func: Callable or serialized callable to run remotely.
|
|
325
|
+
args: Positional arguments for the callable.
|
|
326
|
+
kwargs: Keyword arguments for the callable.
|
|
327
|
+
env_keys: Environment variable names to forward.
|
|
328
|
+
env_variables: Environment variables to inject remotely.
|
|
329
|
+
print_stdout: Whether to print stdout from the command output.
|
|
330
|
+
timeout: Optional timeout for execution.
|
|
331
|
+
command: Optional prebuilt command string override.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
The decoded return value from the remote execution.
|
|
335
|
+
"""
|
|
277
336
|
if self.is_in_databricks_environment():
|
|
278
337
|
args = args or []
|
|
279
338
|
kwargs = kwargs or {}
|
|
280
339
|
return func(*args, **kwargs)
|
|
281
340
|
|
|
282
|
-
"""Execute a command in this context and return decoded output."""
|
|
283
341
|
self.connect(language=Language.PYTHON)
|
|
284
342
|
|
|
285
343
|
logger.debug(
|
|
@@ -354,7 +412,17 @@ print(json.dumps(meta))"""
|
|
|
354
412
|
result_tag: Optional[str] = None,
|
|
355
413
|
print_stdout: Optional[bool] = True,
|
|
356
414
|
) -> str:
|
|
357
|
-
"""Execute a command in this context and return decoded output.
|
|
415
|
+
"""Execute a command in this context and return decoded output.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
command: The command string to execute.
|
|
419
|
+
timeout: Optional timeout for execution.
|
|
420
|
+
result_tag: Optional tag to extract a specific result segment.
|
|
421
|
+
print_stdout: Whether to print stdout for tagged output.
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
The decoded command output string.
|
|
425
|
+
"""
|
|
358
426
|
self.connect()
|
|
359
427
|
|
|
360
428
|
client = self._workspace_client()
|
|
@@ -402,6 +470,12 @@ print(json.dumps(meta))"""
|
|
|
402
470
|
- If local_path is a directory:
|
|
403
471
|
remote_path is the *directory root* on remote; the directory
|
|
404
472
|
contents are mirrored under it.
|
|
473
|
+
Args:
|
|
474
|
+
local_path: Local file or directory to upload.
|
|
475
|
+
remote_path: Target path on the remote cluster.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
None.
|
|
405
479
|
"""
|
|
406
480
|
local_path = os.path.abspath(local_path)
|
|
407
481
|
if not os.path.exists(local_path):
|
|
@@ -490,6 +564,12 @@ with zipfile.ZipFile(buf, "r") as zf:
|
|
|
490
564
|
- path to a file (e.g. "./ygg/__init__.py")
|
|
491
565
|
- module name (e.g. "ygg")
|
|
492
566
|
- module object (e.g. import ygg; workspace.upload_local_lib(ygg))
|
|
567
|
+
Args:
|
|
568
|
+
libraries: Library path, name, module, or iterable of these.
|
|
569
|
+
with_dependencies: Whether to include dependencies (unused).
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
The resolved library or list of libraries uploaded.
|
|
493
573
|
"""
|
|
494
574
|
if isinstance(libraries, (list, tuple, set)):
|
|
495
575
|
return [
|
|
@@ -517,7 +597,16 @@ with zipfile.ZipFile(buf, "r") as zf:
|
|
|
517
597
|
result_tag: Optional[str],
|
|
518
598
|
print_stdout: Optional[bool] = True
|
|
519
599
|
) -> str:
|
|
520
|
-
"""Mirror the old Cluster.execute_command result handling.
|
|
600
|
+
"""Mirror the old Cluster.execute_command result handling.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
result: Raw command execution response.
|
|
604
|
+
result_tag: Optional tag to extract a segment from output.
|
|
605
|
+
print_stdout: Whether to print stdout when using tags.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
The decoded output string.
|
|
609
|
+
"""
|
|
521
610
|
if not getattr(result, "results", None):
|
|
522
611
|
raise RuntimeError("Command execution returned no results")
|
|
523
612
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Convenience decorator for running functions on Databricks clusters."""
|
|
2
|
+
|
|
1
3
|
import datetime as dt
|
|
2
4
|
import logging
|
|
3
5
|
from typing import (
|
|
@@ -26,6 +28,20 @@ def databricks_remote_compute(
|
|
|
26
28
|
env_keys: Optional[List[str]] = None,
|
|
27
29
|
**options
|
|
28
30
|
) -> Callable[[Callable[..., ReturnType]], Callable[..., ReturnType]]:
|
|
31
|
+
"""Return a decorator that executes functions on a remote cluster.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
cluster_id: Optional cluster id to target.
|
|
35
|
+
cluster_name: Optional cluster name to target.
|
|
36
|
+
workspace: Workspace instance or host string for lookup.
|
|
37
|
+
cluster: Pre-configured Cluster instance to reuse.
|
|
38
|
+
timeout: Optional execution timeout for remote calls.
|
|
39
|
+
env_keys: Optional environment variable names to forward.
|
|
40
|
+
**options: Extra options forwarded to the execution decorator.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
A decorator that runs functions on the resolved Databricks cluster.
|
|
44
|
+
"""
|
|
29
45
|
if isinstance(workspace, str):
|
|
30
46
|
workspace = Workspace(host=workspace)
|
|
31
47
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Databricks widget-backed configuration helpers."""
|
|
2
|
+
|
|
1
3
|
import builtins
|
|
2
4
|
import dataclasses
|
|
3
5
|
import datetime as dt
|
|
@@ -21,6 +23,15 @@ logger = logging.getLogger(__name__)
|
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def type_is_iterable(tpe: type, origin=None):
|
|
26
|
+
"""Return True when the type annotation represents a list/set-like container.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
tpe: The type annotation to inspect.
|
|
30
|
+
origin: Optional origin to reuse when recursing.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
True when the type is list-like, otherwise False.
|
|
34
|
+
"""
|
|
24
35
|
if (
|
|
25
36
|
tpe is list or tpe is set
|
|
26
37
|
):
|
|
@@ -40,7 +51,7 @@ ALL_VALUES_TAG = "**all**"
|
|
|
40
51
|
|
|
41
52
|
|
|
42
53
|
class WidgetType(Enum):
|
|
43
|
-
"""Enum defining supported widget types
|
|
54
|
+
"""Enum defining supported Databricks widget types."""
|
|
44
55
|
TEXT = "text"
|
|
45
56
|
DROPDOWN = "dropdown"
|
|
46
57
|
COMBOBOX = "combobox"
|
|
@@ -50,8 +61,15 @@ class WidgetType(Enum):
|
|
|
50
61
|
|
|
51
62
|
@dataclass
|
|
52
63
|
class NotebookConfig:
|
|
64
|
+
"""Base class for widget-driven notebook configuration dataclasses."""
|
|
65
|
+
|
|
53
66
|
@classmethod
|
|
54
67
|
def get_dbutils(cls):
|
|
68
|
+
"""Locate a ``dbutils`` instance from known Databricks injection points.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
The ``dbutils`` instance if found, otherwise None.
|
|
72
|
+
"""
|
|
55
73
|
# 1) explicit builtin injection (Databricks sometimes does this)
|
|
56
74
|
if hasattr(builtins, "dbutils"):
|
|
57
75
|
return builtins.dbutils
|
|
@@ -85,8 +103,7 @@ class NotebookConfig:
|
|
|
85
103
|
|
|
86
104
|
@classmethod
|
|
87
105
|
def from_environment(cls):
|
|
88
|
-
"""
|
|
89
|
-
Build data class from environment variables or databricks dbutils if available.
|
|
106
|
+
"""Build a config instance from Databricks widgets or environment variables.
|
|
90
107
|
|
|
91
108
|
This method looks for values in the following order:
|
|
92
109
|
1. Databricks widgets (if running in Databricks notebook)
|
|
@@ -94,7 +111,7 @@ class NotebookConfig:
|
|
|
94
111
|
3. Environment variables
|
|
95
112
|
|
|
96
113
|
Returns:
|
|
97
|
-
An instance of the dataclass populated with values from the environment
|
|
114
|
+
An instance of the dataclass populated with values from the environment.
|
|
98
115
|
"""
|
|
99
116
|
dbutils = cls.get_dbutils()
|
|
100
117
|
key_values: Dict[str, Any] = {}
|
|
@@ -229,6 +246,9 @@ class NotebookConfig:
|
|
|
229
246
|
|
|
230
247
|
This method creates appropriate widgets for each field in the dataclass,
|
|
231
248
|
with optional default values and customization options.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
None. Widgets are created in the notebook environment.
|
|
232
252
|
"""
|
|
233
253
|
dbutils = cls.get_dbutils()
|
|
234
254
|
if dbutils is None or not hasattr(dbutils, "widgets"):
|
|
@@ -296,6 +316,11 @@ class NotebookConfig:
|
|
|
296
316
|
|
|
297
317
|
@classmethod
|
|
298
318
|
def init_job(cls):
|
|
319
|
+
"""Initialize widgets, tweak Spark session defaults, and return config.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
An instance of the dataclass populated from widgets or environment.
|
|
323
|
+
"""
|
|
299
324
|
cls.init_widgets()
|
|
300
325
|
|
|
301
326
|
if SparkSession is not None:
|