ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
  2. ygg-0.1.32.dist-info/RECORD +60 -0
  3. yggdrasil/__init__.py +2 -0
  4. yggdrasil/databricks/__init__.py +2 -0
  5. yggdrasil/databricks/compute/__init__.py +2 -0
  6. yggdrasil/databricks/compute/cluster.py +241 -2
  7. yggdrasil/databricks/compute/execution_context.py +100 -11
  8. yggdrasil/databricks/compute/remote.py +16 -0
  9. yggdrasil/databricks/jobs/__init__.py +5 -0
  10. yggdrasil/databricks/jobs/config.py +31 -34
  11. yggdrasil/databricks/sql/__init__.py +2 -0
  12. yggdrasil/databricks/sql/engine.py +217 -36
  13. yggdrasil/databricks/sql/exceptions.py +1 -0
  14. yggdrasil/databricks/sql/statement_result.py +148 -1
  15. yggdrasil/databricks/sql/types.py +49 -1
  16. yggdrasil/databricks/workspaces/__init__.py +4 -1
  17. yggdrasil/databricks/workspaces/filesytem.py +344 -0
  18. yggdrasil/databricks/workspaces/io.py +1123 -0
  19. yggdrasil/databricks/workspaces/path.py +1415 -0
  20. yggdrasil/databricks/workspaces/path_kind.py +13 -0
  21. yggdrasil/databricks/workspaces/workspace.py +298 -154
  22. yggdrasil/dataclasses/__init__.py +2 -0
  23. yggdrasil/dataclasses/dataclass.py +42 -1
  24. yggdrasil/libs/__init__.py +2 -0
  25. yggdrasil/libs/databrickslib.py +9 -0
  26. yggdrasil/libs/extensions/__init__.py +2 -0
  27. yggdrasil/libs/extensions/polars_extensions.py +72 -0
  28. yggdrasil/libs/extensions/spark_extensions.py +116 -0
  29. yggdrasil/libs/pandaslib.py +7 -0
  30. yggdrasil/libs/polarslib.py +7 -0
  31. yggdrasil/libs/sparklib.py +41 -0
  32. yggdrasil/pyutils/__init__.py +4 -0
  33. yggdrasil/pyutils/callable_serde.py +106 -0
  34. yggdrasil/pyutils/exceptions.py +16 -0
  35. yggdrasil/pyutils/modules.py +44 -1
  36. yggdrasil/pyutils/parallel.py +29 -0
  37. yggdrasil/pyutils/python_env.py +301 -0
  38. yggdrasil/pyutils/retry.py +57 -0
  39. yggdrasil/requests/__init__.py +4 -0
  40. yggdrasil/requests/msal.py +124 -3
  41. yggdrasil/requests/session.py +18 -0
  42. yggdrasil/types/__init__.py +2 -0
  43. yggdrasil/types/cast/__init__.py +2 -1
  44. yggdrasil/types/cast/arrow_cast.py +131 -0
  45. yggdrasil/types/cast/cast_options.py +119 -1
  46. yggdrasil/types/cast/pandas_cast.py +29 -0
  47. yggdrasil/types/cast/polars_cast.py +47 -0
  48. yggdrasil/types/cast/polars_pandas_cast.py +29 -0
  49. yggdrasil/types/cast/registry.py +176 -0
  50. yggdrasil/types/cast/spark_cast.py +76 -0
  51. yggdrasil/types/cast/spark_pandas_cast.py +29 -0
  52. yggdrasil/types/cast/spark_polars_cast.py +28 -0
  53. yggdrasil/types/libs.py +2 -0
  54. yggdrasil/types/python_arrow.py +191 -0
  55. yggdrasil/types/python_defaults.py +73 -0
  56. yggdrasil/version.py +1 -0
  57. ygg-0.1.30.dist-info/RECORD +0 -56
  58. yggdrasil/databricks/workspaces/databricks_path.py +0 -784
  59. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
  60. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
  61. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
  62. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ """Remote execution helpers for Databricks command contexts."""
2
+
1
3
  import base64
2
4
  import dataclasses as dc
3
5
  import datetime as dt
@@ -33,6 +35,7 @@ logger = logging.getLogger(__name__)
33
35
 
34
36
  @dc.dataclass
35
37
  class RemoteMetadata:
38
+ """Metadata describing the remote cluster execution environment."""
36
39
  site_packages_path: Optional[str] = dc.field(default=None)
37
40
  os_env: Dict[str, str] = dc.field(default_factory=dict)
38
41
  requirements: Optional[str] = dc.field(default=None)
@@ -42,6 +45,7 @@ class RemoteMetadata:
42
45
  self,
43
46
  current: Optional[Dict] = None
44
47
  ):
48
+ """Return environment variables present locally but missing remotely."""
45
49
  if current is None:
46
50
  current = os.environ
47
51
 
@@ -81,6 +85,7 @@ class ExecutionContext:
81
85
 
82
86
  # --- Pickle / cloudpickle support (don’t serialize locks or cached remote metadata) ---
83
87
  def __getstate__(self):
88
+ """Serialize context state, excluding locks and remote metadata."""
84
89
  state = self.__dict__.copy()
85
90
 
86
91
  # name-mangled field for _lock in instance dict:
@@ -89,25 +94,30 @@ class ExecutionContext:
89
94
  return state
90
95
 
91
96
  def __setstate__(self, state):
97
+ """Restore context state, rehydrating locks if needed."""
92
98
  state["_lock"] = state.get("_lock", threading.RLock())
93
99
 
94
100
  self.__dict__.update(state)
95
101
 
96
102
  def __enter__(self) -> "ExecutionContext":
103
+ """Enter a context manager, opening a remote execution context."""
97
104
  self.cluster.__enter__()
98
105
  self._was_connected = self.context_id is not None
99
106
  return self.connect()
100
107
 
101
108
  def __exit__(self, exc_type, exc_val, exc_tb):
109
+ """Exit the context manager and close the remote context if created."""
102
110
  if not self._was_connected:
103
111
  self.close()
104
112
  self.cluster.__exit__(exc_type, exc_val=exc_val, exc_tb=exc_tb)
105
113
 
106
114
  def __del__(self):
115
+ """Best-effort cleanup for the remote execution context."""
107
116
  self.close()
108
117
 
109
118
  @property
110
119
  def remote_metadata(self) -> RemoteMetadata:
120
+ """Fetch and cache remote environment metadata for the cluster."""
111
121
  # fast path (no lock)
112
122
  rm = self._remote_metadata
113
123
  if rm is not None:
@@ -134,7 +144,7 @@ os_env = meta["os_env"] = {}
134
144
  for k, v in os.environ.items():
135
145
  os_env[k] = v
136
146
 
137
- meta["requirements"] = current_env.export_requirements_matrix()
147
+ meta["requirements"] = current_env.requirements()
138
148
  meta["version_info"] = current_env.version_info
139
149
 
140
150
  print(json.dumps(meta))"""
@@ -151,17 +161,24 @@ print(json.dumps(meta))"""
151
161
 
152
162
  # ------------ internal helpers ------------
153
163
  def _workspace_client(self):
164
+ """Return the Databricks SDK client for command execution.
165
+
166
+ Returns:
167
+ The underlying WorkspaceClient instance.
168
+ """
154
169
  return self.cluster.workspace.sdk()
155
170
 
156
171
  def _create_command(
157
172
  self,
158
173
  language: "Language",
159
174
  ) -> any:
160
- """
161
- Wrap `client.command_execution.create` in a 10s timeout.
162
- On timeout:
163
- - ensure the cluster is running
164
- - retry once with the same timeout
175
+ """Create a command execution context, retrying if needed.
176
+
177
+ Args:
178
+ language: The Databricks command language to use.
179
+
180
+ Returns:
181
+ The created command execution context response.
165
182
  """
166
183
  self.cluster.ensure_running()
167
184
 
@@ -182,7 +199,14 @@ print(json.dumps(meta))"""
182
199
  self,
183
200
  language: Optional["Language"] = None
184
201
  ) -> "ExecutionContext":
185
- """Create a remote command execution context if not already open."""
202
+ """Create a remote command execution context if not already open.
203
+
204
+ Args:
205
+ language: Optional language override for the context.
206
+
207
+ Returns:
208
+ The connected ExecutionContext instance.
209
+ """
186
210
  if self.context_id is not None:
187
211
  logger.debug(
188
212
  "Execution context already open for %s",
@@ -209,7 +233,11 @@ print(json.dumps(meta))"""
209
233
  return self
210
234
 
211
235
  def close(self) -> None:
212
- """Destroy the remote command execution context if it exists."""
236
+ """Destroy the remote command execution context if it exists.
237
+
238
+ Returns:
239
+ None.
240
+ """
213
241
  if self.context_id is None:
214
242
  return
215
243
 
@@ -241,6 +269,21 @@ print(json.dumps(meta))"""
241
269
  result_tag: Optional[str] = None,
242
270
  **options
243
271
  ):
272
+ """Execute a string command or a callable in the remote context.
273
+
274
+ Args:
275
+ obj: Command string or callable to execute.
276
+ args: Optional positional arguments for callables.
277
+ kwargs: Optional keyword arguments for callables.
278
+ env_keys: Environment variable names to forward.
279
+ env_variables: Environment variables to inject remotely.
280
+ timeout: Optional timeout for execution.
281
+ result_tag: Optional result tag for parsing output.
282
+ **options: Additional execution options.
283
+
284
+ Returns:
285
+ The decoded execution result.
286
+ """
244
287
  if isinstance(obj, str):
245
288
  return self.execute_command(
246
289
  command=obj,
@@ -261,6 +304,7 @@ print(json.dumps(meta))"""
261
304
  raise ValueError(f"Cannot execute {type(obj)}")
262
305
 
263
306
  def is_in_databricks_environment(self):
307
+ """Return True when running on a Databricks runtime."""
264
308
  return self.cluster.is_in_databricks_environment()
265
309
 
266
310
  def execute_callable(
@@ -274,12 +318,26 @@ print(json.dumps(meta))"""
274
318
  timeout: Optional[dt.timedelta] = None,
275
319
  command: Optional[str] = None,
276
320
  ) -> Any:
321
+ """Execute a Python callable remotely and return the decoded result.
322
+
323
+ Args:
324
+ func: Callable or serialized callable to run remotely.
325
+ args: Positional arguments for the callable.
326
+ kwargs: Keyword arguments for the callable.
327
+ env_keys: Environment variable names to forward.
328
+ env_variables: Environment variables to inject remotely.
329
+ print_stdout: Whether to print stdout from the command output.
330
+ timeout: Optional timeout for execution.
331
+ command: Optional prebuilt command string override.
332
+
333
+ Returns:
334
+ The decoded return value from the remote execution.
335
+ """
277
336
  if self.is_in_databricks_environment():
278
337
  args = args or []
279
338
  kwargs = kwargs or {}
280
339
  return func(*args, **kwargs)
281
340
 
282
- """Execute a command in this context and return decoded output."""
283
341
  self.connect(language=Language.PYTHON)
284
342
 
285
343
  logger.debug(
@@ -354,7 +412,17 @@ print(json.dumps(meta))"""
354
412
  result_tag: Optional[str] = None,
355
413
  print_stdout: Optional[bool] = True,
356
414
  ) -> str:
357
- """Execute a command in this context and return decoded output."""
415
+ """Execute a command in this context and return decoded output.
416
+
417
+ Args:
418
+ command: The command string to execute.
419
+ timeout: Optional timeout for execution.
420
+ result_tag: Optional tag to extract a specific result segment.
421
+ print_stdout: Whether to print stdout for tagged output.
422
+
423
+ Returns:
424
+ The decoded command output string.
425
+ """
358
426
  self.connect()
359
427
 
360
428
  client = self._workspace_client()
@@ -402,6 +470,12 @@ print(json.dumps(meta))"""
402
470
  - If local_path is a directory:
403
471
  remote_path is the *directory root* on remote; the directory
404
472
  contents are mirrored under it.
473
+ Args:
474
+ local_path: Local file or directory to upload.
475
+ remote_path: Target path on the remote cluster.
476
+
477
+ Returns:
478
+ None.
405
479
  """
406
480
  local_path = os.path.abspath(local_path)
407
481
  if not os.path.exists(local_path):
@@ -490,6 +564,12 @@ with zipfile.ZipFile(buf, "r") as zf:
490
564
  - path to a file (e.g. "./ygg/__init__.py")
491
565
  - module name (e.g. "ygg")
492
566
  - module object (e.g. import ygg; workspace.upload_local_lib(ygg))
567
+ Args:
568
+ libraries: Library path, name, module, or iterable of these.
569
+ with_dependencies: Whether to include dependencies (unused).
570
+
571
+ Returns:
572
+ The resolved library or list of libraries uploaded.
493
573
  """
494
574
  if isinstance(libraries, (list, tuple, set)):
495
575
  return [
@@ -517,7 +597,16 @@ with zipfile.ZipFile(buf, "r") as zf:
517
597
  result_tag: Optional[str],
518
598
  print_stdout: Optional[bool] = True
519
599
  ) -> str:
520
- """Mirror the old Cluster.execute_command result handling."""
600
+ """Mirror the old Cluster.execute_command result handling.
601
+
602
+ Args:
603
+ result: Raw command execution response.
604
+ result_tag: Optional tag to extract a segment from output.
605
+ print_stdout: Whether to print stdout when using tags.
606
+
607
+ Returns:
608
+ The decoded output string.
609
+ """
521
610
  if not getattr(result, "results", None):
522
611
  raise RuntimeError("Command execution returned no results")
523
612
 
@@ -1,3 +1,5 @@
1
+ """Convenience decorator for running functions on Databricks clusters."""
2
+
1
3
  import datetime as dt
2
4
  import logging
3
5
  from typing import (
@@ -26,6 +28,20 @@ def databricks_remote_compute(
26
28
  env_keys: Optional[List[str]] = None,
27
29
  **options
28
30
  ) -> Callable[[Callable[..., ReturnType]], Callable[..., ReturnType]]:
31
+ """Return a decorator that executes functions on a remote cluster.
32
+
33
+ Args:
34
+ cluster_id: Optional cluster id to target.
35
+ cluster_name: Optional cluster name to target.
36
+ workspace: Workspace instance or host string for lookup.
37
+ cluster: Pre-configured Cluster instance to reuse.
38
+ timeout: Optional execution timeout for remote calls.
39
+ env_keys: Optional environment variable names to forward.
40
+ **options: Extra options forwarded to the execution decorator.
41
+
42
+ Returns:
43
+ A decorator that runs functions on the resolved Databricks cluster.
44
+ """
29
45
  if isinstance(workspace, str):
30
46
  workspace = Workspace(host=workspace)
31
47
 
@@ -0,0 +1,5 @@
1
+ """Helpers for running Databricks jobs and notebooks."""
2
+
3
+ from .config import NotebookConfig, WidgetType
4
+
5
+ __all__ = ["NotebookConfig", "WidgetType"]
@@ -1,12 +1,14 @@
1
+ """Databricks widget-backed configuration helpers."""
2
+
1
3
  import builtins
2
4
  import dataclasses
3
5
  import datetime as dt
4
6
  import inspect
7
+ import logging
5
8
  from dataclasses import dataclass, fields
6
9
  from enum import Enum
7
10
  from inspect import isclass
8
- from typing import Any, Dict, List, get_type_hints, Optional, get_origin
9
- import logging
11
+ from typing import Any, Dict, List, get_type_hints, get_origin
10
12
 
11
13
  from ...libs.sparklib import SparkSession
12
14
  from ...types.cast.registry import convert
@@ -21,6 +23,15 @@ logger = logging.getLogger(__name__)
21
23
 
22
24
 
23
25
  def type_is_iterable(tpe: type, origin=None):
26
+ """Return True when the type annotation represents a list/set-like container.
27
+
28
+ Args:
29
+ tpe: The type annotation to inspect.
30
+ origin: Optional origin to reuse when recursing.
31
+
32
+ Returns:
33
+ True when the type is list-like, otherwise False.
34
+ """
24
35
  if (
25
36
  tpe is list or tpe is set
26
37
  ):
@@ -40,7 +51,7 @@ ALL_VALUES_TAG = "**all**"
40
51
 
41
52
 
42
53
  class WidgetType(Enum):
43
- """Enum defining supported widget types in Databricks"""
54
+ """Enum defining supported Databricks widget types."""
44
55
  TEXT = "text"
45
56
  DROPDOWN = "dropdown"
46
57
  COMBOBOX = "combobox"
@@ -50,8 +61,15 @@ class WidgetType(Enum):
50
61
 
51
62
  @dataclass
52
63
  class NotebookConfig:
64
+ """Base class for widget-driven notebook configuration dataclasses."""
65
+
53
66
  @classmethod
54
67
  def get_dbutils(cls):
68
+ """Locate a ``dbutils`` instance from known Databricks injection points.
69
+
70
+ Returns:
71
+ The ``dbutils`` instance if found, otherwise None.
72
+ """
55
73
  # 1) explicit builtin injection (Databricks sometimes does this)
56
74
  if hasattr(builtins, "dbutils"):
57
75
  return builtins.dbutils
@@ -85,8 +103,7 @@ class NotebookConfig:
85
103
 
86
104
  @classmethod
87
105
  def from_environment(cls):
88
- """
89
- Build data class from environment variables or databricks dbutils if available.
106
+ """Build a config instance from Databricks widgets or environment variables.
90
107
 
91
108
  This method looks for values in the following order:
92
109
  1. Databricks widgets (if running in Databricks notebook)
@@ -94,7 +111,7 @@ class NotebookConfig:
94
111
  3. Environment variables
95
112
 
96
113
  Returns:
97
- An instance of the dataclass populated with values from the environment
114
+ An instance of the dataclass populated with values from the environment.
98
115
  """
99
116
  dbutils = cls.get_dbutils()
100
117
  key_values: Dict[str, Any] = {}
@@ -229,6 +246,9 @@ class NotebookConfig:
229
246
 
230
247
  This method creates appropriate widgets for each field in the dataclass,
231
248
  with optional default values and customization options.
249
+
250
+ Returns:
251
+ None. Widgets are created in the notebook environment.
232
252
  """
233
253
  dbutils = cls.get_dbutils()
234
254
  if dbutils is None or not hasattr(dbutils, "widgets"):
@@ -296,6 +316,11 @@ class NotebookConfig:
296
316
 
297
317
  @classmethod
298
318
  def init_job(cls):
319
+ """Initialize widgets, tweak Spark session defaults, and return config.
320
+
321
+ Returns:
322
+ An instance of the dataclass populated from widgets or environment.
323
+ """
299
324
  cls.init_widgets()
300
325
 
301
326
  if SparkSession is not None:
@@ -308,31 +333,3 @@ class NotebookConfig:
308
333
  spark_session.conf.set("spark.sql.session.timeZone", "UTC")
309
334
 
310
335
  return cls.from_environment()
311
-
312
-
313
- class ExampleEnum(Enum):
314
- """Example enum for widget demonstration"""
315
- OPTION1 = "option1"
316
- OPTION2 = "option2"
317
- OPTION3 = "option3"
318
-
319
-
320
- @dataclass
321
- class CompleteNotebookConfig(NotebookConfig):
322
- """Example JobConfig with various field types to demonstrate widget handling"""
323
- # Basic types
324
- text_field: str
325
- integer_field: int = 42
326
- float_field: float = 3.14
327
- boolean_field: bool = True
328
-
329
- # Special types
330
- date_field: dt.date = dt.date(2023, 1, 1)
331
- datetime_field: dt.datetime = dt.datetime(2023, 1, 1, 12, 0, 0)
332
- enum_field: ExampleEnum = ExampleEnum.OPTION1
333
-
334
- # Collection types
335
- list_of_strings: List[str] = None # Will be displayed as multiselect
336
-
337
- # Optional fields
338
- optional_text: Optional[str] = None
@@ -1,3 +1,5 @@
1
+ """Databricks SQL helpers and engine wrappers."""
2
+
1
3
  from .engine import SQLEngine, StatementResult
2
4
 
3
5
  # Backwards compatibility