ygg 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/METADATA +1 -1
- ygg-0.1.33.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +244 -3
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +24 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +29 -4
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +147 -0
- yggdrasil/databricks/sql/types.py +33 -1
- yggdrasil/databricks/workspaces/__init__.py +2 -1
- yggdrasil/databricks/workspaces/filesytem.py +183 -0
- yggdrasil/databricks/workspaces/io.py +387 -9
- yggdrasil/databricks/workspaces/path.py +297 -2
- yggdrasil/databricks/workspaces/path_kind.py +3 -0
- yggdrasil/databricks/workspaces/workspace.py +202 -5
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +123 -1
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.31.dist-info/RECORD +0 -59
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/WHEEL +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Workspace configuration and Databricks SDK helpers."""
|
|
2
|
+
|
|
1
3
|
import dataclasses
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
@@ -17,13 +19,13 @@ if TYPE_CHECKING:
|
|
|
17
19
|
from ..compute.cluster import Cluster
|
|
18
20
|
|
|
19
21
|
from .path import DatabricksPath, DatabricksPathKind
|
|
22
|
+
from ...version import __version__ as YGGDRASIL_VERSION
|
|
20
23
|
from ...libs.databrickslib import require_databricks_sdk, databricks_sdk
|
|
21
24
|
|
|
22
25
|
if databricks_sdk is not None:
|
|
23
26
|
from databricks.sdk import WorkspaceClient
|
|
24
|
-
from databricks.sdk.errors import ResourceDoesNotExist
|
|
27
|
+
from databricks.sdk.errors import ResourceDoesNotExist
|
|
25
28
|
from databricks.sdk.service.workspace import ExportFormat, ObjectInfo
|
|
26
|
-
from databricks.sdk.service import catalog as catalog_svc
|
|
27
29
|
from databricks.sdk.dbutils import FileInfo
|
|
28
30
|
from databricks.sdk.service.files import DirectoryEntry
|
|
29
31
|
|
|
@@ -45,7 +47,7 @@ def _get_env_product():
|
|
|
45
47
|
v = os.getenv("DATABRICKS_PRODUCT")
|
|
46
48
|
|
|
47
49
|
if not v:
|
|
48
|
-
return
|
|
50
|
+
return "yggdrasil"
|
|
49
51
|
return v.strip().lower()
|
|
50
52
|
|
|
51
53
|
|
|
@@ -53,7 +55,7 @@ def _get_env_product_version():
|
|
|
53
55
|
v = os.getenv("DATABRICKS_PRODUCT_VERSION")
|
|
54
56
|
|
|
55
57
|
if not v:
|
|
56
|
-
return
|
|
58
|
+
return YGGDRASIL_VERSION
|
|
57
59
|
return v.strip().lower()
|
|
58
60
|
|
|
59
61
|
|
|
@@ -67,6 +69,7 @@ def _get_env_product_tag():
|
|
|
67
69
|
|
|
68
70
|
@dataclass
|
|
69
71
|
class Workspace:
|
|
72
|
+
"""Configuration wrapper for connecting to a Databricks workspace."""
|
|
70
73
|
# Databricks / generic
|
|
71
74
|
host: Optional[str] = None
|
|
72
75
|
account_id: Optional[str] = None
|
|
@@ -113,6 +116,11 @@ class Workspace:
|
|
|
113
116
|
# Pickle support
|
|
114
117
|
# -------------------------
|
|
115
118
|
def __getstate__(self):
|
|
119
|
+
"""Serialize the workspace state for pickling.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
A pickle-ready state dictionary.
|
|
123
|
+
"""
|
|
116
124
|
state = self.__dict__.copy()
|
|
117
125
|
state.pop("_sdk", None)
|
|
118
126
|
|
|
@@ -122,6 +130,11 @@ class Workspace:
|
|
|
122
130
|
return state
|
|
123
131
|
|
|
124
132
|
def __setstate__(self, state):
|
|
133
|
+
"""Restore workspace state after unpickling.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
state: Serialized state dictionary.
|
|
137
|
+
"""
|
|
125
138
|
self.__dict__.update(state)
|
|
126
139
|
self._sdk = None
|
|
127
140
|
|
|
@@ -132,10 +145,25 @@ class Workspace:
|
|
|
132
145
|
self.connect(reset=True)
|
|
133
146
|
|
|
134
147
|
def __enter__(self) -> "Workspace":
|
|
148
|
+
"""Enter a context manager and connect to the workspace.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
The connected Workspace instance.
|
|
152
|
+
"""
|
|
135
153
|
self._was_connected = self._sdk is not None
|
|
136
154
|
return self.connect()
|
|
137
155
|
|
|
138
156
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
157
|
+
"""Exit the context manager and close if newly connected.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
exc_type: Exception type, if raised.
|
|
161
|
+
exc_val: Exception value, if raised.
|
|
162
|
+
exc_tb: Exception traceback, if raised.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
None.
|
|
166
|
+
"""
|
|
139
167
|
if not self._was_connected:
|
|
140
168
|
self.close()
|
|
141
169
|
|
|
@@ -149,6 +177,14 @@ class Workspace:
|
|
|
149
177
|
self,
|
|
150
178
|
**kwargs
|
|
151
179
|
) -> "Workspace":
|
|
180
|
+
"""Clone the workspace config with overrides.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
**kwargs: Field overrides for the clone.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
A new Workspace instance with updated fields.
|
|
187
|
+
"""
|
|
152
188
|
state = self.__getstate__()
|
|
153
189
|
state.update(kwargs)
|
|
154
190
|
return Workspace().__setstate__(state)
|
|
@@ -158,9 +194,23 @@ class Workspace:
|
|
|
158
194
|
# -------------------------
|
|
159
195
|
@property
|
|
160
196
|
def connected(self):
|
|
197
|
+
"""Return True when a WorkspaceClient is cached.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
True if connected, otherwise False.
|
|
201
|
+
"""
|
|
161
202
|
return self._sdk is not None
|
|
162
203
|
|
|
163
204
|
def connect(self, reset: bool = False, clone: bool = False) -> "Workspace":
|
|
205
|
+
"""Connect to the workspace and cache the SDK client.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
reset: Whether to reset the cached client before connecting.
|
|
209
|
+
clone: Whether to connect a cloned instance.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
The connected Workspace instance.
|
|
213
|
+
"""
|
|
164
214
|
if reset:
|
|
165
215
|
self._sdk = None
|
|
166
216
|
|
|
@@ -270,6 +320,11 @@ class Workspace:
|
|
|
270
320
|
return str(files[0]) if files else None
|
|
271
321
|
|
|
272
322
|
def reset_local_cache(self):
|
|
323
|
+
"""Remove cached browser OAuth tokens.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
None.
|
|
327
|
+
"""
|
|
273
328
|
local_cache = self._local_cache_token_path()
|
|
274
329
|
|
|
275
330
|
if local_cache:
|
|
@@ -277,6 +332,11 @@ class Workspace:
|
|
|
277
332
|
|
|
278
333
|
@property
|
|
279
334
|
def current_user(self):
|
|
335
|
+
"""Return the current Databricks user.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
The current user object from the SDK.
|
|
339
|
+
"""
|
|
280
340
|
try:
|
|
281
341
|
return self.sdk().current_user.me()
|
|
282
342
|
except:
|
|
@@ -285,6 +345,11 @@ class Workspace:
|
|
|
285
345
|
raise
|
|
286
346
|
|
|
287
347
|
def current_token(self) -> str:
|
|
348
|
+
"""Return the active API token for this workspace.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
The bearer token string.
|
|
352
|
+
"""
|
|
288
353
|
if self.token:
|
|
289
354
|
return self.token
|
|
290
355
|
|
|
@@ -301,6 +366,14 @@ class Workspace:
|
|
|
301
366
|
self,
|
|
302
367
|
workspace: Optional["Workspace"] = None,
|
|
303
368
|
):
|
|
369
|
+
"""Return a PyArrow filesystem for Databricks paths.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
workspace: Optional workspace override.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
A DatabricksFileSystem instance.
|
|
376
|
+
"""
|
|
304
377
|
from .filesytem import DatabricksFileSystem, DatabricksFileSystemHandler
|
|
305
378
|
|
|
306
379
|
handler = DatabricksFileSystemHandler(
|
|
@@ -317,6 +390,16 @@ class Workspace:
|
|
|
317
390
|
kind: Optional[DatabricksPathKind] = None,
|
|
318
391
|
workspace: Optional["Workspace"] = None
|
|
319
392
|
):
|
|
393
|
+
"""Create a DatabricksPath in this workspace.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
parts: Path parts or string to parse.
|
|
397
|
+
kind: Optional path kind override.
|
|
398
|
+
workspace: Optional workspace override.
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
A DatabricksPath instance.
|
|
402
|
+
"""
|
|
320
403
|
workspace = self if workspace is None else workspace
|
|
321
404
|
|
|
322
405
|
if kind is None or isinstance(parts, str):
|
|
@@ -337,6 +420,12 @@ class Workspace:
|
|
|
337
420
|
) -> DatabricksPath:
|
|
338
421
|
"""
|
|
339
422
|
Shared cache base under Volumes for the current user.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
suffix: Optional path suffix to append.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
A DatabricksPath pointing at the shared cache location.
|
|
340
429
|
"""
|
|
341
430
|
base = "/Workspace/Shared/.ygg/cache"
|
|
342
431
|
|
|
@@ -351,6 +440,11 @@ class Workspace:
|
|
|
351
440
|
# ------------------------------------------------------------------ #
|
|
352
441
|
|
|
353
442
|
def sdk(self) -> "WorkspaceClient":
|
|
443
|
+
"""Return the connected WorkspaceClient.
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
The WorkspaceClient instance.
|
|
447
|
+
"""
|
|
354
448
|
return self.connect()._sdk
|
|
355
449
|
|
|
356
450
|
# ------------------------------------------------------------------ #
|
|
@@ -370,6 +464,13 @@ class Workspace:
|
|
|
370
464
|
- other paths -> Workspace paths (sdk.workspace.list)
|
|
371
465
|
|
|
372
466
|
If recursive=True, yield all nested files/directories.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
path: Path string to list.
|
|
470
|
+
recursive: Whether to list recursively.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
An iterator of workspace/DBFS/volume entries.
|
|
373
474
|
"""
|
|
374
475
|
sdk = self.sdk()
|
|
375
476
|
|
|
@@ -422,6 +523,13 @@ class Workspace:
|
|
|
422
523
|
via workspace.download(...).
|
|
423
524
|
|
|
424
525
|
Returned object is a BinaryIO context manager.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
path: Path to open.
|
|
529
|
+
workspace_format: Optional export format for workspace paths.
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
A BinaryIO stream for reading.
|
|
425
533
|
"""
|
|
426
534
|
sdk = self.sdk()
|
|
427
535
|
|
|
@@ -437,9 +545,19 @@ class Workspace:
|
|
|
437
545
|
|
|
438
546
|
@staticmethod
|
|
439
547
|
def is_in_databricks_environment():
|
|
548
|
+
"""Return True when running on a Databricks runtime.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
True if running on Databricks, otherwise False.
|
|
552
|
+
"""
|
|
440
553
|
return os.getenv("DATABRICKS_RUNTIME_VERSION") is not None
|
|
441
554
|
|
|
442
555
|
def default_tags(self):
|
|
556
|
+
"""Return default resource tags for Databricks assets.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
A dict of default tags.
|
|
560
|
+
"""
|
|
443
561
|
return {
|
|
444
562
|
k: v
|
|
445
563
|
for k, v in (
|
|
@@ -451,6 +569,14 @@ class Workspace:
|
|
|
451
569
|
}
|
|
452
570
|
|
|
453
571
|
def merge_tags(self, existing: dict | None = None):
|
|
572
|
+
"""Merge default tags with an existing set.
|
|
573
|
+
|
|
574
|
+
Args:
|
|
575
|
+
existing: Optional existing tags.
|
|
576
|
+
|
|
577
|
+
Returns:
|
|
578
|
+
A dict of merged tags.
|
|
579
|
+
"""
|
|
454
580
|
if existing:
|
|
455
581
|
return self.default_tags()
|
|
456
582
|
|
|
@@ -461,6 +587,17 @@ class Workspace:
|
|
|
461
587
|
schema_name: Optional[str] = None,
|
|
462
588
|
**kwargs
|
|
463
589
|
):
|
|
590
|
+
"""Return a SQLEngine configured for this workspace.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
workspace: Optional workspace override.
|
|
594
|
+
catalog_name: Optional catalog name.
|
|
595
|
+
schema_name: Optional schema name.
|
|
596
|
+
**kwargs: Additional SQLEngine parameters.
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
A SQLEngine instance.
|
|
600
|
+
"""
|
|
464
601
|
from ..sql import SQLEngine
|
|
465
602
|
|
|
466
603
|
return SQLEngine(
|
|
@@ -476,11 +613,20 @@ class Workspace:
|
|
|
476
613
|
cluster_name: Optional[str] = None,
|
|
477
614
|
**kwargs
|
|
478
615
|
) -> "Cluster":
|
|
616
|
+
"""Return a Cluster helper bound to this workspace.
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
cluster_id: Optional cluster id.
|
|
620
|
+
cluster_name: Optional cluster name.
|
|
621
|
+
**kwargs: Additional Cluster parameters.
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
A Cluster instance.
|
|
625
|
+
"""
|
|
479
626
|
from ..compute.cluster import Cluster
|
|
480
627
|
|
|
481
628
|
return Cluster(workspace=self, cluster_id=cluster_id, cluster_name=cluster_name, **kwargs)
|
|
482
629
|
|
|
483
|
-
|
|
484
630
|
# ---------------------------------------------------------------------------
|
|
485
631
|
# Workspace-bound base class
|
|
486
632
|
# ---------------------------------------------------------------------------
|
|
@@ -490,23 +636,54 @@ DBXWorkspace = Workspace
|
|
|
490
636
|
|
|
491
637
|
@dataclass
|
|
492
638
|
class WorkspaceService(ABC):
|
|
639
|
+
"""Base class for helpers that depend on a Workspace."""
|
|
493
640
|
workspace: Workspace = dataclasses.field(default_factory=Workspace)
|
|
494
641
|
|
|
495
642
|
def __post_init__(self):
|
|
643
|
+
"""Ensure a Workspace instance is available.
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
None.
|
|
647
|
+
"""
|
|
496
648
|
if self.workspace is None:
|
|
497
649
|
self.workspace = Workspace()
|
|
498
650
|
|
|
499
651
|
def __enter__(self):
|
|
652
|
+
"""Enter a context manager and connect the workspace.
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
The current WorkspaceService instance.
|
|
656
|
+
"""
|
|
500
657
|
self.workspace.__enter__()
|
|
501
658
|
return self
|
|
502
659
|
|
|
503
660
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
661
|
+
"""Exit the context manager and close the workspace.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
exc_type: Exception type, if raised.
|
|
665
|
+
exc_val: Exception value, if raised.
|
|
666
|
+
exc_tb: Exception traceback, if raised.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
None.
|
|
670
|
+
"""
|
|
504
671
|
self.workspace.__exit__(exc_type=exc_type, exc_val=exc_val, exc_tb=exc_tb)
|
|
505
672
|
|
|
506
673
|
def is_in_databricks_environment(self):
|
|
674
|
+
"""Return True when running on a Databricks runtime.
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
True if running on Databricks, otherwise False.
|
|
678
|
+
"""
|
|
507
679
|
return self.workspace.is_in_databricks_environment()
|
|
508
680
|
|
|
509
681
|
def connect(self):
|
|
682
|
+
"""Connect the underlying workspace.
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
The current WorkspaceService instance.
|
|
686
|
+
"""
|
|
510
687
|
self.workspace = self.workspace.connect()
|
|
511
688
|
return self
|
|
512
689
|
|
|
@@ -516,6 +693,16 @@ class WorkspaceService(ABC):
|
|
|
516
693
|
kind: Optional[DatabricksPathKind] = None,
|
|
517
694
|
workspace: Optional["Workspace"] = None
|
|
518
695
|
):
|
|
696
|
+
"""Create a DatabricksPath in the underlying workspace.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
parts: Path parts or string to parse.
|
|
700
|
+
kind: Optional path kind override.
|
|
701
|
+
workspace: Optional workspace override.
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
A DatabricksPath instance.
|
|
705
|
+
"""
|
|
519
706
|
return self.workspace.dbfs_path(
|
|
520
707
|
kind=kind,
|
|
521
708
|
parts=parts,
|
|
@@ -523,8 +710,18 @@ class WorkspaceService(ABC):
|
|
|
523
710
|
)
|
|
524
711
|
|
|
525
712
|
def sdk(self):
|
|
713
|
+
"""Return the WorkspaceClient for the underlying workspace.
|
|
714
|
+
|
|
715
|
+
Returns:
|
|
716
|
+
The WorkspaceClient instance.
|
|
717
|
+
"""
|
|
526
718
|
return self.workspace.sdk()
|
|
527
719
|
|
|
528
720
|
@property
|
|
529
721
|
def current_user(self):
|
|
722
|
+
"""Return the current Databricks user.
|
|
723
|
+
|
|
724
|
+
Returns:
|
|
725
|
+
The current user object from the SDK.
|
|
726
|
+
"""
|
|
530
727
|
return self.workspace.current_user
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Dataclass helpers that integrate with Arrow schemas and safe casting."""
|
|
2
|
+
|
|
1
3
|
import dataclasses
|
|
2
4
|
from inspect import isclass
|
|
3
5
|
from typing import Any, Iterable, Mapping, Tuple
|
|
@@ -18,6 +20,7 @@ def is_yggdataclass(cls_or_instance: Any) -> bool:
|
|
|
18
20
|
|
|
19
21
|
Args:
|
|
20
22
|
cls_or_instance: The class or instance to check.
|
|
23
|
+
|
|
21
24
|
Returns:
|
|
22
25
|
True if the class or instance
|
|
23
26
|
is a yggdrasil dataclass, False otherwise.
|
|
@@ -26,6 +29,14 @@ def is_yggdataclass(cls_or_instance: Any) -> bool:
|
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
def get_dataclass_arrow_field(cls_or_instance: Any) -> pa.Field:
|
|
32
|
+
"""Return a cached Arrow Field describing the dataclass type.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
cls_or_instance: Dataclass class or instance.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Arrow field describing the dataclass schema.
|
|
39
|
+
"""
|
|
29
40
|
if is_yggdataclass(cls_or_instance):
|
|
30
41
|
return cls_or_instance.__arrow_field__()
|
|
31
42
|
|
|
@@ -58,7 +69,7 @@ def yggdataclass(
|
|
|
58
69
|
kw_only=False, slots=False,
|
|
59
70
|
weakref_slot=False
|
|
60
71
|
):
|
|
61
|
-
"""
|
|
72
|
+
"""Decorate a class with dataclass behavior plus Arrow helpers.
|
|
62
73
|
|
|
63
74
|
Examines PEP 526 __annotations__ to determine fields.
|
|
64
75
|
|
|
@@ -73,7 +84,24 @@ def yggdataclass(
|
|
|
73
84
|
"""
|
|
74
85
|
|
|
75
86
|
def wrap(c):
|
|
87
|
+
"""Wrap a class with yggdrasil dataclass enhancements.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
c: Class to decorate.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Decorated dataclass type.
|
|
94
|
+
"""
|
|
95
|
+
|
|
76
96
|
def _init_public_fields(cls):
|
|
97
|
+
"""Return init-enabled, public dataclass fields.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
cls: Dataclass type.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of dataclasses.Field objects.
|
|
104
|
+
"""
|
|
77
105
|
return [
|
|
78
106
|
field
|
|
79
107
|
for field in dataclasses.fields(cls)
|
|
@@ -83,6 +111,11 @@ def yggdataclass(
|
|
|
83
111
|
if not hasattr(c, "default_instance"):
|
|
84
112
|
@classmethod
|
|
85
113
|
def default_instance(cls):
|
|
114
|
+
"""Return a default instance built from type defaults.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Default instance of the dataclass.
|
|
118
|
+
"""
|
|
86
119
|
from yggdrasil.types import default_scalar
|
|
87
120
|
|
|
88
121
|
if not hasattr(cls, "__default_instance__"):
|
|
@@ -135,6 +168,14 @@ def yggdataclass(
|
|
|
135
168
|
if not hasattr(c, "__arrow_field__"):
|
|
136
169
|
@classmethod
|
|
137
170
|
def __arrow_field__(cls, name: str | None = None):
|
|
171
|
+
"""Return an Arrow field representing the dataclass schema.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
name: Optional override for the field name.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Arrow field describing the dataclass schema.
|
|
178
|
+
"""
|
|
138
179
|
from yggdrasil.types.python_arrow import arrow_field_from_hint
|
|
139
180
|
|
|
140
181
|
return arrow_field_from_hint(cls, name=name)
|
yggdrasil/libs/__init__.py
CHANGED
yggdrasil/libs/databrickslib.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Optional Databricks SDK dependency helpers."""
|
|
2
|
+
|
|
1
3
|
try:
|
|
2
4
|
import databricks
|
|
3
5
|
import databricks.sdk # type: ignore
|
|
@@ -6,7 +8,9 @@ try:
|
|
|
6
8
|
databricks_sdk = databricks.sdk
|
|
7
9
|
except ImportError:
|
|
8
10
|
class _DatabricksDummy:
|
|
11
|
+
"""Placeholder object that raises if Databricks SDK is required."""
|
|
9
12
|
def __getattr__(self, item):
|
|
13
|
+
"""Raise an error when accessing missing Databricks SDK attributes."""
|
|
10
14
|
require_databricks_sdk()
|
|
11
15
|
|
|
12
16
|
databricks = _DatabricksDummy
|
|
@@ -14,6 +18,11 @@ except ImportError:
|
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
def require_databricks_sdk():
|
|
21
|
+
"""Ensure the Databricks SDK is available before use.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
None.
|
|
25
|
+
"""
|
|
17
26
|
if databricks_sdk is None:
|
|
18
27
|
raise ImportError(
|
|
19
28
|
"databricks_sdk is required to use this function. "
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Polars DataFrame extension helpers for joins and resampling."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import datetime
|
|
@@ -39,6 +41,14 @@ def join_coalesced(
|
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
def _normalize_group_by(group_by: str | Sequence[str] | None) -> list[str] | None:
|
|
44
|
+
"""Normalize group_by inputs into a list or None.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
group_by: Grouping column or columns.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of column names or None.
|
|
51
|
+
"""
|
|
42
52
|
if group_by is None:
|
|
43
53
|
return None
|
|
44
54
|
if isinstance(group_by, str):
|
|
@@ -57,6 +67,15 @@ def _filter_kwargs_for_callable(fn: object, kwargs: dict[str, Any]) -> dict[str,
|
|
|
57
67
|
|
|
58
68
|
|
|
59
69
|
def _expr_from_agg(col: str, agg: Any) -> "pl.Expr":
|
|
70
|
+
"""Build a Polars expression from an aggregation spec.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
col: Column name to aggregate.
|
|
74
|
+
agg: Aggregation spec (expr, callable, or string).
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Polars expression.
|
|
78
|
+
"""
|
|
60
79
|
base = pl.col(col)
|
|
61
80
|
|
|
62
81
|
if isinstance(agg, pl.Expr):
|
|
@@ -80,6 +99,14 @@ def _expr_from_agg(col: str, agg: Any) -> "pl.Expr":
|
|
|
80
99
|
|
|
81
100
|
|
|
82
101
|
def _normalize_aggs(agg: AggSpec) -> list["pl.Expr"]:
|
|
102
|
+
"""Normalize aggregation specs into a list of Polars expressions.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
agg: Mapping or sequence of aggregation specs.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of Polars expressions.
|
|
109
|
+
"""
|
|
83
110
|
if isinstance(agg, Mapping):
|
|
84
111
|
return [_expr_from_agg(col, spec) for col, spec in agg.items()]
|
|
85
112
|
|
|
@@ -91,11 +118,27 @@ def _normalize_aggs(agg: AggSpec) -> list["pl.Expr"]:
|
|
|
91
118
|
|
|
92
119
|
|
|
93
120
|
def _is_datetime(dtype: object) -> bool:
|
|
121
|
+
"""Return True when the dtype is a Polars datetime.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
dtype: Polars dtype to inspect.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
True if dtype is Polars Datetime.
|
|
128
|
+
"""
|
|
94
129
|
# Datetime-only inference (per requirement), version-safe.
|
|
95
130
|
return isinstance(dtype, pl.Datetime)
|
|
96
131
|
|
|
97
132
|
|
|
98
133
|
def _infer_time_col(df: "pl.DataFrame") -> str:
|
|
134
|
+
"""Infer the first datetime-like column name from a DataFrame.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
df: Polars DataFrame to inspect.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Column name of the first datetime field.
|
|
141
|
+
"""
|
|
99
142
|
# Find first Datetime column in schema order; ignore Date columns.
|
|
100
143
|
for name, dtype in df.schema.items():
|
|
101
144
|
if _is_datetime(dtype):
|
|
@@ -106,6 +149,15 @@ def _infer_time_col(df: "pl.DataFrame") -> str:
|
|
|
106
149
|
|
|
107
150
|
|
|
108
151
|
def _ensure_datetime_like(df: "pl.DataFrame", time_col: str) -> "pl.DataFrame":
|
|
152
|
+
"""Ensure a time column is cast to datetime for resampling.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
df: Polars DataFrame.
|
|
156
|
+
time_col: Column name to validate.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
DataFrame with time column cast to datetime if needed.
|
|
160
|
+
"""
|
|
109
161
|
dtype = df.schema.get(time_col)
|
|
110
162
|
if dtype is None:
|
|
111
163
|
raise KeyError(f"resample: time_col '{time_col}' not found in DataFrame columns.")
|
|
@@ -151,6 +203,14 @@ def _timedelta_to_polars_duration(td: datetime.timedelta) -> str:
|
|
|
151
203
|
|
|
152
204
|
|
|
153
205
|
def _normalize_duration(v: str | datetime.timedelta | None) -> str | None:
|
|
206
|
+
"""Normalize duration inputs to a Polars duration string.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
v: Duration string, timedelta, or None.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Normalized duration string or None.
|
|
213
|
+
"""
|
|
154
214
|
if v is None:
|
|
155
215
|
return None
|
|
156
216
|
if isinstance(v, str):
|
|
@@ -168,6 +228,18 @@ def _upsample_single(
|
|
|
168
228
|
offset: str | datetime.timedelta | None,
|
|
169
229
|
keep_group_order: bool,
|
|
170
230
|
) -> "pl.DataFrame":
|
|
231
|
+
"""Upsample a single DataFrame with normalized duration arguments.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
df: Polars DataFrame to upsample.
|
|
235
|
+
time_col: Name of the time column.
|
|
236
|
+
every: Sampling interval.
|
|
237
|
+
offset: Optional offset interval.
|
|
238
|
+
keep_group_order: Preserve input order when grouping.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Upsampled Polars DataFrame.
|
|
242
|
+
"""
|
|
171
243
|
df = df.sort(time_col)
|
|
172
244
|
|
|
173
245
|
every_n = _normalize_duration(every)
|