ygg 0.1.34__py3-none-any.whl → 0.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.34.dist-info → ygg-0.1.37.dist-info}/METADATA +1 -1
- {ygg-0.1.34.dist-info → ygg-0.1.37.dist-info}/RECORD +13 -12
- yggdrasil/databricks/compute/cluster.py +48 -17
- yggdrasil/databricks/compute/execution_context.py +2 -2
- yggdrasil/databricks/compute/remote.py +25 -8
- yggdrasil/databricks/sql/engine.py +43 -27
- yggdrasil/databricks/sql/statement_result.py +36 -44
- yggdrasil/pyutils/equality.py +107 -0
- yggdrasil/version.py +1 -1
- {ygg-0.1.34.dist-info → ygg-0.1.37.dist-info}/WHEEL +0 -0
- {ygg-0.1.34.dist-info → ygg-0.1.37.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.34.dist-info → ygg-0.1.37.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.34.dist-info → ygg-0.1.37.dist-info}/top_level.txt +0 -0
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
ygg-0.1.
|
|
1
|
+
ygg-0.1.37.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
2
2
|
yggdrasil/__init__.py,sha256=PfH7Xwt6uue6oqe6S5V8NhDJcVQClkKrBE1KXhdelZc,117
|
|
3
|
-
yggdrasil/version.py,sha256=
|
|
3
|
+
yggdrasil/version.py,sha256=bC2HSZRduanhYcwfv2uqbh4LgiwM3nV4LyoWJhD4ftY,22
|
|
4
4
|
yggdrasil/databricks/__init__.py,sha256=skctY2c8W-hI81upx9F_PWRe5ishL3hrdiTuizgDjdw,152
|
|
5
5
|
yggdrasil/databricks/compute/__init__.py,sha256=NvdzmaJSNYY1uJthv1hHdBuNu3bD_-Z65DWnaJt9yXg,289
|
|
6
|
-
yggdrasil/databricks/compute/cluster.py,sha256=
|
|
7
|
-
yggdrasil/databricks/compute/execution_context.py,sha256=
|
|
8
|
-
yggdrasil/databricks/compute/remote.py,sha256=
|
|
6
|
+
yggdrasil/databricks/compute/cluster.py,sha256=mnNzjCx7X3iK22oZ7K3pqot0AXq9JTdg97kT61j2_UU,40729
|
|
7
|
+
yggdrasil/databricks/compute/execution_context.py,sha256=nxrNXoarq_JAB-Cpj0udHhq2jx-DmMbRWJdAezLrPis,22347
|
|
8
|
+
yggdrasil/databricks/compute/remote.py,sha256=nEN_Fr1Ouul_iKOf4B5QjEGscYAcl7nHjGsl2toRzrU,2874
|
|
9
9
|
yggdrasil/databricks/jobs/__init__.py,sha256=snxGSJb0M5I39v0y3IR-uEeSlZR248cQ_4DJ1sYs-h8,154
|
|
10
10
|
yggdrasil/databricks/jobs/config.py,sha256=9LGeHD04hbfy0xt8_6oobC4moKJh4_DTjZiK4Q2Tqjk,11557
|
|
11
11
|
yggdrasil/databricks/sql/__init__.py,sha256=y1n5yg-drZ8QVZbEgznsRG24kdJSnFis9l2YfYCsaCM,234
|
|
12
|
-
yggdrasil/databricks/sql/engine.py,sha256=
|
|
12
|
+
yggdrasil/databricks/sql/engine.py,sha256=kUFBddJJQC0AgDqH0l7GFs7d_Ony5rc8fOv4inLU6Vw,41051
|
|
13
13
|
yggdrasil/databricks/sql/exceptions.py,sha256=Jqd_gT_VyPL8klJEHYEzpv5eHtmdY43WiQ7HZBaEqSk,53
|
|
14
|
-
yggdrasil/databricks/sql/statement_result.py,sha256=
|
|
14
|
+
yggdrasil/databricks/sql/statement_result.py,sha256=KXBLbEpwrjrAeH0ezKNlaa6Vm3jbG3R0ZCnEFHvqpoQ,16834
|
|
15
15
|
yggdrasil/databricks/sql/types.py,sha256=5G-BM9_eOsRKEMzeDTWUsWW5g4Idvs-czVCpOCrMhdA,6412
|
|
16
16
|
yggdrasil/databricks/workspaces/__init__.py,sha256=Ti1I99JTC3koYJaCy8WYvkAox4KdcuMRk8b2rHroWCY,133
|
|
17
17
|
yggdrasil/databricks/workspaces/filesytem.py,sha256=Z8JXU7_XUEbw9fpTQT1avRQKi-IAP2KemXBMPkUoY4w,9805
|
|
@@ -31,6 +31,7 @@ yggdrasil/libs/extensions/polars_extensions.py,sha256=RTkGi8llhPJjX7x9egix7-yXWo
|
|
|
31
31
|
yggdrasil/libs/extensions/spark_extensions.py,sha256=E64n-3SFTDgMuXwWitX6vOYP9ln2lpGKb0htoBLEZgc,16745
|
|
32
32
|
yggdrasil/pyutils/__init__.py,sha256=tl-LapAc71TV7RMgf2ftKwrzr8iiLOGHeJgA3RvO93w,293
|
|
33
33
|
yggdrasil/pyutils/callable_serde.py,sha256=euY7Kiy04i1tpWKuB0b2qQ1FokLC3nq0cv7PObWYUBE,21809
|
|
34
|
+
yggdrasil/pyutils/equality.py,sha256=Xyf8D1dLUCm3spDEir8Zyj7O4US_fBJwEylJCfJ9slI,3080
|
|
34
35
|
yggdrasil/pyutils/exceptions.py,sha256=ssKNm-rjhavHUOZmGA7_1Gq9tSHDrb2EFI-cnBuWgng,3388
|
|
35
36
|
yggdrasil/pyutils/expiring_dict.py,sha256=q9gb09-2EUN-jQZumUw5BXOQGYcj1wb85qKtQlciSxg,5825
|
|
36
37
|
yggdrasil/pyutils/modules.py,sha256=B7IP99YqUMW6-DIESFzBx8-09V1d0a8qrIJUDFhhL2g,11424
|
|
@@ -54,8 +55,8 @@ yggdrasil/types/cast/registry.py,sha256=_zdFGmUBB7P-e_LIcJlOxMcxAkXoA-UXB6HqLMgT
|
|
|
54
55
|
yggdrasil/types/cast/spark_cast.py,sha256=_KAsl1DqmKMSfWxqhVE7gosjYdgiL1C5bDQv6eP3HtA,24926
|
|
55
56
|
yggdrasil/types/cast/spark_pandas_cast.py,sha256=BuTiWrdCANZCdD_p2MAytqm74eq-rdRXd-LGojBRrfU,5023
|
|
56
57
|
yggdrasil/types/cast/spark_polars_cast.py,sha256=btmZNHXn2NSt3fUuB4xg7coaE0RezIBdZD92H8NK0Jw,9073
|
|
57
|
-
ygg-0.1.
|
|
58
|
-
ygg-0.1.
|
|
59
|
-
ygg-0.1.
|
|
60
|
-
ygg-0.1.
|
|
61
|
-
ygg-0.1.
|
|
58
|
+
ygg-0.1.37.dist-info/METADATA,sha256=QOawaiOu5RrOUAhuIws2wNB1Nj3CQq38desRezzYMwk,19204
|
|
59
|
+
ygg-0.1.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
60
|
+
ygg-0.1.37.dist-info/entry_points.txt,sha256=6q-vpWG3kvw2dhctQ0LALdatoeefkN855Ev02I1dKGY,70
|
|
61
|
+
ygg-0.1.37.dist-info/top_level.txt,sha256=iBe9Kk4VIVbLpgv_p8OZUIfxgj4dgJ5wBg6vO3rigso,10
|
|
62
|
+
ygg-0.1.37.dist-info/RECORD,,
|
|
@@ -24,6 +24,7 @@ from .execution_context import ExecutionContext
|
|
|
24
24
|
from ..workspaces.workspace import WorkspaceService, Workspace
|
|
25
25
|
from ... import retry, CallableSerde
|
|
26
26
|
from ...libs.databrickslib import databricks_sdk
|
|
27
|
+
from ...pyutils.equality import dicts_equal, dict_diff
|
|
27
28
|
from ...pyutils.expiring_dict import ExpiringDict
|
|
28
29
|
from ...pyutils.modules import PipIndexSettings
|
|
29
30
|
from ...pyutils.python_env import PythonEnv
|
|
@@ -110,7 +111,7 @@ class Cluster(WorkspaceService):
|
|
|
110
111
|
|
|
111
112
|
_details: Optional["ClusterDetails"] = dataclasses.field(default=None, repr=False)
|
|
112
113
|
_details_refresh_time: float = dataclasses.field(default=0, repr=False)
|
|
113
|
-
_system_context: Optional[ExecutionContext] = None
|
|
114
|
+
_system_context: Optional[ExecutionContext] = dataclasses.field(default=None, repr=False)
|
|
114
115
|
|
|
115
116
|
# host → Cluster instance
|
|
116
117
|
_env_clusters: ClassVar[Dict[str, "Cluster"]] = {}
|
|
@@ -309,6 +310,11 @@ class Cluster(WorkspaceService):
|
|
|
309
310
|
self.details = self.clusters_client().get(cluster_id=self.cluster_id)
|
|
310
311
|
return self._details
|
|
311
312
|
|
|
313
|
+
def refresh(self, max_delay: float | None = None):
|
|
314
|
+
self.details = self.fresh_details(max_delay=max_delay)
|
|
315
|
+
|
|
316
|
+
return self
|
|
317
|
+
|
|
312
318
|
@details.setter
|
|
313
319
|
def details(self, value: "ClusterDetails"):
|
|
314
320
|
"""Cache cluster details and update identifiers."""
|
|
@@ -321,10 +327,10 @@ class Cluster(WorkspaceService):
|
|
|
321
327
|
@property
|
|
322
328
|
def state(self):
|
|
323
329
|
"""Return the current cluster state."""
|
|
324
|
-
|
|
330
|
+
self.refresh()
|
|
325
331
|
|
|
326
|
-
if
|
|
327
|
-
return
|
|
332
|
+
if self._details is not None:
|
|
333
|
+
return self._details.state
|
|
328
334
|
return State.UNKNOWN
|
|
329
335
|
|
|
330
336
|
@property
|
|
@@ -355,7 +361,7 @@ class Cluster(WorkspaceService):
|
|
|
355
361
|
def wait_for_status(
|
|
356
362
|
self,
|
|
357
363
|
tick: float = 0.5,
|
|
358
|
-
timeout: float = 600,
|
|
364
|
+
timeout: Union[float, dt.timedelta] = 600,
|
|
359
365
|
backoff: int = 2,
|
|
360
366
|
max_sleep_time: float = 15
|
|
361
367
|
):
|
|
@@ -373,6 +379,9 @@ class Cluster(WorkspaceService):
|
|
|
373
379
|
start = time.time()
|
|
374
380
|
sleep_time = tick
|
|
375
381
|
|
|
382
|
+
if isinstance(timeout, dt.timedelta):
|
|
383
|
+
timeout = timeout.total_seconds()
|
|
384
|
+
|
|
376
385
|
while self.is_pending:
|
|
377
386
|
time.sleep(sleep_time)
|
|
378
387
|
|
|
@@ -658,8 +667,6 @@ class Cluster(WorkspaceService):
|
|
|
658
667
|
Returns:
|
|
659
668
|
The updated Cluster instance.
|
|
660
669
|
"""
|
|
661
|
-
self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
|
|
662
|
-
|
|
663
670
|
existing_details = {
|
|
664
671
|
k: v
|
|
665
672
|
for k, v in self.details.as_shallow_dict().items()
|
|
@@ -672,22 +679,36 @@ class Cluster(WorkspaceService):
|
|
|
672
679
|
if k in _EDIT_ARG_NAMES
|
|
673
680
|
}
|
|
674
681
|
|
|
675
|
-
|
|
682
|
+
same = dicts_equal(
|
|
683
|
+
existing_details,
|
|
684
|
+
update_details,
|
|
685
|
+
keys=_EDIT_ARG_NAMES,
|
|
686
|
+
treat_missing_as_none=True,
|
|
687
|
+
float_tol=0.0, # set e.g. 1e-6 if you have float-y stuff
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
if not same:
|
|
691
|
+
diff = {
|
|
692
|
+
k: v[1]
|
|
693
|
+
for k, v in dict_diff(existing_details, update_details, keys=_EDIT_ARG_NAMES).items()
|
|
694
|
+
}
|
|
695
|
+
|
|
676
696
|
logger.debug(
|
|
677
697
|
"Updating %s with %s",
|
|
678
|
-
self,
|
|
698
|
+
self, diff
|
|
679
699
|
)
|
|
680
700
|
|
|
681
701
|
self.wait_for_status()
|
|
682
|
-
self.details =
|
|
683
|
-
|
|
684
|
-
)(**update_details)
|
|
702
|
+
self.details = self.clusters_client().edit(**update_details)
|
|
703
|
+
self.wait_for_status()
|
|
685
704
|
|
|
686
705
|
logger.info(
|
|
687
706
|
"Updated %s",
|
|
688
707
|
self
|
|
689
708
|
)
|
|
690
709
|
|
|
710
|
+
self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
|
|
711
|
+
|
|
691
712
|
return self
|
|
692
713
|
|
|
693
714
|
def list_clusters(self) -> Iterator["Cluster"]:
|
|
@@ -742,7 +763,10 @@ class Cluster(WorkspaceService):
|
|
|
742
763
|
return None
|
|
743
764
|
|
|
744
765
|
return Cluster(
|
|
745
|
-
workspace=self.workspace,
|
|
766
|
+
workspace=self.workspace,
|
|
767
|
+
cluster_id=details.cluster_id,
|
|
768
|
+
cluster_name=details.cluster_name,
|
|
769
|
+
_details=details
|
|
746
770
|
)
|
|
747
771
|
|
|
748
772
|
for cluster in self.list_clusters():
|
|
@@ -760,16 +784,18 @@ class Cluster(WorkspaceService):
|
|
|
760
784
|
|
|
761
785
|
def ensure_running(
|
|
762
786
|
self,
|
|
787
|
+
wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
|
|
763
788
|
) -> "Cluster":
|
|
764
789
|
"""Ensure the cluster is running.
|
|
765
790
|
|
|
766
791
|
Returns:
|
|
767
792
|
The current Cluster instance.
|
|
768
793
|
"""
|
|
769
|
-
return self.start()
|
|
794
|
+
return self.start(wait_timeout=wait_timeout)
|
|
770
795
|
|
|
771
796
|
def start(
|
|
772
797
|
self,
|
|
798
|
+
wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
|
|
773
799
|
) -> "Cluster":
|
|
774
800
|
"""Start the cluster if it is not already running.
|
|
775
801
|
|
|
@@ -780,8 +806,13 @@ class Cluster(WorkspaceService):
|
|
|
780
806
|
|
|
781
807
|
if not self.is_running:
|
|
782
808
|
logger.info("Starting %s", self)
|
|
783
|
-
|
|
784
|
-
|
|
809
|
+
|
|
810
|
+
if wait_timeout:
|
|
811
|
+
self.clusters_client().start(cluster_id=self.cluster_id)
|
|
812
|
+
self.wait_for_status(timeout=wait_timeout.total_seconds())
|
|
813
|
+
self.wait_installed_libraries(timeout=wait_timeout)
|
|
814
|
+
else:
|
|
815
|
+
self.clusters_client().start(cluster_id=self.cluster_id)
|
|
785
816
|
|
|
786
817
|
return self
|
|
787
818
|
|
|
@@ -1124,7 +1155,7 @@ class Cluster(WorkspaceService):
|
|
|
1124
1155
|
"Waiting %s to install libraries timed out" % self
|
|
1125
1156
|
)
|
|
1126
1157
|
|
|
1127
|
-
time.sleep(
|
|
1158
|
+
time.sleep(5)
|
|
1128
1159
|
statuses = list(self.installed_library_statuses())
|
|
1129
1160
|
|
|
1130
1161
|
return self
|
|
@@ -78,8 +78,8 @@ class ExecutionContext:
|
|
|
78
78
|
language: Optional["Language"] = None
|
|
79
79
|
context_id: Optional[str] = None
|
|
80
80
|
|
|
81
|
-
_was_connected: Optional[bool] = None
|
|
82
|
-
_remote_metadata: Optional[RemoteMetadata] = None
|
|
81
|
+
_was_connected: Optional[bool] = dc.field(default=None, repr=False)
|
|
82
|
+
_remote_metadata: Optional[RemoteMetadata] = dc.field(default=None, repr=False)
|
|
83
83
|
|
|
84
84
|
_lock: threading.RLock = dc.field(default_factory=threading.RLock, init=False, repr=False)
|
|
85
85
|
|
|
@@ -2,11 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
import datetime as dt
|
|
4
4
|
import logging
|
|
5
|
+
import os
|
|
5
6
|
from typing import (
|
|
6
7
|
Callable,
|
|
7
8
|
Optional,
|
|
8
9
|
TypeVar,
|
|
9
|
-
List, TYPE_CHECKING,
|
|
10
|
+
List, TYPE_CHECKING, Union,
|
|
10
11
|
)
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
@@ -25,10 +26,15 @@ ReturnType = TypeVar("ReturnType")
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
def identity(x):
|
|
30
|
+
return x
|
|
31
|
+
|
|
32
|
+
|
|
28
33
|
def databricks_remote_compute(
|
|
34
|
+
_func: Optional[Callable] = None,
|
|
29
35
|
cluster_id: Optional[str] = None,
|
|
30
36
|
cluster_name: Optional[str] = None,
|
|
31
|
-
workspace: Optional[Workspace] = None,
|
|
37
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
32
38
|
cluster: Optional["Cluster"] = None,
|
|
33
39
|
timeout: Optional[dt.timedelta] = None,
|
|
34
40
|
env_keys: Optional[List[str]] = None,
|
|
@@ -38,6 +44,7 @@ def databricks_remote_compute(
|
|
|
38
44
|
"""Return a decorator that executes functions on a remote cluster.
|
|
39
45
|
|
|
40
46
|
Args:
|
|
47
|
+
_func: function to decorate
|
|
41
48
|
cluster_id: Optional cluster id to target.
|
|
42
49
|
cluster_name: Optional cluster name to target.
|
|
43
50
|
workspace: Workspace instance or host string for lookup.
|
|
@@ -51,13 +58,19 @@ def databricks_remote_compute(
|
|
|
51
58
|
A decorator that runs functions on the resolved Databricks cluster.
|
|
52
59
|
"""
|
|
53
60
|
if force_local or Workspace.is_in_databricks_environment():
|
|
54
|
-
|
|
55
|
-
|
|
61
|
+
return identity if _func is None else _func
|
|
62
|
+
|
|
63
|
+
if workspace is None:
|
|
64
|
+
workspace = os.getenv("DATABRICKS_HOST")
|
|
56
65
|
|
|
57
|
-
|
|
66
|
+
if workspace is None:
|
|
67
|
+
return identity if _func is None else _func
|
|
58
68
|
|
|
59
|
-
if isinstance(workspace,
|
|
60
|
-
|
|
69
|
+
if not isinstance(workspace, Workspace):
|
|
70
|
+
if isinstance(workspace, str):
|
|
71
|
+
workspace = Workspace(host=workspace).connect(clone=False)
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError("Cannot initialize databricks workspace with %s" % type(workspace))
|
|
61
74
|
|
|
62
75
|
if cluster is None:
|
|
63
76
|
if cluster_id or cluster_name:
|
|
@@ -68,10 +81,14 @@ def databricks_remote_compute(
|
|
|
68
81
|
else:
|
|
69
82
|
cluster = workspace.clusters().replicated_current_environment(
|
|
70
83
|
workspace=workspace,
|
|
71
|
-
cluster_name=cluster_name
|
|
84
|
+
cluster_name=cluster_name,
|
|
85
|
+
single_user_name=workspace.current_user.user_name
|
|
72
86
|
)
|
|
73
87
|
|
|
88
|
+
cluster.ensure_running(wait_timeout=None)
|
|
89
|
+
|
|
74
90
|
return cluster.execution_decorator(
|
|
91
|
+
_func=_func,
|
|
75
92
|
env_keys=env_keys,
|
|
76
93
|
timeout=timeout,
|
|
77
94
|
**options
|
|
@@ -198,8 +198,7 @@ class SQLEngine(WorkspaceService):
|
|
|
198
198
|
"""Short, single-line preview for logs (avoids spewing giant SQL)."""
|
|
199
199
|
if not sql:
|
|
200
200
|
return ""
|
|
201
|
-
|
|
202
|
-
return one_line[:limit] + ("…" if len(one_line) > limit else "")
|
|
201
|
+
return sql[:limit] + ("…" if len(sql) > limit else "")
|
|
203
202
|
|
|
204
203
|
def execute(
|
|
205
204
|
self,
|
|
@@ -218,7 +217,6 @@ class SQLEngine(WorkspaceService):
|
|
|
218
217
|
schema_name: Optional[str] = None,
|
|
219
218
|
table_name: Optional[str] = None,
|
|
220
219
|
wait_result: bool = True,
|
|
221
|
-
**kwargs,
|
|
222
220
|
) -> "StatementResult":
|
|
223
221
|
"""Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
|
|
224
222
|
|
|
@@ -245,7 +243,6 @@ class SQLEngine(WorkspaceService):
|
|
|
245
243
|
schema_name: Optional schema override for API engine.
|
|
246
244
|
table_name: Optional table override used when `statement` is None.
|
|
247
245
|
wait_result: Whether to block until completion (API engine).
|
|
248
|
-
**kwargs: Extra params forwarded to Databricks SDK execute_statement.
|
|
249
246
|
|
|
250
247
|
Returns:
|
|
251
248
|
StatementResult.
|
|
@@ -263,9 +260,12 @@ class SQLEngine(WorkspaceService):
|
|
|
263
260
|
if spark_session is None:
|
|
264
261
|
raise ValueError("No spark session found to run sql query")
|
|
265
262
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
263
|
+
df: SparkDataFrame = spark_session.sql(statement)
|
|
264
|
+
|
|
265
|
+
if row_limit:
|
|
266
|
+
df = df.limit(row_limit)
|
|
267
|
+
|
|
268
|
+
logger.info("Spark SQL executed: %s", self._sql_preview(statement))
|
|
269
269
|
|
|
270
270
|
# Avoid Disposition dependency if SDK imports are absent
|
|
271
271
|
spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
|
|
@@ -287,7 +287,6 @@ class SQLEngine(WorkspaceService):
|
|
|
287
287
|
if not statement:
|
|
288
288
|
full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
|
|
289
289
|
statement = f"SELECT * FROM {full_name}"
|
|
290
|
-
logger.debug("Autogenerated statement: %s", self._sql_preview(statement))
|
|
291
290
|
|
|
292
291
|
if not warehouse_id:
|
|
293
292
|
warehouse_id = self._get_or_default_warehouse_id()
|
|
@@ -314,7 +313,11 @@ class SQLEngine(WorkspaceService):
|
|
|
314
313
|
disposition=disposition,
|
|
315
314
|
)
|
|
316
315
|
|
|
317
|
-
|
|
316
|
+
logger.info(
|
|
317
|
+
"API SQL executed: %s",
|
|
318
|
+
self._sql_preview(statement)
|
|
319
|
+
)
|
|
320
|
+
|
|
318
321
|
return execution.wait() if wait_result else execution
|
|
319
322
|
|
|
320
323
|
def spark_table(
|
|
@@ -465,15 +468,7 @@ class SQLEngine(WorkspaceService):
|
|
|
465
468
|
safe_chars=True,
|
|
466
469
|
)
|
|
467
470
|
|
|
468
|
-
|
|
469
|
-
"Arrow insert into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
470
|
-
location,
|
|
471
|
-
mode,
|
|
472
|
-
match_by,
|
|
473
|
-
zorder_by,
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
with self as connected:
|
|
471
|
+
with self.connect() as connected:
|
|
477
472
|
if existing_schema is None:
|
|
478
473
|
try:
|
|
479
474
|
existing_schema = connected.get_table_schema(
|
|
@@ -482,7 +477,6 @@ class SQLEngine(WorkspaceService):
|
|
|
482
477
|
table_name=table_name,
|
|
483
478
|
to_arrow_schema=True,
|
|
484
479
|
)
|
|
485
|
-
logger.debug("Fetched existing schema for %s (columns=%d)", location, len(existing_schema.names))
|
|
486
480
|
except ValueError as exc:
|
|
487
481
|
data_tbl = convert(data, pa.Table)
|
|
488
482
|
existing_schema = data_tbl.schema
|
|
@@ -527,7 +521,20 @@ class SQLEngine(WorkspaceService):
|
|
|
527
521
|
|
|
528
522
|
transaction_id = self._random_suffix()
|
|
529
523
|
|
|
530
|
-
data_tbl = convert(
|
|
524
|
+
data_tbl = convert(
|
|
525
|
+
data, pa.Table,
|
|
526
|
+
options=cast_options, target_field=existing_schema
|
|
527
|
+
)
|
|
528
|
+
num_rows = data_tbl.num_rows
|
|
529
|
+
|
|
530
|
+
logger.debug(
|
|
531
|
+
"Arrow inserting %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
532
|
+
num_rows,
|
|
533
|
+
location,
|
|
534
|
+
mode,
|
|
535
|
+
match_by,
|
|
536
|
+
zorder_by,
|
|
537
|
+
)
|
|
531
538
|
|
|
532
539
|
# Write in temp volume
|
|
533
540
|
temp_volume_path = connected.dbfs_path(
|
|
@@ -545,7 +552,6 @@ class SQLEngine(WorkspaceService):
|
|
|
545
552
|
statements: list[str] = []
|
|
546
553
|
|
|
547
554
|
if match_by:
|
|
548
|
-
logger.info("Using MERGE INTO (match_by=%s)", match_by)
|
|
549
555
|
on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
|
|
550
556
|
|
|
551
557
|
update_cols = [c for c in columns if c not in match_by]
|
|
@@ -588,6 +594,15 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
588
594
|
except Exception:
|
|
589
595
|
logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
|
|
590
596
|
|
|
597
|
+
logger.info(
|
|
598
|
+
"Arrow inserted %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
599
|
+
num_rows,
|
|
600
|
+
location,
|
|
601
|
+
mode,
|
|
602
|
+
match_by,
|
|
603
|
+
zorder_by,
|
|
604
|
+
)
|
|
605
|
+
|
|
591
606
|
if zorder_by:
|
|
592
607
|
zcols = ", ".join([f"`{c}`" for c in zorder_by])
|
|
593
608
|
optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
|
|
@@ -675,7 +690,6 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
675
690
|
table_name=table_name,
|
|
676
691
|
to_arrow_schema=False,
|
|
677
692
|
)
|
|
678
|
-
logger.debug("Fetched destination Spark schema for %s", location)
|
|
679
693
|
except ValueError:
|
|
680
694
|
logger.warning("Destination table missing; creating table %s via overwrite write", location)
|
|
681
695
|
data = convert(data, pyspark.sql.DataFrame)
|
|
@@ -704,10 +718,8 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
704
718
|
|
|
705
719
|
if match_by:
|
|
706
720
|
cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
|
|
707
|
-
logger.info("Running Delta MERGE (cond=%s)", cond)
|
|
708
721
|
|
|
709
722
|
if mode.casefold() == "overwrite":
|
|
710
|
-
logger.info("Overwrite-by-key mode: delete matching keys then append")
|
|
711
723
|
data = data.cache()
|
|
712
724
|
distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
|
|
713
725
|
|
|
@@ -815,6 +827,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
815
827
|
optimize_write: bool = True,
|
|
816
828
|
auto_compact: bool = True,
|
|
817
829
|
execute: bool = True,
|
|
830
|
+
wait_result: bool = True
|
|
818
831
|
) -> Union[str, "StatementResult"]:
|
|
819
832
|
"""Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
|
|
820
833
|
|
|
@@ -832,6 +845,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
832
845
|
optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
|
|
833
846
|
auto_compact: Sets delta.autoOptimize.autoCompact table property.
|
|
834
847
|
execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
|
|
848
|
+
wait_result: Waits execution to complete
|
|
835
849
|
|
|
836
850
|
Returns:
|
|
837
851
|
StatementResult if execute=True, else the DDL SQL string.
|
|
@@ -897,11 +911,13 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
897
911
|
|
|
898
912
|
statement = "\n".join(sql)
|
|
899
913
|
|
|
900
|
-
logger.
|
|
901
|
-
|
|
914
|
+
logger.debug(
|
|
915
|
+
"Generated CREATE TABLE DDL for %s:\n%s",
|
|
916
|
+
location, statement
|
|
917
|
+
)
|
|
902
918
|
|
|
903
919
|
if execute:
|
|
904
|
-
return self.execute(statement)
|
|
920
|
+
return self.execute(statement, wait_result=wait_result)
|
|
905
921
|
return statement
|
|
906
922
|
|
|
907
923
|
def _check_location_params(
|
|
@@ -44,6 +44,11 @@ if TYPE_CHECKING:
|
|
|
44
44
|
from .engine import SQLEngine
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
DONE_STATES = {
|
|
48
|
+
StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED,
|
|
49
|
+
StatementState.SUCCEEDED
|
|
50
|
+
}
|
|
51
|
+
|
|
47
52
|
__all__ = [
|
|
48
53
|
"StatementResult"
|
|
49
54
|
]
|
|
@@ -57,7 +62,6 @@ class StatementResult:
|
|
|
57
62
|
disposition: "Disposition"
|
|
58
63
|
|
|
59
64
|
_response: Optional[StatementResponse] = dataclasses.field(default=None, repr=False)
|
|
60
|
-
_response_refresh_time: float = dataclasses.field(default=0, repr=False)
|
|
61
65
|
|
|
62
66
|
_spark_df: Optional[SparkDataFrame] = dataclasses.field(default=None, repr=False)
|
|
63
67
|
_arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
|
|
@@ -101,8 +105,35 @@ class StatementResult:
|
|
|
101
105
|
Returns:
|
|
102
106
|
The current StatementResponse object.
|
|
103
107
|
"""
|
|
104
|
-
if self.
|
|
105
|
-
|
|
108
|
+
if self.is_spark_sql:
|
|
109
|
+
return StatementResponse(
|
|
110
|
+
statement_id=self.statement_id or "sparksql",
|
|
111
|
+
status=StatementStatus(
|
|
112
|
+
state=StatementState.SUCCEEDED
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
elif not self.statement_id:
|
|
116
|
+
return StatementResponse(
|
|
117
|
+
statement_id="unknown",
|
|
118
|
+
status=StatementStatus(
|
|
119
|
+
state=StatementState.PENDING
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
statement_execution = self.workspace.sdk().statement_execution
|
|
124
|
+
|
|
125
|
+
if self._response is None:
|
|
126
|
+
self._response = (
|
|
127
|
+
statement_execution
|
|
128
|
+
.get_statement(self.statement_id)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
if self._response.status.state not in DONE_STATES:
|
|
132
|
+
self._response = (
|
|
133
|
+
statement_execution
|
|
134
|
+
.get_statement(self.statement_id)
|
|
135
|
+
)
|
|
136
|
+
|
|
106
137
|
return self._response
|
|
107
138
|
|
|
108
139
|
@response.setter
|
|
@@ -113,27 +144,8 @@ class StatementResult:
|
|
|
113
144
|
value: StatementResponse to cache.
|
|
114
145
|
"""
|
|
115
146
|
self._response = value
|
|
116
|
-
self._response_refresh_time = time.time()
|
|
117
|
-
|
|
118
147
|
self.statement_id = self._response.statement_id
|
|
119
148
|
|
|
120
|
-
def fresh_response(self, delay: float):
|
|
121
|
-
"""Refresh the response if it is older than ``delay`` seconds.
|
|
122
|
-
|
|
123
|
-
Args:
|
|
124
|
-
delay: Minimum age in seconds before refreshing.
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
The refreshed StatementResponse object.
|
|
128
|
-
"""
|
|
129
|
-
if self.is_spark_sql:
|
|
130
|
-
return self._response
|
|
131
|
-
|
|
132
|
-
if self.statement_id and not self.done and time.time() - self._response_refresh_time > delay:
|
|
133
|
-
self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
|
|
134
|
-
|
|
135
|
-
return self._response
|
|
136
|
-
|
|
137
149
|
def result_data_at(self, chunk_index: int):
|
|
138
150
|
"""Fetch a specific result chunk by index.
|
|
139
151
|
|
|
@@ -166,17 +178,7 @@ class StatementResult:
|
|
|
166
178
|
Returns:
|
|
167
179
|
A StatementStatus object.
|
|
168
180
|
"""
|
|
169
|
-
|
|
170
|
-
return StatementStatus(
|
|
171
|
-
state=StatementState.SUCCEEDED
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
if not self.statement_id:
|
|
175
|
-
return StatementStatus(
|
|
176
|
-
state=StatementState.PENDING
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
return self.fresh_response(delay=1).status
|
|
181
|
+
return self.response.status
|
|
180
182
|
|
|
181
183
|
@property
|
|
182
184
|
def state(self):
|
|
@@ -194,8 +196,6 @@ class StatementResult:
|
|
|
194
196
|
Returns:
|
|
195
197
|
The result manifest or None for Spark SQL results.
|
|
196
198
|
"""
|
|
197
|
-
if self.is_spark_sql:
|
|
198
|
-
return None
|
|
199
199
|
return self.response.manifest
|
|
200
200
|
|
|
201
201
|
@property
|
|
@@ -214,15 +214,7 @@ class StatementResult:
|
|
|
214
214
|
Returns:
|
|
215
215
|
True if the statement is done, otherwise False.
|
|
216
216
|
"""
|
|
217
|
-
|
|
218
|
-
return True
|
|
219
|
-
|
|
220
|
-
if self._response is None:
|
|
221
|
-
return False
|
|
222
|
-
|
|
223
|
-
return self._response.status.state in [
|
|
224
|
-
StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED, StatementState.SUCCEEDED
|
|
225
|
-
]
|
|
217
|
+
return self.state in DONE_STATES
|
|
226
218
|
|
|
227
219
|
@property
|
|
228
220
|
def failed(self):
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from typing import Any, Dict, Iterable, Tuple
|
|
5
|
+
|
|
6
|
+
_MISSING = object()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"dicts_equal",
|
|
11
|
+
"dict_diff"
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _normalize(obj: Any) -> Any:
|
|
16
|
+
"""
|
|
17
|
+
Normalize nested structures so equality is stable:
|
|
18
|
+
- dict: sort keys + normalize values
|
|
19
|
+
- list/tuple: normalize items (keeps order)
|
|
20
|
+
- set: sort normalized items (orderless)
|
|
21
|
+
- float: keep as float (handled separately for tolerance)
|
|
22
|
+
"""
|
|
23
|
+
if isinstance(obj, dict):
|
|
24
|
+
return {k: _normalize(obj[k]) for k in sorted(obj.keys())}
|
|
25
|
+
if isinstance(obj, (list, tuple)):
|
|
26
|
+
return [_normalize(x) for x in obj]
|
|
27
|
+
if isinstance(obj, set):
|
|
28
|
+
return sorted(_normalize(x) for x in obj)
|
|
29
|
+
return obj
|
|
30
|
+
|
|
31
|
+
def _equal(a: Any, b: Any, float_tol: float = 0.0) -> bool:
|
|
32
|
+
# Float tolerance (optional)
|
|
33
|
+
if isinstance(a, float) or isinstance(b, float):
|
|
34
|
+
if a is None or b is None:
|
|
35
|
+
return a is b
|
|
36
|
+
try:
|
|
37
|
+
return math.isclose(float(a), float(b), rel_tol=float_tol, abs_tol=float_tol)
|
|
38
|
+
except Exception:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
# Deep normalize compare for dict/list/set
|
|
42
|
+
return _normalize(a) == _normalize(b)
|
|
43
|
+
|
|
44
|
+
def dicts_equal(
|
|
45
|
+
a: Dict[str, Any],
|
|
46
|
+
b: Dict[str, Any],
|
|
47
|
+
*,
|
|
48
|
+
keys: Iterable[str] | None = None,
|
|
49
|
+
treat_missing_as_none: bool = True,
|
|
50
|
+
float_tol: float = 0.0,
|
|
51
|
+
) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Equality check for two dicts with options:
|
|
54
|
+
- keys: only compare these keys
|
|
55
|
+
- treat_missing_as_none: missing key == None if other side is None
|
|
56
|
+
- float_tol: tolerance for float comparisons
|
|
57
|
+
"""
|
|
58
|
+
if keys is None:
|
|
59
|
+
keys = set(a.keys()) | set(b.keys())
|
|
60
|
+
|
|
61
|
+
for k in keys:
|
|
62
|
+
av = a.get(k, _MISSING)
|
|
63
|
+
bv = b.get(k, _MISSING)
|
|
64
|
+
|
|
65
|
+
if treat_missing_as_none:
|
|
66
|
+
if av is _MISSING and bv is None:
|
|
67
|
+
continue
|
|
68
|
+
if bv is _MISSING and av is None:
|
|
69
|
+
continue
|
|
70
|
+
if av is _MISSING and bv is _MISSING:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
if not _equal(av, bv, float_tol=float_tol):
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
def dict_diff(
|
|
79
|
+
a: Dict[str, Any],
|
|
80
|
+
b: Dict[str, Any],
|
|
81
|
+
*,
|
|
82
|
+
keys: Iterable[str] | None = None,
|
|
83
|
+
treat_missing_as_none: bool = True,
|
|
84
|
+
float_tol: float = 0.0,
|
|
85
|
+
) -> Dict[str, Tuple[Any, Any]]:
|
|
86
|
+
"""
|
|
87
|
+
Returns {key: (a_val, b_val)} for all keys that differ.
|
|
88
|
+
"""
|
|
89
|
+
if keys is None:
|
|
90
|
+
keys = set(a.keys()) | set(b.keys())
|
|
91
|
+
|
|
92
|
+
out: Dict[str, Tuple[Any, Any]] = {}
|
|
93
|
+
for k in keys:
|
|
94
|
+
av = a.get(k, _MISSING)
|
|
95
|
+
bv = b.get(k, _MISSING)
|
|
96
|
+
|
|
97
|
+
if treat_missing_as_none:
|
|
98
|
+
if av is _MISSING and bv is None:
|
|
99
|
+
continue
|
|
100
|
+
if bv is _MISSING and av is None:
|
|
101
|
+
continue
|
|
102
|
+
if av is _MISSING and bv is _MISSING:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
if not _equal(av, bv, float_tol=float_tol):
|
|
106
|
+
out[k] = (None if av is _MISSING else av, None if bv is _MISSING else bv)
|
|
107
|
+
return out
|
yggdrasil/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.37"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|