ygg 0.1.34__tar.gz → 0.1.38__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.34 → ygg-0.1.38}/PKG-INFO +1 -1
- {ygg-0.1.34 → ygg-0.1.38}/pyproject.toml +1 -1
- {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/PKG-INFO +1 -1
- {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/SOURCES.txt +1 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/cluster.py +48 -17
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/execution_context.py +2 -2
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/remote.py +25 -8
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/engine.py +43 -28
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/statement_result.py +36 -51
- ygg-0.1.38/src/yggdrasil/pyutils/equality.py +107 -0
- ygg-0.1.38/src/yggdrasil/version.py +1 -0
- ygg-0.1.34/src/yggdrasil/version.py +0 -1
- {ygg-0.1.34 → ygg-0.1.38}/LICENSE +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/README.md +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/setup.cfg +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/dependency_links.txt +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/entry_points.txt +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/requires.txt +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/ygg.egg-info/top_level.txt +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/compute/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/jobs/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/jobs/config.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/exceptions.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/sql/types.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/filesytem.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/io.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/path.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/path_kind.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/databricks/workspaces/workspace.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/dataclasses/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/dataclasses/dataclass.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/databrickslib.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/extensions/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/extensions/polars_extensions.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/extensions/spark_extensions.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/pandaslib.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/polarslib.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/libs/sparklib.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/callable_serde.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/exceptions.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/expiring_dict.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/modules.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/parallel.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/python_env.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/pyutils/retry.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/requests/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/requests/msal.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/requests/session.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/__init__.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/arrow_cast.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/cast_options.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/pandas_cast.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/polars_cast.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/polars_pandas_cast.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/registry.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/spark_cast.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/spark_pandas_cast.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/cast/spark_polars_cast.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/libs.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/python_arrow.py +0 -0
- {ygg-0.1.34 → ygg-0.1.38}/src/yggdrasil/types/python_defaults.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "ygg"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.38"
|
|
8
8
|
description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
|
|
9
9
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -39,6 +39,7 @@ src/yggdrasil/libs/extensions/polars_extensions.py
|
|
|
39
39
|
src/yggdrasil/libs/extensions/spark_extensions.py
|
|
40
40
|
src/yggdrasil/pyutils/__init__.py
|
|
41
41
|
src/yggdrasil/pyutils/callable_serde.py
|
|
42
|
+
src/yggdrasil/pyutils/equality.py
|
|
42
43
|
src/yggdrasil/pyutils/exceptions.py
|
|
43
44
|
src/yggdrasil/pyutils/expiring_dict.py
|
|
44
45
|
src/yggdrasil/pyutils/modules.py
|
|
@@ -24,6 +24,7 @@ from .execution_context import ExecutionContext
|
|
|
24
24
|
from ..workspaces.workspace import WorkspaceService, Workspace
|
|
25
25
|
from ... import retry, CallableSerde
|
|
26
26
|
from ...libs.databrickslib import databricks_sdk
|
|
27
|
+
from ...pyutils.equality import dicts_equal, dict_diff
|
|
27
28
|
from ...pyutils.expiring_dict import ExpiringDict
|
|
28
29
|
from ...pyutils.modules import PipIndexSettings
|
|
29
30
|
from ...pyutils.python_env import PythonEnv
|
|
@@ -110,7 +111,7 @@ class Cluster(WorkspaceService):
|
|
|
110
111
|
|
|
111
112
|
_details: Optional["ClusterDetails"] = dataclasses.field(default=None, repr=False)
|
|
112
113
|
_details_refresh_time: float = dataclasses.field(default=0, repr=False)
|
|
113
|
-
_system_context: Optional[ExecutionContext] = None
|
|
114
|
+
_system_context: Optional[ExecutionContext] = dataclasses.field(default=None, repr=False)
|
|
114
115
|
|
|
115
116
|
# host → Cluster instance
|
|
116
117
|
_env_clusters: ClassVar[Dict[str, "Cluster"]] = {}
|
|
@@ -309,6 +310,11 @@ class Cluster(WorkspaceService):
|
|
|
309
310
|
self.details = self.clusters_client().get(cluster_id=self.cluster_id)
|
|
310
311
|
return self._details
|
|
311
312
|
|
|
313
|
+
def refresh(self, max_delay: float | None = None):
|
|
314
|
+
self.details = self.fresh_details(max_delay=max_delay)
|
|
315
|
+
|
|
316
|
+
return self
|
|
317
|
+
|
|
312
318
|
@details.setter
|
|
313
319
|
def details(self, value: "ClusterDetails"):
|
|
314
320
|
"""Cache cluster details and update identifiers."""
|
|
@@ -321,10 +327,10 @@ class Cluster(WorkspaceService):
|
|
|
321
327
|
@property
|
|
322
328
|
def state(self):
|
|
323
329
|
"""Return the current cluster state."""
|
|
324
|
-
|
|
330
|
+
self.refresh()
|
|
325
331
|
|
|
326
|
-
if
|
|
327
|
-
return
|
|
332
|
+
if self._details is not None:
|
|
333
|
+
return self._details.state
|
|
328
334
|
return State.UNKNOWN
|
|
329
335
|
|
|
330
336
|
@property
|
|
@@ -355,7 +361,7 @@ class Cluster(WorkspaceService):
|
|
|
355
361
|
def wait_for_status(
|
|
356
362
|
self,
|
|
357
363
|
tick: float = 0.5,
|
|
358
|
-
timeout: float = 600,
|
|
364
|
+
timeout: Union[float, dt.timedelta] = 600,
|
|
359
365
|
backoff: int = 2,
|
|
360
366
|
max_sleep_time: float = 15
|
|
361
367
|
):
|
|
@@ -373,6 +379,9 @@ class Cluster(WorkspaceService):
|
|
|
373
379
|
start = time.time()
|
|
374
380
|
sleep_time = tick
|
|
375
381
|
|
|
382
|
+
if isinstance(timeout, dt.timedelta):
|
|
383
|
+
timeout = timeout.total_seconds()
|
|
384
|
+
|
|
376
385
|
while self.is_pending:
|
|
377
386
|
time.sleep(sleep_time)
|
|
378
387
|
|
|
@@ -658,8 +667,6 @@ class Cluster(WorkspaceService):
|
|
|
658
667
|
Returns:
|
|
659
668
|
The updated Cluster instance.
|
|
660
669
|
"""
|
|
661
|
-
self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
|
|
662
|
-
|
|
663
670
|
existing_details = {
|
|
664
671
|
k: v
|
|
665
672
|
for k, v in self.details.as_shallow_dict().items()
|
|
@@ -672,22 +679,36 @@ class Cluster(WorkspaceService):
|
|
|
672
679
|
if k in _EDIT_ARG_NAMES
|
|
673
680
|
}
|
|
674
681
|
|
|
675
|
-
|
|
682
|
+
same = dicts_equal(
|
|
683
|
+
existing_details,
|
|
684
|
+
update_details,
|
|
685
|
+
keys=_EDIT_ARG_NAMES,
|
|
686
|
+
treat_missing_as_none=True,
|
|
687
|
+
float_tol=0.0, # set e.g. 1e-6 if you have float-y stuff
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
if not same:
|
|
691
|
+
diff = {
|
|
692
|
+
k: v[1]
|
|
693
|
+
for k, v in dict_diff(existing_details, update_details, keys=_EDIT_ARG_NAMES).items()
|
|
694
|
+
}
|
|
695
|
+
|
|
676
696
|
logger.debug(
|
|
677
697
|
"Updating %s with %s",
|
|
678
|
-
self,
|
|
698
|
+
self, diff
|
|
679
699
|
)
|
|
680
700
|
|
|
681
701
|
self.wait_for_status()
|
|
682
|
-
self.details =
|
|
683
|
-
|
|
684
|
-
)(**update_details)
|
|
702
|
+
self.details = self.clusters_client().edit(**update_details)
|
|
703
|
+
self.wait_for_status()
|
|
685
704
|
|
|
686
705
|
logger.info(
|
|
687
706
|
"Updated %s",
|
|
688
707
|
self
|
|
689
708
|
)
|
|
690
709
|
|
|
710
|
+
self.install_libraries(libraries=libraries, wait_timeout=None, raise_error=False)
|
|
711
|
+
|
|
691
712
|
return self
|
|
692
713
|
|
|
693
714
|
def list_clusters(self) -> Iterator["Cluster"]:
|
|
@@ -742,7 +763,10 @@ class Cluster(WorkspaceService):
|
|
|
742
763
|
return None
|
|
743
764
|
|
|
744
765
|
return Cluster(
|
|
745
|
-
workspace=self.workspace,
|
|
766
|
+
workspace=self.workspace,
|
|
767
|
+
cluster_id=details.cluster_id,
|
|
768
|
+
cluster_name=details.cluster_name,
|
|
769
|
+
_details=details
|
|
746
770
|
)
|
|
747
771
|
|
|
748
772
|
for cluster in self.list_clusters():
|
|
@@ -760,16 +784,18 @@ class Cluster(WorkspaceService):
|
|
|
760
784
|
|
|
761
785
|
def ensure_running(
|
|
762
786
|
self,
|
|
787
|
+
wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
|
|
763
788
|
) -> "Cluster":
|
|
764
789
|
"""Ensure the cluster is running.
|
|
765
790
|
|
|
766
791
|
Returns:
|
|
767
792
|
The current Cluster instance.
|
|
768
793
|
"""
|
|
769
|
-
return self.start()
|
|
794
|
+
return self.start(wait_timeout=wait_timeout)
|
|
770
795
|
|
|
771
796
|
def start(
|
|
772
797
|
self,
|
|
798
|
+
wait_timeout: Optional[dt.timedelta] = dt.timedelta(minutes=20)
|
|
773
799
|
) -> "Cluster":
|
|
774
800
|
"""Start the cluster if it is not already running.
|
|
775
801
|
|
|
@@ -780,8 +806,13 @@ class Cluster(WorkspaceService):
|
|
|
780
806
|
|
|
781
807
|
if not self.is_running:
|
|
782
808
|
logger.info("Starting %s", self)
|
|
783
|
-
|
|
784
|
-
|
|
809
|
+
|
|
810
|
+
if wait_timeout:
|
|
811
|
+
self.clusters_client().start(cluster_id=self.cluster_id)
|
|
812
|
+
self.wait_for_status(timeout=wait_timeout.total_seconds())
|
|
813
|
+
self.wait_installed_libraries(timeout=wait_timeout)
|
|
814
|
+
else:
|
|
815
|
+
self.clusters_client().start(cluster_id=self.cluster_id)
|
|
785
816
|
|
|
786
817
|
return self
|
|
787
818
|
|
|
@@ -1124,7 +1155,7 @@ class Cluster(WorkspaceService):
|
|
|
1124
1155
|
"Waiting %s to install libraries timed out" % self
|
|
1125
1156
|
)
|
|
1126
1157
|
|
|
1127
|
-
time.sleep(
|
|
1158
|
+
time.sleep(5)
|
|
1128
1159
|
statuses = list(self.installed_library_statuses())
|
|
1129
1160
|
|
|
1130
1161
|
return self
|
|
@@ -78,8 +78,8 @@ class ExecutionContext:
|
|
|
78
78
|
language: Optional["Language"] = None
|
|
79
79
|
context_id: Optional[str] = None
|
|
80
80
|
|
|
81
|
-
_was_connected: Optional[bool] = None
|
|
82
|
-
_remote_metadata: Optional[RemoteMetadata] = None
|
|
81
|
+
_was_connected: Optional[bool] = dc.field(default=None, repr=False)
|
|
82
|
+
_remote_metadata: Optional[RemoteMetadata] = dc.field(default=None, repr=False)
|
|
83
83
|
|
|
84
84
|
_lock: threading.RLock = dc.field(default_factory=threading.RLock, init=False, repr=False)
|
|
85
85
|
|
|
@@ -2,11 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
import datetime as dt
|
|
4
4
|
import logging
|
|
5
|
+
import os
|
|
5
6
|
from typing import (
|
|
6
7
|
Callable,
|
|
7
8
|
Optional,
|
|
8
9
|
TypeVar,
|
|
9
|
-
List, TYPE_CHECKING,
|
|
10
|
+
List, TYPE_CHECKING, Union,
|
|
10
11
|
)
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
@@ -25,10 +26,15 @@ ReturnType = TypeVar("ReturnType")
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
26
27
|
|
|
27
28
|
|
|
29
|
+
def identity(x):
|
|
30
|
+
return x
|
|
31
|
+
|
|
32
|
+
|
|
28
33
|
def databricks_remote_compute(
|
|
34
|
+
_func: Optional[Callable] = None,
|
|
29
35
|
cluster_id: Optional[str] = None,
|
|
30
36
|
cluster_name: Optional[str] = None,
|
|
31
|
-
workspace: Optional[Workspace] = None,
|
|
37
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
32
38
|
cluster: Optional["Cluster"] = None,
|
|
33
39
|
timeout: Optional[dt.timedelta] = None,
|
|
34
40
|
env_keys: Optional[List[str]] = None,
|
|
@@ -38,6 +44,7 @@ def databricks_remote_compute(
|
|
|
38
44
|
"""Return a decorator that executes functions on a remote cluster.
|
|
39
45
|
|
|
40
46
|
Args:
|
|
47
|
+
_func: function to decorate
|
|
41
48
|
cluster_id: Optional cluster id to target.
|
|
42
49
|
cluster_name: Optional cluster name to target.
|
|
43
50
|
workspace: Workspace instance or host string for lookup.
|
|
@@ -51,13 +58,19 @@ def databricks_remote_compute(
|
|
|
51
58
|
A decorator that runs functions on the resolved Databricks cluster.
|
|
52
59
|
"""
|
|
53
60
|
if force_local or Workspace.is_in_databricks_environment():
|
|
54
|
-
|
|
55
|
-
|
|
61
|
+
return identity if _func is None else _func
|
|
62
|
+
|
|
63
|
+
if workspace is None:
|
|
64
|
+
workspace = os.getenv("DATABRICKS_HOST")
|
|
56
65
|
|
|
57
|
-
|
|
66
|
+
if workspace is None:
|
|
67
|
+
return identity if _func is None else _func
|
|
58
68
|
|
|
59
|
-
if isinstance(workspace,
|
|
60
|
-
|
|
69
|
+
if not isinstance(workspace, Workspace):
|
|
70
|
+
if isinstance(workspace, str):
|
|
71
|
+
workspace = Workspace(host=workspace).connect(clone=False)
|
|
72
|
+
else:
|
|
73
|
+
raise ValueError("Cannot initialize databricks workspace with %s" % type(workspace))
|
|
61
74
|
|
|
62
75
|
if cluster is None:
|
|
63
76
|
if cluster_id or cluster_name:
|
|
@@ -68,10 +81,14 @@ def databricks_remote_compute(
|
|
|
68
81
|
else:
|
|
69
82
|
cluster = workspace.clusters().replicated_current_environment(
|
|
70
83
|
workspace=workspace,
|
|
71
|
-
cluster_name=cluster_name
|
|
84
|
+
cluster_name=cluster_name,
|
|
85
|
+
single_user_name=workspace.current_user.user_name
|
|
72
86
|
)
|
|
73
87
|
|
|
88
|
+
cluster.ensure_running(wait_timeout=None)
|
|
89
|
+
|
|
74
90
|
return cluster.execution_decorator(
|
|
91
|
+
_func=_func,
|
|
75
92
|
env_keys=env_keys,
|
|
76
93
|
timeout=timeout,
|
|
77
94
|
**options
|
|
@@ -198,8 +198,7 @@ class SQLEngine(WorkspaceService):
|
|
|
198
198
|
"""Short, single-line preview for logs (avoids spewing giant SQL)."""
|
|
199
199
|
if not sql:
|
|
200
200
|
return ""
|
|
201
|
-
|
|
202
|
-
return one_line[:limit] + ("…" if len(one_line) > limit else "")
|
|
201
|
+
return sql[:limit] + ("…" if len(sql) > limit else "")
|
|
203
202
|
|
|
204
203
|
def execute(
|
|
205
204
|
self,
|
|
@@ -218,7 +217,6 @@ class SQLEngine(WorkspaceService):
|
|
|
218
217
|
schema_name: Optional[str] = None,
|
|
219
218
|
table_name: Optional[str] = None,
|
|
220
219
|
wait_result: bool = True,
|
|
221
|
-
**kwargs,
|
|
222
220
|
) -> "StatementResult":
|
|
223
221
|
"""Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
|
|
224
222
|
|
|
@@ -245,7 +243,6 @@ class SQLEngine(WorkspaceService):
|
|
|
245
243
|
schema_name: Optional schema override for API engine.
|
|
246
244
|
table_name: Optional table override used when `statement` is None.
|
|
247
245
|
wait_result: Whether to block until completion (API engine).
|
|
248
|
-
**kwargs: Extra params forwarded to Databricks SDK execute_statement.
|
|
249
246
|
|
|
250
247
|
Returns:
|
|
251
248
|
StatementResult.
|
|
@@ -263,9 +260,12 @@ class SQLEngine(WorkspaceService):
|
|
|
263
260
|
if spark_session is None:
|
|
264
261
|
raise ValueError("No spark session found to run sql query")
|
|
265
262
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
263
|
+
df: SparkDataFrame = spark_session.sql(statement)
|
|
264
|
+
|
|
265
|
+
if row_limit:
|
|
266
|
+
df = df.limit(row_limit)
|
|
267
|
+
|
|
268
|
+
logger.info("Spark SQL executed: %s", self._sql_preview(statement))
|
|
269
269
|
|
|
270
270
|
# Avoid Disposition dependency if SDK imports are absent
|
|
271
271
|
spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
|
|
@@ -287,7 +287,6 @@ class SQLEngine(WorkspaceService):
|
|
|
287
287
|
if not statement:
|
|
288
288
|
full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
|
|
289
289
|
statement = f"SELECT * FROM {full_name}"
|
|
290
|
-
logger.debug("Autogenerated statement: %s", self._sql_preview(statement))
|
|
291
290
|
|
|
292
291
|
if not warehouse_id:
|
|
293
292
|
warehouse_id = self._get_or_default_warehouse_id()
|
|
@@ -310,11 +309,14 @@ class SQLEngine(WorkspaceService):
|
|
|
310
309
|
engine=self,
|
|
311
310
|
statement_id=response.statement_id,
|
|
312
311
|
_response=response,
|
|
313
|
-
_response_refresh_time=time.time(),
|
|
314
312
|
disposition=disposition,
|
|
315
313
|
)
|
|
316
314
|
|
|
317
|
-
|
|
315
|
+
logger.info(
|
|
316
|
+
"API SQL executed: %s",
|
|
317
|
+
self._sql_preview(statement)
|
|
318
|
+
)
|
|
319
|
+
|
|
318
320
|
return execution.wait() if wait_result else execution
|
|
319
321
|
|
|
320
322
|
def spark_table(
|
|
@@ -465,15 +467,7 @@ class SQLEngine(WorkspaceService):
|
|
|
465
467
|
safe_chars=True,
|
|
466
468
|
)
|
|
467
469
|
|
|
468
|
-
|
|
469
|
-
"Arrow insert into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
470
|
-
location,
|
|
471
|
-
mode,
|
|
472
|
-
match_by,
|
|
473
|
-
zorder_by,
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
with self as connected:
|
|
470
|
+
with self.connect() as connected:
|
|
477
471
|
if existing_schema is None:
|
|
478
472
|
try:
|
|
479
473
|
existing_schema = connected.get_table_schema(
|
|
@@ -482,7 +476,6 @@ class SQLEngine(WorkspaceService):
|
|
|
482
476
|
table_name=table_name,
|
|
483
477
|
to_arrow_schema=True,
|
|
484
478
|
)
|
|
485
|
-
logger.debug("Fetched existing schema for %s (columns=%d)", location, len(existing_schema.names))
|
|
486
479
|
except ValueError as exc:
|
|
487
480
|
data_tbl = convert(data, pa.Table)
|
|
488
481
|
existing_schema = data_tbl.schema
|
|
@@ -527,7 +520,20 @@ class SQLEngine(WorkspaceService):
|
|
|
527
520
|
|
|
528
521
|
transaction_id = self._random_suffix()
|
|
529
522
|
|
|
530
|
-
data_tbl = convert(
|
|
523
|
+
data_tbl = convert(
|
|
524
|
+
data, pa.Table,
|
|
525
|
+
options=cast_options, target_field=existing_schema
|
|
526
|
+
)
|
|
527
|
+
num_rows = data_tbl.num_rows
|
|
528
|
+
|
|
529
|
+
logger.debug(
|
|
530
|
+
"Arrow inserting %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
531
|
+
num_rows,
|
|
532
|
+
location,
|
|
533
|
+
mode,
|
|
534
|
+
match_by,
|
|
535
|
+
zorder_by,
|
|
536
|
+
)
|
|
531
537
|
|
|
532
538
|
# Write in temp volume
|
|
533
539
|
temp_volume_path = connected.dbfs_path(
|
|
@@ -545,7 +551,6 @@ class SQLEngine(WorkspaceService):
|
|
|
545
551
|
statements: list[str] = []
|
|
546
552
|
|
|
547
553
|
if match_by:
|
|
548
|
-
logger.info("Using MERGE INTO (match_by=%s)", match_by)
|
|
549
554
|
on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
|
|
550
555
|
|
|
551
556
|
update_cols = [c for c in columns if c not in match_by]
|
|
@@ -588,6 +593,15 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
588
593
|
except Exception:
|
|
589
594
|
logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
|
|
590
595
|
|
|
596
|
+
logger.info(
|
|
597
|
+
"Arrow inserted %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
598
|
+
num_rows,
|
|
599
|
+
location,
|
|
600
|
+
mode,
|
|
601
|
+
match_by,
|
|
602
|
+
zorder_by,
|
|
603
|
+
)
|
|
604
|
+
|
|
591
605
|
if zorder_by:
|
|
592
606
|
zcols = ", ".join([f"`{c}`" for c in zorder_by])
|
|
593
607
|
optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
|
|
@@ -675,7 +689,6 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
675
689
|
table_name=table_name,
|
|
676
690
|
to_arrow_schema=False,
|
|
677
691
|
)
|
|
678
|
-
logger.debug("Fetched destination Spark schema for %s", location)
|
|
679
692
|
except ValueError:
|
|
680
693
|
logger.warning("Destination table missing; creating table %s via overwrite write", location)
|
|
681
694
|
data = convert(data, pyspark.sql.DataFrame)
|
|
@@ -704,10 +717,8 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
704
717
|
|
|
705
718
|
if match_by:
|
|
706
719
|
cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
|
|
707
|
-
logger.info("Running Delta MERGE (cond=%s)", cond)
|
|
708
720
|
|
|
709
721
|
if mode.casefold() == "overwrite":
|
|
710
|
-
logger.info("Overwrite-by-key mode: delete matching keys then append")
|
|
711
722
|
data = data.cache()
|
|
712
723
|
distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
|
|
713
724
|
|
|
@@ -815,6 +826,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
815
826
|
optimize_write: bool = True,
|
|
816
827
|
auto_compact: bool = True,
|
|
817
828
|
execute: bool = True,
|
|
829
|
+
wait_result: bool = True
|
|
818
830
|
) -> Union[str, "StatementResult"]:
|
|
819
831
|
"""Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
|
|
820
832
|
|
|
@@ -832,6 +844,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
832
844
|
optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
|
|
833
845
|
auto_compact: Sets delta.autoOptimize.autoCompact table property.
|
|
834
846
|
execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
|
|
847
|
+
wait_result: Waits execution to complete
|
|
835
848
|
|
|
836
849
|
Returns:
|
|
837
850
|
StatementResult if execute=True, else the DDL SQL string.
|
|
@@ -897,11 +910,13 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
897
910
|
|
|
898
911
|
statement = "\n".join(sql)
|
|
899
912
|
|
|
900
|
-
logger.
|
|
901
|
-
|
|
913
|
+
logger.debug(
|
|
914
|
+
"Generated CREATE TABLE DDL for %s:\n%s",
|
|
915
|
+
location, statement
|
|
916
|
+
)
|
|
902
917
|
|
|
903
918
|
if execute:
|
|
904
|
-
return self.execute(statement)
|
|
919
|
+
return self.execute(statement, wait_result=wait_result)
|
|
905
920
|
return statement
|
|
906
921
|
|
|
907
922
|
def _check_location_params(
|
|
@@ -44,6 +44,15 @@ if TYPE_CHECKING:
|
|
|
44
44
|
from .engine import SQLEngine
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
DONE_STATES = {
|
|
48
|
+
StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED,
|
|
49
|
+
StatementState.SUCCEEDED
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
FAILED_STATES = {
|
|
53
|
+
StatementState.FAILED, StatementState.CANCELED
|
|
54
|
+
}
|
|
55
|
+
|
|
47
56
|
__all__ = [
|
|
48
57
|
"StatementResult"
|
|
49
58
|
]
|
|
@@ -57,7 +66,6 @@ class StatementResult:
|
|
|
57
66
|
disposition: "Disposition"
|
|
58
67
|
|
|
59
68
|
_response: Optional[StatementResponse] = dataclasses.field(default=None, repr=False)
|
|
60
|
-
_response_refresh_time: float = dataclasses.field(default=0, repr=False)
|
|
61
69
|
|
|
62
70
|
_spark_df: Optional[SparkDataFrame] = dataclasses.field(default=None, repr=False)
|
|
63
71
|
_arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
|
|
@@ -101,8 +109,30 @@ class StatementResult:
|
|
|
101
109
|
Returns:
|
|
102
110
|
The current StatementResponse object.
|
|
103
111
|
"""
|
|
104
|
-
if self.
|
|
105
|
-
|
|
112
|
+
if self.is_spark_sql:
|
|
113
|
+
return StatementResponse(
|
|
114
|
+
statement_id=self.statement_id or "sparksql",
|
|
115
|
+
status=StatementStatus(
|
|
116
|
+
state=StatementState.SUCCEEDED
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
elif not self.statement_id:
|
|
120
|
+
return StatementResponse(
|
|
121
|
+
statement_id="unknown",
|
|
122
|
+
status=StatementStatus(
|
|
123
|
+
state=StatementState.PENDING
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
statement_execution = self.workspace.sdk().statement_execution
|
|
128
|
+
|
|
129
|
+
if self._response is None:
|
|
130
|
+
# Initialize
|
|
131
|
+
self._response = statement_execution.get_statement(self.statement_id)
|
|
132
|
+
elif self._response.status.state not in DONE_STATES:
|
|
133
|
+
# Refresh
|
|
134
|
+
self._response = statement_execution.get_statement(self.statement_id)
|
|
135
|
+
|
|
106
136
|
return self._response
|
|
107
137
|
|
|
108
138
|
@response.setter
|
|
@@ -113,27 +143,8 @@ class StatementResult:
|
|
|
113
143
|
value: StatementResponse to cache.
|
|
114
144
|
"""
|
|
115
145
|
self._response = value
|
|
116
|
-
self._response_refresh_time = time.time()
|
|
117
|
-
|
|
118
146
|
self.statement_id = self._response.statement_id
|
|
119
147
|
|
|
120
|
-
def fresh_response(self, delay: float):
|
|
121
|
-
"""Refresh the response if it is older than ``delay`` seconds.
|
|
122
|
-
|
|
123
|
-
Args:
|
|
124
|
-
delay: Minimum age in seconds before refreshing.
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
The refreshed StatementResponse object.
|
|
128
|
-
"""
|
|
129
|
-
if self.is_spark_sql:
|
|
130
|
-
return self._response
|
|
131
|
-
|
|
132
|
-
if self.statement_id and not self.done and time.time() - self._response_refresh_time > delay:
|
|
133
|
-
self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
|
|
134
|
-
|
|
135
|
-
return self._response
|
|
136
|
-
|
|
137
148
|
def result_data_at(self, chunk_index: int):
|
|
138
149
|
"""Fetch a specific result chunk by index.
|
|
139
150
|
|
|
@@ -166,17 +177,7 @@ class StatementResult:
|
|
|
166
177
|
Returns:
|
|
167
178
|
A StatementStatus object.
|
|
168
179
|
"""
|
|
169
|
-
|
|
170
|
-
return StatementStatus(
|
|
171
|
-
state=StatementState.SUCCEEDED
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
if not self.statement_id:
|
|
175
|
-
return StatementStatus(
|
|
176
|
-
state=StatementState.PENDING
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
return self.fresh_response(delay=1).status
|
|
180
|
+
return self.response.status
|
|
180
181
|
|
|
181
182
|
@property
|
|
182
183
|
def state(self):
|
|
@@ -194,8 +195,6 @@ class StatementResult:
|
|
|
194
195
|
Returns:
|
|
195
196
|
The result manifest or None for Spark SQL results.
|
|
196
197
|
"""
|
|
197
|
-
if self.is_spark_sql:
|
|
198
|
-
return None
|
|
199
198
|
return self.response.manifest
|
|
200
199
|
|
|
201
200
|
@property
|
|
@@ -214,15 +213,7 @@ class StatementResult:
|
|
|
214
213
|
Returns:
|
|
215
214
|
True if the statement is done, otherwise False.
|
|
216
215
|
"""
|
|
217
|
-
|
|
218
|
-
return True
|
|
219
|
-
|
|
220
|
-
if self._response is None:
|
|
221
|
-
return False
|
|
222
|
-
|
|
223
|
-
return self._response.status.state in [
|
|
224
|
-
StatementState.CANCELED, StatementState.CLOSED, StatementState.FAILED, StatementState.SUCCEEDED
|
|
225
|
-
]
|
|
216
|
+
return self.state in DONE_STATES
|
|
226
217
|
|
|
227
218
|
@property
|
|
228
219
|
def failed(self):
|
|
@@ -231,13 +222,7 @@ class StatementResult:
|
|
|
231
222
|
Returns:
|
|
232
223
|
True if the statement failed or was cancelled.
|
|
233
224
|
"""
|
|
234
|
-
|
|
235
|
-
return True
|
|
236
|
-
|
|
237
|
-
if self._response is None:
|
|
238
|
-
return False
|
|
239
|
-
|
|
240
|
-
return self._response.status.state in [StatementState.CANCELED, StatementState.FAILED]
|
|
225
|
+
return self.state in FAILED_STATES
|
|
241
226
|
|
|
242
227
|
@property
|
|
243
228
|
def persisted(self):
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from typing import Any, Dict, Iterable, Tuple
|
|
5
|
+
|
|
6
|
+
_MISSING = object()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"dicts_equal",
|
|
11
|
+
"dict_diff"
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _normalize(obj: Any) -> Any:
|
|
16
|
+
"""
|
|
17
|
+
Normalize nested structures so equality is stable:
|
|
18
|
+
- dict: sort keys + normalize values
|
|
19
|
+
- list/tuple: normalize items (keeps order)
|
|
20
|
+
- set: sort normalized items (orderless)
|
|
21
|
+
- float: keep as float (handled separately for tolerance)
|
|
22
|
+
"""
|
|
23
|
+
if isinstance(obj, dict):
|
|
24
|
+
return {k: _normalize(obj[k]) for k in sorted(obj.keys())}
|
|
25
|
+
if isinstance(obj, (list, tuple)):
|
|
26
|
+
return [_normalize(x) for x in obj]
|
|
27
|
+
if isinstance(obj, set):
|
|
28
|
+
return sorted(_normalize(x) for x in obj)
|
|
29
|
+
return obj
|
|
30
|
+
|
|
31
|
+
def _equal(a: Any, b: Any, float_tol: float = 0.0) -> bool:
|
|
32
|
+
# Float tolerance (optional)
|
|
33
|
+
if isinstance(a, float) or isinstance(b, float):
|
|
34
|
+
if a is None or b is None:
|
|
35
|
+
return a is b
|
|
36
|
+
try:
|
|
37
|
+
return math.isclose(float(a), float(b), rel_tol=float_tol, abs_tol=float_tol)
|
|
38
|
+
except Exception:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
# Deep normalize compare for dict/list/set
|
|
42
|
+
return _normalize(a) == _normalize(b)
|
|
43
|
+
|
|
44
|
+
def dicts_equal(
|
|
45
|
+
a: Dict[str, Any],
|
|
46
|
+
b: Dict[str, Any],
|
|
47
|
+
*,
|
|
48
|
+
keys: Iterable[str] | None = None,
|
|
49
|
+
treat_missing_as_none: bool = True,
|
|
50
|
+
float_tol: float = 0.0,
|
|
51
|
+
) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Equality check for two dicts with options:
|
|
54
|
+
- keys: only compare these keys
|
|
55
|
+
- treat_missing_as_none: missing key == None if other side is None
|
|
56
|
+
- float_tol: tolerance for float comparisons
|
|
57
|
+
"""
|
|
58
|
+
if keys is None:
|
|
59
|
+
keys = set(a.keys()) | set(b.keys())
|
|
60
|
+
|
|
61
|
+
for k in keys:
|
|
62
|
+
av = a.get(k, _MISSING)
|
|
63
|
+
bv = b.get(k, _MISSING)
|
|
64
|
+
|
|
65
|
+
if treat_missing_as_none:
|
|
66
|
+
if av is _MISSING and bv is None:
|
|
67
|
+
continue
|
|
68
|
+
if bv is _MISSING and av is None:
|
|
69
|
+
continue
|
|
70
|
+
if av is _MISSING and bv is _MISSING:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
if not _equal(av, bv, float_tol=float_tol):
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
def dict_diff(
|
|
79
|
+
a: Dict[str, Any],
|
|
80
|
+
b: Dict[str, Any],
|
|
81
|
+
*,
|
|
82
|
+
keys: Iterable[str] | None = None,
|
|
83
|
+
treat_missing_as_none: bool = True,
|
|
84
|
+
float_tol: float = 0.0,
|
|
85
|
+
) -> Dict[str, Tuple[Any, Any]]:
|
|
86
|
+
"""
|
|
87
|
+
Returns {key: (a_val, b_val)} for all keys that differ.
|
|
88
|
+
"""
|
|
89
|
+
if keys is None:
|
|
90
|
+
keys = set(a.keys()) | set(b.keys())
|
|
91
|
+
|
|
92
|
+
out: Dict[str, Tuple[Any, Any]] = {}
|
|
93
|
+
for k in keys:
|
|
94
|
+
av = a.get(k, _MISSING)
|
|
95
|
+
bv = b.get(k, _MISSING)
|
|
96
|
+
|
|
97
|
+
if treat_missing_as_none:
|
|
98
|
+
if av is _MISSING and bv is None:
|
|
99
|
+
continue
|
|
100
|
+
if bv is _MISSING and av is None:
|
|
101
|
+
continue
|
|
102
|
+
if av is _MISSING and bv is _MISSING:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
if not _equal(av, bv, float_tol=float_tol):
|
|
106
|
+
out[k] = (None if av is _MISSING else av, None if bv is _MISSING else bv)
|
|
107
|
+
return out
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.38"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.1.34"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|