ygg 0.1.57__py3-none-any.whl → 0.1.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/METADATA +2 -2
- ygg-0.1.64.dist-info/RECORD +74 -0
- yggdrasil/ai/__init__.py +2 -0
- yggdrasil/ai/session.py +87 -0
- yggdrasil/ai/sql_session.py +310 -0
- yggdrasil/databricks/__init__.py +0 -3
- yggdrasil/databricks/compute/cluster.py +68 -113
- yggdrasil/databricks/compute/command_execution.py +674 -0
- yggdrasil/databricks/compute/exceptions.py +19 -0
- yggdrasil/databricks/compute/execution_context.py +491 -282
- yggdrasil/databricks/compute/remote.py +4 -14
- yggdrasil/databricks/exceptions.py +10 -0
- yggdrasil/databricks/sql/__init__.py +0 -4
- yggdrasil/databricks/sql/engine.py +178 -178
- yggdrasil/databricks/sql/exceptions.py +9 -1
- yggdrasil/databricks/sql/statement_result.py +108 -120
- yggdrasil/databricks/sql/warehouse.py +339 -92
- yggdrasil/databricks/workspaces/io.py +185 -40
- yggdrasil/databricks/workspaces/path.py +114 -100
- yggdrasil/databricks/workspaces/workspace.py +210 -61
- yggdrasil/exceptions.py +7 -0
- yggdrasil/libs/databrickslib.py +22 -18
- yggdrasil/libs/extensions/spark_extensions.py +1 -1
- yggdrasil/libs/pandaslib.py +15 -6
- yggdrasil/libs/polarslib.py +49 -13
- yggdrasil/pyutils/__init__.py +1 -2
- yggdrasil/pyutils/callable_serde.py +12 -19
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +6 -7
- yggdrasil/pyutils/python_env.py +16 -21
- yggdrasil/pyutils/waiting_config.py +171 -0
- yggdrasil/requests/msal.py +9 -96
- yggdrasil/types/cast/arrow_cast.py +3 -0
- yggdrasil/types/cast/pandas_cast.py +157 -169
- yggdrasil/types/cast/polars_cast.py +11 -43
- yggdrasil/types/dummy_class.py +81 -0
- yggdrasil/types/file_format.py +6 -2
- yggdrasil/types/python_defaults.py +92 -76
- yggdrasil/version.py +1 -1
- ygg-0.1.57.dist-info/RECORD +0 -66
- yggdrasil/databricks/ai/loki.py +0 -53
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/WHEEL +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/top_level.txt +0 -0
- /yggdrasil/{databricks/ai/__init__.py → pyutils/mimetypes.py} +0 -0
|
@@ -2,27 +2,26 @@
|
|
|
2
2
|
|
|
3
3
|
import base64
|
|
4
4
|
import io
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
5
7
|
import time
|
|
6
8
|
from abc import ABC, abstractmethod
|
|
9
|
+
from tempfile import SpooledTemporaryFile
|
|
7
10
|
from threading import Thread
|
|
8
|
-
from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union
|
|
11
|
+
from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union, Any, BinaryIO
|
|
9
12
|
|
|
13
|
+
import dill
|
|
10
14
|
import pyarrow as pa
|
|
11
15
|
import pyarrow.csv as pcsv
|
|
12
16
|
import pyarrow.parquet as pq
|
|
13
|
-
from pyarrow.dataset import (
|
|
14
|
-
FileFormat,
|
|
15
|
-
ParquetFileFormat,
|
|
16
|
-
CsvFileFormat,
|
|
17
|
-
)
|
|
18
17
|
|
|
19
18
|
from .path_kind import DatabricksPathKind
|
|
20
19
|
from ...libs.databrickslib import databricks
|
|
21
20
|
from ...libs.pandaslib import PandasDataFrame
|
|
22
21
|
from ...libs.polarslib import polars, PolarsDataFrame
|
|
23
|
-
from ...pyutils import retry
|
|
22
|
+
from ...pyutils.retry import retry
|
|
24
23
|
from ...types.cast.registry import convert
|
|
25
|
-
from ...types.file_format import ExcelFileFormat
|
|
24
|
+
from ...types.file_format import FileFormat, ParquetFileFormat, CsvFileFormat, ExcelFileFormat
|
|
26
25
|
|
|
27
26
|
if databricks is not None:
|
|
28
27
|
from databricks.sdk.service.workspace import ImportFormat, ExportFormat
|
|
@@ -42,6 +41,66 @@ __all__ = [
|
|
|
42
41
|
]
|
|
43
42
|
|
|
44
43
|
|
|
44
|
+
LOGGER = logging.getLogger(__name__)
|
|
45
|
+
_SPOOL_MAX = 64 * 1024 * 1024 # 64MB in RAM then spill to disk
|
|
46
|
+
_COPY_CHUNK = 8 * 1024 * 1024 # 8MB chunks
|
|
47
|
+
|
|
48
|
+
def _prepare_binaryio_and_size(
|
|
49
|
+
data: Union[bytes, bytearray, memoryview, BinaryIO]
|
|
50
|
+
) -> tuple[int, BinaryIO, bool]:
|
|
51
|
+
"""
|
|
52
|
+
Returns (size, bio, should_close).
|
|
53
|
+
|
|
54
|
+
- bytes-like -> wrap in BytesIO (closeable by us).
|
|
55
|
+
- seekable file -> compute size via fstat or seek/tell.
|
|
56
|
+
- non-seekable stream -> spool into SpooledTemporaryFile, count bytes.
|
|
57
|
+
"""
|
|
58
|
+
# bytes-like
|
|
59
|
+
if isinstance(data, (bytes, bytearray, memoryview)):
|
|
60
|
+
b = bytes(data)
|
|
61
|
+
return len(b), io.BytesIO(b), True
|
|
62
|
+
|
|
63
|
+
f: BinaryIO = data
|
|
64
|
+
|
|
65
|
+
# 1) try OS-level size for real files
|
|
66
|
+
try:
|
|
67
|
+
fileno = f.fileno() # type: ignore[attr-defined]
|
|
68
|
+
except Exception:
|
|
69
|
+
fileno = None
|
|
70
|
+
|
|
71
|
+
if fileno is not None:
|
|
72
|
+
try:
|
|
73
|
+
st = os.fstat(fileno)
|
|
74
|
+
# rewind if possible
|
|
75
|
+
try:
|
|
76
|
+
f.seek(0)
|
|
77
|
+
except Exception:
|
|
78
|
+
pass
|
|
79
|
+
return int(st.st_size), f, False
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
# 2) try seek/tell (seekable streams)
|
|
84
|
+
try:
|
|
85
|
+
f.seek(0, io.SEEK_END)
|
|
86
|
+
end = f.tell()
|
|
87
|
+
f.seek(0)
|
|
88
|
+
return int(end), f, False
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
# 3) non-seekable stream: spool + count
|
|
93
|
+
spooled = SpooledTemporaryFile(max_size=_SPOOL_MAX, mode="w+b")
|
|
94
|
+
size = 0
|
|
95
|
+
while True:
|
|
96
|
+
chunk = f.read(_COPY_CHUNK)
|
|
97
|
+
if not chunk:
|
|
98
|
+
break
|
|
99
|
+
spooled.write(chunk)
|
|
100
|
+
size += len(chunk)
|
|
101
|
+
spooled.seek(0)
|
|
102
|
+
return size, spooled, True
|
|
103
|
+
|
|
45
104
|
class DatabricksIO(ABC, IO):
|
|
46
105
|
"""File-like interface for Databricks workspace, volume, or DBFS paths."""
|
|
47
106
|
|
|
@@ -96,6 +155,18 @@ class DatabricksIO(ABC, IO):
|
|
|
96
155
|
def __hash__(self):
|
|
97
156
|
return self.path.__hash__()
|
|
98
157
|
|
|
158
|
+
def __str__(self):
|
|
159
|
+
return "%s(path=%s)" % (
|
|
160
|
+
self.__class__.__name__,
|
|
161
|
+
self.path.__repr__()
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def __repr__(self):
|
|
165
|
+
return "%s(path=%s)" % (
|
|
166
|
+
self.__class__.__name__,
|
|
167
|
+
self.path.__repr__()
|
|
168
|
+
)
|
|
169
|
+
|
|
99
170
|
@classmethod
|
|
100
171
|
def create_instance(
|
|
101
172
|
cls,
|
|
@@ -504,7 +575,7 @@ class DatabricksIO(ABC, IO):
|
|
|
504
575
|
return True
|
|
505
576
|
|
|
506
577
|
@abstractmethod
|
|
507
|
-
def write_all_bytes(self, data: bytes):
|
|
578
|
+
def write_all_bytes(self, data: Union[bytes, IO[bytes]]):
|
|
508
579
|
"""Write raw bytes to the remote path.
|
|
509
580
|
|
|
510
581
|
Args:
|
|
@@ -921,10 +992,32 @@ class DatabricksIO(ABC, IO):
|
|
|
921
992
|
|
|
922
993
|
self.write_all_bytes(data=buffer.getvalue())
|
|
923
994
|
|
|
995
|
+
def read_object(
|
|
996
|
+
self,
|
|
997
|
+
):
|
|
998
|
+
content = self.read_all_bytes()
|
|
999
|
+
obj = dill.loads(content)
|
|
1000
|
+
|
|
1001
|
+
return obj
|
|
1002
|
+
|
|
1003
|
+
def write_object(
|
|
1004
|
+
self,
|
|
1005
|
+
obj: Any,
|
|
1006
|
+
file_format: Optional[FileFormat] = None,
|
|
1007
|
+
):
|
|
1008
|
+
buffer = io.BytesIO()
|
|
1009
|
+
|
|
1010
|
+
if isinstance(obj, PandasDataFrame):
|
|
1011
|
+
obj.to_pickle(buffer)
|
|
1012
|
+
else:
|
|
1013
|
+
buffer.write(dill.dumps(obj))
|
|
1014
|
+
|
|
1015
|
+
self.write_all_bytes(data=buffer.getvalue())
|
|
924
1016
|
|
|
925
1017
|
class DatabricksWorkspaceIO(DatabricksIO):
|
|
926
1018
|
"""IO adapter for Workspace files."""
|
|
927
1019
|
|
|
1020
|
+
@retry(exceptions=(InternalError,))
|
|
928
1021
|
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
929
1022
|
"""Read bytes from a Workspace file.
|
|
930
1023
|
|
|
@@ -956,11 +1049,12 @@ class DatabricksWorkspaceIO(DatabricksIO):
|
|
|
956
1049
|
end = start + length
|
|
957
1050
|
return data[start:end]
|
|
958
1051
|
|
|
959
|
-
|
|
1052
|
+
@retry(exceptions=(InternalError,))
|
|
1053
|
+
def write_all_bytes(self, data: Union[bytes, IO[bytes]]):
|
|
960
1054
|
"""Write bytes to a Workspace file.
|
|
961
1055
|
|
|
962
1056
|
Args:
|
|
963
|
-
data:
|
|
1057
|
+
data: Union[bytes, IO[bytes]] to write.
|
|
964
1058
|
|
|
965
1059
|
Returns:
|
|
966
1060
|
The DatabricksWorkspaceIO instance.
|
|
@@ -969,6 +1063,20 @@ class DatabricksWorkspaceIO(DatabricksIO):
|
|
|
969
1063
|
workspace_client = sdk.workspace
|
|
970
1064
|
full_path = self.path.workspace_full_path()
|
|
971
1065
|
|
|
1066
|
+
if isinstance(data, bytes):
|
|
1067
|
+
bsize = len(data)
|
|
1068
|
+
elif isinstance(data, io.BytesIO):
|
|
1069
|
+
bsize = len(data.getvalue())
|
|
1070
|
+
else:
|
|
1071
|
+
bsize = None
|
|
1072
|
+
|
|
1073
|
+
LOGGER.debug(
|
|
1074
|
+
"Writing %s(size=%s) in %s",
|
|
1075
|
+
type(data),
|
|
1076
|
+
bsize,
|
|
1077
|
+
self
|
|
1078
|
+
)
|
|
1079
|
+
|
|
972
1080
|
try:
|
|
973
1081
|
workspace_client.upload(
|
|
974
1082
|
full_path,
|
|
@@ -989,16 +1097,23 @@ class DatabricksWorkspaceIO(DatabricksIO):
|
|
|
989
1097
|
self.path.reset_metadata(
|
|
990
1098
|
is_file=True,
|
|
991
1099
|
is_dir=False,
|
|
992
|
-
size=
|
|
1100
|
+
size=bsize,
|
|
993
1101
|
mtime=time.time()
|
|
994
1102
|
)
|
|
995
1103
|
|
|
1104
|
+
LOGGER.info(
|
|
1105
|
+
"Written %s bytes in %s",
|
|
1106
|
+
bsize,
|
|
1107
|
+
self
|
|
1108
|
+
)
|
|
1109
|
+
|
|
996
1110
|
return self
|
|
997
1111
|
|
|
998
1112
|
|
|
999
1113
|
class DatabricksVolumeIO(DatabricksIO):
|
|
1000
1114
|
"""IO adapter for Unity Catalog volume files."""
|
|
1001
1115
|
|
|
1116
|
+
@retry(exceptions=(InternalError,))
|
|
1002
1117
|
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
1003
1118
|
"""Read bytes from a volume file.
|
|
1004
1119
|
|
|
@@ -1023,9 +1138,9 @@ class DatabricksVolumeIO(DatabricksIO):
|
|
|
1023
1138
|
|
|
1024
1139
|
try:
|
|
1025
1140
|
resp = client.download(full_path)
|
|
1026
|
-
except
|
|
1141
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, InternalError) as e:
|
|
1027
1142
|
# Databricks SDK exceptions vary a bit by version; keep it pragmatic.
|
|
1028
|
-
if allow_not_found
|
|
1143
|
+
if allow_not_found:
|
|
1029
1144
|
return b""
|
|
1030
1145
|
raise
|
|
1031
1146
|
|
|
@@ -1038,48 +1153,67 @@ class DatabricksVolumeIO(DatabricksIO):
|
|
|
1038
1153
|
end = start + length
|
|
1039
1154
|
return data[start:end]
|
|
1040
1155
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
"""
|
|
1156
|
+
def write_all_bytes(
|
|
1157
|
+
self,
|
|
1158
|
+
data: Union[bytes, bytearray, memoryview, BinaryIO],
|
|
1159
|
+
*,
|
|
1160
|
+
overwrite: bool = True,
|
|
1161
|
+
part_size: Optional[int] = None,
|
|
1162
|
+
use_parallel: bool = True,
|
|
1163
|
+
parallelism: Optional[int] = None,
|
|
1164
|
+
):
|
|
1165
|
+
"""Write bytes/stream to a volume file safely (BinaryIO upload)."""
|
|
1051
1166
|
sdk = self.workspace.sdk()
|
|
1052
1167
|
client = sdk.files
|
|
1053
1168
|
full_path = self.path.files_full_path()
|
|
1054
1169
|
|
|
1055
|
-
|
|
1056
|
-
client.upload(
|
|
1057
|
-
full_path,
|
|
1058
|
-
io.BytesIO(data),
|
|
1059
|
-
overwrite=True
|
|
1060
|
-
)
|
|
1061
|
-
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
1062
|
-
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
1170
|
+
LOGGER.debug("Writing all bytes in %s", self)
|
|
1063
1171
|
|
|
1064
|
-
|
|
1172
|
+
size, bio, should_close = _prepare_binaryio_and_size(data)
|
|
1173
|
+
|
|
1174
|
+
def _upload():
|
|
1175
|
+
return client.upload(
|
|
1065
1176
|
full_path,
|
|
1066
|
-
|
|
1067
|
-
overwrite=
|
|
1177
|
+
bio,
|
|
1178
|
+
overwrite=overwrite,
|
|
1179
|
+
part_size=part_size,
|
|
1180
|
+
use_parallel=use_parallel,
|
|
1181
|
+
parallelism=parallelism,
|
|
1068
1182
|
)
|
|
1069
1183
|
|
|
1184
|
+
try:
|
|
1185
|
+
_ = _upload()
|
|
1186
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, InternalError):
|
|
1187
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
1188
|
+
# Important: rewind if possible before retry
|
|
1189
|
+
try:
|
|
1190
|
+
bio.seek(0)
|
|
1191
|
+
except Exception:
|
|
1192
|
+
pass
|
|
1193
|
+
_ = _upload()
|
|
1194
|
+
finally:
|
|
1195
|
+
if should_close:
|
|
1196
|
+
try:
|
|
1197
|
+
bio.close()
|
|
1198
|
+
except Exception:
|
|
1199
|
+
pass
|
|
1200
|
+
|
|
1070
1201
|
self.path.reset_metadata(
|
|
1071
1202
|
is_file=True,
|
|
1072
1203
|
is_dir=False,
|
|
1073
|
-
size=
|
|
1074
|
-
mtime=time.time()
|
|
1204
|
+
size=size,
|
|
1205
|
+
mtime=time.time(),
|
|
1075
1206
|
)
|
|
1076
1207
|
|
|
1077
|
-
|
|
1208
|
+
LOGGER.info("Written %s bytes in %s", size or "all", self.path)
|
|
1209
|
+
|
|
1210
|
+
return self # or return result if your API prefers that
|
|
1078
1211
|
|
|
1079
1212
|
|
|
1080
1213
|
class DatabricksDBFSIO(DatabricksIO):
|
|
1081
1214
|
"""IO adapter for DBFS files."""
|
|
1082
1215
|
|
|
1216
|
+
@retry(exceptions=(InternalError,))
|
|
1083
1217
|
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
1084
1218
|
"""Read bytes from a DBFS file.
|
|
1085
1219
|
|
|
@@ -1124,11 +1258,12 @@ class DatabricksDBFSIO(DatabricksIO):
|
|
|
1124
1258
|
|
|
1125
1259
|
return bytes(read_bytes)
|
|
1126
1260
|
|
|
1127
|
-
|
|
1261
|
+
@retry(exceptions=(InternalError,))
|
|
1262
|
+
def write_all_bytes(self, data: Union[bytes, IO[bytes]]):
|
|
1128
1263
|
"""Write bytes to a DBFS file.
|
|
1129
1264
|
|
|
1130
1265
|
Args:
|
|
1131
|
-
data:
|
|
1266
|
+
data: Union[bytes, IO[bytes]] to write.
|
|
1132
1267
|
|
|
1133
1268
|
Returns:
|
|
1134
1269
|
The DatabricksDBFSIO instance.
|
|
@@ -1137,6 +1272,11 @@ class DatabricksDBFSIO(DatabricksIO):
|
|
|
1137
1272
|
client = sdk.dbfs
|
|
1138
1273
|
full_path = self.path.dbfs_full_path()
|
|
1139
1274
|
|
|
1275
|
+
LOGGER.debug(
|
|
1276
|
+
"Writing all bytes in %s",
|
|
1277
|
+
self
|
|
1278
|
+
)
|
|
1279
|
+
|
|
1140
1280
|
try:
|
|
1141
1281
|
with client.open(
|
|
1142
1282
|
path=full_path,
|
|
@@ -1156,6 +1296,11 @@ class DatabricksDBFSIO(DatabricksIO):
|
|
|
1156
1296
|
) as f:
|
|
1157
1297
|
f.write(data)
|
|
1158
1298
|
|
|
1299
|
+
LOGGER.info(
|
|
1300
|
+
"Written all bytes in %s",
|
|
1301
|
+
self
|
|
1302
|
+
)
|
|
1303
|
+
|
|
1159
1304
|
self.path.reset_metadata(
|
|
1160
1305
|
is_file=True,
|
|
1161
1306
|
is_dir=False,
|