ygg 0.1.57__py3-none-any.whl → 0.1.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/METADATA +2 -2
  2. ygg-0.1.64.dist-info/RECORD +74 -0
  3. yggdrasil/ai/__init__.py +2 -0
  4. yggdrasil/ai/session.py +87 -0
  5. yggdrasil/ai/sql_session.py +310 -0
  6. yggdrasil/databricks/__init__.py +0 -3
  7. yggdrasil/databricks/compute/cluster.py +68 -113
  8. yggdrasil/databricks/compute/command_execution.py +674 -0
  9. yggdrasil/databricks/compute/exceptions.py +19 -0
  10. yggdrasil/databricks/compute/execution_context.py +491 -282
  11. yggdrasil/databricks/compute/remote.py +4 -14
  12. yggdrasil/databricks/exceptions.py +10 -0
  13. yggdrasil/databricks/sql/__init__.py +0 -4
  14. yggdrasil/databricks/sql/engine.py +178 -178
  15. yggdrasil/databricks/sql/exceptions.py +9 -1
  16. yggdrasil/databricks/sql/statement_result.py +108 -120
  17. yggdrasil/databricks/sql/warehouse.py +339 -92
  18. yggdrasil/databricks/workspaces/io.py +185 -40
  19. yggdrasil/databricks/workspaces/path.py +114 -100
  20. yggdrasil/databricks/workspaces/workspace.py +210 -61
  21. yggdrasil/exceptions.py +7 -0
  22. yggdrasil/libs/databrickslib.py +22 -18
  23. yggdrasil/libs/extensions/spark_extensions.py +1 -1
  24. yggdrasil/libs/pandaslib.py +15 -6
  25. yggdrasil/libs/polarslib.py +49 -13
  26. yggdrasil/pyutils/__init__.py +1 -2
  27. yggdrasil/pyutils/callable_serde.py +12 -19
  28. yggdrasil/pyutils/exceptions.py +16 -0
  29. yggdrasil/pyutils/modules.py +6 -7
  30. yggdrasil/pyutils/python_env.py +16 -21
  31. yggdrasil/pyutils/waiting_config.py +171 -0
  32. yggdrasil/requests/msal.py +9 -96
  33. yggdrasil/types/cast/arrow_cast.py +3 -0
  34. yggdrasil/types/cast/pandas_cast.py +157 -169
  35. yggdrasil/types/cast/polars_cast.py +11 -43
  36. yggdrasil/types/dummy_class.py +81 -0
  37. yggdrasil/types/file_format.py +6 -2
  38. yggdrasil/types/python_defaults.py +92 -76
  39. yggdrasil/version.py +1 -1
  40. ygg-0.1.57.dist-info/RECORD +0 -66
  41. yggdrasil/databricks/ai/loki.py +0 -53
  42. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/WHEEL +0 -0
  43. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/entry_points.txt +0 -0
  44. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/licenses/LICENSE +0 -0
  45. {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/top_level.txt +0 -0
  46. /yggdrasil/{databricks/ai/__init__.py → pyutils/mimetypes.py} +0 -0
@@ -2,27 +2,26 @@
2
2
 
3
3
  import base64
4
4
  import io
5
+ import logging
6
+ import os
5
7
  import time
6
8
  from abc import ABC, abstractmethod
9
+ from tempfile import SpooledTemporaryFile
7
10
  from threading import Thread
8
- from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union
11
+ from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union, Any, BinaryIO
9
12
 
13
+ import dill
10
14
  import pyarrow as pa
11
15
  import pyarrow.csv as pcsv
12
16
  import pyarrow.parquet as pq
13
- from pyarrow.dataset import (
14
- FileFormat,
15
- ParquetFileFormat,
16
- CsvFileFormat,
17
- )
18
17
 
19
18
  from .path_kind import DatabricksPathKind
20
19
  from ...libs.databrickslib import databricks
21
20
  from ...libs.pandaslib import PandasDataFrame
22
21
  from ...libs.polarslib import polars, PolarsDataFrame
23
- from ...pyutils import retry
22
+ from ...pyutils.retry import retry
24
23
  from ...types.cast.registry import convert
25
- from ...types.file_format import ExcelFileFormat
24
+ from ...types.file_format import FileFormat, ParquetFileFormat, CsvFileFormat, ExcelFileFormat
26
25
 
27
26
  if databricks is not None:
28
27
  from databricks.sdk.service.workspace import ImportFormat, ExportFormat
@@ -42,6 +41,66 @@ __all__ = [
42
41
  ]
43
42
 
44
43
 
44
+ LOGGER = logging.getLogger(__name__)
45
+ _SPOOL_MAX = 64 * 1024 * 1024 # 64MB in RAM then spill to disk
46
+ _COPY_CHUNK = 8 * 1024 * 1024 # 8MB chunks
47
+
48
+ def _prepare_binaryio_and_size(
49
+ data: Union[bytes, bytearray, memoryview, BinaryIO]
50
+ ) -> tuple[int, BinaryIO, bool]:
51
+ """
52
+ Returns (size, bio, should_close).
53
+
54
+ - bytes-like -> wrap in BytesIO (closeable by us).
55
+ - seekable file -> compute size via fstat or seek/tell.
56
+ - non-seekable stream -> spool into SpooledTemporaryFile, count bytes.
57
+ """
58
+ # bytes-like
59
+ if isinstance(data, (bytes, bytearray, memoryview)):
60
+ b = bytes(data)
61
+ return len(b), io.BytesIO(b), True
62
+
63
+ f: BinaryIO = data
64
+
65
+ # 1) try OS-level size for real files
66
+ try:
67
+ fileno = f.fileno() # type: ignore[attr-defined]
68
+ except Exception:
69
+ fileno = None
70
+
71
+ if fileno is not None:
72
+ try:
73
+ st = os.fstat(fileno)
74
+ # rewind if possible
75
+ try:
76
+ f.seek(0)
77
+ except Exception:
78
+ pass
79
+ return int(st.st_size), f, False
80
+ except Exception:
81
+ pass
82
+
83
+ # 2) try seek/tell (seekable streams)
84
+ try:
85
+ f.seek(0, io.SEEK_END)
86
+ end = f.tell()
87
+ f.seek(0)
88
+ return int(end), f, False
89
+ except Exception:
90
+ pass
91
+
92
+ # 3) non-seekable stream: spool + count
93
+ spooled = SpooledTemporaryFile(max_size=_SPOOL_MAX, mode="w+b")
94
+ size = 0
95
+ while True:
96
+ chunk = f.read(_COPY_CHUNK)
97
+ if not chunk:
98
+ break
99
+ spooled.write(chunk)
100
+ size += len(chunk)
101
+ spooled.seek(0)
102
+ return size, spooled, True
103
+
45
104
  class DatabricksIO(ABC, IO):
46
105
  """File-like interface for Databricks workspace, volume, or DBFS paths."""
47
106
 
@@ -96,6 +155,18 @@ class DatabricksIO(ABC, IO):
96
155
  def __hash__(self):
97
156
  return self.path.__hash__()
98
157
 
158
+ def __str__(self):
159
+ return "%s(path=%s)" % (
160
+ self.__class__.__name__,
161
+ self.path.__repr__()
162
+ )
163
+
164
+ def __repr__(self):
165
+ return "%s(path=%s)" % (
166
+ self.__class__.__name__,
167
+ self.path.__repr__()
168
+ )
169
+
99
170
  @classmethod
100
171
  def create_instance(
101
172
  cls,
@@ -504,7 +575,7 @@ class DatabricksIO(ABC, IO):
504
575
  return True
505
576
 
506
577
  @abstractmethod
507
- def write_all_bytes(self, data: bytes):
578
+ def write_all_bytes(self, data: Union[bytes, IO[bytes]]):
508
579
  """Write raw bytes to the remote path.
509
580
 
510
581
  Args:
@@ -921,10 +992,32 @@ class DatabricksIO(ABC, IO):
921
992
 
922
993
  self.write_all_bytes(data=buffer.getvalue())
923
994
 
995
+ def read_object(
996
+ self,
997
+ ):
998
+ content = self.read_all_bytes()
999
+ obj = dill.loads(content)
1000
+
1001
+ return obj
1002
+
1003
+ def write_object(
1004
+ self,
1005
+ obj: Any,
1006
+ file_format: Optional[FileFormat] = None,
1007
+ ):
1008
+ buffer = io.BytesIO()
1009
+
1010
+ if isinstance(obj, PandasDataFrame):
1011
+ obj.to_pickle(buffer)
1012
+ else:
1013
+ buffer.write(dill.dumps(obj))
1014
+
1015
+ self.write_all_bytes(data=buffer.getvalue())
924
1016
 
925
1017
  class DatabricksWorkspaceIO(DatabricksIO):
926
1018
  """IO adapter for Workspace files."""
927
1019
 
1020
+ @retry(exceptions=(InternalError,))
928
1021
  def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
929
1022
  """Read bytes from a Workspace file.
930
1023
 
@@ -956,11 +1049,12 @@ class DatabricksWorkspaceIO(DatabricksIO):
956
1049
  end = start + length
957
1050
  return data[start:end]
958
1051
 
959
- def write_all_bytes(self, data: bytes):
1052
+ @retry(exceptions=(InternalError,))
1053
+ def write_all_bytes(self, data: Union[bytes, IO[bytes]]):
960
1054
  """Write bytes to a Workspace file.
961
1055
 
962
1056
  Args:
963
- data: Bytes to write.
1057
+ data: Union[bytes, IO[bytes]] to write.
964
1058
 
965
1059
  Returns:
966
1060
  The DatabricksWorkspaceIO instance.
@@ -969,6 +1063,20 @@ class DatabricksWorkspaceIO(DatabricksIO):
969
1063
  workspace_client = sdk.workspace
970
1064
  full_path = self.path.workspace_full_path()
971
1065
 
1066
+ if isinstance(data, bytes):
1067
+ bsize = len(data)
1068
+ elif isinstance(data, io.BytesIO):
1069
+ bsize = len(data.getvalue())
1070
+ else:
1071
+ bsize = None
1072
+
1073
+ LOGGER.debug(
1074
+ "Writing %s(size=%s) in %s",
1075
+ type(data),
1076
+ bsize,
1077
+ self
1078
+ )
1079
+
972
1080
  try:
973
1081
  workspace_client.upload(
974
1082
  full_path,
@@ -989,16 +1097,23 @@ class DatabricksWorkspaceIO(DatabricksIO):
989
1097
  self.path.reset_metadata(
990
1098
  is_file=True,
991
1099
  is_dir=False,
992
- size=len(data),
1100
+ size=bsize,
993
1101
  mtime=time.time()
994
1102
  )
995
1103
 
1104
+ LOGGER.info(
1105
+ "Written %s bytes in %s",
1106
+ bsize,
1107
+ self
1108
+ )
1109
+
996
1110
  return self
997
1111
 
998
1112
 
999
1113
  class DatabricksVolumeIO(DatabricksIO):
1000
1114
  """IO adapter for Unity Catalog volume files."""
1001
1115
 
1116
+ @retry(exceptions=(InternalError,))
1002
1117
  def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
1003
1118
  """Read bytes from a volume file.
1004
1119
 
@@ -1023,9 +1138,9 @@ class DatabricksVolumeIO(DatabricksIO):
1023
1138
 
1024
1139
  try:
1025
1140
  resp = client.download(full_path)
1026
- except Exception as e:
1141
+ except (NotFound, ResourceDoesNotExist, BadRequest, InternalError) as e:
1027
1142
  # Databricks SDK exceptions vary a bit by version; keep it pragmatic.
1028
- if allow_not_found and any(s in str(e).lower() for s in ("not found", "not exist", "404")):
1143
+ if allow_not_found:
1029
1144
  return b""
1030
1145
  raise
1031
1146
 
@@ -1038,48 +1153,67 @@ class DatabricksVolumeIO(DatabricksIO):
1038
1153
  end = start + length
1039
1154
  return data[start:end]
1040
1155
 
1041
- @retry(exceptions=(InternalError,))
1042
- def write_all_bytes(self, data: bytes):
1043
- """Write bytes to a volume file.
1044
-
1045
- Args:
1046
- data: Bytes to write.
1047
-
1048
- Returns:
1049
- The DatabricksVolumeIO instance.
1050
- """
1156
+ def write_all_bytes(
1157
+ self,
1158
+ data: Union[bytes, bytearray, memoryview, BinaryIO],
1159
+ *,
1160
+ overwrite: bool = True,
1161
+ part_size: Optional[int] = None,
1162
+ use_parallel: bool = True,
1163
+ parallelism: Optional[int] = None,
1164
+ ):
1165
+ """Write bytes/stream to a volume file safely (BinaryIO upload)."""
1051
1166
  sdk = self.workspace.sdk()
1052
1167
  client = sdk.files
1053
1168
  full_path = self.path.files_full_path()
1054
1169
 
1055
- try:
1056
- client.upload(
1057
- full_path,
1058
- io.BytesIO(data),
1059
- overwrite=True
1060
- )
1061
- except (NotFound, ResourceDoesNotExist, BadRequest):
1062
- self.path.parent.mkdir(parents=True, exist_ok=True)
1170
+ LOGGER.debug("Writing all bytes in %s", self)
1063
1171
 
1064
- client.upload(
1172
+ size, bio, should_close = _prepare_binaryio_and_size(data)
1173
+
1174
+ def _upload():
1175
+ return client.upload(
1065
1176
  full_path,
1066
- io.BytesIO(data),
1067
- overwrite=True
1177
+ bio,
1178
+ overwrite=overwrite,
1179
+ part_size=part_size,
1180
+ use_parallel=use_parallel,
1181
+ parallelism=parallelism,
1068
1182
  )
1069
1183
 
1184
+ try:
1185
+ _ = _upload()
1186
+ except (NotFound, ResourceDoesNotExist, BadRequest, InternalError):
1187
+ self.path.parent.mkdir(parents=True, exist_ok=True)
1188
+ # Important: rewind if possible before retry
1189
+ try:
1190
+ bio.seek(0)
1191
+ except Exception:
1192
+ pass
1193
+ _ = _upload()
1194
+ finally:
1195
+ if should_close:
1196
+ try:
1197
+ bio.close()
1198
+ except Exception:
1199
+ pass
1200
+
1070
1201
  self.path.reset_metadata(
1071
1202
  is_file=True,
1072
1203
  is_dir=False,
1073
- size=len(data),
1074
- mtime=time.time()
1204
+ size=size,
1205
+ mtime=time.time(),
1075
1206
  )
1076
1207
 
1077
- return self
1208
+ LOGGER.info("Written %s bytes in %s", size or "all", self.path)
1209
+
1210
+ return self # or return result if your API prefers that
1078
1211
 
1079
1212
 
1080
1213
  class DatabricksDBFSIO(DatabricksIO):
1081
1214
  """IO adapter for DBFS files."""
1082
1215
 
1216
+ @retry(exceptions=(InternalError,))
1083
1217
  def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
1084
1218
  """Read bytes from a DBFS file.
1085
1219
 
@@ -1124,11 +1258,12 @@ class DatabricksDBFSIO(DatabricksIO):
1124
1258
 
1125
1259
  return bytes(read_bytes)
1126
1260
 
1127
- def write_all_bytes(self, data: bytes):
1261
+ @retry(exceptions=(InternalError,))
1262
+ def write_all_bytes(self, data: Union[bytes, IO[bytes]]):
1128
1263
  """Write bytes to a DBFS file.
1129
1264
 
1130
1265
  Args:
1131
- data: Bytes to write.
1266
+ data: Union[bytes, IO[bytes]] to write.
1132
1267
 
1133
1268
  Returns:
1134
1269
  The DatabricksDBFSIO instance.
@@ -1137,6 +1272,11 @@ class DatabricksDBFSIO(DatabricksIO):
1137
1272
  client = sdk.dbfs
1138
1273
  full_path = self.path.dbfs_full_path()
1139
1274
 
1275
+ LOGGER.debug(
1276
+ "Writing all bytes in %s",
1277
+ self
1278
+ )
1279
+
1140
1280
  try:
1141
1281
  with client.open(
1142
1282
  path=full_path,
@@ -1156,6 +1296,11 @@ class DatabricksDBFSIO(DatabricksIO):
1156
1296
  ) as f:
1157
1297
  f.write(data)
1158
1298
 
1299
+ LOGGER.info(
1300
+ "Written all bytes in %s",
1301
+ self
1302
+ )
1303
+
1159
1304
  self.path.reset_metadata(
1160
1305
  is_file=True,
1161
1306
  is_dir=False,