ygg 0.1.57__py3-none-any.whl → 0.1.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/METADATA +2 -2
- ygg-0.1.64.dist-info/RECORD +74 -0
- yggdrasil/ai/__init__.py +2 -0
- yggdrasil/ai/session.py +87 -0
- yggdrasil/ai/sql_session.py +310 -0
- yggdrasil/databricks/__init__.py +0 -3
- yggdrasil/databricks/compute/cluster.py +68 -113
- yggdrasil/databricks/compute/command_execution.py +674 -0
- yggdrasil/databricks/compute/exceptions.py +19 -0
- yggdrasil/databricks/compute/execution_context.py +491 -282
- yggdrasil/databricks/compute/remote.py +4 -14
- yggdrasil/databricks/exceptions.py +10 -0
- yggdrasil/databricks/sql/__init__.py +0 -4
- yggdrasil/databricks/sql/engine.py +178 -178
- yggdrasil/databricks/sql/exceptions.py +9 -1
- yggdrasil/databricks/sql/statement_result.py +108 -120
- yggdrasil/databricks/sql/warehouse.py +339 -92
- yggdrasil/databricks/workspaces/io.py +185 -40
- yggdrasil/databricks/workspaces/path.py +114 -100
- yggdrasil/databricks/workspaces/workspace.py +210 -61
- yggdrasil/exceptions.py +7 -0
- yggdrasil/libs/databrickslib.py +22 -18
- yggdrasil/libs/extensions/spark_extensions.py +1 -1
- yggdrasil/libs/pandaslib.py +15 -6
- yggdrasil/libs/polarslib.py +49 -13
- yggdrasil/pyutils/__init__.py +1 -2
- yggdrasil/pyutils/callable_serde.py +12 -19
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +6 -7
- yggdrasil/pyutils/python_env.py +16 -21
- yggdrasil/pyutils/waiting_config.py +171 -0
- yggdrasil/requests/msal.py +9 -96
- yggdrasil/types/cast/arrow_cast.py +3 -0
- yggdrasil/types/cast/pandas_cast.py +157 -169
- yggdrasil/types/cast/polars_cast.py +11 -43
- yggdrasil/types/dummy_class.py +81 -0
- yggdrasil/types/file_format.py +6 -2
- yggdrasil/types/python_defaults.py +92 -76
- yggdrasil/version.py +1 -1
- ygg-0.1.57.dist-info/RECORD +0 -66
- yggdrasil/databricks/ai/loki.py +0 -53
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/WHEEL +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/top_level.txt +0 -0
- /yggdrasil/{databricks/ai/__init__.py → pyutils/mimetypes.py} +0 -0
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
"""Databricks path abstraction spanning DBFS, workspace, and volumes."""
|
|
2
2
|
|
|
3
3
|
# src/yggdrasil/databricks/workspaces/databricks_path.py
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
4
|
import dataclasses
|
|
7
5
|
import datetime as dt
|
|
6
|
+
import io
|
|
8
7
|
import random
|
|
9
8
|
import string
|
|
10
9
|
import time
|
|
11
10
|
from pathlib import PurePosixPath
|
|
12
|
-
from
|
|
11
|
+
from threading import Thread
|
|
12
|
+
from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Any, IO
|
|
13
13
|
|
|
14
|
+
import dill
|
|
14
15
|
import pyarrow as pa
|
|
15
|
-
import pyarrow.dataset as ds
|
|
16
16
|
from pyarrow import ArrowInvalid
|
|
17
|
-
from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
|
|
18
17
|
from pyarrow.fs import FileInfo, FileType, FileSystem
|
|
19
18
|
|
|
20
19
|
from .io import DatabricksIO
|
|
@@ -22,15 +21,13 @@ from .path_kind import DatabricksPathKind
|
|
|
22
21
|
from .volumes_path import get_volume_status, get_volume_metadata
|
|
23
22
|
from ...libs.databrickslib import databricks
|
|
24
23
|
from ...libs.pandaslib import PandasDataFrame
|
|
25
|
-
from ...libs.polarslib import polars
|
|
26
|
-
from ...types.cast.
|
|
27
|
-
from ...types.
|
|
28
|
-
from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
|
|
29
|
-
from ...types.cast.registry import convert, register_converter
|
|
30
|
-
from ...types.file_format import ExcelFileFormat
|
|
24
|
+
from ...libs.polarslib import polars
|
|
25
|
+
from ...types.cast.registry import convert
|
|
26
|
+
from ...types.file_format import FileFormat, ExcelFileFormat, ParquetFileFormat, JsonFileFormat, CsvFileFormat
|
|
31
27
|
|
|
32
28
|
if databricks is not None:
|
|
33
|
-
from databricks.sdk.
|
|
29
|
+
from databricks.sdk.errors import InternalError
|
|
30
|
+
from databricks.sdk.service.catalog import VolumeType, VolumeInfo, PathOperation
|
|
34
31
|
from databricks.sdk.service.workspace import ObjectType
|
|
35
32
|
from databricks.sdk.errors.platform import (
|
|
36
33
|
NotFound,
|
|
@@ -176,6 +173,8 @@ class DatabricksPath:
|
|
|
176
173
|
if not obj:
|
|
177
174
|
return cls.empty_instance(workspace=workspace)
|
|
178
175
|
|
|
176
|
+
if isinstance(obj, str):
|
|
177
|
+
obj = [obj]
|
|
179
178
|
if not isinstance(obj, (str, list)):
|
|
180
179
|
if isinstance(obj, DatabricksPath):
|
|
181
180
|
if workspace is not None and obj._workspace is None:
|
|
@@ -191,6 +190,7 @@ class DatabricksPath:
|
|
|
191
190
|
obj = str(obj)
|
|
192
191
|
|
|
193
192
|
|
|
193
|
+
|
|
194
194
|
obj = _flatten_parts(obj)
|
|
195
195
|
|
|
196
196
|
if obj and not obj[0]:
|
|
@@ -246,16 +246,23 @@ class DatabricksPath:
|
|
|
246
246
|
if self._workspace is not None:
|
|
247
247
|
self._workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
248
248
|
|
|
249
|
+
self.close(wait=False)
|
|
250
|
+
|
|
249
251
|
def __str__(self):
|
|
250
252
|
return self.full_path()
|
|
251
253
|
|
|
252
254
|
def __repr__(self):
|
|
253
255
|
return self.url()
|
|
254
256
|
|
|
257
|
+
def __del__(self):
|
|
258
|
+
self.close(wait=False)
|
|
259
|
+
|
|
255
260
|
def __fspath__(self):
|
|
256
261
|
return self.full_path()
|
|
257
262
|
|
|
258
263
|
def url(self):
|
|
264
|
+
if self._workspace is not None:
|
|
265
|
+
return self._workspace.safe_host + self.full_path()
|
|
259
266
|
return "dbfs://%s" % self.full_path()
|
|
260
267
|
|
|
261
268
|
def full_path(self) -> str:
|
|
@@ -282,7 +289,7 @@ class DatabricksPath:
|
|
|
282
289
|
Returns:
|
|
283
290
|
A PyArrow FileSystem instance.
|
|
284
291
|
"""
|
|
285
|
-
return self.workspace.
|
|
292
|
+
return self.workspace.filesystem(workspace=workspace)
|
|
286
293
|
|
|
287
294
|
@property
|
|
288
295
|
def parent(self):
|
|
@@ -496,9 +503,15 @@ class DatabricksPath:
|
|
|
496
503
|
|
|
497
504
|
return self
|
|
498
505
|
|
|
499
|
-
def close(self):
|
|
506
|
+
def close(self, wait: bool = True):
|
|
500
507
|
if self.temporary:
|
|
501
|
-
|
|
508
|
+
if wait:
|
|
509
|
+
self.remove(recursive=True)
|
|
510
|
+
else:
|
|
511
|
+
Thread(
|
|
512
|
+
target=self.remove,
|
|
513
|
+
kwargs={"recursive": True}
|
|
514
|
+
).start()
|
|
502
515
|
|
|
503
516
|
def storage_location(self) -> str:
|
|
504
517
|
info = self.volume_info()
|
|
@@ -586,7 +599,7 @@ class DatabricksPath:
|
|
|
586
599
|
mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
|
|
587
600
|
|
|
588
601
|
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
589
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
602
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
590
603
|
pass
|
|
591
604
|
|
|
592
605
|
found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
|
|
@@ -730,7 +743,7 @@ class DatabricksPath:
|
|
|
730
743
|
properties=default_tags,
|
|
731
744
|
comment="Catalog auto generated by yggdrasil"
|
|
732
745
|
)
|
|
733
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
746
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest, InternalError):
|
|
734
747
|
if not exist_ok:
|
|
735
748
|
raise
|
|
736
749
|
|
|
@@ -742,7 +755,7 @@ class DatabricksPath:
|
|
|
742
755
|
properties=default_tags,
|
|
743
756
|
comment="Schema auto generated by yggdrasil"
|
|
744
757
|
)
|
|
745
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
758
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest, InternalError):
|
|
746
759
|
if not exist_ok:
|
|
747
760
|
raise
|
|
748
761
|
|
|
@@ -808,37 +821,54 @@ class DatabricksPath:
|
|
|
808
821
|
|
|
809
822
|
def remove(
|
|
810
823
|
self,
|
|
811
|
-
recursive: bool = True
|
|
824
|
+
recursive: bool = True,
|
|
825
|
+
allow_not_found: bool = True
|
|
812
826
|
):
|
|
813
827
|
"""Remove the path as a file or directory.
|
|
814
828
|
|
|
815
829
|
Args:
|
|
816
830
|
recursive: Whether to delete directories recursively.
|
|
831
|
+
allow_not_found: Allow not found path
|
|
817
832
|
|
|
818
833
|
Returns:
|
|
819
834
|
The DatabricksPath instance.
|
|
820
835
|
"""
|
|
821
836
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
822
|
-
return self._remove_volume_obj(recursive=recursive)
|
|
837
|
+
return self._remove_volume_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
823
838
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
824
|
-
return self._remove_workspace_obj(recursive=recursive)
|
|
839
|
+
return self._remove_workspace_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
825
840
|
elif self.kind == DatabricksPathKind.DBFS:
|
|
826
|
-
return self._remove_dbfs_obj(recursive=recursive)
|
|
841
|
+
return self._remove_dbfs_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
827
842
|
|
|
828
|
-
def _remove_volume_obj(
|
|
843
|
+
def _remove_volume_obj(
|
|
844
|
+
self,
|
|
845
|
+
recursive: bool = True,
|
|
846
|
+
allow_not_found: bool = True
|
|
847
|
+
):
|
|
829
848
|
if self.is_file():
|
|
830
|
-
return self._remove_volume_file()
|
|
831
|
-
|
|
849
|
+
return self._remove_volume_file(allow_not_found=allow_not_found)
|
|
850
|
+
elif self.is_dir():
|
|
851
|
+
return self._remove_volume_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
832
852
|
|
|
833
|
-
def _remove_workspace_obj(
|
|
853
|
+
def _remove_workspace_obj(
|
|
854
|
+
self,
|
|
855
|
+
recursive: bool = True,
|
|
856
|
+
allow_not_found: bool = True
|
|
857
|
+
):
|
|
834
858
|
if self.is_file():
|
|
835
|
-
return self._remove_workspace_file()
|
|
836
|
-
|
|
859
|
+
return self._remove_workspace_file(allow_not_found=allow_not_found)
|
|
860
|
+
elif self.is_dir():
|
|
861
|
+
return self._remove_workspace_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
837
862
|
|
|
838
|
-
def _remove_dbfs_obj(
|
|
863
|
+
def _remove_dbfs_obj(
|
|
864
|
+
self,
|
|
865
|
+
recursive: bool = True,
|
|
866
|
+
allow_not_found: bool = True
|
|
867
|
+
):
|
|
839
868
|
if self.is_file():
|
|
840
|
-
return self._remove_dbfs_file()
|
|
841
|
-
|
|
869
|
+
return self._remove_dbfs_file(allow_not_found=allow_not_found)
|
|
870
|
+
elif self.is_dir():
|
|
871
|
+
return self._remove_dbfs_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
842
872
|
|
|
843
873
|
def rmfile(self, allow_not_found: bool = True):
|
|
844
874
|
"""Remove the path as a file.
|
|
@@ -859,7 +889,7 @@ class DatabricksPath:
|
|
|
859
889
|
sdk = self.workspace.sdk()
|
|
860
890
|
try:
|
|
861
891
|
sdk.files.delete(self.files_full_path())
|
|
862
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
892
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
863
893
|
if not allow_not_found:
|
|
864
894
|
raise
|
|
865
895
|
finally:
|
|
@@ -871,7 +901,7 @@ class DatabricksPath:
|
|
|
871
901
|
sdk = self.workspace.sdk()
|
|
872
902
|
try:
|
|
873
903
|
sdk.workspace.delete(self.workspace_full_path(), recursive=True)
|
|
874
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
904
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
875
905
|
if not allow_not_found:
|
|
876
906
|
raise
|
|
877
907
|
finally:
|
|
@@ -883,7 +913,7 @@ class DatabricksPath:
|
|
|
883
913
|
sdk = self.workspace.sdk()
|
|
884
914
|
try:
|
|
885
915
|
sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
|
|
886
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
916
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
887
917
|
if not allow_not_found:
|
|
888
918
|
raise
|
|
889
919
|
finally:
|
|
@@ -940,7 +970,7 @@ class DatabricksPath:
|
|
|
940
970
|
|
|
941
971
|
if not with_root:
|
|
942
972
|
sdk.workspace.mkdirs(full_path)
|
|
943
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
973
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
944
974
|
if not allow_not_found:
|
|
945
975
|
raise
|
|
946
976
|
finally:
|
|
@@ -962,7 +992,7 @@ class DatabricksPath:
|
|
|
962
992
|
|
|
963
993
|
if not with_root:
|
|
964
994
|
sdk.dbfs.mkdirs(full_path)
|
|
965
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
995
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
966
996
|
if not allow_not_found:
|
|
967
997
|
raise
|
|
968
998
|
finally:
|
|
@@ -983,7 +1013,7 @@ class DatabricksPath:
|
|
|
983
1013
|
if rel:
|
|
984
1014
|
try:
|
|
985
1015
|
sdk.files.delete_directory(full_path)
|
|
986
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
|
|
1016
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError) as e:
|
|
987
1017
|
message = str(e)
|
|
988
1018
|
|
|
989
1019
|
if recursive and "directory is not empty" in message:
|
|
@@ -998,13 +1028,13 @@ class DatabricksPath:
|
|
|
998
1028
|
elif volume_name:
|
|
999
1029
|
try:
|
|
1000
1030
|
sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
|
|
1001
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1031
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1002
1032
|
if not allow_not_found:
|
|
1003
1033
|
raise
|
|
1004
1034
|
elif schema_name:
|
|
1005
1035
|
try:
|
|
1006
1036
|
sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
|
|
1007
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1037
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1008
1038
|
if not allow_not_found:
|
|
1009
1039
|
raise
|
|
1010
1040
|
|
|
@@ -1064,7 +1094,7 @@ class DatabricksPath:
|
|
|
1064
1094
|
yield from base._ls_volume(recursive=recursive)
|
|
1065
1095
|
else:
|
|
1066
1096
|
yield base
|
|
1067
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1097
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1068
1098
|
if not allow_not_found:
|
|
1069
1099
|
raise
|
|
1070
1100
|
elif schema_name is None:
|
|
@@ -1082,7 +1112,7 @@ class DatabricksPath:
|
|
|
1082
1112
|
yield from base._ls_volume(recursive=recursive)
|
|
1083
1113
|
else:
|
|
1084
1114
|
yield base
|
|
1085
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1115
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1086
1116
|
if not allow_not_found:
|
|
1087
1117
|
raise
|
|
1088
1118
|
else:
|
|
@@ -1100,7 +1130,7 @@ class DatabricksPath:
|
|
|
1100
1130
|
yield from base._ls_volume(recursive=recursive)
|
|
1101
1131
|
else:
|
|
1102
1132
|
yield base
|
|
1103
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1133
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1104
1134
|
if not allow_not_found:
|
|
1105
1135
|
raise
|
|
1106
1136
|
else:
|
|
@@ -1121,7 +1151,7 @@ class DatabricksPath:
|
|
|
1121
1151
|
yield from base._ls_volume(recursive=recursive)
|
|
1122
1152
|
else:
|
|
1123
1153
|
yield base
|
|
1124
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1154
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1125
1155
|
if not allow_not_found:
|
|
1126
1156
|
raise
|
|
1127
1157
|
|
|
@@ -1140,7 +1170,7 @@ class DatabricksPath:
|
|
|
1140
1170
|
_is_dir=is_dir,
|
|
1141
1171
|
_size=info.size,
|
|
1142
1172
|
)
|
|
1143
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1173
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1144
1174
|
if not allow_not_found:
|
|
1145
1175
|
raise
|
|
1146
1176
|
|
|
@@ -1158,7 +1188,7 @@ class DatabricksPath:
|
|
|
1158
1188
|
_is_dir=info.is_dir,
|
|
1159
1189
|
_size=info.file_size,
|
|
1160
1190
|
)
|
|
1161
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1191
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1162
1192
|
if not allow_not_found:
|
|
1163
1193
|
raise
|
|
1164
1194
|
|
|
@@ -1225,13 +1255,6 @@ class DatabricksPath:
|
|
|
1225
1255
|
else:
|
|
1226
1256
|
raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
|
|
1227
1257
|
|
|
1228
|
-
def write_bytes(self, data: bytes):
|
|
1229
|
-
if hasattr(data, "read"):
|
|
1230
|
-
data = data.read()
|
|
1231
|
-
|
|
1232
|
-
with self.open("wb") as f:
|
|
1233
|
-
f.write_all_bytes(data=data)
|
|
1234
|
-
|
|
1235
1258
|
def temporary_credentials(
|
|
1236
1259
|
self,
|
|
1237
1260
|
operation: Optional["PathOperation"] = None
|
|
@@ -1248,6 +1271,14 @@ class DatabricksPath:
|
|
|
1248
1271
|
operation=operation or PathOperation.PATH_READ,
|
|
1249
1272
|
)
|
|
1250
1273
|
|
|
1274
|
+
def read_bytes(self, use_cache: bool = False):
|
|
1275
|
+
with self.open("rb") as f:
|
|
1276
|
+
return f.read_all_bytes(use_cache=use_cache)
|
|
1277
|
+
|
|
1278
|
+
def write_bytes(self, data: Union[bytes, IO[bytes]]):
|
|
1279
|
+
with self.open("wb") as f:
|
|
1280
|
+
f.write_all_bytes(data=data)
|
|
1281
|
+
|
|
1251
1282
|
# -------------------------
|
|
1252
1283
|
# Data ops (Arrow / Pandas / Polars)
|
|
1253
1284
|
# -------------------------
|
|
@@ -1267,6 +1298,8 @@ class DatabricksPath:
|
|
|
1267
1298
|
Returns:
|
|
1268
1299
|
A PyArrow Dataset instance.
|
|
1269
1300
|
"""
|
|
1301
|
+
import pyarrow.dataset as ds
|
|
1302
|
+
|
|
1270
1303
|
filesystem = self.filesystem(workspace=workspace) if filesystem is None else filesystem
|
|
1271
1304
|
|
|
1272
1305
|
return ds.dataset(
|
|
@@ -1356,7 +1389,6 @@ class DatabricksPath:
|
|
|
1356
1389
|
table: pa.Table,
|
|
1357
1390
|
file_format: Optional[FileFormat] = None,
|
|
1358
1391
|
batch_size: Optional[int] = None,
|
|
1359
|
-
**kwargs
|
|
1360
1392
|
):
|
|
1361
1393
|
"""Write an Arrow table to the path, sharding if needed.
|
|
1362
1394
|
|
|
@@ -1364,7 +1396,6 @@ class DatabricksPath:
|
|
|
1364
1396
|
table: Arrow table to write.
|
|
1365
1397
|
file_format: Optional file format override.
|
|
1366
1398
|
batch_size: Optional batch size for writes.
|
|
1367
|
-
**kwargs: Format-specific options.
|
|
1368
1399
|
|
|
1369
1400
|
Returns:
|
|
1370
1401
|
The DatabricksPath instance.
|
|
@@ -1377,7 +1408,11 @@ class DatabricksPath:
|
|
|
1377
1408
|
part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
|
|
1378
1409
|
|
|
1379
1410
|
with part_path.open(mode="wb") as f:
|
|
1380
|
-
f.write_arrow_batch(
|
|
1411
|
+
f.write_arrow_batch(
|
|
1412
|
+
batch,
|
|
1413
|
+
file_format=file_format,
|
|
1414
|
+
batch_size=batch_size,
|
|
1415
|
+
)
|
|
1381
1416
|
|
|
1382
1417
|
return connected
|
|
1383
1418
|
|
|
@@ -1387,7 +1422,6 @@ class DatabricksPath:
|
|
|
1387
1422
|
table,
|
|
1388
1423
|
file_format=file_format,
|
|
1389
1424
|
batch_size=batch_size,
|
|
1390
|
-
**kwargs
|
|
1391
1425
|
)
|
|
1392
1426
|
|
|
1393
1427
|
return self
|
|
@@ -1397,7 +1431,6 @@ class DatabricksPath:
|
|
|
1397
1431
|
file_format: Optional[FileFormat] = None,
|
|
1398
1432
|
batch_size: Optional[int] = None,
|
|
1399
1433
|
concat: bool = True,
|
|
1400
|
-
**kwargs
|
|
1401
1434
|
):
|
|
1402
1435
|
"""Read the path into a pandas DataFrame.
|
|
1403
1436
|
|
|
@@ -1405,7 +1438,6 @@ class DatabricksPath:
|
|
|
1405
1438
|
file_format: Optional file format override.
|
|
1406
1439
|
batch_size: Optional batch size for reads.
|
|
1407
1440
|
concat: Whether to concatenate results for directories.
|
|
1408
|
-
**kwargs: Format-specific options.
|
|
1409
1441
|
|
|
1410
1442
|
Returns:
|
|
1411
1443
|
A pandas DataFrame or list of DataFrames if concat=False.
|
|
@@ -1415,14 +1447,12 @@ class DatabricksPath:
|
|
|
1415
1447
|
file_format=file_format,
|
|
1416
1448
|
batch_size=batch_size,
|
|
1417
1449
|
concat=True,
|
|
1418
|
-
**kwargs
|
|
1419
1450
|
).to_pandas()
|
|
1420
1451
|
|
|
1421
1452
|
tables = self.read_arrow_table(
|
|
1422
1453
|
batch_size=batch_size,
|
|
1423
1454
|
file_format=file_format,
|
|
1424
1455
|
concat=False,
|
|
1425
|
-
**kwargs
|
|
1426
1456
|
)
|
|
1427
1457
|
|
|
1428
1458
|
return [t.to_pandas() for t in tables] # type: ignore[arg-type]
|
|
@@ -1432,7 +1462,6 @@ class DatabricksPath:
|
|
|
1432
1462
|
df: PandasDataFrame,
|
|
1433
1463
|
file_format: Optional[FileFormat] = None,
|
|
1434
1464
|
batch_size: Optional[int] = None,
|
|
1435
|
-
**kwargs
|
|
1436
1465
|
):
|
|
1437
1466
|
"""Write a pandas DataFrame to the path.
|
|
1438
1467
|
|
|
@@ -1440,7 +1469,6 @@ class DatabricksPath:
|
|
|
1440
1469
|
df: pandas DataFrame to write.
|
|
1441
1470
|
file_format: Optional file format override.
|
|
1442
1471
|
batch_size: Optional batch size for writes.
|
|
1443
|
-
**kwargs: Format-specific options.
|
|
1444
1472
|
|
|
1445
1473
|
Returns:
|
|
1446
1474
|
The DatabricksPath instance.
|
|
@@ -1461,7 +1489,6 @@ class DatabricksPath:
|
|
|
1461
1489
|
batch,
|
|
1462
1490
|
file_format=file_format,
|
|
1463
1491
|
batch_size=batch_size,
|
|
1464
|
-
**kwargs
|
|
1465
1492
|
)
|
|
1466
1493
|
else:
|
|
1467
1494
|
with connected.open(mode="wb", clone=False) as f:
|
|
@@ -1469,7 +1496,6 @@ class DatabricksPath:
|
|
|
1469
1496
|
df,
|
|
1470
1497
|
file_format=file_format,
|
|
1471
1498
|
batch_size=batch_size,
|
|
1472
|
-
**kwargs
|
|
1473
1499
|
)
|
|
1474
1500
|
|
|
1475
1501
|
return self
|
|
@@ -1521,7 +1547,6 @@ class DatabricksPath:
|
|
|
1521
1547
|
df,
|
|
1522
1548
|
file_format: Optional[FileFormat] = None,
|
|
1523
1549
|
batch_size: Optional[int] = None,
|
|
1524
|
-
**kwargs
|
|
1525
1550
|
):
|
|
1526
1551
|
"""
|
|
1527
1552
|
Write Polars to a DatabricksPath.
|
|
@@ -1536,7 +1561,6 @@ class DatabricksPath:
|
|
|
1536
1561
|
df: polars DataFrame or LazyFrame to write.
|
|
1537
1562
|
file_format: Optional file format override.
|
|
1538
1563
|
batch_size: Optional rows per part for directory sinks.
|
|
1539
|
-
**kwargs: Format-specific options.
|
|
1540
1564
|
|
|
1541
1565
|
Returns:
|
|
1542
1566
|
The DatabricksPath instance.
|
|
@@ -1550,7 +1574,7 @@ class DatabricksPath:
|
|
|
1550
1574
|
with self.connect() as connected:
|
|
1551
1575
|
if connected.is_dir_sink():
|
|
1552
1576
|
seed = int(time.time() * 1000)
|
|
1553
|
-
rows_per_part = batch_size or
|
|
1577
|
+
rows_per_part = batch_size or 1024 * 1024
|
|
1554
1578
|
|
|
1555
1579
|
# Always parquet for directory sinks (lake layout standard)
|
|
1556
1580
|
for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
|
|
@@ -1561,7 +1585,6 @@ class DatabricksPath:
|
|
|
1561
1585
|
df,
|
|
1562
1586
|
file_format=file_format,
|
|
1563
1587
|
batch_size=batch_size,
|
|
1564
|
-
**kwargs
|
|
1565
1588
|
)
|
|
1566
1589
|
else:
|
|
1567
1590
|
with connected.open(mode="wb", clone=False) as f:
|
|
@@ -1569,11 +1592,33 @@ class DatabricksPath:
|
|
|
1569
1592
|
df,
|
|
1570
1593
|
file_format=file_format,
|
|
1571
1594
|
batch_size=batch_size,
|
|
1572
|
-
**kwargs
|
|
1573
1595
|
)
|
|
1574
1596
|
|
|
1575
1597
|
return self
|
|
1576
1598
|
|
|
1599
|
+
def read_pickle(
|
|
1600
|
+
self,
|
|
1601
|
+
) -> Any:
|
|
1602
|
+
content = self.read_bytes()
|
|
1603
|
+
obj = dill.loads(content)
|
|
1604
|
+
|
|
1605
|
+
return obj
|
|
1606
|
+
|
|
1607
|
+
def write_pickle(
|
|
1608
|
+
self,
|
|
1609
|
+
obj: Any,
|
|
1610
|
+
file_format: Optional[FileFormat] = None,
|
|
1611
|
+
):
|
|
1612
|
+
buffer = io.BytesIO()
|
|
1613
|
+
|
|
1614
|
+
if isinstance(obj, PandasDataFrame):
|
|
1615
|
+
obj.to_pickle(buffer)
|
|
1616
|
+
else:
|
|
1617
|
+
buffer.write(dill.dumps(obj))
|
|
1618
|
+
|
|
1619
|
+
self.write_bytes(data=buffer.getvalue())
|
|
1620
|
+
|
|
1621
|
+
|
|
1577
1622
|
def sql(
|
|
1578
1623
|
self,
|
|
1579
1624
|
query: str,
|
|
@@ -1634,34 +1679,3 @@ class DatabricksPath:
|
|
|
1634
1679
|
raise ValueError(
|
|
1635
1680
|
"Invalid engine %s, must be in duckdb, polars" % engine
|
|
1636
1681
|
)
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
if databricks is not None:
|
|
1640
|
-
@register_converter(DatabricksPath, pa.Table)
|
|
1641
|
-
def databricks_path_to_arrow_table(
|
|
1642
|
-
data: DatabricksPath,
|
|
1643
|
-
options: Optional[CastOptions] = None,
|
|
1644
|
-
) -> pa.Table:
|
|
1645
|
-
return cast_arrow_tabular(
|
|
1646
|
-
data.read_arrow_table(),
|
|
1647
|
-
options
|
|
1648
|
-
)
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
@register_converter(DatabricksPath, ds.Dataset)
|
|
1652
|
-
def databricks_path_to_arrow_table(
|
|
1653
|
-
data: DatabricksPath,
|
|
1654
|
-
options: Optional[CastOptions] = None,
|
|
1655
|
-
) -> ds.Dataset:
|
|
1656
|
-
return data.arrow_dataset()
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
@polars_converter(DatabricksPath, PolarsDataFrame)
|
|
1660
|
-
def databricks_path_to_polars(
|
|
1661
|
-
data: DatabricksPath,
|
|
1662
|
-
options: Optional[CastOptions] = None,
|
|
1663
|
-
) -> PolarsDataFrame:
|
|
1664
|
-
return cast_polars_dataframe(
|
|
1665
|
-
data.read_polars(),
|
|
1666
|
-
options
|
|
1667
|
-
)
|