ygg 0.1.57__py3-none-any.whl → 0.1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/METADATA +1 -1
- ygg-0.1.60.dist-info/RECORD +74 -0
- yggdrasil/ai/__init__.py +2 -0
- yggdrasil/ai/session.py +89 -0
- yggdrasil/ai/sql_session.py +310 -0
- yggdrasil/databricks/__init__.py +0 -3
- yggdrasil/databricks/compute/cluster.py +68 -113
- yggdrasil/databricks/compute/command_execution.py +674 -0
- yggdrasil/databricks/compute/exceptions.py +19 -0
- yggdrasil/databricks/compute/execution_context.py +491 -282
- yggdrasil/databricks/compute/remote.py +4 -14
- yggdrasil/databricks/exceptions.py +10 -0
- yggdrasil/databricks/sql/__init__.py +0 -4
- yggdrasil/databricks/sql/engine.py +161 -173
- yggdrasil/databricks/sql/exceptions.py +9 -1
- yggdrasil/databricks/sql/statement_result.py +108 -120
- yggdrasil/databricks/sql/warehouse.py +331 -92
- yggdrasil/databricks/workspaces/io.py +89 -9
- yggdrasil/databricks/workspaces/path.py +120 -72
- yggdrasil/databricks/workspaces/workspace.py +214 -61
- yggdrasil/exceptions.py +7 -0
- yggdrasil/libs/databrickslib.py +23 -18
- yggdrasil/libs/extensions/spark_extensions.py +1 -1
- yggdrasil/libs/pandaslib.py +15 -6
- yggdrasil/libs/polarslib.py +49 -13
- yggdrasil/pyutils/__init__.py +1 -2
- yggdrasil/pyutils/callable_serde.py +12 -19
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/python_env.py +14 -13
- yggdrasil/pyutils/waiting_config.py +171 -0
- yggdrasil/types/cast/arrow_cast.py +3 -0
- yggdrasil/types/cast/pandas_cast.py +157 -169
- yggdrasil/types/cast/polars_cast.py +11 -43
- yggdrasil/types/dummy_class.py +81 -0
- yggdrasil/version.py +1 -1
- ygg-0.1.57.dist-info/RECORD +0 -66
- yggdrasil/databricks/ai/loki.py +0 -53
- {ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/WHEEL +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/top_level.txt +0 -0
- /yggdrasil/{databricks/ai/__init__.py → pyutils/mimetypes.py} +0 -0
|
@@ -5,12 +5,15 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import dataclasses
|
|
7
7
|
import datetime as dt
|
|
8
|
+
import io
|
|
8
9
|
import random
|
|
9
10
|
import string
|
|
10
11
|
import time
|
|
11
12
|
from pathlib import PurePosixPath
|
|
12
|
-
from
|
|
13
|
+
from threading import Thread
|
|
14
|
+
from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Any, IO
|
|
13
15
|
|
|
16
|
+
import dill
|
|
14
17
|
import pyarrow as pa
|
|
15
18
|
import pyarrow.dataset as ds
|
|
16
19
|
from pyarrow import ArrowInvalid
|
|
@@ -23,14 +26,15 @@ from .volumes_path import get_volume_status, get_volume_metadata
|
|
|
23
26
|
from ...libs.databrickslib import databricks
|
|
24
27
|
from ...libs.pandaslib import PandasDataFrame
|
|
25
28
|
from ...libs.polarslib import polars, PolarsDataFrame
|
|
26
|
-
from ...types.cast.arrow_cast import cast_arrow_tabular
|
|
27
29
|
from ...types.cast.cast_options import CastOptions
|
|
30
|
+
from ...types.cast.pandas_cast import pandas_converter, cast_pandas_dataframe
|
|
28
31
|
from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
|
|
29
32
|
from ...types.cast.registry import convert, register_converter
|
|
30
33
|
from ...types.file_format import ExcelFileFormat
|
|
31
34
|
|
|
32
35
|
if databricks is not None:
|
|
33
|
-
from databricks.sdk.
|
|
36
|
+
from databricks.sdk.errors import InternalError
|
|
37
|
+
from databricks.sdk.service.catalog import VolumeType, VolumeInfo, PathOperation
|
|
34
38
|
from databricks.sdk.service.workspace import ObjectType
|
|
35
39
|
from databricks.sdk.errors.platform import (
|
|
36
40
|
NotFound,
|
|
@@ -176,6 +180,8 @@ class DatabricksPath:
|
|
|
176
180
|
if not obj:
|
|
177
181
|
return cls.empty_instance(workspace=workspace)
|
|
178
182
|
|
|
183
|
+
if isinstance(obj, str):
|
|
184
|
+
obj = [obj]
|
|
179
185
|
if not isinstance(obj, (str, list)):
|
|
180
186
|
if isinstance(obj, DatabricksPath):
|
|
181
187
|
if workspace is not None and obj._workspace is None:
|
|
@@ -191,6 +197,7 @@ class DatabricksPath:
|
|
|
191
197
|
obj = str(obj)
|
|
192
198
|
|
|
193
199
|
|
|
200
|
+
|
|
194
201
|
obj = _flatten_parts(obj)
|
|
195
202
|
|
|
196
203
|
if obj and not obj[0]:
|
|
@@ -246,16 +253,23 @@ class DatabricksPath:
|
|
|
246
253
|
if self._workspace is not None:
|
|
247
254
|
self._workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
248
255
|
|
|
256
|
+
self.close(wait=False)
|
|
257
|
+
|
|
249
258
|
def __str__(self):
|
|
250
259
|
return self.full_path()
|
|
251
260
|
|
|
252
261
|
def __repr__(self):
|
|
253
262
|
return self.url()
|
|
254
263
|
|
|
264
|
+
def __del__(self):
|
|
265
|
+
self.close(wait=False)
|
|
266
|
+
|
|
255
267
|
def __fspath__(self):
|
|
256
268
|
return self.full_path()
|
|
257
269
|
|
|
258
270
|
def url(self):
|
|
271
|
+
if self._workspace is not None:
|
|
272
|
+
return self._workspace.safe_host + self.full_path()
|
|
259
273
|
return "dbfs://%s" % self.full_path()
|
|
260
274
|
|
|
261
275
|
def full_path(self) -> str:
|
|
@@ -282,7 +296,7 @@ class DatabricksPath:
|
|
|
282
296
|
Returns:
|
|
283
297
|
A PyArrow FileSystem instance.
|
|
284
298
|
"""
|
|
285
|
-
return self.workspace.
|
|
299
|
+
return self.workspace.filesystem(workspace=workspace)
|
|
286
300
|
|
|
287
301
|
@property
|
|
288
302
|
def parent(self):
|
|
@@ -496,9 +510,15 @@ class DatabricksPath:
|
|
|
496
510
|
|
|
497
511
|
return self
|
|
498
512
|
|
|
499
|
-
def close(self):
|
|
513
|
+
def close(self, wait: bool = True):
|
|
500
514
|
if self.temporary:
|
|
501
|
-
|
|
515
|
+
if wait:
|
|
516
|
+
self.remove(recursive=True)
|
|
517
|
+
else:
|
|
518
|
+
Thread(
|
|
519
|
+
target=self.remove,
|
|
520
|
+
kwargs={"recursive": True}
|
|
521
|
+
).start()
|
|
502
522
|
|
|
503
523
|
def storage_location(self) -> str:
|
|
504
524
|
info = self.volume_info()
|
|
@@ -586,7 +606,7 @@ class DatabricksPath:
|
|
|
586
606
|
mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
|
|
587
607
|
|
|
588
608
|
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
589
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
609
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
590
610
|
pass
|
|
591
611
|
|
|
592
612
|
found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
|
|
@@ -730,7 +750,7 @@ class DatabricksPath:
|
|
|
730
750
|
properties=default_tags,
|
|
731
751
|
comment="Catalog auto generated by yggdrasil"
|
|
732
752
|
)
|
|
733
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
753
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest, InternalError):
|
|
734
754
|
if not exist_ok:
|
|
735
755
|
raise
|
|
736
756
|
|
|
@@ -742,7 +762,7 @@ class DatabricksPath:
|
|
|
742
762
|
properties=default_tags,
|
|
743
763
|
comment="Schema auto generated by yggdrasil"
|
|
744
764
|
)
|
|
745
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
765
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest, InternalError):
|
|
746
766
|
if not exist_ok:
|
|
747
767
|
raise
|
|
748
768
|
|
|
@@ -808,37 +828,54 @@ class DatabricksPath:
|
|
|
808
828
|
|
|
809
829
|
def remove(
|
|
810
830
|
self,
|
|
811
|
-
recursive: bool = True
|
|
831
|
+
recursive: bool = True,
|
|
832
|
+
allow_not_found: bool = True
|
|
812
833
|
):
|
|
813
834
|
"""Remove the path as a file or directory.
|
|
814
835
|
|
|
815
836
|
Args:
|
|
816
837
|
recursive: Whether to delete directories recursively.
|
|
838
|
+
allow_not_found: Allow not found path
|
|
817
839
|
|
|
818
840
|
Returns:
|
|
819
841
|
The DatabricksPath instance.
|
|
820
842
|
"""
|
|
821
843
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
822
|
-
return self._remove_volume_obj(recursive=recursive)
|
|
844
|
+
return self._remove_volume_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
823
845
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
824
|
-
return self._remove_workspace_obj(recursive=recursive)
|
|
846
|
+
return self._remove_workspace_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
825
847
|
elif self.kind == DatabricksPathKind.DBFS:
|
|
826
|
-
return self._remove_dbfs_obj(recursive=recursive)
|
|
848
|
+
return self._remove_dbfs_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
827
849
|
|
|
828
|
-
def _remove_volume_obj(
|
|
850
|
+
def _remove_volume_obj(
|
|
851
|
+
self,
|
|
852
|
+
recursive: bool = True,
|
|
853
|
+
allow_not_found: bool = True
|
|
854
|
+
):
|
|
829
855
|
if self.is_file():
|
|
830
|
-
return self._remove_volume_file()
|
|
831
|
-
|
|
856
|
+
return self._remove_volume_file(allow_not_found=allow_not_found)
|
|
857
|
+
elif self.is_dir():
|
|
858
|
+
return self._remove_volume_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
832
859
|
|
|
833
|
-
def _remove_workspace_obj(
|
|
860
|
+
def _remove_workspace_obj(
|
|
861
|
+
self,
|
|
862
|
+
recursive: bool = True,
|
|
863
|
+
allow_not_found: bool = True
|
|
864
|
+
):
|
|
834
865
|
if self.is_file():
|
|
835
|
-
return self._remove_workspace_file()
|
|
836
|
-
|
|
866
|
+
return self._remove_workspace_file(allow_not_found=allow_not_found)
|
|
867
|
+
elif self.is_dir():
|
|
868
|
+
return self._remove_workspace_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
837
869
|
|
|
838
|
-
def _remove_dbfs_obj(
|
|
870
|
+
def _remove_dbfs_obj(
|
|
871
|
+
self,
|
|
872
|
+
recursive: bool = True,
|
|
873
|
+
allow_not_found: bool = True
|
|
874
|
+
):
|
|
839
875
|
if self.is_file():
|
|
840
|
-
return self._remove_dbfs_file()
|
|
841
|
-
|
|
876
|
+
return self._remove_dbfs_file(allow_not_found=allow_not_found)
|
|
877
|
+
elif self.is_dir():
|
|
878
|
+
return self._remove_dbfs_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
842
879
|
|
|
843
880
|
def rmfile(self, allow_not_found: bool = True):
|
|
844
881
|
"""Remove the path as a file.
|
|
@@ -859,7 +896,7 @@ class DatabricksPath:
|
|
|
859
896
|
sdk = self.workspace.sdk()
|
|
860
897
|
try:
|
|
861
898
|
sdk.files.delete(self.files_full_path())
|
|
862
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
899
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
863
900
|
if not allow_not_found:
|
|
864
901
|
raise
|
|
865
902
|
finally:
|
|
@@ -871,7 +908,7 @@ class DatabricksPath:
|
|
|
871
908
|
sdk = self.workspace.sdk()
|
|
872
909
|
try:
|
|
873
910
|
sdk.workspace.delete(self.workspace_full_path(), recursive=True)
|
|
874
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
911
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
875
912
|
if not allow_not_found:
|
|
876
913
|
raise
|
|
877
914
|
finally:
|
|
@@ -883,7 +920,7 @@ class DatabricksPath:
|
|
|
883
920
|
sdk = self.workspace.sdk()
|
|
884
921
|
try:
|
|
885
922
|
sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
|
|
886
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
923
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
887
924
|
if not allow_not_found:
|
|
888
925
|
raise
|
|
889
926
|
finally:
|
|
@@ -940,7 +977,7 @@ class DatabricksPath:
|
|
|
940
977
|
|
|
941
978
|
if not with_root:
|
|
942
979
|
sdk.workspace.mkdirs(full_path)
|
|
943
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
980
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
944
981
|
if not allow_not_found:
|
|
945
982
|
raise
|
|
946
983
|
finally:
|
|
@@ -962,7 +999,7 @@ class DatabricksPath:
|
|
|
962
999
|
|
|
963
1000
|
if not with_root:
|
|
964
1001
|
sdk.dbfs.mkdirs(full_path)
|
|
965
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1002
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
966
1003
|
if not allow_not_found:
|
|
967
1004
|
raise
|
|
968
1005
|
finally:
|
|
@@ -983,7 +1020,7 @@ class DatabricksPath:
|
|
|
983
1020
|
if rel:
|
|
984
1021
|
try:
|
|
985
1022
|
sdk.files.delete_directory(full_path)
|
|
986
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
|
|
1023
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError) as e:
|
|
987
1024
|
message = str(e)
|
|
988
1025
|
|
|
989
1026
|
if recursive and "directory is not empty" in message:
|
|
@@ -998,13 +1035,13 @@ class DatabricksPath:
|
|
|
998
1035
|
elif volume_name:
|
|
999
1036
|
try:
|
|
1000
1037
|
sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
|
|
1001
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1038
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1002
1039
|
if not allow_not_found:
|
|
1003
1040
|
raise
|
|
1004
1041
|
elif schema_name:
|
|
1005
1042
|
try:
|
|
1006
1043
|
sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
|
|
1007
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1044
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1008
1045
|
if not allow_not_found:
|
|
1009
1046
|
raise
|
|
1010
1047
|
|
|
@@ -1064,7 +1101,7 @@ class DatabricksPath:
|
|
|
1064
1101
|
yield from base._ls_volume(recursive=recursive)
|
|
1065
1102
|
else:
|
|
1066
1103
|
yield base
|
|
1067
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1104
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1068
1105
|
if not allow_not_found:
|
|
1069
1106
|
raise
|
|
1070
1107
|
elif schema_name is None:
|
|
@@ -1082,7 +1119,7 @@ class DatabricksPath:
|
|
|
1082
1119
|
yield from base._ls_volume(recursive=recursive)
|
|
1083
1120
|
else:
|
|
1084
1121
|
yield base
|
|
1085
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1122
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1086
1123
|
if not allow_not_found:
|
|
1087
1124
|
raise
|
|
1088
1125
|
else:
|
|
@@ -1100,7 +1137,7 @@ class DatabricksPath:
|
|
|
1100
1137
|
yield from base._ls_volume(recursive=recursive)
|
|
1101
1138
|
else:
|
|
1102
1139
|
yield base
|
|
1103
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1140
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1104
1141
|
if not allow_not_found:
|
|
1105
1142
|
raise
|
|
1106
1143
|
else:
|
|
@@ -1121,7 +1158,7 @@ class DatabricksPath:
|
|
|
1121
1158
|
yield from base._ls_volume(recursive=recursive)
|
|
1122
1159
|
else:
|
|
1123
1160
|
yield base
|
|
1124
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1161
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1125
1162
|
if not allow_not_found:
|
|
1126
1163
|
raise
|
|
1127
1164
|
|
|
@@ -1140,7 +1177,7 @@ class DatabricksPath:
|
|
|
1140
1177
|
_is_dir=is_dir,
|
|
1141
1178
|
_size=info.size,
|
|
1142
1179
|
)
|
|
1143
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1180
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1144
1181
|
if not allow_not_found:
|
|
1145
1182
|
raise
|
|
1146
1183
|
|
|
@@ -1158,7 +1195,7 @@ class DatabricksPath:
|
|
|
1158
1195
|
_is_dir=info.is_dir,
|
|
1159
1196
|
_size=info.file_size,
|
|
1160
1197
|
)
|
|
1161
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1198
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1162
1199
|
if not allow_not_found:
|
|
1163
1200
|
raise
|
|
1164
1201
|
|
|
@@ -1225,13 +1262,6 @@ class DatabricksPath:
|
|
|
1225
1262
|
else:
|
|
1226
1263
|
raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
|
|
1227
1264
|
|
|
1228
|
-
def write_bytes(self, data: bytes):
|
|
1229
|
-
if hasattr(data, "read"):
|
|
1230
|
-
data = data.read()
|
|
1231
|
-
|
|
1232
|
-
with self.open("wb") as f:
|
|
1233
|
-
f.write_all_bytes(data=data)
|
|
1234
|
-
|
|
1235
1265
|
def temporary_credentials(
|
|
1236
1266
|
self,
|
|
1237
1267
|
operation: Optional["PathOperation"] = None
|
|
@@ -1248,6 +1278,14 @@ class DatabricksPath:
|
|
|
1248
1278
|
operation=operation or PathOperation.PATH_READ,
|
|
1249
1279
|
)
|
|
1250
1280
|
|
|
1281
|
+
def read_bytes(self, use_cache: bool = False):
|
|
1282
|
+
with self.open("rb") as f:
|
|
1283
|
+
return f.read_all_bytes(use_cache=use_cache)
|
|
1284
|
+
|
|
1285
|
+
def write_bytes(self, data: Union[bytes, IO[bytes]]):
|
|
1286
|
+
with self.open("wb") as f:
|
|
1287
|
+
f.write_all_bytes(data=data)
|
|
1288
|
+
|
|
1251
1289
|
# -------------------------
|
|
1252
1290
|
# Data ops (Arrow / Pandas / Polars)
|
|
1253
1291
|
# -------------------------
|
|
@@ -1356,7 +1394,6 @@ class DatabricksPath:
|
|
|
1356
1394
|
table: pa.Table,
|
|
1357
1395
|
file_format: Optional[FileFormat] = None,
|
|
1358
1396
|
batch_size: Optional[int] = None,
|
|
1359
|
-
**kwargs
|
|
1360
1397
|
):
|
|
1361
1398
|
"""Write an Arrow table to the path, sharding if needed.
|
|
1362
1399
|
|
|
@@ -1364,7 +1401,6 @@ class DatabricksPath:
|
|
|
1364
1401
|
table: Arrow table to write.
|
|
1365
1402
|
file_format: Optional file format override.
|
|
1366
1403
|
batch_size: Optional batch size for writes.
|
|
1367
|
-
**kwargs: Format-specific options.
|
|
1368
1404
|
|
|
1369
1405
|
Returns:
|
|
1370
1406
|
The DatabricksPath instance.
|
|
@@ -1377,7 +1413,11 @@ class DatabricksPath:
|
|
|
1377
1413
|
part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
|
|
1378
1414
|
|
|
1379
1415
|
with part_path.open(mode="wb") as f:
|
|
1380
|
-
f.write_arrow_batch(
|
|
1416
|
+
f.write_arrow_batch(
|
|
1417
|
+
batch,
|
|
1418
|
+
file_format=file_format,
|
|
1419
|
+
batch_size=batch_size,
|
|
1420
|
+
)
|
|
1381
1421
|
|
|
1382
1422
|
return connected
|
|
1383
1423
|
|
|
@@ -1387,7 +1427,6 @@ class DatabricksPath:
|
|
|
1387
1427
|
table,
|
|
1388
1428
|
file_format=file_format,
|
|
1389
1429
|
batch_size=batch_size,
|
|
1390
|
-
**kwargs
|
|
1391
1430
|
)
|
|
1392
1431
|
|
|
1393
1432
|
return self
|
|
@@ -1397,7 +1436,6 @@ class DatabricksPath:
|
|
|
1397
1436
|
file_format: Optional[FileFormat] = None,
|
|
1398
1437
|
batch_size: Optional[int] = None,
|
|
1399
1438
|
concat: bool = True,
|
|
1400
|
-
**kwargs
|
|
1401
1439
|
):
|
|
1402
1440
|
"""Read the path into a pandas DataFrame.
|
|
1403
1441
|
|
|
@@ -1405,7 +1443,6 @@ class DatabricksPath:
|
|
|
1405
1443
|
file_format: Optional file format override.
|
|
1406
1444
|
batch_size: Optional batch size for reads.
|
|
1407
1445
|
concat: Whether to concatenate results for directories.
|
|
1408
|
-
**kwargs: Format-specific options.
|
|
1409
1446
|
|
|
1410
1447
|
Returns:
|
|
1411
1448
|
A pandas DataFrame or list of DataFrames if concat=False.
|
|
@@ -1415,14 +1452,12 @@ class DatabricksPath:
|
|
|
1415
1452
|
file_format=file_format,
|
|
1416
1453
|
batch_size=batch_size,
|
|
1417
1454
|
concat=True,
|
|
1418
|
-
**kwargs
|
|
1419
1455
|
).to_pandas()
|
|
1420
1456
|
|
|
1421
1457
|
tables = self.read_arrow_table(
|
|
1422
1458
|
batch_size=batch_size,
|
|
1423
1459
|
file_format=file_format,
|
|
1424
1460
|
concat=False,
|
|
1425
|
-
**kwargs
|
|
1426
1461
|
)
|
|
1427
1462
|
|
|
1428
1463
|
return [t.to_pandas() for t in tables] # type: ignore[arg-type]
|
|
@@ -1432,7 +1467,6 @@ class DatabricksPath:
|
|
|
1432
1467
|
df: PandasDataFrame,
|
|
1433
1468
|
file_format: Optional[FileFormat] = None,
|
|
1434
1469
|
batch_size: Optional[int] = None,
|
|
1435
|
-
**kwargs
|
|
1436
1470
|
):
|
|
1437
1471
|
"""Write a pandas DataFrame to the path.
|
|
1438
1472
|
|
|
@@ -1440,7 +1474,6 @@ class DatabricksPath:
|
|
|
1440
1474
|
df: pandas DataFrame to write.
|
|
1441
1475
|
file_format: Optional file format override.
|
|
1442
1476
|
batch_size: Optional batch size for writes.
|
|
1443
|
-
**kwargs: Format-specific options.
|
|
1444
1477
|
|
|
1445
1478
|
Returns:
|
|
1446
1479
|
The DatabricksPath instance.
|
|
@@ -1461,7 +1494,6 @@ class DatabricksPath:
|
|
|
1461
1494
|
batch,
|
|
1462
1495
|
file_format=file_format,
|
|
1463
1496
|
batch_size=batch_size,
|
|
1464
|
-
**kwargs
|
|
1465
1497
|
)
|
|
1466
1498
|
else:
|
|
1467
1499
|
with connected.open(mode="wb", clone=False) as f:
|
|
@@ -1469,7 +1501,6 @@ class DatabricksPath:
|
|
|
1469
1501
|
df,
|
|
1470
1502
|
file_format=file_format,
|
|
1471
1503
|
batch_size=batch_size,
|
|
1472
|
-
**kwargs
|
|
1473
1504
|
)
|
|
1474
1505
|
|
|
1475
1506
|
return self
|
|
@@ -1521,7 +1552,6 @@ class DatabricksPath:
|
|
|
1521
1552
|
df,
|
|
1522
1553
|
file_format: Optional[FileFormat] = None,
|
|
1523
1554
|
batch_size: Optional[int] = None,
|
|
1524
|
-
**kwargs
|
|
1525
1555
|
):
|
|
1526
1556
|
"""
|
|
1527
1557
|
Write Polars to a DatabricksPath.
|
|
@@ -1536,7 +1566,6 @@ class DatabricksPath:
|
|
|
1536
1566
|
df: polars DataFrame or LazyFrame to write.
|
|
1537
1567
|
file_format: Optional file format override.
|
|
1538
1568
|
batch_size: Optional rows per part for directory sinks.
|
|
1539
|
-
**kwargs: Format-specific options.
|
|
1540
1569
|
|
|
1541
1570
|
Returns:
|
|
1542
1571
|
The DatabricksPath instance.
|
|
@@ -1550,7 +1579,7 @@ class DatabricksPath:
|
|
|
1550
1579
|
with self.connect() as connected:
|
|
1551
1580
|
if connected.is_dir_sink():
|
|
1552
1581
|
seed = int(time.time() * 1000)
|
|
1553
|
-
rows_per_part = batch_size or
|
|
1582
|
+
rows_per_part = batch_size or 1024 * 1024
|
|
1554
1583
|
|
|
1555
1584
|
# Always parquet for directory sinks (lake layout standard)
|
|
1556
1585
|
for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
|
|
@@ -1561,7 +1590,6 @@ class DatabricksPath:
|
|
|
1561
1590
|
df,
|
|
1562
1591
|
file_format=file_format,
|
|
1563
1592
|
batch_size=batch_size,
|
|
1564
|
-
**kwargs
|
|
1565
1593
|
)
|
|
1566
1594
|
else:
|
|
1567
1595
|
with connected.open(mode="wb", clone=False) as f:
|
|
@@ -1569,11 +1597,33 @@ class DatabricksPath:
|
|
|
1569
1597
|
df,
|
|
1570
1598
|
file_format=file_format,
|
|
1571
1599
|
batch_size=batch_size,
|
|
1572
|
-
**kwargs
|
|
1573
1600
|
)
|
|
1574
1601
|
|
|
1575
1602
|
return self
|
|
1576
1603
|
|
|
1604
|
+
def read_pickle(
|
|
1605
|
+
self,
|
|
1606
|
+
) -> Any:
|
|
1607
|
+
content = self.read_bytes()
|
|
1608
|
+
obj = dill.loads(content)
|
|
1609
|
+
|
|
1610
|
+
return obj
|
|
1611
|
+
|
|
1612
|
+
def write_pickle(
|
|
1613
|
+
self,
|
|
1614
|
+
obj: Any,
|
|
1615
|
+
file_format: Optional[FileFormat] = None,
|
|
1616
|
+
):
|
|
1617
|
+
buffer = io.BytesIO()
|
|
1618
|
+
|
|
1619
|
+
if isinstance(obj, PandasDataFrame):
|
|
1620
|
+
obj.to_pickle(buffer)
|
|
1621
|
+
else:
|
|
1622
|
+
buffer.write(dill.dumps(obj))
|
|
1623
|
+
|
|
1624
|
+
self.write_bytes(data=buffer.getvalue())
|
|
1625
|
+
|
|
1626
|
+
|
|
1577
1627
|
def sql(
|
|
1578
1628
|
self,
|
|
1579
1629
|
query: str,
|
|
@@ -1635,19 +1685,7 @@ class DatabricksPath:
|
|
|
1635
1685
|
"Invalid engine %s, must be in duckdb, polars" % engine
|
|
1636
1686
|
)
|
|
1637
1687
|
|
|
1638
|
-
|
|
1639
1688
|
if databricks is not None:
|
|
1640
|
-
@register_converter(DatabricksPath, pa.Table)
|
|
1641
|
-
def databricks_path_to_arrow_table(
|
|
1642
|
-
data: DatabricksPath,
|
|
1643
|
-
options: Optional[CastOptions] = None,
|
|
1644
|
-
) -> pa.Table:
|
|
1645
|
-
return cast_arrow_tabular(
|
|
1646
|
-
data.read_arrow_table(),
|
|
1647
|
-
options
|
|
1648
|
-
)
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
1689
|
@register_converter(DatabricksPath, ds.Dataset)
|
|
1652
1690
|
def databricks_path_to_arrow_table(
|
|
1653
1691
|
data: DatabricksPath,
|
|
@@ -1656,6 +1694,16 @@ if databricks is not None:
|
|
|
1656
1694
|
return data.arrow_dataset()
|
|
1657
1695
|
|
|
1658
1696
|
|
|
1697
|
+
@pandas_converter(DatabricksPath, PandasDataFrame)
|
|
1698
|
+
def databricks_path_to_pandas(
|
|
1699
|
+
data: DatabricksPath,
|
|
1700
|
+
options: Optional[CastOptions] = None,
|
|
1701
|
+
) -> PolarsDataFrame:
|
|
1702
|
+
return cast_pandas_dataframe(
|
|
1703
|
+
data.read_pandas(),
|
|
1704
|
+
options
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1659
1707
|
@polars_converter(DatabricksPath, PolarsDataFrame)
|
|
1660
1708
|
def databricks_path_to_polars(
|
|
1661
1709
|
data: DatabricksPath,
|