ygg 0.1.56__py3-none-any.whl → 0.1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/METADATA +1 -1
- ygg-0.1.60.dist-info/RECORD +74 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/WHEEL +1 -1
- yggdrasil/ai/__init__.py +2 -0
- yggdrasil/ai/session.py +89 -0
- yggdrasil/ai/sql_session.py +310 -0
- yggdrasil/databricks/__init__.py +0 -3
- yggdrasil/databricks/compute/cluster.py +68 -113
- yggdrasil/databricks/compute/command_execution.py +674 -0
- yggdrasil/databricks/compute/exceptions.py +7 -2
- yggdrasil/databricks/compute/execution_context.py +465 -277
- yggdrasil/databricks/compute/remote.py +4 -14
- yggdrasil/databricks/exceptions.py +10 -0
- yggdrasil/databricks/sql/__init__.py +0 -4
- yggdrasil/databricks/sql/engine.py +161 -173
- yggdrasil/databricks/sql/exceptions.py +9 -1
- yggdrasil/databricks/sql/statement_result.py +108 -120
- yggdrasil/databricks/sql/warehouse.py +331 -92
- yggdrasil/databricks/workspaces/io.py +92 -9
- yggdrasil/databricks/workspaces/path.py +120 -74
- yggdrasil/databricks/workspaces/workspace.py +212 -68
- yggdrasil/libs/databrickslib.py +23 -18
- yggdrasil/libs/extensions/spark_extensions.py +1 -1
- yggdrasil/libs/pandaslib.py +15 -6
- yggdrasil/libs/polarslib.py +49 -13
- yggdrasil/pyutils/__init__.py +1 -0
- yggdrasil/pyutils/callable_serde.py +12 -19
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/mimetypes.py +0 -0
- yggdrasil/pyutils/python_env.py +13 -12
- yggdrasil/pyutils/waiting_config.py +171 -0
- yggdrasil/types/cast/arrow_cast.py +3 -0
- yggdrasil/types/cast/pandas_cast.py +157 -169
- yggdrasil/types/cast/polars_cast.py +11 -43
- yggdrasil/types/dummy_class.py +81 -0
- yggdrasil/version.py +1 -1
- ygg-0.1.56.dist-info/RECORD +0 -68
- yggdrasil/databricks/ai/__init__.py +0 -1
- yggdrasil/databricks/ai/loki.py +0 -374
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/top_level.txt +0 -0
|
@@ -5,12 +5,15 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import dataclasses
|
|
7
7
|
import datetime as dt
|
|
8
|
+
import io
|
|
8
9
|
import random
|
|
9
10
|
import string
|
|
10
11
|
import time
|
|
11
12
|
from pathlib import PurePosixPath
|
|
12
|
-
from
|
|
13
|
+
from threading import Thread
|
|
14
|
+
from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Any, IO
|
|
13
15
|
|
|
16
|
+
import dill
|
|
14
17
|
import pyarrow as pa
|
|
15
18
|
import pyarrow.dataset as ds
|
|
16
19
|
from pyarrow import ArrowInvalid
|
|
@@ -23,14 +26,15 @@ from .volumes_path import get_volume_status, get_volume_metadata
|
|
|
23
26
|
from ...libs.databrickslib import databricks
|
|
24
27
|
from ...libs.pandaslib import PandasDataFrame
|
|
25
28
|
from ...libs.polarslib import polars, PolarsDataFrame
|
|
26
|
-
from ...types.cast.arrow_cast import cast_arrow_tabular
|
|
27
29
|
from ...types.cast.cast_options import CastOptions
|
|
30
|
+
from ...types.cast.pandas_cast import pandas_converter, cast_pandas_dataframe
|
|
28
31
|
from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
|
|
29
32
|
from ...types.cast.registry import convert, register_converter
|
|
30
33
|
from ...types.file_format import ExcelFileFormat
|
|
31
34
|
|
|
32
35
|
if databricks is not None:
|
|
33
|
-
from databricks.sdk.
|
|
36
|
+
from databricks.sdk.errors import InternalError
|
|
37
|
+
from databricks.sdk.service.catalog import VolumeType, VolumeInfo, PathOperation
|
|
34
38
|
from databricks.sdk.service.workspace import ObjectType
|
|
35
39
|
from databricks.sdk.errors.platform import (
|
|
36
40
|
NotFound,
|
|
@@ -176,6 +180,8 @@ class DatabricksPath:
|
|
|
176
180
|
if not obj:
|
|
177
181
|
return cls.empty_instance(workspace=workspace)
|
|
178
182
|
|
|
183
|
+
if isinstance(obj, str):
|
|
184
|
+
obj = [obj]
|
|
179
185
|
if not isinstance(obj, (str, list)):
|
|
180
186
|
if isinstance(obj, DatabricksPath):
|
|
181
187
|
if workspace is not None and obj._workspace is None:
|
|
@@ -191,6 +197,7 @@ class DatabricksPath:
|
|
|
191
197
|
obj = str(obj)
|
|
192
198
|
|
|
193
199
|
|
|
200
|
+
|
|
194
201
|
obj = _flatten_parts(obj)
|
|
195
202
|
|
|
196
203
|
if obj and not obj[0]:
|
|
@@ -246,16 +253,23 @@ class DatabricksPath:
|
|
|
246
253
|
if self._workspace is not None:
|
|
247
254
|
self._workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
248
255
|
|
|
256
|
+
self.close(wait=False)
|
|
257
|
+
|
|
249
258
|
def __str__(self):
|
|
250
259
|
return self.full_path()
|
|
251
260
|
|
|
252
261
|
def __repr__(self):
|
|
253
262
|
return self.url()
|
|
254
263
|
|
|
264
|
+
def __del__(self):
|
|
265
|
+
self.close(wait=False)
|
|
266
|
+
|
|
255
267
|
def __fspath__(self):
|
|
256
268
|
return self.full_path()
|
|
257
269
|
|
|
258
270
|
def url(self):
|
|
271
|
+
if self._workspace is not None:
|
|
272
|
+
return self._workspace.safe_host + self.full_path()
|
|
259
273
|
return "dbfs://%s" % self.full_path()
|
|
260
274
|
|
|
261
275
|
def full_path(self) -> str:
|
|
@@ -282,7 +296,7 @@ class DatabricksPath:
|
|
|
282
296
|
Returns:
|
|
283
297
|
A PyArrow FileSystem instance.
|
|
284
298
|
"""
|
|
285
|
-
return self.workspace.
|
|
299
|
+
return self.workspace.filesystem(workspace=workspace)
|
|
286
300
|
|
|
287
301
|
@property
|
|
288
302
|
def parent(self):
|
|
@@ -496,9 +510,15 @@ class DatabricksPath:
|
|
|
496
510
|
|
|
497
511
|
return self
|
|
498
512
|
|
|
499
|
-
def close(self):
|
|
513
|
+
def close(self, wait: bool = True):
|
|
500
514
|
if self.temporary:
|
|
501
|
-
|
|
515
|
+
if wait:
|
|
516
|
+
self.remove(recursive=True)
|
|
517
|
+
else:
|
|
518
|
+
Thread(
|
|
519
|
+
target=self.remove,
|
|
520
|
+
kwargs={"recursive": True}
|
|
521
|
+
).start()
|
|
502
522
|
|
|
503
523
|
def storage_location(self) -> str:
|
|
504
524
|
info = self.volume_info()
|
|
@@ -586,7 +606,7 @@ class DatabricksPath:
|
|
|
586
606
|
mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
|
|
587
607
|
|
|
588
608
|
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
589
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
609
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
590
610
|
pass
|
|
591
611
|
|
|
592
612
|
found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
|
|
@@ -730,7 +750,7 @@ class DatabricksPath:
|
|
|
730
750
|
properties=default_tags,
|
|
731
751
|
comment="Catalog auto generated by yggdrasil"
|
|
732
752
|
)
|
|
733
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
753
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest, InternalError):
|
|
734
754
|
if not exist_ok:
|
|
735
755
|
raise
|
|
736
756
|
|
|
@@ -742,7 +762,7 @@ class DatabricksPath:
|
|
|
742
762
|
properties=default_tags,
|
|
743
763
|
comment="Schema auto generated by yggdrasil"
|
|
744
764
|
)
|
|
745
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
765
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest, InternalError):
|
|
746
766
|
if not exist_ok:
|
|
747
767
|
raise
|
|
748
768
|
|
|
@@ -808,37 +828,54 @@ class DatabricksPath:
|
|
|
808
828
|
|
|
809
829
|
def remove(
|
|
810
830
|
self,
|
|
811
|
-
recursive: bool = True
|
|
831
|
+
recursive: bool = True,
|
|
832
|
+
allow_not_found: bool = True
|
|
812
833
|
):
|
|
813
834
|
"""Remove the path as a file or directory.
|
|
814
835
|
|
|
815
836
|
Args:
|
|
816
837
|
recursive: Whether to delete directories recursively.
|
|
838
|
+
allow_not_found: Allow not found path
|
|
817
839
|
|
|
818
840
|
Returns:
|
|
819
841
|
The DatabricksPath instance.
|
|
820
842
|
"""
|
|
821
843
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
822
|
-
return self._remove_volume_obj(recursive=recursive)
|
|
844
|
+
return self._remove_volume_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
823
845
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
824
|
-
return self._remove_workspace_obj(recursive=recursive)
|
|
846
|
+
return self._remove_workspace_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
825
847
|
elif self.kind == DatabricksPathKind.DBFS:
|
|
826
|
-
return self._remove_dbfs_obj(recursive=recursive)
|
|
848
|
+
return self._remove_dbfs_obj(recursive=recursive, allow_not_found=allow_not_found)
|
|
827
849
|
|
|
828
|
-
def _remove_volume_obj(
|
|
850
|
+
def _remove_volume_obj(
|
|
851
|
+
self,
|
|
852
|
+
recursive: bool = True,
|
|
853
|
+
allow_not_found: bool = True
|
|
854
|
+
):
|
|
829
855
|
if self.is_file():
|
|
830
|
-
return self._remove_volume_file()
|
|
831
|
-
|
|
856
|
+
return self._remove_volume_file(allow_not_found=allow_not_found)
|
|
857
|
+
elif self.is_dir():
|
|
858
|
+
return self._remove_volume_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
832
859
|
|
|
833
|
-
def _remove_workspace_obj(
|
|
860
|
+
def _remove_workspace_obj(
|
|
861
|
+
self,
|
|
862
|
+
recursive: bool = True,
|
|
863
|
+
allow_not_found: bool = True
|
|
864
|
+
):
|
|
834
865
|
if self.is_file():
|
|
835
|
-
return self._remove_workspace_file()
|
|
836
|
-
|
|
866
|
+
return self._remove_workspace_file(allow_not_found=allow_not_found)
|
|
867
|
+
elif self.is_dir():
|
|
868
|
+
return self._remove_workspace_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
837
869
|
|
|
838
|
-
def _remove_dbfs_obj(
|
|
870
|
+
def _remove_dbfs_obj(
|
|
871
|
+
self,
|
|
872
|
+
recursive: bool = True,
|
|
873
|
+
allow_not_found: bool = True
|
|
874
|
+
):
|
|
839
875
|
if self.is_file():
|
|
840
|
-
return self._remove_dbfs_file()
|
|
841
|
-
|
|
876
|
+
return self._remove_dbfs_file(allow_not_found=allow_not_found)
|
|
877
|
+
elif self.is_dir():
|
|
878
|
+
return self._remove_dbfs_dir(recursive=recursive, allow_not_found=allow_not_found)
|
|
842
879
|
|
|
843
880
|
def rmfile(self, allow_not_found: bool = True):
|
|
844
881
|
"""Remove the path as a file.
|
|
@@ -859,7 +896,7 @@ class DatabricksPath:
|
|
|
859
896
|
sdk = self.workspace.sdk()
|
|
860
897
|
try:
|
|
861
898
|
sdk.files.delete(self.files_full_path())
|
|
862
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
899
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
863
900
|
if not allow_not_found:
|
|
864
901
|
raise
|
|
865
902
|
finally:
|
|
@@ -871,7 +908,7 @@ class DatabricksPath:
|
|
|
871
908
|
sdk = self.workspace.sdk()
|
|
872
909
|
try:
|
|
873
910
|
sdk.workspace.delete(self.workspace_full_path(), recursive=True)
|
|
874
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
911
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
875
912
|
if not allow_not_found:
|
|
876
913
|
raise
|
|
877
914
|
finally:
|
|
@@ -883,7 +920,7 @@ class DatabricksPath:
|
|
|
883
920
|
sdk = self.workspace.sdk()
|
|
884
921
|
try:
|
|
885
922
|
sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
|
|
886
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
923
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
887
924
|
if not allow_not_found:
|
|
888
925
|
raise
|
|
889
926
|
finally:
|
|
@@ -940,7 +977,7 @@ class DatabricksPath:
|
|
|
940
977
|
|
|
941
978
|
if not with_root:
|
|
942
979
|
sdk.workspace.mkdirs(full_path)
|
|
943
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
980
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
944
981
|
if not allow_not_found:
|
|
945
982
|
raise
|
|
946
983
|
finally:
|
|
@@ -962,7 +999,7 @@ class DatabricksPath:
|
|
|
962
999
|
|
|
963
1000
|
if not with_root:
|
|
964
1001
|
sdk.dbfs.mkdirs(full_path)
|
|
965
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1002
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
966
1003
|
if not allow_not_found:
|
|
967
1004
|
raise
|
|
968
1005
|
finally:
|
|
@@ -983,7 +1020,7 @@ class DatabricksPath:
|
|
|
983
1020
|
if rel:
|
|
984
1021
|
try:
|
|
985
1022
|
sdk.files.delete_directory(full_path)
|
|
986
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
|
|
1023
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError) as e:
|
|
987
1024
|
message = str(e)
|
|
988
1025
|
|
|
989
1026
|
if recursive and "directory is not empty" in message:
|
|
@@ -998,13 +1035,13 @@ class DatabricksPath:
|
|
|
998
1035
|
elif volume_name:
|
|
999
1036
|
try:
|
|
1000
1037
|
sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
|
|
1001
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1038
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1002
1039
|
if not allow_not_found:
|
|
1003
1040
|
raise
|
|
1004
1041
|
elif schema_name:
|
|
1005
1042
|
try:
|
|
1006
1043
|
sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
|
|
1007
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1044
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1008
1045
|
if not allow_not_found:
|
|
1009
1046
|
raise
|
|
1010
1047
|
|
|
@@ -1064,7 +1101,7 @@ class DatabricksPath:
|
|
|
1064
1101
|
yield from base._ls_volume(recursive=recursive)
|
|
1065
1102
|
else:
|
|
1066
1103
|
yield base
|
|
1067
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1104
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1068
1105
|
if not allow_not_found:
|
|
1069
1106
|
raise
|
|
1070
1107
|
elif schema_name is None:
|
|
@@ -1082,7 +1119,7 @@ class DatabricksPath:
|
|
|
1082
1119
|
yield from base._ls_volume(recursive=recursive)
|
|
1083
1120
|
else:
|
|
1084
1121
|
yield base
|
|
1085
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1122
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1086
1123
|
if not allow_not_found:
|
|
1087
1124
|
raise
|
|
1088
1125
|
else:
|
|
@@ -1100,7 +1137,7 @@ class DatabricksPath:
|
|
|
1100
1137
|
yield from base._ls_volume(recursive=recursive)
|
|
1101
1138
|
else:
|
|
1102
1139
|
yield base
|
|
1103
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1140
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1104
1141
|
if not allow_not_found:
|
|
1105
1142
|
raise
|
|
1106
1143
|
else:
|
|
@@ -1121,7 +1158,7 @@ class DatabricksPath:
|
|
|
1121
1158
|
yield from base._ls_volume(recursive=recursive)
|
|
1122
1159
|
else:
|
|
1123
1160
|
yield base
|
|
1124
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1161
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1125
1162
|
if not allow_not_found:
|
|
1126
1163
|
raise
|
|
1127
1164
|
|
|
@@ -1140,7 +1177,7 @@ class DatabricksPath:
|
|
|
1140
1177
|
_is_dir=is_dir,
|
|
1141
1178
|
_size=info.size,
|
|
1142
1179
|
)
|
|
1143
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1180
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1144
1181
|
if not allow_not_found:
|
|
1145
1182
|
raise
|
|
1146
1183
|
|
|
@@ -1158,7 +1195,7 @@ class DatabricksPath:
|
|
|
1158
1195
|
_is_dir=info.is_dir,
|
|
1159
1196
|
_size=info.file_size,
|
|
1160
1197
|
)
|
|
1161
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
1198
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied, InternalError):
|
|
1162
1199
|
if not allow_not_found:
|
|
1163
1200
|
raise
|
|
1164
1201
|
|
|
@@ -1225,19 +1262,10 @@ class DatabricksPath:
|
|
|
1225
1262
|
else:
|
|
1226
1263
|
raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
|
|
1227
1264
|
|
|
1228
|
-
def write_bytes(self, data: bytes):
|
|
1229
|
-
if hasattr(data, "read"):
|
|
1230
|
-
data = data.read()
|
|
1231
|
-
|
|
1232
|
-
with self.open("wb") as f:
|
|
1233
|
-
f.write_all_bytes(data=data)
|
|
1234
|
-
|
|
1235
1265
|
def temporary_credentials(
|
|
1236
1266
|
self,
|
|
1237
1267
|
operation: Optional["PathOperation"] = None
|
|
1238
1268
|
):
|
|
1239
|
-
from databricks.sdk.service.catalog import PathOperation
|
|
1240
|
-
|
|
1241
1269
|
if self.kind != DatabricksPathKind.VOLUME:
|
|
1242
1270
|
raise ValueError(f"Cannot generate temporary credentials for {repr(self)}")
|
|
1243
1271
|
|
|
@@ -1250,6 +1278,14 @@ class DatabricksPath:
|
|
|
1250
1278
|
operation=operation or PathOperation.PATH_READ,
|
|
1251
1279
|
)
|
|
1252
1280
|
|
|
1281
|
+
def read_bytes(self, use_cache: bool = False):
|
|
1282
|
+
with self.open("rb") as f:
|
|
1283
|
+
return f.read_all_bytes(use_cache=use_cache)
|
|
1284
|
+
|
|
1285
|
+
def write_bytes(self, data: Union[bytes, IO[bytes]]):
|
|
1286
|
+
with self.open("wb") as f:
|
|
1287
|
+
f.write_all_bytes(data=data)
|
|
1288
|
+
|
|
1253
1289
|
# -------------------------
|
|
1254
1290
|
# Data ops (Arrow / Pandas / Polars)
|
|
1255
1291
|
# -------------------------
|
|
@@ -1358,7 +1394,6 @@ class DatabricksPath:
|
|
|
1358
1394
|
table: pa.Table,
|
|
1359
1395
|
file_format: Optional[FileFormat] = None,
|
|
1360
1396
|
batch_size: Optional[int] = None,
|
|
1361
|
-
**kwargs
|
|
1362
1397
|
):
|
|
1363
1398
|
"""Write an Arrow table to the path, sharding if needed.
|
|
1364
1399
|
|
|
@@ -1366,7 +1401,6 @@ class DatabricksPath:
|
|
|
1366
1401
|
table: Arrow table to write.
|
|
1367
1402
|
file_format: Optional file format override.
|
|
1368
1403
|
batch_size: Optional batch size for writes.
|
|
1369
|
-
**kwargs: Format-specific options.
|
|
1370
1404
|
|
|
1371
1405
|
Returns:
|
|
1372
1406
|
The DatabricksPath instance.
|
|
@@ -1379,7 +1413,11 @@ class DatabricksPath:
|
|
|
1379
1413
|
part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
|
|
1380
1414
|
|
|
1381
1415
|
with part_path.open(mode="wb") as f:
|
|
1382
|
-
f.write_arrow_batch(
|
|
1416
|
+
f.write_arrow_batch(
|
|
1417
|
+
batch,
|
|
1418
|
+
file_format=file_format,
|
|
1419
|
+
batch_size=batch_size,
|
|
1420
|
+
)
|
|
1383
1421
|
|
|
1384
1422
|
return connected
|
|
1385
1423
|
|
|
@@ -1389,7 +1427,6 @@ class DatabricksPath:
|
|
|
1389
1427
|
table,
|
|
1390
1428
|
file_format=file_format,
|
|
1391
1429
|
batch_size=batch_size,
|
|
1392
|
-
**kwargs
|
|
1393
1430
|
)
|
|
1394
1431
|
|
|
1395
1432
|
return self
|
|
@@ -1399,7 +1436,6 @@ class DatabricksPath:
|
|
|
1399
1436
|
file_format: Optional[FileFormat] = None,
|
|
1400
1437
|
batch_size: Optional[int] = None,
|
|
1401
1438
|
concat: bool = True,
|
|
1402
|
-
**kwargs
|
|
1403
1439
|
):
|
|
1404
1440
|
"""Read the path into a pandas DataFrame.
|
|
1405
1441
|
|
|
@@ -1407,7 +1443,6 @@ class DatabricksPath:
|
|
|
1407
1443
|
file_format: Optional file format override.
|
|
1408
1444
|
batch_size: Optional batch size for reads.
|
|
1409
1445
|
concat: Whether to concatenate results for directories.
|
|
1410
|
-
**kwargs: Format-specific options.
|
|
1411
1446
|
|
|
1412
1447
|
Returns:
|
|
1413
1448
|
A pandas DataFrame or list of DataFrames if concat=False.
|
|
@@ -1417,14 +1452,12 @@ class DatabricksPath:
|
|
|
1417
1452
|
file_format=file_format,
|
|
1418
1453
|
batch_size=batch_size,
|
|
1419
1454
|
concat=True,
|
|
1420
|
-
**kwargs
|
|
1421
1455
|
).to_pandas()
|
|
1422
1456
|
|
|
1423
1457
|
tables = self.read_arrow_table(
|
|
1424
1458
|
batch_size=batch_size,
|
|
1425
1459
|
file_format=file_format,
|
|
1426
1460
|
concat=False,
|
|
1427
|
-
**kwargs
|
|
1428
1461
|
)
|
|
1429
1462
|
|
|
1430
1463
|
return [t.to_pandas() for t in tables] # type: ignore[arg-type]
|
|
@@ -1434,7 +1467,6 @@ class DatabricksPath:
|
|
|
1434
1467
|
df: PandasDataFrame,
|
|
1435
1468
|
file_format: Optional[FileFormat] = None,
|
|
1436
1469
|
batch_size: Optional[int] = None,
|
|
1437
|
-
**kwargs
|
|
1438
1470
|
):
|
|
1439
1471
|
"""Write a pandas DataFrame to the path.
|
|
1440
1472
|
|
|
@@ -1442,7 +1474,6 @@ class DatabricksPath:
|
|
|
1442
1474
|
df: pandas DataFrame to write.
|
|
1443
1475
|
file_format: Optional file format override.
|
|
1444
1476
|
batch_size: Optional batch size for writes.
|
|
1445
|
-
**kwargs: Format-specific options.
|
|
1446
1477
|
|
|
1447
1478
|
Returns:
|
|
1448
1479
|
The DatabricksPath instance.
|
|
@@ -1463,7 +1494,6 @@ class DatabricksPath:
|
|
|
1463
1494
|
batch,
|
|
1464
1495
|
file_format=file_format,
|
|
1465
1496
|
batch_size=batch_size,
|
|
1466
|
-
**kwargs
|
|
1467
1497
|
)
|
|
1468
1498
|
else:
|
|
1469
1499
|
with connected.open(mode="wb", clone=False) as f:
|
|
@@ -1471,7 +1501,6 @@ class DatabricksPath:
|
|
|
1471
1501
|
df,
|
|
1472
1502
|
file_format=file_format,
|
|
1473
1503
|
batch_size=batch_size,
|
|
1474
|
-
**kwargs
|
|
1475
1504
|
)
|
|
1476
1505
|
|
|
1477
1506
|
return self
|
|
@@ -1523,7 +1552,6 @@ class DatabricksPath:
|
|
|
1523
1552
|
df,
|
|
1524
1553
|
file_format: Optional[FileFormat] = None,
|
|
1525
1554
|
batch_size: Optional[int] = None,
|
|
1526
|
-
**kwargs
|
|
1527
1555
|
):
|
|
1528
1556
|
"""
|
|
1529
1557
|
Write Polars to a DatabricksPath.
|
|
@@ -1538,7 +1566,6 @@ class DatabricksPath:
|
|
|
1538
1566
|
df: polars DataFrame or LazyFrame to write.
|
|
1539
1567
|
file_format: Optional file format override.
|
|
1540
1568
|
batch_size: Optional rows per part for directory sinks.
|
|
1541
|
-
**kwargs: Format-specific options.
|
|
1542
1569
|
|
|
1543
1570
|
Returns:
|
|
1544
1571
|
The DatabricksPath instance.
|
|
@@ -1552,7 +1579,7 @@ class DatabricksPath:
|
|
|
1552
1579
|
with self.connect() as connected:
|
|
1553
1580
|
if connected.is_dir_sink():
|
|
1554
1581
|
seed = int(time.time() * 1000)
|
|
1555
|
-
rows_per_part = batch_size or
|
|
1582
|
+
rows_per_part = batch_size or 1024 * 1024
|
|
1556
1583
|
|
|
1557
1584
|
# Always parquet for directory sinks (lake layout standard)
|
|
1558
1585
|
for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
|
|
@@ -1563,7 +1590,6 @@ class DatabricksPath:
|
|
|
1563
1590
|
df,
|
|
1564
1591
|
file_format=file_format,
|
|
1565
1592
|
batch_size=batch_size,
|
|
1566
|
-
**kwargs
|
|
1567
1593
|
)
|
|
1568
1594
|
else:
|
|
1569
1595
|
with connected.open(mode="wb", clone=False) as f:
|
|
@@ -1571,11 +1597,33 @@ class DatabricksPath:
|
|
|
1571
1597
|
df,
|
|
1572
1598
|
file_format=file_format,
|
|
1573
1599
|
batch_size=batch_size,
|
|
1574
|
-
**kwargs
|
|
1575
1600
|
)
|
|
1576
1601
|
|
|
1577
1602
|
return self
|
|
1578
1603
|
|
|
1604
|
+
def read_pickle(
|
|
1605
|
+
self,
|
|
1606
|
+
) -> Any:
|
|
1607
|
+
content = self.read_bytes()
|
|
1608
|
+
obj = dill.loads(content)
|
|
1609
|
+
|
|
1610
|
+
return obj
|
|
1611
|
+
|
|
1612
|
+
def write_pickle(
|
|
1613
|
+
self,
|
|
1614
|
+
obj: Any,
|
|
1615
|
+
file_format: Optional[FileFormat] = None,
|
|
1616
|
+
):
|
|
1617
|
+
buffer = io.BytesIO()
|
|
1618
|
+
|
|
1619
|
+
if isinstance(obj, PandasDataFrame):
|
|
1620
|
+
obj.to_pickle(buffer)
|
|
1621
|
+
else:
|
|
1622
|
+
buffer.write(dill.dumps(obj))
|
|
1623
|
+
|
|
1624
|
+
self.write_bytes(data=buffer.getvalue())
|
|
1625
|
+
|
|
1626
|
+
|
|
1579
1627
|
def sql(
|
|
1580
1628
|
self,
|
|
1581
1629
|
query: str,
|
|
@@ -1637,19 +1685,7 @@ class DatabricksPath:
|
|
|
1637
1685
|
"Invalid engine %s, must be in duckdb, polars" % engine
|
|
1638
1686
|
)
|
|
1639
1687
|
|
|
1640
|
-
|
|
1641
1688
|
if databricks is not None:
|
|
1642
|
-
@register_converter(DatabricksPath, pa.Table)
|
|
1643
|
-
def databricks_path_to_arrow_table(
|
|
1644
|
-
data: DatabricksPath,
|
|
1645
|
-
options: Optional[CastOptions] = None,
|
|
1646
|
-
) -> pa.Table:
|
|
1647
|
-
return cast_arrow_tabular(
|
|
1648
|
-
data.read_arrow_table(),
|
|
1649
|
-
options
|
|
1650
|
-
)
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
1689
|
@register_converter(DatabricksPath, ds.Dataset)
|
|
1654
1690
|
def databricks_path_to_arrow_table(
|
|
1655
1691
|
data: DatabricksPath,
|
|
@@ -1658,6 +1694,16 @@ if databricks is not None:
|
|
|
1658
1694
|
return data.arrow_dataset()
|
|
1659
1695
|
|
|
1660
1696
|
|
|
1697
|
+
@pandas_converter(DatabricksPath, PandasDataFrame)
|
|
1698
|
+
def databricks_path_to_pandas(
|
|
1699
|
+
data: DatabricksPath,
|
|
1700
|
+
options: Optional[CastOptions] = None,
|
|
1701
|
+
) -> PolarsDataFrame:
|
|
1702
|
+
return cast_pandas_dataframe(
|
|
1703
|
+
data.read_pandas(),
|
|
1704
|
+
options
|
|
1705
|
+
)
|
|
1706
|
+
|
|
1661
1707
|
@polars_converter(DatabricksPath, PolarsDataFrame)
|
|
1662
1708
|
def databricks_path_to_polars(
|
|
1663
1709
|
data: DatabricksPath,
|