ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
- ygg-0.1.32.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +241 -2
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +16 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +31 -34
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +148 -1
- yggdrasil/databricks/sql/types.py +49 -1
- yggdrasil/databricks/workspaces/__init__.py +4 -1
- yggdrasil/databricks/workspaces/filesytem.py +344 -0
- yggdrasil/databricks/workspaces/io.py +1123 -0
- yggdrasil/databricks/workspaces/path.py +1415 -0
- yggdrasil/databricks/workspaces/path_kind.py +13 -0
- yggdrasil/databricks/workspaces/workspace.py +298 -154
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +131 -0
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.30.dist-info/RECORD +0 -56
- yggdrasil/databricks/workspaces/databricks_path.py +0 -784
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
|
@@ -1,784 +0,0 @@
|
|
|
1
|
-
# src/yggdrasil/databricks/workspaces/databricks_path.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import dataclasses
|
|
5
|
-
import io
|
|
6
|
-
import time
|
|
7
|
-
from contextlib import contextmanager
|
|
8
|
-
from enum import Enum
|
|
9
|
-
from pathlib import PurePosixPath
|
|
10
|
-
from typing import BinaryIO, Iterator, Optional, Tuple, Union, TYPE_CHECKING, List
|
|
11
|
-
|
|
12
|
-
from databricks.sdk.service.catalog import VolumeType
|
|
13
|
-
|
|
14
|
-
from ...libs.databrickslib import databricks
|
|
15
|
-
|
|
16
|
-
if databricks is not None:
|
|
17
|
-
from databricks.sdk.service.workspace import ImportFormat, ObjectType
|
|
18
|
-
from databricks.sdk.errors.platform import (
|
|
19
|
-
NotFound,
|
|
20
|
-
ResourceDoesNotExist,
|
|
21
|
-
BadRequest,
|
|
22
|
-
PermissionDenied,
|
|
23
|
-
AlreadyExists,
|
|
24
|
-
ResourceAlreadyExists,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
NOT_FOUND_ERRORS = NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied
|
|
28
|
-
ALREADY_EXISTS_ERRORS = AlreadyExists, ResourceAlreadyExists, BadRequest
|
|
29
|
-
|
|
30
|
-
if TYPE_CHECKING:
|
|
31
|
-
from .workspace import Workspace
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
__all__ = [
|
|
35
|
-
"DatabricksPathKind",
|
|
36
|
-
"DatabricksPath",
|
|
37
|
-
]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
|
|
41
|
-
if isinstance(parts, str):
|
|
42
|
-
parts = [parts]
|
|
43
|
-
|
|
44
|
-
if any("/" in part for part in parts):
|
|
45
|
-
# flatten parts with slashes
|
|
46
|
-
new_parts = []
|
|
47
|
-
for part in parts:
|
|
48
|
-
split_parts = part.split("/")
|
|
49
|
-
new_parts.extend(split_parts)
|
|
50
|
-
parts = new_parts
|
|
51
|
-
|
|
52
|
-
return parts
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
class DatabricksPathKind(str, Enum):
|
|
56
|
-
WORKSPACE = "workspace"
|
|
57
|
-
VOLUME = "volume"
|
|
58
|
-
DBFS = "dbfs"
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@dataclasses.dataclass
|
|
62
|
-
class DatabricksPath:
|
|
63
|
-
kind: "DatabricksPathKind"
|
|
64
|
-
parts: List[str]
|
|
65
|
-
workspace: Optional["Workspace"] = None
|
|
66
|
-
|
|
67
|
-
_is_file: Optional[bool] = None
|
|
68
|
-
_is_dir: Optional[bool] = None
|
|
69
|
-
|
|
70
|
-
_raw_status: Optional[dict] = None
|
|
71
|
-
_raw_status_refresh_time: float = 0.0
|
|
72
|
-
|
|
73
|
-
@classmethod
|
|
74
|
-
def parse(
|
|
75
|
-
cls,
|
|
76
|
-
parts: Union[List[str], str],
|
|
77
|
-
workspace: Optional["Workspace"] = None,
|
|
78
|
-
) -> "DatabricksPath":
|
|
79
|
-
if not parts:
|
|
80
|
-
return DatabricksPath(
|
|
81
|
-
kind=DatabricksPathKind.DBFS,
|
|
82
|
-
parts=[],
|
|
83
|
-
workspace=workspace,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
parts = _flatten_parts(parts)
|
|
87
|
-
|
|
88
|
-
if not parts[0]:
|
|
89
|
-
parts = parts[1:]
|
|
90
|
-
|
|
91
|
-
if not parts:
|
|
92
|
-
return DatabricksPath(
|
|
93
|
-
kind=DatabricksPathKind.DBFS,
|
|
94
|
-
parts=[],
|
|
95
|
-
workspace=workspace,
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
head, *tail = parts
|
|
99
|
-
|
|
100
|
-
if head == "dbfs":
|
|
101
|
-
kind = DatabricksPathKind.DBFS
|
|
102
|
-
elif head == "Workspace":
|
|
103
|
-
kind = DatabricksPathKind.WORKSPACE
|
|
104
|
-
elif head == "Volumes":
|
|
105
|
-
kind = DatabricksPathKind.VOLUME
|
|
106
|
-
else:
|
|
107
|
-
raise ValueError(f"Invalid DatabricksPath prefix: {parts!r}")
|
|
108
|
-
|
|
109
|
-
return DatabricksPath(
|
|
110
|
-
kind=kind,
|
|
111
|
-
parts=tail,
|
|
112
|
-
workspace=workspace,
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
def __hash__(self):
|
|
116
|
-
return hash((self.kind, tuple(self.parts)))
|
|
117
|
-
|
|
118
|
-
def __eq__(self, other):
|
|
119
|
-
if not isinstance(other, DatabricksPath):
|
|
120
|
-
if isinstance(other, str):
|
|
121
|
-
return str(self) == other
|
|
122
|
-
return False
|
|
123
|
-
return self.kind == other.kind and self.parts == other.parts
|
|
124
|
-
|
|
125
|
-
def __truediv__(self, other):
|
|
126
|
-
if not other:
|
|
127
|
-
return self
|
|
128
|
-
|
|
129
|
-
other_parts = _flatten_parts(other)
|
|
130
|
-
|
|
131
|
-
built = DatabricksPath(
|
|
132
|
-
kind=self.kind,
|
|
133
|
-
parts=self.parts + other_parts,
|
|
134
|
-
workspace=self.workspace,
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
return built
|
|
138
|
-
|
|
139
|
-
def __enter__(self):
|
|
140
|
-
self.safe_workspace.__enter__()
|
|
141
|
-
return self
|
|
142
|
-
|
|
143
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
144
|
-
return self.safe_workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
145
|
-
|
|
146
|
-
def __str__(self):
|
|
147
|
-
if self.kind == DatabricksPathKind.DBFS:
|
|
148
|
-
return self.as_dbfs_api_path()
|
|
149
|
-
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
150
|
-
return self.as_workspace_api_path()
|
|
151
|
-
elif self.kind == DatabricksPathKind.VOLUME:
|
|
152
|
-
return self.as_files_api_path()
|
|
153
|
-
else:
|
|
154
|
-
raise ValueError(f"Unknown DatabricksPath kind: {self.kind!r}")
|
|
155
|
-
|
|
156
|
-
def __repr__(self):
|
|
157
|
-
return "dbfs://%s" % self.__str__()
|
|
158
|
-
|
|
159
|
-
@property
|
|
160
|
-
def parent(self):
|
|
161
|
-
if not self.parts:
|
|
162
|
-
return self
|
|
163
|
-
|
|
164
|
-
if self._is_file is not None or self._is_dir is not None:
|
|
165
|
-
_is_file, _is_dir = False, True
|
|
166
|
-
else:
|
|
167
|
-
_is_file, _is_dir = None, None
|
|
168
|
-
|
|
169
|
-
built = DatabricksPath(
|
|
170
|
-
kind=self.kind,
|
|
171
|
-
parts=self.parts[:-1],
|
|
172
|
-
workspace=self.workspace,
|
|
173
|
-
_is_file=_is_file,
|
|
174
|
-
_is_dir=_is_dir,
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
return built
|
|
178
|
-
|
|
179
|
-
@property
|
|
180
|
-
def safe_workspace(self):
|
|
181
|
-
if self.workspace is None:
|
|
182
|
-
from .workspace import Workspace
|
|
183
|
-
|
|
184
|
-
self.workspace = Workspace()
|
|
185
|
-
return self.workspace
|
|
186
|
-
|
|
187
|
-
@safe_workspace.setter
|
|
188
|
-
def safe_workspace(self, value):
|
|
189
|
-
self.workspace = value
|
|
190
|
-
|
|
191
|
-
def is_file(self):
|
|
192
|
-
if self._is_file is None:
|
|
193
|
-
self.refresh_status()
|
|
194
|
-
return self._is_file
|
|
195
|
-
|
|
196
|
-
def is_dir(self):
|
|
197
|
-
if self._is_dir is None:
|
|
198
|
-
self.refresh_status()
|
|
199
|
-
return self._is_dir
|
|
200
|
-
|
|
201
|
-
def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
|
|
202
|
-
if self.kind != DatabricksPathKind.VOLUME:
|
|
203
|
-
return None, None, None, None
|
|
204
|
-
|
|
205
|
-
catalog = self.parts[0] if len(self.parts) > 0 and self.parts[0] else None
|
|
206
|
-
schema = self.parts[1] if len(self.parts) > 1 and self.parts[1] else None
|
|
207
|
-
volume = self.parts[2] if len(self.parts) > 2 and self.parts[2] else None
|
|
208
|
-
|
|
209
|
-
return catalog, schema, volume, self.parts[3:]
|
|
210
|
-
|
|
211
|
-
def refresh_status(self):
|
|
212
|
-
with self as connected:
|
|
213
|
-
sdk = connected.safe_workspace.sdk()
|
|
214
|
-
|
|
215
|
-
try:
|
|
216
|
-
if connected.kind == DatabricksPathKind.VOLUME:
|
|
217
|
-
info = sdk.files.get_metadata(connected.as_files_api_path())
|
|
218
|
-
|
|
219
|
-
connected._raw_status = info
|
|
220
|
-
connected._is_file, connected._is_dir = True, False
|
|
221
|
-
elif connected.kind == DatabricksPathKind.WORKSPACE:
|
|
222
|
-
info = sdk.workspace.get_status(connected.as_workspace_api_path())
|
|
223
|
-
|
|
224
|
-
is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
|
|
225
|
-
connected._raw_status = info
|
|
226
|
-
connected._is_file, connected._is_dir = not is_dir, is_dir
|
|
227
|
-
else:
|
|
228
|
-
info = sdk.dbfs.get_status(connected.as_dbfs_api_path())
|
|
229
|
-
|
|
230
|
-
connected._raw_status = info
|
|
231
|
-
connected._is_file, connected._is_dir = (not info.is_dir), info.is_dir
|
|
232
|
-
|
|
233
|
-
connected._raw_status_refresh_time = time.time()
|
|
234
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
235
|
-
found = next(connected.ls(fetch_size=1, recursive=False, raise_error=False), None)
|
|
236
|
-
|
|
237
|
-
if found is None:
|
|
238
|
-
connected._is_file, connected._is_dir = False, False
|
|
239
|
-
else:
|
|
240
|
-
connected._is_file, connected._is_dir = False, True
|
|
241
|
-
|
|
242
|
-
return connected
|
|
243
|
-
|
|
244
|
-
def clear_cache(self):
|
|
245
|
-
self._raw_status = None
|
|
246
|
-
self._raw_status_refresh_time = 0
|
|
247
|
-
|
|
248
|
-
self._is_file = None
|
|
249
|
-
self._is_dir = None
|
|
250
|
-
|
|
251
|
-
# ---- API path normalization helpers ----
|
|
252
|
-
|
|
253
|
-
def as_workspace_api_path(self) -> str:
|
|
254
|
-
"""
|
|
255
|
-
Workspace API typically uses paths like /Users/... (not /Workspace/Users/...)
|
|
256
|
-
so we strip the leading /Workspace when present.
|
|
257
|
-
"""
|
|
258
|
-
return "/Workspace/%s" % "/".join(self.parts) if self.parts else "/Workspace"
|
|
259
|
-
|
|
260
|
-
def as_dbfs_api_path(self) -> str:
|
|
261
|
-
"""
|
|
262
|
-
DBFS REST wants absolute DBFS paths like /tmp/x.
|
|
263
|
-
If the user passes /dbfs/tmp/x (FUSE-style), strip the /dbfs prefix.
|
|
264
|
-
"""
|
|
265
|
-
return "/dbfs/%s" % "/".join(self.parts) if self.parts else "/dbfs"
|
|
266
|
-
|
|
267
|
-
def as_files_api_path(self) -> str:
|
|
268
|
-
"""
|
|
269
|
-
Files API takes absolute paths, e.g. /Volumes/<...>/file
|
|
270
|
-
"""
|
|
271
|
-
return "/Volumes/%s" % "/".join(self.parts) if self.parts else "/Volumes"
|
|
272
|
-
|
|
273
|
-
def exists(self) -> bool:
|
|
274
|
-
if self.is_file():
|
|
275
|
-
return True
|
|
276
|
-
if self.is_dir():
|
|
277
|
-
return True
|
|
278
|
-
return False
|
|
279
|
-
|
|
280
|
-
def mkdir(self, parents=True, exist_ok=True):
|
|
281
|
-
"""
|
|
282
|
-
Create a new directory at this given path.
|
|
283
|
-
"""
|
|
284
|
-
with self as connected:
|
|
285
|
-
connected.clear_cache()
|
|
286
|
-
|
|
287
|
-
try:
|
|
288
|
-
if connected.kind == DatabricksPathKind.WORKSPACE:
|
|
289
|
-
connected.safe_workspace.sdk().workspace.mkdirs(self.as_workspace_api_path())
|
|
290
|
-
elif connected.kind == DatabricksPathKind.VOLUME:
|
|
291
|
-
return connected._create_volume_dir(parents=parents, exist_ok=exist_ok)
|
|
292
|
-
elif connected.kind == DatabricksPathKind.DBFS:
|
|
293
|
-
connected.safe_workspace.sdk().dbfs.mkdirs(self.as_dbfs_api_path())
|
|
294
|
-
|
|
295
|
-
connected._is_file, connected._is_dir = False, True
|
|
296
|
-
except (NotFound, ResourceDoesNotExist):
|
|
297
|
-
if not parents or self.parent == self:
|
|
298
|
-
raise
|
|
299
|
-
|
|
300
|
-
connected.parent.mkdir(parents=True, exist_ok=True)
|
|
301
|
-
connected.mkdir(parents=False, exist_ok=exist_ok)
|
|
302
|
-
except (AlreadyExists, ResourceAlreadyExists):
|
|
303
|
-
if not exist_ok:
|
|
304
|
-
raise
|
|
305
|
-
|
|
306
|
-
def _ensure_volume(self, exist_ok: bool = True):
|
|
307
|
-
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
308
|
-
sdk = self.safe_workspace.sdk()
|
|
309
|
-
|
|
310
|
-
if catalog_name:
|
|
311
|
-
try:
|
|
312
|
-
sdk.catalogs.create(name=catalog_name)
|
|
313
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
314
|
-
if not exist_ok:
|
|
315
|
-
raise
|
|
316
|
-
|
|
317
|
-
if schema_name:
|
|
318
|
-
try:
|
|
319
|
-
sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
|
|
320
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
321
|
-
if not exist_ok:
|
|
322
|
-
raise
|
|
323
|
-
|
|
324
|
-
if volume_name:
|
|
325
|
-
try:
|
|
326
|
-
sdk.volumes.create(
|
|
327
|
-
catalog_name=catalog_name,
|
|
328
|
-
schema_name=schema_name,
|
|
329
|
-
name=volume_name,
|
|
330
|
-
volume_type=VolumeType.MANAGED,
|
|
331
|
-
)
|
|
332
|
-
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
333
|
-
if not exist_ok:
|
|
334
|
-
raise
|
|
335
|
-
|
|
336
|
-
def _create_volume_dir(self, parents=True, exist_ok=True):
|
|
337
|
-
path = self.as_files_api_path()
|
|
338
|
-
sdk = self.safe_workspace.sdk()
|
|
339
|
-
|
|
340
|
-
try:
|
|
341
|
-
sdk.files.create_directory(path)
|
|
342
|
-
except (BadRequest, NotFound, ResourceDoesNotExist) as e:
|
|
343
|
-
if not parents:
|
|
344
|
-
raise
|
|
345
|
-
|
|
346
|
-
message = str(e)
|
|
347
|
-
|
|
348
|
-
if "olume" in message and "not exist" in message:
|
|
349
|
-
self._ensure_volume()
|
|
350
|
-
|
|
351
|
-
sdk.files.create_directory(path)
|
|
352
|
-
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
353
|
-
if not exist_ok:
|
|
354
|
-
raise
|
|
355
|
-
|
|
356
|
-
self.clear_cache()
|
|
357
|
-
self._is_file, self._is_dir = False, True
|
|
358
|
-
|
|
359
|
-
def remove(self, recursive: bool = True):
|
|
360
|
-
if self.is_file():
|
|
361
|
-
return self.rmfile()
|
|
362
|
-
else:
|
|
363
|
-
return self.rmdir(recursive=recursive)
|
|
364
|
-
|
|
365
|
-
def rmfile(self):
|
|
366
|
-
try:
|
|
367
|
-
if self.kind == DatabricksPathKind.VOLUME:
|
|
368
|
-
return self._remove_volume_file()
|
|
369
|
-
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
370
|
-
return self._remove_workspace_file()
|
|
371
|
-
elif self.kind == DatabricksPathKind.DBFS:
|
|
372
|
-
return self._remove_dbfs_file()
|
|
373
|
-
finally:
|
|
374
|
-
self.clear_cache()
|
|
375
|
-
|
|
376
|
-
def _remove_volume_file(self):
|
|
377
|
-
sdk = self.safe_workspace.sdk()
|
|
378
|
-
|
|
379
|
-
try:
|
|
380
|
-
sdk.files.delete(self.as_files_api_path())
|
|
381
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
382
|
-
pass
|
|
383
|
-
|
|
384
|
-
def _remove_workspace_file(self):
|
|
385
|
-
sdk = self.safe_workspace.sdk()
|
|
386
|
-
|
|
387
|
-
try:
|
|
388
|
-
sdk.workspace.delete(self.as_workspace_api_path(), recursive=True)
|
|
389
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
390
|
-
pass
|
|
391
|
-
|
|
392
|
-
def _remove_dbfs_file(self):
|
|
393
|
-
sdk = self.safe_workspace.sdk()
|
|
394
|
-
|
|
395
|
-
try:
|
|
396
|
-
sdk.dbfs.delete(self.as_dbfs_api_path(), recursive=True)
|
|
397
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
398
|
-
pass
|
|
399
|
-
|
|
400
|
-
def rmdir(self, recursive: bool = True):
|
|
401
|
-
with self as connected:
|
|
402
|
-
try:
|
|
403
|
-
if connected.kind == DatabricksPathKind.WORKSPACE:
|
|
404
|
-
connected.safe_workspace.sdk().workspace.delete(
|
|
405
|
-
self.as_workspace_api_path(),
|
|
406
|
-
recursive=recursive,
|
|
407
|
-
)
|
|
408
|
-
elif connected.kind == DatabricksPathKind.VOLUME:
|
|
409
|
-
return self._remove_volume_dir(recursive=recursive)
|
|
410
|
-
else:
|
|
411
|
-
connected.safe_workspace.sdk().dbfs.delete(
|
|
412
|
-
self.as_dbfs_api_path(),
|
|
413
|
-
recursive=recursive,
|
|
414
|
-
)
|
|
415
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
416
|
-
pass
|
|
417
|
-
finally:
|
|
418
|
-
connected.clear_cache()
|
|
419
|
-
|
|
420
|
-
def _remove_volume_dir(self, recursive: bool = True):
|
|
421
|
-
root_path = self.as_files_api_path()
|
|
422
|
-
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
423
|
-
|
|
424
|
-
sdk = self.safe_workspace.sdk()
|
|
425
|
-
|
|
426
|
-
if rel:
|
|
427
|
-
try:
|
|
428
|
-
sdk.files.delete_directory(root_path)
|
|
429
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
|
|
430
|
-
message = str(e)
|
|
431
|
-
|
|
432
|
-
if recursive and "directory is not empty" in message:
|
|
433
|
-
for child_path in self.ls():
|
|
434
|
-
child_path.remove(recursive=True)
|
|
435
|
-
sdk.files.delete_directory(root_path)
|
|
436
|
-
else:
|
|
437
|
-
pass
|
|
438
|
-
elif volume_name:
|
|
439
|
-
try:
|
|
440
|
-
sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
|
|
441
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
442
|
-
pass
|
|
443
|
-
elif schema_name:
|
|
444
|
-
try:
|
|
445
|
-
sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
|
|
446
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
447
|
-
pass
|
|
448
|
-
|
|
449
|
-
self.clear_cache()
|
|
450
|
-
|
|
451
|
-
def ls(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
|
|
452
|
-
if self.kind == DatabricksPathKind.VOLUME:
|
|
453
|
-
for _ in self._ls_volume(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
|
|
454
|
-
yield _
|
|
455
|
-
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
456
|
-
for _ in self._ls_workspace(recursive=recursive, raise_error=raise_error):
|
|
457
|
-
yield _
|
|
458
|
-
elif self.kind == DatabricksPathKind.DBFS:
|
|
459
|
-
for _ in self._ls_dbfs(recursive=recursive, raise_error=raise_error):
|
|
460
|
-
yield _
|
|
461
|
-
|
|
462
|
-
def _ls_volume(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
|
|
463
|
-
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
464
|
-
sdk = self.safe_workspace.sdk()
|
|
465
|
-
|
|
466
|
-
if rel is None:
|
|
467
|
-
if volume_name is None:
|
|
468
|
-
try:
|
|
469
|
-
for info in sdk.volumes.list(
|
|
470
|
-
catalog_name=catalog_name,
|
|
471
|
-
schema_name=schema_name,
|
|
472
|
-
):
|
|
473
|
-
base = DatabricksPath(
|
|
474
|
-
kind=DatabricksPathKind.VOLUME,
|
|
475
|
-
parts = [info.catalog_name, info.schema_name, info.name],
|
|
476
|
-
workspace=self.safe_workspace,
|
|
477
|
-
_is_file=False,
|
|
478
|
-
_is_dir=True,
|
|
479
|
-
)
|
|
480
|
-
|
|
481
|
-
if recursive:
|
|
482
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
483
|
-
yield sub
|
|
484
|
-
else:
|
|
485
|
-
yield base
|
|
486
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
487
|
-
if raise_error:
|
|
488
|
-
raise
|
|
489
|
-
elif schema_name is None:
|
|
490
|
-
try:
|
|
491
|
-
for info in sdk.schemas.list(catalog_name=catalog_name):
|
|
492
|
-
base = DatabricksPath(
|
|
493
|
-
kind=DatabricksPathKind.VOLUME,
|
|
494
|
-
parts=[info.catalog_name, info.name],
|
|
495
|
-
workspace=self.safe_workspace,
|
|
496
|
-
_is_file=False,
|
|
497
|
-
_is_dir=True,
|
|
498
|
-
)
|
|
499
|
-
|
|
500
|
-
if recursive:
|
|
501
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
502
|
-
yield sub
|
|
503
|
-
else:
|
|
504
|
-
yield base
|
|
505
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
506
|
-
if raise_error:
|
|
507
|
-
raise
|
|
508
|
-
else:
|
|
509
|
-
try:
|
|
510
|
-
for info in sdk.catalogs.list():
|
|
511
|
-
base = DatabricksPath(
|
|
512
|
-
kind=DatabricksPathKind.VOLUME,
|
|
513
|
-
parts=[info.name],
|
|
514
|
-
workspace=self.safe_workspace,
|
|
515
|
-
_is_file=False,
|
|
516
|
-
_is_dir=True,
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
if recursive:
|
|
520
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
521
|
-
yield sub
|
|
522
|
-
else:
|
|
523
|
-
yield base
|
|
524
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
525
|
-
if raise_error:
|
|
526
|
-
raise
|
|
527
|
-
else:
|
|
528
|
-
try:
|
|
529
|
-
for info in sdk.files.list_directory_contents(self.as_files_api_path(), page_size=fetch_size):
|
|
530
|
-
base = DatabricksPath(
|
|
531
|
-
kind=DatabricksPathKind.VOLUME,
|
|
532
|
-
parts=info.path.split("/")[2:],
|
|
533
|
-
workspace=self.safe_workspace,
|
|
534
|
-
_is_file=not info.is_directory,
|
|
535
|
-
_is_dir=info.is_directory,
|
|
536
|
-
)
|
|
537
|
-
|
|
538
|
-
if recursive and info.is_directory:
|
|
539
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
540
|
-
yield sub
|
|
541
|
-
else:
|
|
542
|
-
yield base
|
|
543
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
544
|
-
if raise_error:
|
|
545
|
-
raise
|
|
546
|
-
|
|
547
|
-
def _ls_workspace(self, recursive: bool = True, raise_error: bool = True):
|
|
548
|
-
sdk = self.safe_workspace.sdk()
|
|
549
|
-
|
|
550
|
-
try:
|
|
551
|
-
for info in sdk.workspace.list(self.as_workspace_api_path(), recursive=recursive):
|
|
552
|
-
is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
|
|
553
|
-
base = DatabricksPath(
|
|
554
|
-
kind=DatabricksPathKind.WORKSPACE,
|
|
555
|
-
parts=info.path.split("/")[2:],
|
|
556
|
-
workspace=self.safe_workspace,
|
|
557
|
-
_is_file=not is_dir,
|
|
558
|
-
_is_dir=is_dir,
|
|
559
|
-
)
|
|
560
|
-
yield base
|
|
561
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
562
|
-
if raise_error:
|
|
563
|
-
raise
|
|
564
|
-
|
|
565
|
-
def _ls_dbfs(self, recursive: bool = True, raise_error: bool = True):
|
|
566
|
-
sdk = self.safe_workspace.sdk()
|
|
567
|
-
|
|
568
|
-
try:
|
|
569
|
-
# FIX: DBFS listing should use DBFS-normalized path, not workspace path
|
|
570
|
-
p = self.as_dbfs_api_path()
|
|
571
|
-
|
|
572
|
-
for info in sdk.dbfs.list(p, recursive=recursive):
|
|
573
|
-
base = DatabricksPath(
|
|
574
|
-
kind=DatabricksPathKind.DBFS,
|
|
575
|
-
parts=info.path.split("/")[2:],
|
|
576
|
-
workspace=self.safe_workspace,
|
|
577
|
-
_is_file=not info.is_dir,
|
|
578
|
-
_is_dir=info.is_dir,
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
yield base
|
|
582
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
583
|
-
if raise_error:
|
|
584
|
-
raise
|
|
585
|
-
|
|
586
|
-
@contextmanager
|
|
587
|
-
def open(
|
|
588
|
-
self,
|
|
589
|
-
mode="r",
|
|
590
|
-
encoding=None,
|
|
591
|
-
) -> Iterator[Union[BinaryIO, io.TextIOBase]]:
|
|
592
|
-
"""
|
|
593
|
-
Open this Databricks path using databricks-sdk's WorkspaceClient.
|
|
594
|
-
|
|
595
|
-
Supported:
|
|
596
|
-
- read: "rb", "r"
|
|
597
|
-
- write: "wb", "w" (buffered; uploads on close for WORKSPACE/VOLUME)
|
|
598
|
-
"""
|
|
599
|
-
if mode not in {"rb", "r", "wb", "w"}:
|
|
600
|
-
raise ValueError(f"Unsupported mode {mode!r}. Use r/rb/w/wb.")
|
|
601
|
-
|
|
602
|
-
if encoding is None:
|
|
603
|
-
encoding = None if "b" in mode else "utf-8"
|
|
604
|
-
reading = "r" in mode
|
|
605
|
-
|
|
606
|
-
if reading:
|
|
607
|
-
with self.open_read(encoding=encoding) as f:
|
|
608
|
-
yield f
|
|
609
|
-
else:
|
|
610
|
-
with self.open_write(encoding=encoding) as f:
|
|
611
|
-
yield f
|
|
612
|
-
|
|
613
|
-
@contextmanager
|
|
614
|
-
def open_read(self, encoding: str | None = None):
|
|
615
|
-
with self as connected:
|
|
616
|
-
if connected.kind == DatabricksPathKind.VOLUME:
|
|
617
|
-
with connected._open_read_volume(encoding=encoding) as f:
|
|
618
|
-
yield f
|
|
619
|
-
elif connected.kind == DatabricksPathKind.WORKSPACE:
|
|
620
|
-
with connected._open_read_workspace(encoding=encoding) as f:
|
|
621
|
-
yield f
|
|
622
|
-
else:
|
|
623
|
-
with connected._open_read_dbfs(encoding=encoding) as f:
|
|
624
|
-
yield f
|
|
625
|
-
|
|
626
|
-
@contextmanager
|
|
627
|
-
def _open_read_volume(self, encoding: str | None = None):
|
|
628
|
-
workspace_client = self.safe_workspace.sdk()
|
|
629
|
-
path = self.as_files_api_path()
|
|
630
|
-
|
|
631
|
-
resp = workspace_client.files.download(path)
|
|
632
|
-
raw = io.BytesIO(resp.contents.read())
|
|
633
|
-
|
|
634
|
-
if encoding is not None:
|
|
635
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
636
|
-
yield f
|
|
637
|
-
else:
|
|
638
|
-
with raw as f:
|
|
639
|
-
yield f
|
|
640
|
-
|
|
641
|
-
@contextmanager
|
|
642
|
-
def _open_read_workspace(self, encoding: str | None = None):
|
|
643
|
-
workspace_client = self.safe_workspace.sdk()
|
|
644
|
-
path = self.as_workspace_api_path()
|
|
645
|
-
|
|
646
|
-
raw = workspace_client.workspace.download(path) # returns BinaryIO
|
|
647
|
-
|
|
648
|
-
if encoding is not None:
|
|
649
|
-
raw = io.BytesIO(raw.read())
|
|
650
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
651
|
-
yield f
|
|
652
|
-
else:
|
|
653
|
-
with raw as f:
|
|
654
|
-
yield f
|
|
655
|
-
|
|
656
|
-
@contextmanager
|
|
657
|
-
def _open_read_dbfs(self, encoding: str | None = None):
|
|
658
|
-
workspace_client = self.safe_workspace.sdk()
|
|
659
|
-
path = self.as_dbfs_api_path()
|
|
660
|
-
|
|
661
|
-
raw = workspace_client.dbfs.open(path, read=True)
|
|
662
|
-
|
|
663
|
-
if encoding is not None:
|
|
664
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
665
|
-
yield f
|
|
666
|
-
else:
|
|
667
|
-
with raw as f:
|
|
668
|
-
yield f
|
|
669
|
-
|
|
670
|
-
@contextmanager
|
|
671
|
-
def open_write(self, encoding: str | None = None):
|
|
672
|
-
with self as connected:
|
|
673
|
-
if connected.kind == DatabricksPathKind.VOLUME:
|
|
674
|
-
with connected._open_write_volume(encoding=encoding) as f:
|
|
675
|
-
yield f
|
|
676
|
-
elif connected.kind == DatabricksPathKind.WORKSPACE:
|
|
677
|
-
with connected._open_write_workspace(encoding=encoding) as f:
|
|
678
|
-
yield f
|
|
679
|
-
else:
|
|
680
|
-
with connected._open_write_dbfs(encoding=encoding) as f:
|
|
681
|
-
yield f
|
|
682
|
-
|
|
683
|
-
@contextmanager
|
|
684
|
-
def _open_write_volume(self, encoding: str | None = None, overwrite: bool = True):
|
|
685
|
-
workspace_client = self.safe_workspace.sdk()
|
|
686
|
-
path = self.as_files_api_path()
|
|
687
|
-
|
|
688
|
-
buf = io.BytesIO()
|
|
689
|
-
|
|
690
|
-
if encoding is not None:
|
|
691
|
-
tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
|
|
692
|
-
try:
|
|
693
|
-
yield tw
|
|
694
|
-
finally:
|
|
695
|
-
tw.flush()
|
|
696
|
-
buf.seek(0)
|
|
697
|
-
|
|
698
|
-
try:
|
|
699
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
700
|
-
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
701
|
-
self.parent.mkdir(parents=True, exist_ok=True)
|
|
702
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
703
|
-
|
|
704
|
-
tw.detach()
|
|
705
|
-
else:
|
|
706
|
-
try:
|
|
707
|
-
yield buf
|
|
708
|
-
finally:
|
|
709
|
-
buf.seek(0)
|
|
710
|
-
|
|
711
|
-
try:
|
|
712
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
713
|
-
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
714
|
-
self.parent.mkdir(parents=True, exist_ok=True)
|
|
715
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
716
|
-
|
|
717
|
-
@contextmanager
|
|
718
|
-
def _open_write_workspace(self, encoding: str | None = None, overwrite: bool = True):
|
|
719
|
-
workspace_client = self.safe_workspace.sdk()
|
|
720
|
-
path = self.as_workspace_api_path()
|
|
721
|
-
|
|
722
|
-
buf = io.BytesIO()
|
|
723
|
-
|
|
724
|
-
if encoding is not None:
|
|
725
|
-
tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
|
|
726
|
-
try:
|
|
727
|
-
yield tw
|
|
728
|
-
finally:
|
|
729
|
-
tw.flush()
|
|
730
|
-
buf.seek(0)
|
|
731
|
-
|
|
732
|
-
try:
|
|
733
|
-
workspace_client.workspace.upload(
|
|
734
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
735
|
-
)
|
|
736
|
-
except Exception as e:
|
|
737
|
-
message = str(e)
|
|
738
|
-
if "parent folder" in message and "does not exist" in message:
|
|
739
|
-
self.parent.mkdir(parents=True)
|
|
740
|
-
buf.seek(0)
|
|
741
|
-
workspace_client.workspace.upload(
|
|
742
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
743
|
-
)
|
|
744
|
-
else:
|
|
745
|
-
raise
|
|
746
|
-
|
|
747
|
-
tw.detach()
|
|
748
|
-
else:
|
|
749
|
-
try:
|
|
750
|
-
yield buf
|
|
751
|
-
finally:
|
|
752
|
-
buf.seek(0)
|
|
753
|
-
|
|
754
|
-
try:
|
|
755
|
-
workspace_client.workspace.upload(
|
|
756
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
757
|
-
)
|
|
758
|
-
except Exception as e:
|
|
759
|
-
message = str(e)
|
|
760
|
-
if "parent folder" in message and "does not exist" in message:
|
|
761
|
-
self.parent.mkdir(parents=True)
|
|
762
|
-
buf.seek(0)
|
|
763
|
-
workspace_client.workspace.upload(
|
|
764
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
765
|
-
)
|
|
766
|
-
else:
|
|
767
|
-
raise
|
|
768
|
-
|
|
769
|
-
@contextmanager
|
|
770
|
-
def _open_write_dbfs(self, encoding: str | None = None, overwrite: bool = True):
|
|
771
|
-
workspace_client = self.safe_workspace.sdk()
|
|
772
|
-
path = self.as_dbfs_api_path()
|
|
773
|
-
|
|
774
|
-
raw = workspace_client.dbfs.open(path, write=True, overwrite=overwrite)
|
|
775
|
-
|
|
776
|
-
if encoding is not None:
|
|
777
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
778
|
-
yield f
|
|
779
|
-
else:
|
|
780
|
-
with raw as f:
|
|
781
|
-
yield f
|
|
782
|
-
|
|
783
|
-
self.clear_cache()
|
|
784
|
-
self._is_file, self._is_dir = True, False
|