ygg 0.1.29__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/METADATA +1 -1
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/RECORD +23 -20
- yggdrasil/databricks/compute/cluster.py +41 -21
- yggdrasil/databricks/compute/execution_context.py +9 -10
- yggdrasil/databricks/compute/remote.py +10 -6
- yggdrasil/databricks/jobs/config.py +2 -30
- yggdrasil/databricks/sql/engine.py +4 -2
- yggdrasil/databricks/sql/statement_result.py +18 -3
- yggdrasil/databricks/sql/types.py +16 -0
- yggdrasil/databricks/workspaces/__init__.py +3 -1
- yggdrasil/databricks/workspaces/filesytem.py +161 -0
- yggdrasil/databricks/workspaces/io.py +745 -0
- yggdrasil/databricks/workspaces/path.py +1120 -0
- yggdrasil/databricks/workspaces/path_kind.py +10 -0
- yggdrasil/databricks/workspaces/workspace.py +146 -562
- yggdrasil/pyutils/callable_serde.py +1 -0
- yggdrasil/pyutils/modules.py +1 -1
- yggdrasil/pyutils/python_env.py +81 -264
- yggdrasil/types/cast/arrow_cast.py +9 -0
- yggdrasil/databricks/workspaces/databricks_path.py +0 -875
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/WHEEL +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/top_level.txt +0 -0
|
@@ -1,875 +0,0 @@
|
|
|
1
|
-
# src/yggdrasil/databricks/workspaces/databricks_path.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import io
|
|
5
|
-
import time
|
|
6
|
-
import urllib.parse as urlparse
|
|
7
|
-
from contextlib import contextmanager
|
|
8
|
-
from enum import Enum
|
|
9
|
-
from pathlib import PurePosixPath, Path as SysPath
|
|
10
|
-
from typing import Any, BinaryIO, Iterator, Optional, Tuple, Union, TYPE_CHECKING
|
|
11
|
-
|
|
12
|
-
from databricks.sdk.service.catalog import VolumeType
|
|
13
|
-
|
|
14
|
-
from ...libs.databrickslib import databricks
|
|
15
|
-
|
|
16
|
-
if databricks is not None:
|
|
17
|
-
from databricks.sdk.service.workspace import ImportFormat, ObjectType
|
|
18
|
-
from databricks.sdk.errors.platform import (
|
|
19
|
-
NotFound,
|
|
20
|
-
ResourceDoesNotExist,
|
|
21
|
-
BadRequest,
|
|
22
|
-
PermissionDenied,
|
|
23
|
-
AlreadyExists,
|
|
24
|
-
ResourceAlreadyExists,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
NOT_FOUND_ERRORS = NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied
|
|
28
|
-
ALREADY_EXISTS_ERRORS = AlreadyExists, ResourceAlreadyExists, BadRequest
|
|
29
|
-
|
|
30
|
-
if TYPE_CHECKING:
|
|
31
|
-
from .workspace import Workspace
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
__all__ = [
|
|
35
|
-
"DatabricksPathKind",
|
|
36
|
-
"DatabricksPath",
|
|
37
|
-
]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _seg_to_str(s) -> str:
|
|
41
|
-
# Handles DatabricksPath, PurePosixPath, Windows Path, etc.
|
|
42
|
-
if isinstance(s, SysPath):
|
|
43
|
-
return s.as_posix()
|
|
44
|
-
return str(s)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class DatabricksPathKind(str, Enum):
|
|
48
|
-
WORKSPACE = "workspace"
|
|
49
|
-
VOLUME = "volume"
|
|
50
|
-
DBFS = "dbfs"
|
|
51
|
-
|
|
52
|
-
@classmethod
|
|
53
|
-
def parse(
|
|
54
|
-
cls,
|
|
55
|
-
path: str,
|
|
56
|
-
workspace: Optional["Workspace"] = None,
|
|
57
|
-
) -> Tuple["DatabricksPathKind", Optional["Workspace"], str]:
|
|
58
|
-
from .workspace import Workspace
|
|
59
|
-
|
|
60
|
-
if path.startswith("/Workspace") or path.startswith("/Users") or path.startswith("/Shared"):
|
|
61
|
-
if path.startswith("/Users/me"):
|
|
62
|
-
workspace = Workspace() if workspace is None else workspace
|
|
63
|
-
path = path.replace("/Users/me", "/Users/%s" % workspace.current_user.user_name)
|
|
64
|
-
|
|
65
|
-
return cls.WORKSPACE, workspace, path
|
|
66
|
-
|
|
67
|
-
if path.startswith("/Volumes"):
|
|
68
|
-
return cls.VOLUME, workspace, path
|
|
69
|
-
|
|
70
|
-
if path.startswith("dbfs://"):
|
|
71
|
-
parsed = urlparse.urlparse(path)
|
|
72
|
-
|
|
73
|
-
# inner path is the URL path (e.g. /tmp/x or /Volumes/...)
|
|
74
|
-
kind, _, inner_path = cls.parse(parsed.path, workspace=workspace)
|
|
75
|
-
|
|
76
|
-
# hostname can be None for malformed/dbfs:// variants; fall back to default Workspace()
|
|
77
|
-
if workspace is None:
|
|
78
|
-
workspace = Workspace(host=parsed.hostname) if parsed.hostname else Workspace()
|
|
79
|
-
|
|
80
|
-
return kind, workspace, inner_path
|
|
81
|
-
|
|
82
|
-
return cls.DBFS, workspace, path
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class DatabricksPath(SysPath, PurePosixPath):
|
|
86
|
-
_kind: "DatabricksPathKind"
|
|
87
|
-
_workspace: Optional["Workspace"]
|
|
88
|
-
|
|
89
|
-
_is_file: Optional[bool]
|
|
90
|
-
_is_dir: Optional[bool]
|
|
91
|
-
|
|
92
|
-
_raw_status: Optional[dict]
|
|
93
|
-
_raw_status_refresh_time: float
|
|
94
|
-
|
|
95
|
-
@staticmethod
|
|
96
|
-
def _join_segments(pathsegments: tuple[Any, ...]) -> str:
|
|
97
|
-
if not pathsegments:
|
|
98
|
-
return ""
|
|
99
|
-
|
|
100
|
-
first = _seg_to_str(pathsegments[0])
|
|
101
|
-
|
|
102
|
-
# Keep dbfs:// URL-ish paths URL-ish (don't let PurePosixPath normalize it)
|
|
103
|
-
if first.startswith("dbfs://"):
|
|
104
|
-
rest = (_seg_to_str(s).lstrip("/") for s in pathsegments[1:])
|
|
105
|
-
first = first.rstrip("/")
|
|
106
|
-
tail = "/".join(rest)
|
|
107
|
-
return f"{first}/{tail}" if tail else first
|
|
108
|
-
|
|
109
|
-
return str(PurePosixPath(*(_seg_to_str(s) for s in pathsegments)))
|
|
110
|
-
|
|
111
|
-
def __new__(
|
|
112
|
-
cls,
|
|
113
|
-
*pathsegments: Any,
|
|
114
|
-
workspace: Optional["Workspace"] = None,
|
|
115
|
-
is_file: Optional[bool] = None,
|
|
116
|
-
is_dir: Optional[bool] = None,
|
|
117
|
-
raw_status: Optional[dict] = None,
|
|
118
|
-
raw_status_refresh_time: float = 0.0,
|
|
119
|
-
) -> "DatabricksPath":
|
|
120
|
-
joined = cls._join_segments(pathsegments)
|
|
121
|
-
kind, parsed_ws, pure_path = DatabricksPathKind.parse(joined, workspace=workspace)
|
|
122
|
-
|
|
123
|
-
self = cls._from_parts([pure_path]) # pathlib-style construction (calls _init)
|
|
124
|
-
|
|
125
|
-
# Override with constructor-provided metadata
|
|
126
|
-
self._kind = kind
|
|
127
|
-
self._workspace = parsed_ws if workspace is None else workspace
|
|
128
|
-
self._is_file = is_file
|
|
129
|
-
self._is_dir = is_dir
|
|
130
|
-
self._raw_status = raw_status
|
|
131
|
-
self._raw_status_refresh_time = float(raw_status_refresh_time)
|
|
132
|
-
|
|
133
|
-
return self
|
|
134
|
-
|
|
135
|
-
def __init__(
|
|
136
|
-
self,
|
|
137
|
-
*pathsegments: Any,
|
|
138
|
-
workspace: Optional["Workspace"] = None,
|
|
139
|
-
is_file: Optional[bool] = None,
|
|
140
|
-
is_dir: Optional[bool] = None,
|
|
141
|
-
raw_status: Optional[dict] = None,
|
|
142
|
-
raw_status_refresh_time: float = 0.0,
|
|
143
|
-
) -> None:
|
|
144
|
-
# pathlib paths are effectively immutable; all init happens in __new__ / _init
|
|
145
|
-
pass
|
|
146
|
-
|
|
147
|
-
def __truediv__(self, other):
|
|
148
|
-
if not other:
|
|
149
|
-
return self
|
|
150
|
-
|
|
151
|
-
built = super().__truediv__(other)
|
|
152
|
-
|
|
153
|
-
built._kind = self._kind
|
|
154
|
-
built._workspace = self._workspace
|
|
155
|
-
|
|
156
|
-
built._is_file = None
|
|
157
|
-
built._is_dir = None
|
|
158
|
-
built._raw_status = None
|
|
159
|
-
built._raw_status_refresh_time = 0.0
|
|
160
|
-
|
|
161
|
-
return built
|
|
162
|
-
|
|
163
|
-
def __enter__(self):
|
|
164
|
-
self.workspace.__enter__()
|
|
165
|
-
return self
|
|
166
|
-
|
|
167
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
168
|
-
return self.workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
169
|
-
|
|
170
|
-
def _clone_meta_from(self, template: "DatabricksPath") -> None:
|
|
171
|
-
"""
|
|
172
|
-
Copy *connection/meta* state, but never copy caches.
|
|
173
|
-
Centralizes the logic so every creation path stays consistent.
|
|
174
|
-
"""
|
|
175
|
-
# Keep workspace threading; kind should match the NEW path string.
|
|
176
|
-
kind, ws, _ = DatabricksPathKind.parse(str(self), workspace=getattr(template, "_workspace", None))
|
|
177
|
-
self._kind = kind
|
|
178
|
-
self._workspace = ws if ws is not None else getattr(template, "_workspace", None)
|
|
179
|
-
|
|
180
|
-
# Reset caches
|
|
181
|
-
self._is_file = None
|
|
182
|
-
self._is_dir = None
|
|
183
|
-
self._raw_status = None
|
|
184
|
-
self._raw_status_refresh_time = 0.0
|
|
185
|
-
|
|
186
|
-
@property
|
|
187
|
-
def parent(self):
|
|
188
|
-
built = super().parent
|
|
189
|
-
|
|
190
|
-
built._clone_meta_from(self)
|
|
191
|
-
|
|
192
|
-
return built
|
|
193
|
-
|
|
194
|
-
@classmethod
|
|
195
|
-
def _from_parsed_parts(cls, drv, root, parts):
|
|
196
|
-
"""
|
|
197
|
-
pathlib internal factory. It may pass a template in some Python versions,
|
|
198
|
-
but if not, we still return a valid DatabricksPath with initialized state.
|
|
199
|
-
"""
|
|
200
|
-
built = super()._from_parsed_parts(drv, root, parts) # type: ignore[misc]
|
|
201
|
-
|
|
202
|
-
# Best effort: if pathlib gave us a template on the object, use it.
|
|
203
|
-
# Otherwise ensure we at least have valid defaults.
|
|
204
|
-
if isinstance(built, DatabricksPath) and isinstance(getattr(built, "_workspace", None), object):
|
|
205
|
-
# If the object already has workspace/kind via _init, don't stomp it.
|
|
206
|
-
# But if it's missing _kind (common failure), derive it.
|
|
207
|
-
if not hasattr(built, "_kind"):
|
|
208
|
-
kind, ws, _ = DatabricksPathKind.parse(str(built), workspace=getattr(built, "_workspace", None))
|
|
209
|
-
built._kind = kind
|
|
210
|
-
built._workspace = ws if ws is not None else getattr(built, "_workspace", None)
|
|
211
|
-
|
|
212
|
-
# Always reset caches (derived path => cache invalid)
|
|
213
|
-
built._is_file = None
|
|
214
|
-
built._is_dir = None
|
|
215
|
-
built._raw_status = None
|
|
216
|
-
built._raw_status_refresh_time = 0.0
|
|
217
|
-
else:
|
|
218
|
-
# Safety defaults (should be rare)
|
|
219
|
-
kind, ws, _ = DatabricksPathKind.parse(str(built))
|
|
220
|
-
built._kind = kind
|
|
221
|
-
built._workspace = ws
|
|
222
|
-
built._is_file = None
|
|
223
|
-
built._is_dir = None
|
|
224
|
-
built._raw_status = None
|
|
225
|
-
built._raw_status_refresh_time = 0.0
|
|
226
|
-
|
|
227
|
-
return built
|
|
228
|
-
|
|
229
|
-
def _make_child(self, args):
|
|
230
|
-
built = super()._make_child(args) # type: ignore[misc]
|
|
231
|
-
|
|
232
|
-
# Ensure type + meta carryover
|
|
233
|
-
if isinstance(built, DatabricksPath):
|
|
234
|
-
built._clone_meta_from(self)
|
|
235
|
-
else:
|
|
236
|
-
# if for some reason super didn't return our type, try to coerce
|
|
237
|
-
built = type(self)(built, workspace=getattr(self, "_workspace", None))
|
|
238
|
-
|
|
239
|
-
return built
|
|
240
|
-
|
|
241
|
-
@property
|
|
242
|
-
def workspace(self):
|
|
243
|
-
if self._workspace is None:
|
|
244
|
-
from .workspace import Workspace
|
|
245
|
-
|
|
246
|
-
self._workspace = Workspace()
|
|
247
|
-
return self._workspace
|
|
248
|
-
|
|
249
|
-
@workspace.setter
|
|
250
|
-
def workspace(self, value):
|
|
251
|
-
self._workspace = value
|
|
252
|
-
|
|
253
|
-
@property
|
|
254
|
-
def kind(self):
|
|
255
|
-
return self._kind
|
|
256
|
-
|
|
257
|
-
@kind.setter
|
|
258
|
-
def kind(self, value: DatabricksPathKind):
|
|
259
|
-
self._kind = value
|
|
260
|
-
|
|
261
|
-
def is_file(self, *, follow_symlinks=True):
|
|
262
|
-
if self._is_file is None:
|
|
263
|
-
self.refresh_status()
|
|
264
|
-
return self._is_file
|
|
265
|
-
|
|
266
|
-
def is_dir(self, *, follow_symlinks=True):
|
|
267
|
-
if self._is_dir is None:
|
|
268
|
-
self.refresh_status()
|
|
269
|
-
return self._is_dir
|
|
270
|
-
|
|
271
|
-
def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
|
|
272
|
-
if self.kind != DatabricksPathKind.VOLUME:
|
|
273
|
-
return None, None, None, None
|
|
274
|
-
|
|
275
|
-
s = str(self)
|
|
276
|
-
segs = s.split("/") # ['', 'Volumes', catalog?, schema?, volume?, ...]
|
|
277
|
-
|
|
278
|
-
# still keep the basic sanity check
|
|
279
|
-
if len(segs) < 2 or segs[1] != "Volumes":
|
|
280
|
-
raise ValueError(f"Invalid volume path: {s!r}")
|
|
281
|
-
|
|
282
|
-
catalog = segs[2] if len(segs) > 2 and segs[2] else None
|
|
283
|
-
schema = segs[3] if len(segs) > 3 and segs[3] else None
|
|
284
|
-
volume = segs[4] if len(segs) > 4 and segs[4] else None
|
|
285
|
-
|
|
286
|
-
# rel path only makes sense after /Volumes/<catalog>/<schema>/<volume>
|
|
287
|
-
if len(segs) > 5:
|
|
288
|
-
rel = "/".join(segs[5:])
|
|
289
|
-
rel_path = PurePosixPath(rel) if rel else PurePosixPath(".")
|
|
290
|
-
else:
|
|
291
|
-
rel_path = None
|
|
292
|
-
|
|
293
|
-
return catalog, schema, volume, rel_path
|
|
294
|
-
|
|
295
|
-
def refresh_status(self):
|
|
296
|
-
with self as connected:
|
|
297
|
-
sdk = connected.workspace.sdk()
|
|
298
|
-
|
|
299
|
-
try:
|
|
300
|
-
if connected.kind == DatabricksPathKind.VOLUME:
|
|
301
|
-
info = sdk.files.get_metadata(connected.as_files_api_path())
|
|
302
|
-
|
|
303
|
-
connected._raw_status = info
|
|
304
|
-
connected._is_file, connected._is_dir = True, False
|
|
305
|
-
elif connected.kind == DatabricksPathKind.WORKSPACE:
|
|
306
|
-
info = sdk.workspace.get_status(connected.as_workspace_api_path())
|
|
307
|
-
|
|
308
|
-
is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
|
|
309
|
-
connected._raw_status = info
|
|
310
|
-
connected._is_file, connected._is_dir = not is_dir, is_dir
|
|
311
|
-
else:
|
|
312
|
-
info = sdk.dbfs.get_status(connected.as_dbfs_api_path())
|
|
313
|
-
|
|
314
|
-
connected._raw_status = info
|
|
315
|
-
connected._is_file, connected._is_dir = (not info.is_dir), info.is_dir
|
|
316
|
-
|
|
317
|
-
connected._raw_status_refresh_time = time.time()
|
|
318
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
319
|
-
found = next(connected.ls(fetch_size=1, recursive=False, raise_error=False), None)
|
|
320
|
-
|
|
321
|
-
if found is None:
|
|
322
|
-
connected._is_file, connected._is_dir = False, False
|
|
323
|
-
else:
|
|
324
|
-
connected._is_file, connected._is_dir = False, True
|
|
325
|
-
|
|
326
|
-
return connected
|
|
327
|
-
|
|
328
|
-
def clear_cache(self):
|
|
329
|
-
self._raw_status = None
|
|
330
|
-
self._raw_status_refresh_time = 0
|
|
331
|
-
|
|
332
|
-
self._is_file = None
|
|
333
|
-
self._is_dir = None
|
|
334
|
-
|
|
335
|
-
# ---- API path normalization helpers ----
|
|
336
|
-
|
|
337
|
-
def as_workspace_api_path(self) -> str:
|
|
338
|
-
"""
|
|
339
|
-
Workspace API typically uses paths like /Users/... (not /Workspace/Users/...)
|
|
340
|
-
so we strip the leading /Workspace when present.
|
|
341
|
-
"""
|
|
342
|
-
s = str(self)
|
|
343
|
-
return s[len("/Workspace") :] if s.startswith("/Workspace") else s
|
|
344
|
-
|
|
345
|
-
def as_dbfs_api_path(self) -> str:
|
|
346
|
-
"""
|
|
347
|
-
DBFS REST wants absolute DBFS paths like /tmp/x.
|
|
348
|
-
If the user passes /dbfs/tmp/x (FUSE-style), strip the /dbfs prefix.
|
|
349
|
-
"""
|
|
350
|
-
s = str(self)
|
|
351
|
-
return s[len("/dbfs") :] if s.startswith("/dbfs") else s
|
|
352
|
-
|
|
353
|
-
def as_files_api_path(self) -> str:
|
|
354
|
-
"""
|
|
355
|
-
Files API takes absolute paths, e.g. /Volumes/<...>/file
|
|
356
|
-
"""
|
|
357
|
-
return str(self)
|
|
358
|
-
|
|
359
|
-
def with_segments(self, *pathsegments):
|
|
360
|
-
"""Construct a new path object from any number of path-like objects.
|
|
361
|
-
Subclasses may override this method to customize how new path objects
|
|
362
|
-
are created from methods like `iterdir()`.
|
|
363
|
-
"""
|
|
364
|
-
return type(self)(*pathsegments, workspace=self._workspace)
|
|
365
|
-
|
|
366
|
-
def exists(self, *, follow_symlinks=True) -> bool:
|
|
367
|
-
if self.is_file():
|
|
368
|
-
return True
|
|
369
|
-
if self.is_dir():
|
|
370
|
-
return True
|
|
371
|
-
return False
|
|
372
|
-
|
|
373
|
-
def mkdir(self, mode=0o777, parents=True, exist_ok=True):
|
|
374
|
-
"""
|
|
375
|
-
Create a new directory at this given path.
|
|
376
|
-
"""
|
|
377
|
-
with self as connected:
|
|
378
|
-
connected.clear_cache()
|
|
379
|
-
|
|
380
|
-
try:
|
|
381
|
-
if connected.kind == DatabricksPathKind.WORKSPACE:
|
|
382
|
-
connected.workspace.sdk().workspace.mkdirs(self.as_workspace_api_path())
|
|
383
|
-
elif connected.kind == DatabricksPathKind.VOLUME:
|
|
384
|
-
return connected._create_volume_dir(mode=mode, parents=parents, exist_ok=exist_ok)
|
|
385
|
-
elif connected._kind == DatabricksPathKind.DBFS:
|
|
386
|
-
connected.workspace.sdk().dbfs.mkdirs(self.as_dbfs_api_path())
|
|
387
|
-
|
|
388
|
-
connected._is_file, connected._is_dir = False, True
|
|
389
|
-
except (NotFound, ResourceDoesNotExist):
|
|
390
|
-
if not parents or self.parent == self:
|
|
391
|
-
raise
|
|
392
|
-
|
|
393
|
-
connected.parent.mkdir(parents=True, exist_ok=True)
|
|
394
|
-
connected.mkdir(mode, parents=False, exist_ok=exist_ok)
|
|
395
|
-
except (AlreadyExists, ResourceAlreadyExists):
|
|
396
|
-
if not exist_ok:
|
|
397
|
-
raise
|
|
398
|
-
|
|
399
|
-
def _ensure_volume(self, exist_ok: bool = True):
|
|
400
|
-
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
401
|
-
sdk = self.workspace.sdk()
|
|
402
|
-
|
|
403
|
-
if catalog_name:
|
|
404
|
-
try:
|
|
405
|
-
sdk.catalogs.create(name=catalog_name)
|
|
406
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
407
|
-
if not exist_ok:
|
|
408
|
-
raise
|
|
409
|
-
|
|
410
|
-
if schema_name:
|
|
411
|
-
try:
|
|
412
|
-
sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
|
|
413
|
-
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
414
|
-
if not exist_ok:
|
|
415
|
-
raise
|
|
416
|
-
|
|
417
|
-
if volume_name:
|
|
418
|
-
try:
|
|
419
|
-
sdk.volumes.create(
|
|
420
|
-
catalog_name=catalog_name,
|
|
421
|
-
schema_name=schema_name,
|
|
422
|
-
name=volume_name,
|
|
423
|
-
volume_type=VolumeType.MANAGED,
|
|
424
|
-
)
|
|
425
|
-
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
426
|
-
if not exist_ok:
|
|
427
|
-
raise
|
|
428
|
-
|
|
429
|
-
def _create_volume_dir(self, mode=0o777, parents=True, exist_ok=True):
|
|
430
|
-
path = self.as_files_api_path()
|
|
431
|
-
sdk = self.workspace.sdk()
|
|
432
|
-
|
|
433
|
-
try:
|
|
434
|
-
sdk.files.create_directory(path)
|
|
435
|
-
except (BadRequest, NotFound, ResourceDoesNotExist) as e:
|
|
436
|
-
if not parents:
|
|
437
|
-
raise
|
|
438
|
-
|
|
439
|
-
message = str(e)
|
|
440
|
-
|
|
441
|
-
if "not exist" in message:
|
|
442
|
-
self._ensure_volume()
|
|
443
|
-
|
|
444
|
-
sdk.files.create_directory(path)
|
|
445
|
-
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
446
|
-
if not exist_ok:
|
|
447
|
-
raise
|
|
448
|
-
|
|
449
|
-
self.clear_cache()
|
|
450
|
-
self._is_file, self._is_dir = False, True
|
|
451
|
-
|
|
452
|
-
def remove(self, recursive: bool = True):
|
|
453
|
-
if self.is_file():
|
|
454
|
-
return self.rmfile()
|
|
455
|
-
else:
|
|
456
|
-
return self.rmdir(recursive=recursive)
|
|
457
|
-
|
|
458
|
-
def rmfile(self):
|
|
459
|
-
try:
|
|
460
|
-
if self.kind == DatabricksPathKind.VOLUME:
|
|
461
|
-
return self._remove_volume_file()
|
|
462
|
-
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
463
|
-
return self._remove_workspace_file()
|
|
464
|
-
elif self.kind == DatabricksPathKind.DBFS:
|
|
465
|
-
return self._remove_dbfs_file()
|
|
466
|
-
finally:
|
|
467
|
-
self.clear_cache()
|
|
468
|
-
|
|
469
|
-
def _remove_volume_file(self):
|
|
470
|
-
sdk = self.workspace.sdk()
|
|
471
|
-
|
|
472
|
-
try:
|
|
473
|
-
sdk.files.delete(self.as_files_api_path())
|
|
474
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
475
|
-
pass
|
|
476
|
-
|
|
477
|
-
def _remove_workspace_file(self):
|
|
478
|
-
sdk = self.workspace.sdk()
|
|
479
|
-
|
|
480
|
-
try:
|
|
481
|
-
sdk.workspace.delete(self.as_workspace_api_path(), recursive=True)
|
|
482
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
483
|
-
pass
|
|
484
|
-
|
|
485
|
-
def _remove_dbfs_file(self):
|
|
486
|
-
sdk = self.workspace.sdk()
|
|
487
|
-
|
|
488
|
-
try:
|
|
489
|
-
sdk.dbfs.delete(self.as_dbfs_api_path(), recursive=True)
|
|
490
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
491
|
-
pass
|
|
492
|
-
|
|
493
|
-
def rmdir(self, recursive: bool = True):
|
|
494
|
-
with self as connected:
|
|
495
|
-
try:
|
|
496
|
-
if connected.kind == DatabricksPathKind.WORKSPACE:
|
|
497
|
-
connected.workspace.sdk().workspace.delete(
|
|
498
|
-
self.as_workspace_api_path(),
|
|
499
|
-
recursive=recursive,
|
|
500
|
-
)
|
|
501
|
-
elif connected.kind == DatabricksPathKind.VOLUME:
|
|
502
|
-
return self._remove_volume_dir(recursive=recursive)
|
|
503
|
-
else:
|
|
504
|
-
connected.workspace.sdk().dbfs.delete(
|
|
505
|
-
self.as_dbfs_api_path(),
|
|
506
|
-
recursive=recursive,
|
|
507
|
-
)
|
|
508
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
509
|
-
pass
|
|
510
|
-
finally:
|
|
511
|
-
connected.clear_cache()
|
|
512
|
-
|
|
513
|
-
def _remove_volume_dir(self, recursive: bool = True):
|
|
514
|
-
root_path = self.as_files_api_path()
|
|
515
|
-
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
516
|
-
|
|
517
|
-
sdk = self.workspace.sdk()
|
|
518
|
-
|
|
519
|
-
if rel is None:
|
|
520
|
-
try:
|
|
521
|
-
sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
|
|
522
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
523
|
-
pass
|
|
524
|
-
elif volume_name is None:
|
|
525
|
-
try:
|
|
526
|
-
sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
|
|
527
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
528
|
-
pass
|
|
529
|
-
else:
|
|
530
|
-
try:
|
|
531
|
-
sdk.files.delete_directory(root_path)
|
|
532
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
|
|
533
|
-
message = str(e)
|
|
534
|
-
|
|
535
|
-
if recursive and "directory is not empty" in message:
|
|
536
|
-
for child_path in self.ls():
|
|
537
|
-
child_path.remove(recursive=True)
|
|
538
|
-
sdk.files.delete_directory(root_path)
|
|
539
|
-
else:
|
|
540
|
-
pass
|
|
541
|
-
|
|
542
|
-
self.clear_cache()
|
|
543
|
-
|
|
544
|
-
def ls(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
|
|
545
|
-
if self.kind == DatabricksPathKind.VOLUME:
|
|
546
|
-
for _ in self._ls_volume(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
|
|
547
|
-
yield _
|
|
548
|
-
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
549
|
-
for _ in self._ls_workspace(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
|
|
550
|
-
yield _
|
|
551
|
-
elif self.kind == DatabricksPathKind.DBFS:
|
|
552
|
-
for _ in self._ls_dbfs(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
|
|
553
|
-
yield _
|
|
554
|
-
|
|
555
|
-
def _ls_volume(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
|
|
556
|
-
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
557
|
-
sdk = self.workspace.sdk()
|
|
558
|
-
|
|
559
|
-
if rel is None:
|
|
560
|
-
if volume_name is None:
|
|
561
|
-
try:
|
|
562
|
-
for info in sdk.volumes.list(
|
|
563
|
-
catalog_name=catalog_name,
|
|
564
|
-
schema_name=schema_name,
|
|
565
|
-
):
|
|
566
|
-
base = DatabricksPath(
|
|
567
|
-
f"/Volumes/{info.catalog_name}/{info.schema_name}/{info.name}",
|
|
568
|
-
workspace=self.workspace,
|
|
569
|
-
is_file=False,
|
|
570
|
-
is_dir=True,
|
|
571
|
-
)
|
|
572
|
-
|
|
573
|
-
if recursive:
|
|
574
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
575
|
-
yield sub
|
|
576
|
-
else:
|
|
577
|
-
yield base
|
|
578
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
579
|
-
if raise_error:
|
|
580
|
-
raise
|
|
581
|
-
elif schema_name is None:
|
|
582
|
-
try:
|
|
583
|
-
for info in sdk.schemas.list(catalog_name=catalog_name):
|
|
584
|
-
base = DatabricksPath(
|
|
585
|
-
f"/Volumes/{info.catalog_name}/{info.name}",
|
|
586
|
-
workspace=self.workspace,
|
|
587
|
-
is_file=False,
|
|
588
|
-
is_dir=True,
|
|
589
|
-
)
|
|
590
|
-
|
|
591
|
-
if recursive:
|
|
592
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
593
|
-
yield sub
|
|
594
|
-
else:
|
|
595
|
-
yield base
|
|
596
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
597
|
-
if raise_error:
|
|
598
|
-
raise
|
|
599
|
-
else:
|
|
600
|
-
try:
|
|
601
|
-
for info in sdk.catalogs.list():
|
|
602
|
-
base = DatabricksPath(
|
|
603
|
-
f"/Volumes/{info.name}",
|
|
604
|
-
workspace=self.workspace,
|
|
605
|
-
is_file=False,
|
|
606
|
-
is_dir=True,
|
|
607
|
-
)
|
|
608
|
-
|
|
609
|
-
if recursive:
|
|
610
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
611
|
-
yield sub
|
|
612
|
-
else:
|
|
613
|
-
yield base
|
|
614
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
615
|
-
if raise_error:
|
|
616
|
-
raise
|
|
617
|
-
else:
|
|
618
|
-
try:
|
|
619
|
-
for info in sdk.files.list_directory_contents(self.as_files_api_path(), page_size=fetch_size):
|
|
620
|
-
base = DatabricksPath(
|
|
621
|
-
info.path,
|
|
622
|
-
workspace=self.workspace,
|
|
623
|
-
is_file=not info.is_directory,
|
|
624
|
-
is_dir=info.is_directory,
|
|
625
|
-
)
|
|
626
|
-
|
|
627
|
-
if recursive and info.is_directory:
|
|
628
|
-
for sub in base._ls_volume(recursive=recursive):
|
|
629
|
-
yield sub
|
|
630
|
-
else:
|
|
631
|
-
yield base
|
|
632
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
633
|
-
if raise_error:
|
|
634
|
-
raise
|
|
635
|
-
|
|
636
|
-
def _ls_workspace(self, recursive: bool = True, fetch_size: int = None, raise_error: bool = True):
|
|
637
|
-
sdk = self.workspace.sdk()
|
|
638
|
-
|
|
639
|
-
try:
|
|
640
|
-
for info in sdk.workspace.list(self.as_workspace_api_path(), recursive=recursive):
|
|
641
|
-
is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
|
|
642
|
-
base = DatabricksPath(
|
|
643
|
-
info.path,
|
|
644
|
-
workspace=self.workspace,
|
|
645
|
-
is_file=not is_dir,
|
|
646
|
-
is_dir=is_dir,
|
|
647
|
-
)
|
|
648
|
-
yield base
|
|
649
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
650
|
-
if raise_error:
|
|
651
|
-
raise
|
|
652
|
-
|
|
653
|
-
def _ls_dbfs(self, recursive: bool = True, fetch_size: int = None, raise_error: bool = True):
|
|
654
|
-
sdk = self.workspace.sdk()
|
|
655
|
-
|
|
656
|
-
try:
|
|
657
|
-
# FIX: DBFS listing should use DBFS-normalized path, not workspace path
|
|
658
|
-
p = "/dbfs/" + self.as_dbfs_api_path() + "/"
|
|
659
|
-
for info in sdk.dbfs.list(p, recursive=recursive):
|
|
660
|
-
base = DatabricksPath(
|
|
661
|
-
info.path,
|
|
662
|
-
workspace=self.workspace,
|
|
663
|
-
is_file=not info.is_dir,
|
|
664
|
-
is_dir=info.is_dir,
|
|
665
|
-
)
|
|
666
|
-
yield base
|
|
667
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
668
|
-
if raise_error:
|
|
669
|
-
raise
|
|
670
|
-
|
|
671
|
-
@contextmanager
|
|
672
|
-
def open(
|
|
673
|
-
self,
|
|
674
|
-
mode="r",
|
|
675
|
-
buffering=-1,
|
|
676
|
-
encoding=None,
|
|
677
|
-
errors=None,
|
|
678
|
-
newline=None,
|
|
679
|
-
*,
|
|
680
|
-
workspace: Optional["Workspace"] = None,
|
|
681
|
-
overwrite: bool = True,
|
|
682
|
-
) -> Iterator[Union[BinaryIO, io.TextIOBase]]:
|
|
683
|
-
"""
|
|
684
|
-
Open this Databricks path using databricks-sdk's WorkspaceClient.
|
|
685
|
-
|
|
686
|
-
Supported:
|
|
687
|
-
- read: "rb", "r"
|
|
688
|
-
- write: "wb", "w" (buffered; uploads on close for WORKSPACE/VOLUME)
|
|
689
|
-
"""
|
|
690
|
-
if mode not in {"rb", "r", "wb", "w"}:
|
|
691
|
-
raise ValueError(f"Unsupported mode {mode!r}. Use r/rb/w/wb.")
|
|
692
|
-
|
|
693
|
-
if encoding is None:
|
|
694
|
-
encoding = None if "b" in mode else "utf-8"
|
|
695
|
-
reading = "r" in mode
|
|
696
|
-
|
|
697
|
-
if reading:
|
|
698
|
-
with self.open_read(encoding=encoding) as f:
|
|
699
|
-
yield f
|
|
700
|
-
else:
|
|
701
|
-
with self.open_write(encoding=encoding) as f:
|
|
702
|
-
yield f
|
|
703
|
-
|
|
704
|
-
@contextmanager
|
|
705
|
-
def open_read(self, encoding: str | None = None):
|
|
706
|
-
with self as connected:
|
|
707
|
-
if connected.kind == DatabricksPathKind.VOLUME:
|
|
708
|
-
with connected._open_read_volume(encoding=encoding) as f:
|
|
709
|
-
yield f
|
|
710
|
-
elif connected.kind == DatabricksPathKind.WORKSPACE:
|
|
711
|
-
with connected._open_read_workspace(encoding=encoding) as f:
|
|
712
|
-
yield f
|
|
713
|
-
else:
|
|
714
|
-
with connected._open_read_dbfs(encoding=encoding) as f:
|
|
715
|
-
yield f
|
|
716
|
-
|
|
717
|
-
@contextmanager
|
|
718
|
-
def _open_read_volume(self, encoding: str | None = None):
|
|
719
|
-
workspace_client = self.workspace.sdk()
|
|
720
|
-
path = self.as_files_api_path()
|
|
721
|
-
|
|
722
|
-
resp = workspace_client.files.download(path)
|
|
723
|
-
raw = io.BytesIO(resp.contents.read())
|
|
724
|
-
|
|
725
|
-
if encoding is not None:
|
|
726
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
727
|
-
yield f
|
|
728
|
-
else:
|
|
729
|
-
with raw as f:
|
|
730
|
-
yield f
|
|
731
|
-
|
|
732
|
-
@contextmanager
|
|
733
|
-
def _open_read_workspace(self, encoding: str | None = None):
|
|
734
|
-
workspace_client = self.workspace.sdk()
|
|
735
|
-
path = self.as_workspace_api_path()
|
|
736
|
-
|
|
737
|
-
raw = workspace_client.workspace.download(path) # returns BinaryIO
|
|
738
|
-
|
|
739
|
-
if encoding is not None:
|
|
740
|
-
raw = io.BytesIO(raw.read())
|
|
741
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
742
|
-
yield f
|
|
743
|
-
else:
|
|
744
|
-
with raw as f:
|
|
745
|
-
yield f
|
|
746
|
-
|
|
747
|
-
@contextmanager
|
|
748
|
-
def _open_read_dbfs(self, encoding: str | None = None):
|
|
749
|
-
workspace_client = self.workspace.sdk()
|
|
750
|
-
path = self.as_dbfs_api_path()
|
|
751
|
-
|
|
752
|
-
raw = workspace_client.dbfs.open(path, read=True)
|
|
753
|
-
|
|
754
|
-
if encoding is not None:
|
|
755
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
756
|
-
yield f
|
|
757
|
-
else:
|
|
758
|
-
with raw as f:
|
|
759
|
-
yield f
|
|
760
|
-
|
|
761
|
-
@contextmanager
|
|
762
|
-
def open_write(self, encoding: str | None = None):
|
|
763
|
-
with self as connected:
|
|
764
|
-
if connected.kind == DatabricksPathKind.VOLUME:
|
|
765
|
-
with connected._open_write_volume(encoding=encoding) as f:
|
|
766
|
-
yield f
|
|
767
|
-
elif connected.kind == DatabricksPathKind.WORKSPACE:
|
|
768
|
-
with connected._open_write_workspace(encoding=encoding) as f:
|
|
769
|
-
yield f
|
|
770
|
-
else:
|
|
771
|
-
with connected._open_write_dbfs(encoding=encoding) as f:
|
|
772
|
-
yield f
|
|
773
|
-
|
|
774
|
-
@contextmanager
|
|
775
|
-
def _open_write_volume(self, encoding: str | None = None, overwrite: bool = True):
|
|
776
|
-
workspace_client = self.workspace.sdk()
|
|
777
|
-
path = self.as_files_api_path()
|
|
778
|
-
|
|
779
|
-
buf = io.BytesIO()
|
|
780
|
-
|
|
781
|
-
if encoding is not None:
|
|
782
|
-
tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
|
|
783
|
-
try:
|
|
784
|
-
yield tw
|
|
785
|
-
finally:
|
|
786
|
-
tw.flush()
|
|
787
|
-
buf.seek(0)
|
|
788
|
-
|
|
789
|
-
try:
|
|
790
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
791
|
-
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
792
|
-
self.parent.mkdir(parents=True, exist_ok=True)
|
|
793
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
794
|
-
|
|
795
|
-
tw.detach()
|
|
796
|
-
else:
|
|
797
|
-
try:
|
|
798
|
-
yield buf
|
|
799
|
-
finally:
|
|
800
|
-
buf.seek(0)
|
|
801
|
-
|
|
802
|
-
try:
|
|
803
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
804
|
-
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
805
|
-
self.parent.mkdir(parents=True, exist_ok=True)
|
|
806
|
-
workspace_client.files.upload(path, buf, overwrite=overwrite)
|
|
807
|
-
|
|
808
|
-
@contextmanager
|
|
809
|
-
def _open_write_workspace(self, encoding: str | None = None, overwrite: bool = True):
|
|
810
|
-
workspace_client = self.workspace.sdk()
|
|
811
|
-
path = self.as_workspace_api_path()
|
|
812
|
-
|
|
813
|
-
buf = io.BytesIO()
|
|
814
|
-
|
|
815
|
-
if encoding is not None:
|
|
816
|
-
tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
|
|
817
|
-
try:
|
|
818
|
-
yield tw
|
|
819
|
-
finally:
|
|
820
|
-
tw.flush()
|
|
821
|
-
buf.seek(0)
|
|
822
|
-
|
|
823
|
-
try:
|
|
824
|
-
workspace_client.workspace.upload(
|
|
825
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
826
|
-
)
|
|
827
|
-
except Exception as e:
|
|
828
|
-
message = str(e)
|
|
829
|
-
if "parent folder" in message and "does not exist" in message:
|
|
830
|
-
self.parent.mkdir(parents=True)
|
|
831
|
-
buf.seek(0)
|
|
832
|
-
workspace_client.workspace.upload(
|
|
833
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
834
|
-
)
|
|
835
|
-
else:
|
|
836
|
-
raise
|
|
837
|
-
|
|
838
|
-
tw.detach()
|
|
839
|
-
else:
|
|
840
|
-
try:
|
|
841
|
-
yield buf
|
|
842
|
-
finally:
|
|
843
|
-
buf.seek(0)
|
|
844
|
-
|
|
845
|
-
try:
|
|
846
|
-
workspace_client.workspace.upload(
|
|
847
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
848
|
-
)
|
|
849
|
-
except Exception as e:
|
|
850
|
-
message = str(e)
|
|
851
|
-
if "parent folder" in message and "does not exist" in message:
|
|
852
|
-
self.parent.mkdir(parents=True)
|
|
853
|
-
buf.seek(0)
|
|
854
|
-
workspace_client.workspace.upload(
|
|
855
|
-
path, buf, format=ImportFormat.AUTO, overwrite=overwrite
|
|
856
|
-
)
|
|
857
|
-
else:
|
|
858
|
-
raise
|
|
859
|
-
|
|
860
|
-
@contextmanager
|
|
861
|
-
def _open_write_dbfs(self, encoding: str | None = None, overwrite: bool = True):
|
|
862
|
-
workspace_client = self.workspace.sdk()
|
|
863
|
-
path = self.as_dbfs_api_path()
|
|
864
|
-
|
|
865
|
-
raw = workspace_client.dbfs.open(path, write=True, overwrite=overwrite)
|
|
866
|
-
|
|
867
|
-
if encoding is not None:
|
|
868
|
-
with io.TextIOWrapper(raw, encoding=encoding) as f:
|
|
869
|
-
yield f
|
|
870
|
-
else:
|
|
871
|
-
with raw as f:
|
|
872
|
-
yield f
|
|
873
|
-
|
|
874
|
-
self.clear_cache()
|
|
875
|
-
self._is_file, self._is_dir = True, False
|