ygg 0.1.29__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,875 +0,0 @@
1
- # src/yggdrasil/databricks/workspaces/databricks_path.py
2
- from __future__ import annotations
3
-
4
- import io
5
- import time
6
- import urllib.parse as urlparse
7
- from contextlib import contextmanager
8
- from enum import Enum
9
- from pathlib import PurePosixPath, Path as SysPath
10
- from typing import Any, BinaryIO, Iterator, Optional, Tuple, Union, TYPE_CHECKING
11
-
12
- from databricks.sdk.service.catalog import VolumeType
13
-
14
- from ...libs.databrickslib import databricks
15
-
16
- if databricks is not None:
17
- from databricks.sdk.service.workspace import ImportFormat, ObjectType
18
- from databricks.sdk.errors.platform import (
19
- NotFound,
20
- ResourceDoesNotExist,
21
- BadRequest,
22
- PermissionDenied,
23
- AlreadyExists,
24
- ResourceAlreadyExists,
25
- )
26
-
27
- NOT_FOUND_ERRORS = NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied
28
- ALREADY_EXISTS_ERRORS = AlreadyExists, ResourceAlreadyExists, BadRequest
29
-
30
- if TYPE_CHECKING:
31
- from .workspace import Workspace
32
-
33
-
34
- __all__ = [
35
- "DatabricksPathKind",
36
- "DatabricksPath",
37
- ]
38
-
39
-
40
- def _seg_to_str(s) -> str:
41
- # Handles DatabricksPath, PurePosixPath, Windows Path, etc.
42
- if isinstance(s, SysPath):
43
- return s.as_posix()
44
- return str(s)
45
-
46
-
47
- class DatabricksPathKind(str, Enum):
48
- WORKSPACE = "workspace"
49
- VOLUME = "volume"
50
- DBFS = "dbfs"
51
-
52
- @classmethod
53
- def parse(
54
- cls,
55
- path: str,
56
- workspace: Optional["Workspace"] = None,
57
- ) -> Tuple["DatabricksPathKind", Optional["Workspace"], str]:
58
- from .workspace import Workspace
59
-
60
- if path.startswith("/Workspace") or path.startswith("/Users") or path.startswith("/Shared"):
61
- if path.startswith("/Users/me"):
62
- workspace = Workspace() if workspace is None else workspace
63
- path = path.replace("/Users/me", "/Users/%s" % workspace.current_user.user_name)
64
-
65
- return cls.WORKSPACE, workspace, path
66
-
67
- if path.startswith("/Volumes"):
68
- return cls.VOLUME, workspace, path
69
-
70
- if path.startswith("dbfs://"):
71
- parsed = urlparse.urlparse(path)
72
-
73
- # inner path is the URL path (e.g. /tmp/x or /Volumes/...)
74
- kind, _, inner_path = cls.parse(parsed.path, workspace=workspace)
75
-
76
- # hostname can be None for malformed/dbfs:// variants; fall back to default Workspace()
77
- if workspace is None:
78
- workspace = Workspace(host=parsed.hostname) if parsed.hostname else Workspace()
79
-
80
- return kind, workspace, inner_path
81
-
82
- return cls.DBFS, workspace, path
83
-
84
-
85
- class DatabricksPath(SysPath, PurePosixPath):
86
- _kind: "DatabricksPathKind"
87
- _workspace: Optional["Workspace"]
88
-
89
- _is_file: Optional[bool]
90
- _is_dir: Optional[bool]
91
-
92
- _raw_status: Optional[dict]
93
- _raw_status_refresh_time: float
94
-
95
- @staticmethod
96
- def _join_segments(pathsegments: tuple[Any, ...]) -> str:
97
- if not pathsegments:
98
- return ""
99
-
100
- first = _seg_to_str(pathsegments[0])
101
-
102
- # Keep dbfs:// URL-ish paths URL-ish (don't let PurePosixPath normalize it)
103
- if first.startswith("dbfs://"):
104
- rest = (_seg_to_str(s).lstrip("/") for s in pathsegments[1:])
105
- first = first.rstrip("/")
106
- tail = "/".join(rest)
107
- return f"{first}/{tail}" if tail else first
108
-
109
- return str(PurePosixPath(*(_seg_to_str(s) for s in pathsegments)))
110
-
111
- def __new__(
112
- cls,
113
- *pathsegments: Any,
114
- workspace: Optional["Workspace"] = None,
115
- is_file: Optional[bool] = None,
116
- is_dir: Optional[bool] = None,
117
- raw_status: Optional[dict] = None,
118
- raw_status_refresh_time: float = 0.0,
119
- ) -> "DatabricksPath":
120
- joined = cls._join_segments(pathsegments)
121
- kind, parsed_ws, pure_path = DatabricksPathKind.parse(joined, workspace=workspace)
122
-
123
- self = cls._from_parts([pure_path]) # pathlib-style construction (calls _init)
124
-
125
- # Override with constructor-provided metadata
126
- self._kind = kind
127
- self._workspace = parsed_ws if workspace is None else workspace
128
- self._is_file = is_file
129
- self._is_dir = is_dir
130
- self._raw_status = raw_status
131
- self._raw_status_refresh_time = float(raw_status_refresh_time)
132
-
133
- return self
134
-
135
- def __init__(
136
- self,
137
- *pathsegments: Any,
138
- workspace: Optional["Workspace"] = None,
139
- is_file: Optional[bool] = None,
140
- is_dir: Optional[bool] = None,
141
- raw_status: Optional[dict] = None,
142
- raw_status_refresh_time: float = 0.0,
143
- ) -> None:
144
- # pathlib paths are effectively immutable; all init happens in __new__ / _init
145
- pass
146
-
147
- def __truediv__(self, other):
148
- if not other:
149
- return self
150
-
151
- built = super().__truediv__(other)
152
-
153
- built._kind = self._kind
154
- built._workspace = self._workspace
155
-
156
- built._is_file = None
157
- built._is_dir = None
158
- built._raw_status = None
159
- built._raw_status_refresh_time = 0.0
160
-
161
- return built
162
-
163
- def __enter__(self):
164
- self.workspace.__enter__()
165
- return self
166
-
167
- def __exit__(self, exc_type, exc_val, exc_tb):
168
- return self.workspace.__exit__(exc_type, exc_val, exc_tb)
169
-
170
- def _clone_meta_from(self, template: "DatabricksPath") -> None:
171
- """
172
- Copy *connection/meta* state, but never copy caches.
173
- Centralizes the logic so every creation path stays consistent.
174
- """
175
- # Keep workspace threading; kind should match the NEW path string.
176
- kind, ws, _ = DatabricksPathKind.parse(str(self), workspace=getattr(template, "_workspace", None))
177
- self._kind = kind
178
- self._workspace = ws if ws is not None else getattr(template, "_workspace", None)
179
-
180
- # Reset caches
181
- self._is_file = None
182
- self._is_dir = None
183
- self._raw_status = None
184
- self._raw_status_refresh_time = 0.0
185
-
186
- @property
187
- def parent(self):
188
- built = super().parent
189
-
190
- built._clone_meta_from(self)
191
-
192
- return built
193
-
194
- @classmethod
195
- def _from_parsed_parts(cls, drv, root, parts):
196
- """
197
- pathlib internal factory. It may pass a template in some Python versions,
198
- but if not, we still return a valid DatabricksPath with initialized state.
199
- """
200
- built = super()._from_parsed_parts(drv, root, parts) # type: ignore[misc]
201
-
202
- # Best effort: if pathlib gave us a template on the object, use it.
203
- # Otherwise ensure we at least have valid defaults.
204
- if isinstance(built, DatabricksPath) and isinstance(getattr(built, "_workspace", None), object):
205
- # If the object already has workspace/kind via _init, don't stomp it.
206
- # But if it's missing _kind (common failure), derive it.
207
- if not hasattr(built, "_kind"):
208
- kind, ws, _ = DatabricksPathKind.parse(str(built), workspace=getattr(built, "_workspace", None))
209
- built._kind = kind
210
- built._workspace = ws if ws is not None else getattr(built, "_workspace", None)
211
-
212
- # Always reset caches (derived path => cache invalid)
213
- built._is_file = None
214
- built._is_dir = None
215
- built._raw_status = None
216
- built._raw_status_refresh_time = 0.0
217
- else:
218
- # Safety defaults (should be rare)
219
- kind, ws, _ = DatabricksPathKind.parse(str(built))
220
- built._kind = kind
221
- built._workspace = ws
222
- built._is_file = None
223
- built._is_dir = None
224
- built._raw_status = None
225
- built._raw_status_refresh_time = 0.0
226
-
227
- return built
228
-
229
- def _make_child(self, args):
230
- built = super()._make_child(args) # type: ignore[misc]
231
-
232
- # Ensure type + meta carryover
233
- if isinstance(built, DatabricksPath):
234
- built._clone_meta_from(self)
235
- else:
236
- # if for some reason super didn't return our type, try to coerce
237
- built = type(self)(built, workspace=getattr(self, "_workspace", None))
238
-
239
- return built
240
-
241
- @property
242
- def workspace(self):
243
- if self._workspace is None:
244
- from .workspace import Workspace
245
-
246
- self._workspace = Workspace()
247
- return self._workspace
248
-
249
- @workspace.setter
250
- def workspace(self, value):
251
- self._workspace = value
252
-
253
- @property
254
- def kind(self):
255
- return self._kind
256
-
257
- @kind.setter
258
- def kind(self, value: DatabricksPathKind):
259
- self._kind = value
260
-
261
- def is_file(self, *, follow_symlinks=True):
262
- if self._is_file is None:
263
- self.refresh_status()
264
- return self._is_file
265
-
266
- def is_dir(self, *, follow_symlinks=True):
267
- if self._is_dir is None:
268
- self.refresh_status()
269
- return self._is_dir
270
-
271
- def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
272
- if self.kind != DatabricksPathKind.VOLUME:
273
- return None, None, None, None
274
-
275
- s = str(self)
276
- segs = s.split("/") # ['', 'Volumes', catalog?, schema?, volume?, ...]
277
-
278
- # still keep the basic sanity check
279
- if len(segs) < 2 or segs[1] != "Volumes":
280
- raise ValueError(f"Invalid volume path: {s!r}")
281
-
282
- catalog = segs[2] if len(segs) > 2 and segs[2] else None
283
- schema = segs[3] if len(segs) > 3 and segs[3] else None
284
- volume = segs[4] if len(segs) > 4 and segs[4] else None
285
-
286
- # rel path only makes sense after /Volumes/<catalog>/<schema>/<volume>
287
- if len(segs) > 5:
288
- rel = "/".join(segs[5:])
289
- rel_path = PurePosixPath(rel) if rel else PurePosixPath(".")
290
- else:
291
- rel_path = None
292
-
293
- return catalog, schema, volume, rel_path
294
-
295
- def refresh_status(self):
296
- with self as connected:
297
- sdk = connected.workspace.sdk()
298
-
299
- try:
300
- if connected.kind == DatabricksPathKind.VOLUME:
301
- info = sdk.files.get_metadata(connected.as_files_api_path())
302
-
303
- connected._raw_status = info
304
- connected._is_file, connected._is_dir = True, False
305
- elif connected.kind == DatabricksPathKind.WORKSPACE:
306
- info = sdk.workspace.get_status(connected.as_workspace_api_path())
307
-
308
- is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
309
- connected._raw_status = info
310
- connected._is_file, connected._is_dir = not is_dir, is_dir
311
- else:
312
- info = sdk.dbfs.get_status(connected.as_dbfs_api_path())
313
-
314
- connected._raw_status = info
315
- connected._is_file, connected._is_dir = (not info.is_dir), info.is_dir
316
-
317
- connected._raw_status_refresh_time = time.time()
318
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
319
- found = next(connected.ls(fetch_size=1, recursive=False, raise_error=False), None)
320
-
321
- if found is None:
322
- connected._is_file, connected._is_dir = False, False
323
- else:
324
- connected._is_file, connected._is_dir = False, True
325
-
326
- return connected
327
-
328
- def clear_cache(self):
329
- self._raw_status = None
330
- self._raw_status_refresh_time = 0
331
-
332
- self._is_file = None
333
- self._is_dir = None
334
-
335
- # ---- API path normalization helpers ----
336
-
337
- def as_workspace_api_path(self) -> str:
338
- """
339
- Workspace API typically uses paths like /Users/... (not /Workspace/Users/...)
340
- so we strip the leading /Workspace when present.
341
- """
342
- s = str(self)
343
- return s[len("/Workspace") :] if s.startswith("/Workspace") else s
344
-
345
- def as_dbfs_api_path(self) -> str:
346
- """
347
- DBFS REST wants absolute DBFS paths like /tmp/x.
348
- If the user passes /dbfs/tmp/x (FUSE-style), strip the /dbfs prefix.
349
- """
350
- s = str(self)
351
- return s[len("/dbfs") :] if s.startswith("/dbfs") else s
352
-
353
- def as_files_api_path(self) -> str:
354
- """
355
- Files API takes absolute paths, e.g. /Volumes/<...>/file
356
- """
357
- return str(self)
358
-
359
- def with_segments(self, *pathsegments):
360
- """Construct a new path object from any number of path-like objects.
361
- Subclasses may override this method to customize how new path objects
362
- are created from methods like `iterdir()`.
363
- """
364
- return type(self)(*pathsegments, workspace=self._workspace)
365
-
366
- def exists(self, *, follow_symlinks=True) -> bool:
367
- if self.is_file():
368
- return True
369
- if self.is_dir():
370
- return True
371
- return False
372
-
373
- def mkdir(self, mode=0o777, parents=True, exist_ok=True):
374
- """
375
- Create a new directory at this given path.
376
- """
377
- with self as connected:
378
- connected.clear_cache()
379
-
380
- try:
381
- if connected.kind == DatabricksPathKind.WORKSPACE:
382
- connected.workspace.sdk().workspace.mkdirs(self.as_workspace_api_path())
383
- elif connected.kind == DatabricksPathKind.VOLUME:
384
- return connected._create_volume_dir(mode=mode, parents=parents, exist_ok=exist_ok)
385
- elif connected._kind == DatabricksPathKind.DBFS:
386
- connected.workspace.sdk().dbfs.mkdirs(self.as_dbfs_api_path())
387
-
388
- connected._is_file, connected._is_dir = False, True
389
- except (NotFound, ResourceDoesNotExist):
390
- if not parents or self.parent == self:
391
- raise
392
-
393
- connected.parent.mkdir(parents=True, exist_ok=True)
394
- connected.mkdir(mode, parents=False, exist_ok=exist_ok)
395
- except (AlreadyExists, ResourceAlreadyExists):
396
- if not exist_ok:
397
- raise
398
-
399
- def _ensure_volume(self, exist_ok: bool = True):
400
- catalog_name, schema_name, volume_name, rel = self.volume_parts()
401
- sdk = self.workspace.sdk()
402
-
403
- if catalog_name:
404
- try:
405
- sdk.catalogs.create(name=catalog_name)
406
- except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
407
- if not exist_ok:
408
- raise
409
-
410
- if schema_name:
411
- try:
412
- sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
413
- except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
414
- if not exist_ok:
415
- raise
416
-
417
- if volume_name:
418
- try:
419
- sdk.volumes.create(
420
- catalog_name=catalog_name,
421
- schema_name=schema_name,
422
- name=volume_name,
423
- volume_type=VolumeType.MANAGED,
424
- )
425
- except (AlreadyExists, ResourceAlreadyExists, BadRequest):
426
- if not exist_ok:
427
- raise
428
-
429
- def _create_volume_dir(self, mode=0o777, parents=True, exist_ok=True):
430
- path = self.as_files_api_path()
431
- sdk = self.workspace.sdk()
432
-
433
- try:
434
- sdk.files.create_directory(path)
435
- except (BadRequest, NotFound, ResourceDoesNotExist) as e:
436
- if not parents:
437
- raise
438
-
439
- message = str(e)
440
-
441
- if "not exist" in message:
442
- self._ensure_volume()
443
-
444
- sdk.files.create_directory(path)
445
- except (AlreadyExists, ResourceAlreadyExists, BadRequest):
446
- if not exist_ok:
447
- raise
448
-
449
- self.clear_cache()
450
- self._is_file, self._is_dir = False, True
451
-
452
- def remove(self, recursive: bool = True):
453
- if self.is_file():
454
- return self.rmfile()
455
- else:
456
- return self.rmdir(recursive=recursive)
457
-
458
- def rmfile(self):
459
- try:
460
- if self.kind == DatabricksPathKind.VOLUME:
461
- return self._remove_volume_file()
462
- elif self.kind == DatabricksPathKind.WORKSPACE:
463
- return self._remove_workspace_file()
464
- elif self.kind == DatabricksPathKind.DBFS:
465
- return self._remove_dbfs_file()
466
- finally:
467
- self.clear_cache()
468
-
469
- def _remove_volume_file(self):
470
- sdk = self.workspace.sdk()
471
-
472
- try:
473
- sdk.files.delete(self.as_files_api_path())
474
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
475
- pass
476
-
477
- def _remove_workspace_file(self):
478
- sdk = self.workspace.sdk()
479
-
480
- try:
481
- sdk.workspace.delete(self.as_workspace_api_path(), recursive=True)
482
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
483
- pass
484
-
485
- def _remove_dbfs_file(self):
486
- sdk = self.workspace.sdk()
487
-
488
- try:
489
- sdk.dbfs.delete(self.as_dbfs_api_path(), recursive=True)
490
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
491
- pass
492
-
493
- def rmdir(self, recursive: bool = True):
494
- with self as connected:
495
- try:
496
- if connected.kind == DatabricksPathKind.WORKSPACE:
497
- connected.workspace.sdk().workspace.delete(
498
- self.as_workspace_api_path(),
499
- recursive=recursive,
500
- )
501
- elif connected.kind == DatabricksPathKind.VOLUME:
502
- return self._remove_volume_dir(recursive=recursive)
503
- else:
504
- connected.workspace.sdk().dbfs.delete(
505
- self.as_dbfs_api_path(),
506
- recursive=recursive,
507
- )
508
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
509
- pass
510
- finally:
511
- connected.clear_cache()
512
-
513
- def _remove_volume_dir(self, recursive: bool = True):
514
- root_path = self.as_files_api_path()
515
- catalog_name, schema_name, volume_name, rel = self.volume_parts()
516
-
517
- sdk = self.workspace.sdk()
518
-
519
- if rel is None:
520
- try:
521
- sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
522
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
523
- pass
524
- elif volume_name is None:
525
- try:
526
- sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
527
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
528
- pass
529
- else:
530
- try:
531
- sdk.files.delete_directory(root_path)
532
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
533
- message = str(e)
534
-
535
- if recursive and "directory is not empty" in message:
536
- for child_path in self.ls():
537
- child_path.remove(recursive=True)
538
- sdk.files.delete_directory(root_path)
539
- else:
540
- pass
541
-
542
- self.clear_cache()
543
-
544
- def ls(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
545
- if self.kind == DatabricksPathKind.VOLUME:
546
- for _ in self._ls_volume(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
547
- yield _
548
- elif self.kind == DatabricksPathKind.WORKSPACE:
549
- for _ in self._ls_workspace(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
550
- yield _
551
- elif self.kind == DatabricksPathKind.DBFS:
552
- for _ in self._ls_dbfs(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
553
- yield _
554
-
555
- def _ls_volume(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
556
- catalog_name, schema_name, volume_name, rel = self.volume_parts()
557
- sdk = self.workspace.sdk()
558
-
559
- if rel is None:
560
- if volume_name is None:
561
- try:
562
- for info in sdk.volumes.list(
563
- catalog_name=catalog_name,
564
- schema_name=schema_name,
565
- ):
566
- base = DatabricksPath(
567
- f"/Volumes/{info.catalog_name}/{info.schema_name}/{info.name}",
568
- workspace=self.workspace,
569
- is_file=False,
570
- is_dir=True,
571
- )
572
-
573
- if recursive:
574
- for sub in base._ls_volume(recursive=recursive):
575
- yield sub
576
- else:
577
- yield base
578
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
579
- if raise_error:
580
- raise
581
- elif schema_name is None:
582
- try:
583
- for info in sdk.schemas.list(catalog_name=catalog_name):
584
- base = DatabricksPath(
585
- f"/Volumes/{info.catalog_name}/{info.name}",
586
- workspace=self.workspace,
587
- is_file=False,
588
- is_dir=True,
589
- )
590
-
591
- if recursive:
592
- for sub in base._ls_volume(recursive=recursive):
593
- yield sub
594
- else:
595
- yield base
596
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
597
- if raise_error:
598
- raise
599
- else:
600
- try:
601
- for info in sdk.catalogs.list():
602
- base = DatabricksPath(
603
- f"/Volumes/{info.name}",
604
- workspace=self.workspace,
605
- is_file=False,
606
- is_dir=True,
607
- )
608
-
609
- if recursive:
610
- for sub in base._ls_volume(recursive=recursive):
611
- yield sub
612
- else:
613
- yield base
614
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
615
- if raise_error:
616
- raise
617
- else:
618
- try:
619
- for info in sdk.files.list_directory_contents(self.as_files_api_path(), page_size=fetch_size):
620
- base = DatabricksPath(
621
- info.path,
622
- workspace=self.workspace,
623
- is_file=not info.is_directory,
624
- is_dir=info.is_directory,
625
- )
626
-
627
- if recursive and info.is_directory:
628
- for sub in base._ls_volume(recursive=recursive):
629
- yield sub
630
- else:
631
- yield base
632
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
633
- if raise_error:
634
- raise
635
-
636
- def _ls_workspace(self, recursive: bool = True, fetch_size: int = None, raise_error: bool = True):
637
- sdk = self.workspace.sdk()
638
-
639
- try:
640
- for info in sdk.workspace.list(self.as_workspace_api_path(), recursive=recursive):
641
- is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
642
- base = DatabricksPath(
643
- info.path,
644
- workspace=self.workspace,
645
- is_file=not is_dir,
646
- is_dir=is_dir,
647
- )
648
- yield base
649
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
650
- if raise_error:
651
- raise
652
-
653
- def _ls_dbfs(self, recursive: bool = True, fetch_size: int = None, raise_error: bool = True):
654
- sdk = self.workspace.sdk()
655
-
656
- try:
657
- # FIX: DBFS listing should use DBFS-normalized path, not workspace path
658
- p = "/dbfs/" + self.as_dbfs_api_path() + "/"
659
- for info in sdk.dbfs.list(p, recursive=recursive):
660
- base = DatabricksPath(
661
- info.path,
662
- workspace=self.workspace,
663
- is_file=not info.is_dir,
664
- is_dir=info.is_dir,
665
- )
666
- yield base
667
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
668
- if raise_error:
669
- raise
670
-
671
- @contextmanager
672
- def open(
673
- self,
674
- mode="r",
675
- buffering=-1,
676
- encoding=None,
677
- errors=None,
678
- newline=None,
679
- *,
680
- workspace: Optional["Workspace"] = None,
681
- overwrite: bool = True,
682
- ) -> Iterator[Union[BinaryIO, io.TextIOBase]]:
683
- """
684
- Open this Databricks path using databricks-sdk's WorkspaceClient.
685
-
686
- Supported:
687
- - read: "rb", "r"
688
- - write: "wb", "w" (buffered; uploads on close for WORKSPACE/VOLUME)
689
- """
690
- if mode not in {"rb", "r", "wb", "w"}:
691
- raise ValueError(f"Unsupported mode {mode!r}. Use r/rb/w/wb.")
692
-
693
- if encoding is None:
694
- encoding = None if "b" in mode else "utf-8"
695
- reading = "r" in mode
696
-
697
- if reading:
698
- with self.open_read(encoding=encoding) as f:
699
- yield f
700
- else:
701
- with self.open_write(encoding=encoding) as f:
702
- yield f
703
-
704
- @contextmanager
705
- def open_read(self, encoding: str | None = None):
706
- with self as connected:
707
- if connected.kind == DatabricksPathKind.VOLUME:
708
- with connected._open_read_volume(encoding=encoding) as f:
709
- yield f
710
- elif connected.kind == DatabricksPathKind.WORKSPACE:
711
- with connected._open_read_workspace(encoding=encoding) as f:
712
- yield f
713
- else:
714
- with connected._open_read_dbfs(encoding=encoding) as f:
715
- yield f
716
-
717
- @contextmanager
718
- def _open_read_volume(self, encoding: str | None = None):
719
- workspace_client = self.workspace.sdk()
720
- path = self.as_files_api_path()
721
-
722
- resp = workspace_client.files.download(path)
723
- raw = io.BytesIO(resp.contents.read())
724
-
725
- if encoding is not None:
726
- with io.TextIOWrapper(raw, encoding=encoding) as f:
727
- yield f
728
- else:
729
- with raw as f:
730
- yield f
731
-
732
- @contextmanager
733
- def _open_read_workspace(self, encoding: str | None = None):
734
- workspace_client = self.workspace.sdk()
735
- path = self.as_workspace_api_path()
736
-
737
- raw = workspace_client.workspace.download(path) # returns BinaryIO
738
-
739
- if encoding is not None:
740
- raw = io.BytesIO(raw.read())
741
- with io.TextIOWrapper(raw, encoding=encoding) as f:
742
- yield f
743
- else:
744
- with raw as f:
745
- yield f
746
-
747
- @contextmanager
748
- def _open_read_dbfs(self, encoding: str | None = None):
749
- workspace_client = self.workspace.sdk()
750
- path = self.as_dbfs_api_path()
751
-
752
- raw = workspace_client.dbfs.open(path, read=True)
753
-
754
- if encoding is not None:
755
- with io.TextIOWrapper(raw, encoding=encoding) as f:
756
- yield f
757
- else:
758
- with raw as f:
759
- yield f
760
-
761
- @contextmanager
762
- def open_write(self, encoding: str | None = None):
763
- with self as connected:
764
- if connected.kind == DatabricksPathKind.VOLUME:
765
- with connected._open_write_volume(encoding=encoding) as f:
766
- yield f
767
- elif connected.kind == DatabricksPathKind.WORKSPACE:
768
- with connected._open_write_workspace(encoding=encoding) as f:
769
- yield f
770
- else:
771
- with connected._open_write_dbfs(encoding=encoding) as f:
772
- yield f
773
-
774
- @contextmanager
775
- def _open_write_volume(self, encoding: str | None = None, overwrite: bool = True):
776
- workspace_client = self.workspace.sdk()
777
- path = self.as_files_api_path()
778
-
779
- buf = io.BytesIO()
780
-
781
- if encoding is not None:
782
- tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
783
- try:
784
- yield tw
785
- finally:
786
- tw.flush()
787
- buf.seek(0)
788
-
789
- try:
790
- workspace_client.files.upload(path, buf, overwrite=overwrite)
791
- except (NotFound, ResourceDoesNotExist, BadRequest):
792
- self.parent.mkdir(parents=True, exist_ok=True)
793
- workspace_client.files.upload(path, buf, overwrite=overwrite)
794
-
795
- tw.detach()
796
- else:
797
- try:
798
- yield buf
799
- finally:
800
- buf.seek(0)
801
-
802
- try:
803
- workspace_client.files.upload(path, buf, overwrite=overwrite)
804
- except (NotFound, ResourceDoesNotExist, BadRequest):
805
- self.parent.mkdir(parents=True, exist_ok=True)
806
- workspace_client.files.upload(path, buf, overwrite=overwrite)
807
-
808
- @contextmanager
809
- def _open_write_workspace(self, encoding: str | None = None, overwrite: bool = True):
810
- workspace_client = self.workspace.sdk()
811
- path = self.as_workspace_api_path()
812
-
813
- buf = io.BytesIO()
814
-
815
- if encoding is not None:
816
- tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
817
- try:
818
- yield tw
819
- finally:
820
- tw.flush()
821
- buf.seek(0)
822
-
823
- try:
824
- workspace_client.workspace.upload(
825
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
826
- )
827
- except Exception as e:
828
- message = str(e)
829
- if "parent folder" in message and "does not exist" in message:
830
- self.parent.mkdir(parents=True)
831
- buf.seek(0)
832
- workspace_client.workspace.upload(
833
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
834
- )
835
- else:
836
- raise
837
-
838
- tw.detach()
839
- else:
840
- try:
841
- yield buf
842
- finally:
843
- buf.seek(0)
844
-
845
- try:
846
- workspace_client.workspace.upload(
847
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
848
- )
849
- except Exception as e:
850
- message = str(e)
851
- if "parent folder" in message and "does not exist" in message:
852
- self.parent.mkdir(parents=True)
853
- buf.seek(0)
854
- workspace_client.workspace.upload(
855
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
856
- )
857
- else:
858
- raise
859
-
860
- @contextmanager
861
- def _open_write_dbfs(self, encoding: str | None = None, overwrite: bool = True):
862
- workspace_client = self.workspace.sdk()
863
- path = self.as_dbfs_api_path()
864
-
865
- raw = workspace_client.dbfs.open(path, write=True, overwrite=overwrite)
866
-
867
- if encoding is not None:
868
- with io.TextIOWrapper(raw, encoding=encoding) as f:
869
- yield f
870
- else:
871
- with raw as f:
872
- yield f
873
-
874
- self.clear_cache()
875
- self._is_file, self._is_dir = True, False