ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
  2. ygg-0.1.32.dist-info/RECORD +60 -0
  3. yggdrasil/__init__.py +2 -0
  4. yggdrasil/databricks/__init__.py +2 -0
  5. yggdrasil/databricks/compute/__init__.py +2 -0
  6. yggdrasil/databricks/compute/cluster.py +241 -2
  7. yggdrasil/databricks/compute/execution_context.py +100 -11
  8. yggdrasil/databricks/compute/remote.py +16 -0
  9. yggdrasil/databricks/jobs/__init__.py +5 -0
  10. yggdrasil/databricks/jobs/config.py +31 -34
  11. yggdrasil/databricks/sql/__init__.py +2 -0
  12. yggdrasil/databricks/sql/engine.py +217 -36
  13. yggdrasil/databricks/sql/exceptions.py +1 -0
  14. yggdrasil/databricks/sql/statement_result.py +148 -1
  15. yggdrasil/databricks/sql/types.py +49 -1
  16. yggdrasil/databricks/workspaces/__init__.py +4 -1
  17. yggdrasil/databricks/workspaces/filesytem.py +344 -0
  18. yggdrasil/databricks/workspaces/io.py +1123 -0
  19. yggdrasil/databricks/workspaces/path.py +1415 -0
  20. yggdrasil/databricks/workspaces/path_kind.py +13 -0
  21. yggdrasil/databricks/workspaces/workspace.py +298 -154
  22. yggdrasil/dataclasses/__init__.py +2 -0
  23. yggdrasil/dataclasses/dataclass.py +42 -1
  24. yggdrasil/libs/__init__.py +2 -0
  25. yggdrasil/libs/databrickslib.py +9 -0
  26. yggdrasil/libs/extensions/__init__.py +2 -0
  27. yggdrasil/libs/extensions/polars_extensions.py +72 -0
  28. yggdrasil/libs/extensions/spark_extensions.py +116 -0
  29. yggdrasil/libs/pandaslib.py +7 -0
  30. yggdrasil/libs/polarslib.py +7 -0
  31. yggdrasil/libs/sparklib.py +41 -0
  32. yggdrasil/pyutils/__init__.py +4 -0
  33. yggdrasil/pyutils/callable_serde.py +106 -0
  34. yggdrasil/pyutils/exceptions.py +16 -0
  35. yggdrasil/pyutils/modules.py +44 -1
  36. yggdrasil/pyutils/parallel.py +29 -0
  37. yggdrasil/pyutils/python_env.py +301 -0
  38. yggdrasil/pyutils/retry.py +57 -0
  39. yggdrasil/requests/__init__.py +4 -0
  40. yggdrasil/requests/msal.py +124 -3
  41. yggdrasil/requests/session.py +18 -0
  42. yggdrasil/types/__init__.py +2 -0
  43. yggdrasil/types/cast/__init__.py +2 -1
  44. yggdrasil/types/cast/arrow_cast.py +131 -0
  45. yggdrasil/types/cast/cast_options.py +119 -1
  46. yggdrasil/types/cast/pandas_cast.py +29 -0
  47. yggdrasil/types/cast/polars_cast.py +47 -0
  48. yggdrasil/types/cast/polars_pandas_cast.py +29 -0
  49. yggdrasil/types/cast/registry.py +176 -0
  50. yggdrasil/types/cast/spark_cast.py +76 -0
  51. yggdrasil/types/cast/spark_pandas_cast.py +29 -0
  52. yggdrasil/types/cast/spark_polars_cast.py +28 -0
  53. yggdrasil/types/libs.py +2 -0
  54. yggdrasil/types/python_arrow.py +191 -0
  55. yggdrasil/types/python_defaults.py +73 -0
  56. yggdrasil/version.py +1 -0
  57. ygg-0.1.30.dist-info/RECORD +0 -56
  58. yggdrasil/databricks/workspaces/databricks_path.py +0 -784
  59. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
  60. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
  61. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
  62. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
@@ -1,784 +0,0 @@
1
- # src/yggdrasil/databricks/workspaces/databricks_path.py
2
- from __future__ import annotations
3
-
4
- import dataclasses
5
- import io
6
- import time
7
- from contextlib import contextmanager
8
- from enum import Enum
9
- from pathlib import PurePosixPath
10
- from typing import BinaryIO, Iterator, Optional, Tuple, Union, TYPE_CHECKING, List
11
-
12
- from databricks.sdk.service.catalog import VolumeType
13
-
14
- from ...libs.databrickslib import databricks
15
-
16
- if databricks is not None:
17
- from databricks.sdk.service.workspace import ImportFormat, ObjectType
18
- from databricks.sdk.errors.platform import (
19
- NotFound,
20
- ResourceDoesNotExist,
21
- BadRequest,
22
- PermissionDenied,
23
- AlreadyExists,
24
- ResourceAlreadyExists,
25
- )
26
-
27
- NOT_FOUND_ERRORS = NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied
28
- ALREADY_EXISTS_ERRORS = AlreadyExists, ResourceAlreadyExists, BadRequest
29
-
30
- if TYPE_CHECKING:
31
- from .workspace import Workspace
32
-
33
-
34
- __all__ = [
35
- "DatabricksPathKind",
36
- "DatabricksPath",
37
- ]
38
-
39
-
40
- def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
41
- if isinstance(parts, str):
42
- parts = [parts]
43
-
44
- if any("/" in part for part in parts):
45
- # flatten parts with slashes
46
- new_parts = []
47
- for part in parts:
48
- split_parts = part.split("/")
49
- new_parts.extend(split_parts)
50
- parts = new_parts
51
-
52
- return parts
53
-
54
-
55
- class DatabricksPathKind(str, Enum):
56
- WORKSPACE = "workspace"
57
- VOLUME = "volume"
58
- DBFS = "dbfs"
59
-
60
-
61
- @dataclasses.dataclass
62
- class DatabricksPath:
63
- kind: "DatabricksPathKind"
64
- parts: List[str]
65
- workspace: Optional["Workspace"] = None
66
-
67
- _is_file: Optional[bool] = None
68
- _is_dir: Optional[bool] = None
69
-
70
- _raw_status: Optional[dict] = None
71
- _raw_status_refresh_time: float = 0.0
72
-
73
- @classmethod
74
- def parse(
75
- cls,
76
- parts: Union[List[str], str],
77
- workspace: Optional["Workspace"] = None,
78
- ) -> "DatabricksPath":
79
- if not parts:
80
- return DatabricksPath(
81
- kind=DatabricksPathKind.DBFS,
82
- parts=[],
83
- workspace=workspace,
84
- )
85
-
86
- parts = _flatten_parts(parts)
87
-
88
- if not parts[0]:
89
- parts = parts[1:]
90
-
91
- if not parts:
92
- return DatabricksPath(
93
- kind=DatabricksPathKind.DBFS,
94
- parts=[],
95
- workspace=workspace,
96
- )
97
-
98
- head, *tail = parts
99
-
100
- if head == "dbfs":
101
- kind = DatabricksPathKind.DBFS
102
- elif head == "Workspace":
103
- kind = DatabricksPathKind.WORKSPACE
104
- elif head == "Volumes":
105
- kind = DatabricksPathKind.VOLUME
106
- else:
107
- raise ValueError(f"Invalid DatabricksPath prefix: {parts!r}")
108
-
109
- return DatabricksPath(
110
- kind=kind,
111
- parts=tail,
112
- workspace=workspace,
113
- )
114
-
115
- def __hash__(self):
116
- return hash((self.kind, tuple(self.parts)))
117
-
118
- def __eq__(self, other):
119
- if not isinstance(other, DatabricksPath):
120
- if isinstance(other, str):
121
- return str(self) == other
122
- return False
123
- return self.kind == other.kind and self.parts == other.parts
124
-
125
- def __truediv__(self, other):
126
- if not other:
127
- return self
128
-
129
- other_parts = _flatten_parts(other)
130
-
131
- built = DatabricksPath(
132
- kind=self.kind,
133
- parts=self.parts + other_parts,
134
- workspace=self.workspace,
135
- )
136
-
137
- return built
138
-
139
- def __enter__(self):
140
- self.safe_workspace.__enter__()
141
- return self
142
-
143
- def __exit__(self, exc_type, exc_val, exc_tb):
144
- return self.safe_workspace.__exit__(exc_type, exc_val, exc_tb)
145
-
146
- def __str__(self):
147
- if self.kind == DatabricksPathKind.DBFS:
148
- return self.as_dbfs_api_path()
149
- elif self.kind == DatabricksPathKind.WORKSPACE:
150
- return self.as_workspace_api_path()
151
- elif self.kind == DatabricksPathKind.VOLUME:
152
- return self.as_files_api_path()
153
- else:
154
- raise ValueError(f"Unknown DatabricksPath kind: {self.kind!r}")
155
-
156
- def __repr__(self):
157
- return "dbfs://%s" % self.__str__()
158
-
159
- @property
160
- def parent(self):
161
- if not self.parts:
162
- return self
163
-
164
- if self._is_file is not None or self._is_dir is not None:
165
- _is_file, _is_dir = False, True
166
- else:
167
- _is_file, _is_dir = None, None
168
-
169
- built = DatabricksPath(
170
- kind=self.kind,
171
- parts=self.parts[:-1],
172
- workspace=self.workspace,
173
- _is_file=_is_file,
174
- _is_dir=_is_dir,
175
- )
176
-
177
- return built
178
-
179
- @property
180
- def safe_workspace(self):
181
- if self.workspace is None:
182
- from .workspace import Workspace
183
-
184
- self.workspace = Workspace()
185
- return self.workspace
186
-
187
- @safe_workspace.setter
188
- def safe_workspace(self, value):
189
- self.workspace = value
190
-
191
- def is_file(self):
192
- if self._is_file is None:
193
- self.refresh_status()
194
- return self._is_file
195
-
196
- def is_dir(self):
197
- if self._is_dir is None:
198
- self.refresh_status()
199
- return self._is_dir
200
-
201
- def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
202
- if self.kind != DatabricksPathKind.VOLUME:
203
- return None, None, None, None
204
-
205
- catalog = self.parts[0] if len(self.parts) > 0 and self.parts[0] else None
206
- schema = self.parts[1] if len(self.parts) > 1 and self.parts[1] else None
207
- volume = self.parts[2] if len(self.parts) > 2 and self.parts[2] else None
208
-
209
- return catalog, schema, volume, self.parts[3:]
210
-
211
- def refresh_status(self):
212
- with self as connected:
213
- sdk = connected.safe_workspace.sdk()
214
-
215
- try:
216
- if connected.kind == DatabricksPathKind.VOLUME:
217
- info = sdk.files.get_metadata(connected.as_files_api_path())
218
-
219
- connected._raw_status = info
220
- connected._is_file, connected._is_dir = True, False
221
- elif connected.kind == DatabricksPathKind.WORKSPACE:
222
- info = sdk.workspace.get_status(connected.as_workspace_api_path())
223
-
224
- is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
225
- connected._raw_status = info
226
- connected._is_file, connected._is_dir = not is_dir, is_dir
227
- else:
228
- info = sdk.dbfs.get_status(connected.as_dbfs_api_path())
229
-
230
- connected._raw_status = info
231
- connected._is_file, connected._is_dir = (not info.is_dir), info.is_dir
232
-
233
- connected._raw_status_refresh_time = time.time()
234
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
235
- found = next(connected.ls(fetch_size=1, recursive=False, raise_error=False), None)
236
-
237
- if found is None:
238
- connected._is_file, connected._is_dir = False, False
239
- else:
240
- connected._is_file, connected._is_dir = False, True
241
-
242
- return connected
243
-
244
- def clear_cache(self):
245
- self._raw_status = None
246
- self._raw_status_refresh_time = 0
247
-
248
- self._is_file = None
249
- self._is_dir = None
250
-
251
- # ---- API path normalization helpers ----
252
-
253
- def as_workspace_api_path(self) -> str:
254
- """
255
- Workspace API typically uses paths like /Users/... (not /Workspace/Users/...)
256
- so we strip the leading /Workspace when present.
257
- """
258
- return "/Workspace/%s" % "/".join(self.parts) if self.parts else "/Workspace"
259
-
260
- def as_dbfs_api_path(self) -> str:
261
- """
262
- DBFS REST wants absolute DBFS paths like /tmp/x.
263
- If the user passes /dbfs/tmp/x (FUSE-style), strip the /dbfs prefix.
264
- """
265
- return "/dbfs/%s" % "/".join(self.parts) if self.parts else "/dbfs"
266
-
267
- def as_files_api_path(self) -> str:
268
- """
269
- Files API takes absolute paths, e.g. /Volumes/<...>/file
270
- """
271
- return "/Volumes/%s" % "/".join(self.parts) if self.parts else "/Volumes"
272
-
273
- def exists(self) -> bool:
274
- if self.is_file():
275
- return True
276
- if self.is_dir():
277
- return True
278
- return False
279
-
280
- def mkdir(self, parents=True, exist_ok=True):
281
- """
282
- Create a new directory at this given path.
283
- """
284
- with self as connected:
285
- connected.clear_cache()
286
-
287
- try:
288
- if connected.kind == DatabricksPathKind.WORKSPACE:
289
- connected.safe_workspace.sdk().workspace.mkdirs(self.as_workspace_api_path())
290
- elif connected.kind == DatabricksPathKind.VOLUME:
291
- return connected._create_volume_dir(parents=parents, exist_ok=exist_ok)
292
- elif connected.kind == DatabricksPathKind.DBFS:
293
- connected.safe_workspace.sdk().dbfs.mkdirs(self.as_dbfs_api_path())
294
-
295
- connected._is_file, connected._is_dir = False, True
296
- except (NotFound, ResourceDoesNotExist):
297
- if not parents or self.parent == self:
298
- raise
299
-
300
- connected.parent.mkdir(parents=True, exist_ok=True)
301
- connected.mkdir(parents=False, exist_ok=exist_ok)
302
- except (AlreadyExists, ResourceAlreadyExists):
303
- if not exist_ok:
304
- raise
305
-
306
- def _ensure_volume(self, exist_ok: bool = True):
307
- catalog_name, schema_name, volume_name, rel = self.volume_parts()
308
- sdk = self.safe_workspace.sdk()
309
-
310
- if catalog_name:
311
- try:
312
- sdk.catalogs.create(name=catalog_name)
313
- except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
314
- if not exist_ok:
315
- raise
316
-
317
- if schema_name:
318
- try:
319
- sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
320
- except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
321
- if not exist_ok:
322
- raise
323
-
324
- if volume_name:
325
- try:
326
- sdk.volumes.create(
327
- catalog_name=catalog_name,
328
- schema_name=schema_name,
329
- name=volume_name,
330
- volume_type=VolumeType.MANAGED,
331
- )
332
- except (AlreadyExists, ResourceAlreadyExists, BadRequest):
333
- if not exist_ok:
334
- raise
335
-
336
- def _create_volume_dir(self, parents=True, exist_ok=True):
337
- path = self.as_files_api_path()
338
- sdk = self.safe_workspace.sdk()
339
-
340
- try:
341
- sdk.files.create_directory(path)
342
- except (BadRequest, NotFound, ResourceDoesNotExist) as e:
343
- if not parents:
344
- raise
345
-
346
- message = str(e)
347
-
348
- if "olume" in message and "not exist" in message:
349
- self._ensure_volume()
350
-
351
- sdk.files.create_directory(path)
352
- except (AlreadyExists, ResourceAlreadyExists, BadRequest):
353
- if not exist_ok:
354
- raise
355
-
356
- self.clear_cache()
357
- self._is_file, self._is_dir = False, True
358
-
359
- def remove(self, recursive: bool = True):
360
- if self.is_file():
361
- return self.rmfile()
362
- else:
363
- return self.rmdir(recursive=recursive)
364
-
365
- def rmfile(self):
366
- try:
367
- if self.kind == DatabricksPathKind.VOLUME:
368
- return self._remove_volume_file()
369
- elif self.kind == DatabricksPathKind.WORKSPACE:
370
- return self._remove_workspace_file()
371
- elif self.kind == DatabricksPathKind.DBFS:
372
- return self._remove_dbfs_file()
373
- finally:
374
- self.clear_cache()
375
-
376
- def _remove_volume_file(self):
377
- sdk = self.safe_workspace.sdk()
378
-
379
- try:
380
- sdk.files.delete(self.as_files_api_path())
381
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
382
- pass
383
-
384
- def _remove_workspace_file(self):
385
- sdk = self.safe_workspace.sdk()
386
-
387
- try:
388
- sdk.workspace.delete(self.as_workspace_api_path(), recursive=True)
389
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
390
- pass
391
-
392
- def _remove_dbfs_file(self):
393
- sdk = self.safe_workspace.sdk()
394
-
395
- try:
396
- sdk.dbfs.delete(self.as_dbfs_api_path(), recursive=True)
397
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
398
- pass
399
-
400
- def rmdir(self, recursive: bool = True):
401
- with self as connected:
402
- try:
403
- if connected.kind == DatabricksPathKind.WORKSPACE:
404
- connected.safe_workspace.sdk().workspace.delete(
405
- self.as_workspace_api_path(),
406
- recursive=recursive,
407
- )
408
- elif connected.kind == DatabricksPathKind.VOLUME:
409
- return self._remove_volume_dir(recursive=recursive)
410
- else:
411
- connected.safe_workspace.sdk().dbfs.delete(
412
- self.as_dbfs_api_path(),
413
- recursive=recursive,
414
- )
415
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
416
- pass
417
- finally:
418
- connected.clear_cache()
419
-
420
- def _remove_volume_dir(self, recursive: bool = True):
421
- root_path = self.as_files_api_path()
422
- catalog_name, schema_name, volume_name, rel = self.volume_parts()
423
-
424
- sdk = self.safe_workspace.sdk()
425
-
426
- if rel:
427
- try:
428
- sdk.files.delete_directory(root_path)
429
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
430
- message = str(e)
431
-
432
- if recursive and "directory is not empty" in message:
433
- for child_path in self.ls():
434
- child_path.remove(recursive=True)
435
- sdk.files.delete_directory(root_path)
436
- else:
437
- pass
438
- elif volume_name:
439
- try:
440
- sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
441
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
442
- pass
443
- elif schema_name:
444
- try:
445
- sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
446
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
447
- pass
448
-
449
- self.clear_cache()
450
-
451
- def ls(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
452
- if self.kind == DatabricksPathKind.VOLUME:
453
- for _ in self._ls_volume(recursive=recursive, fetch_size=fetch_size, raise_error=raise_error):
454
- yield _
455
- elif self.kind == DatabricksPathKind.WORKSPACE:
456
- for _ in self._ls_workspace(recursive=recursive, raise_error=raise_error):
457
- yield _
458
- elif self.kind == DatabricksPathKind.DBFS:
459
- for _ in self._ls_dbfs(recursive=recursive, raise_error=raise_error):
460
- yield _
461
-
462
- def _ls_volume(self, recursive: bool = False, fetch_size: int = None, raise_error: bool = True):
463
- catalog_name, schema_name, volume_name, rel = self.volume_parts()
464
- sdk = self.safe_workspace.sdk()
465
-
466
- if rel is None:
467
- if volume_name is None:
468
- try:
469
- for info in sdk.volumes.list(
470
- catalog_name=catalog_name,
471
- schema_name=schema_name,
472
- ):
473
- base = DatabricksPath(
474
- kind=DatabricksPathKind.VOLUME,
475
- parts = [info.catalog_name, info.schema_name, info.name],
476
- workspace=self.safe_workspace,
477
- _is_file=False,
478
- _is_dir=True,
479
- )
480
-
481
- if recursive:
482
- for sub in base._ls_volume(recursive=recursive):
483
- yield sub
484
- else:
485
- yield base
486
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
487
- if raise_error:
488
- raise
489
- elif schema_name is None:
490
- try:
491
- for info in sdk.schemas.list(catalog_name=catalog_name):
492
- base = DatabricksPath(
493
- kind=DatabricksPathKind.VOLUME,
494
- parts=[info.catalog_name, info.name],
495
- workspace=self.safe_workspace,
496
- _is_file=False,
497
- _is_dir=True,
498
- )
499
-
500
- if recursive:
501
- for sub in base._ls_volume(recursive=recursive):
502
- yield sub
503
- else:
504
- yield base
505
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
506
- if raise_error:
507
- raise
508
- else:
509
- try:
510
- for info in sdk.catalogs.list():
511
- base = DatabricksPath(
512
- kind=DatabricksPathKind.VOLUME,
513
- parts=[info.name],
514
- workspace=self.safe_workspace,
515
- _is_file=False,
516
- _is_dir=True,
517
- )
518
-
519
- if recursive:
520
- for sub in base._ls_volume(recursive=recursive):
521
- yield sub
522
- else:
523
- yield base
524
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
525
- if raise_error:
526
- raise
527
- else:
528
- try:
529
- for info in sdk.files.list_directory_contents(self.as_files_api_path(), page_size=fetch_size):
530
- base = DatabricksPath(
531
- kind=DatabricksPathKind.VOLUME,
532
- parts=info.path.split("/")[2:],
533
- workspace=self.safe_workspace,
534
- _is_file=not info.is_directory,
535
- _is_dir=info.is_directory,
536
- )
537
-
538
- if recursive and info.is_directory:
539
- for sub in base._ls_volume(recursive=recursive):
540
- yield sub
541
- else:
542
- yield base
543
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
544
- if raise_error:
545
- raise
546
-
547
- def _ls_workspace(self, recursive: bool = True, raise_error: bool = True):
548
- sdk = self.safe_workspace.sdk()
549
-
550
- try:
551
- for info in sdk.workspace.list(self.as_workspace_api_path(), recursive=recursive):
552
- is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
553
- base = DatabricksPath(
554
- kind=DatabricksPathKind.WORKSPACE,
555
- parts=info.path.split("/")[2:],
556
- workspace=self.safe_workspace,
557
- _is_file=not is_dir,
558
- _is_dir=is_dir,
559
- )
560
- yield base
561
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
562
- if raise_error:
563
- raise
564
-
565
- def _ls_dbfs(self, recursive: bool = True, raise_error: bool = True):
566
- sdk = self.safe_workspace.sdk()
567
-
568
- try:
569
- # FIX: DBFS listing should use DBFS-normalized path, not workspace path
570
- p = self.as_dbfs_api_path()
571
-
572
- for info in sdk.dbfs.list(p, recursive=recursive):
573
- base = DatabricksPath(
574
- kind=DatabricksPathKind.DBFS,
575
- parts=info.path.split("/")[2:],
576
- workspace=self.safe_workspace,
577
- _is_file=not info.is_dir,
578
- _is_dir=info.is_dir,
579
- )
580
-
581
- yield base
582
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
583
- if raise_error:
584
- raise
585
-
586
- @contextmanager
587
- def open(
588
- self,
589
- mode="r",
590
- encoding=None,
591
- ) -> Iterator[Union[BinaryIO, io.TextIOBase]]:
592
- """
593
- Open this Databricks path using databricks-sdk's WorkspaceClient.
594
-
595
- Supported:
596
- - read: "rb", "r"
597
- - write: "wb", "w" (buffered; uploads on close for WORKSPACE/VOLUME)
598
- """
599
- if mode not in {"rb", "r", "wb", "w"}:
600
- raise ValueError(f"Unsupported mode {mode!r}. Use r/rb/w/wb.")
601
-
602
- if encoding is None:
603
- encoding = None if "b" in mode else "utf-8"
604
- reading = "r" in mode
605
-
606
- if reading:
607
- with self.open_read(encoding=encoding) as f:
608
- yield f
609
- else:
610
- with self.open_write(encoding=encoding) as f:
611
- yield f
612
-
613
- @contextmanager
614
- def open_read(self, encoding: str | None = None):
615
- with self as connected:
616
- if connected.kind == DatabricksPathKind.VOLUME:
617
- with connected._open_read_volume(encoding=encoding) as f:
618
- yield f
619
- elif connected.kind == DatabricksPathKind.WORKSPACE:
620
- with connected._open_read_workspace(encoding=encoding) as f:
621
- yield f
622
- else:
623
- with connected._open_read_dbfs(encoding=encoding) as f:
624
- yield f
625
-
626
- @contextmanager
627
- def _open_read_volume(self, encoding: str | None = None):
628
- workspace_client = self.safe_workspace.sdk()
629
- path = self.as_files_api_path()
630
-
631
- resp = workspace_client.files.download(path)
632
- raw = io.BytesIO(resp.contents.read())
633
-
634
- if encoding is not None:
635
- with io.TextIOWrapper(raw, encoding=encoding) as f:
636
- yield f
637
- else:
638
- with raw as f:
639
- yield f
640
-
641
- @contextmanager
642
- def _open_read_workspace(self, encoding: str | None = None):
643
- workspace_client = self.safe_workspace.sdk()
644
- path = self.as_workspace_api_path()
645
-
646
- raw = workspace_client.workspace.download(path) # returns BinaryIO
647
-
648
- if encoding is not None:
649
- raw = io.BytesIO(raw.read())
650
- with io.TextIOWrapper(raw, encoding=encoding) as f:
651
- yield f
652
- else:
653
- with raw as f:
654
- yield f
655
-
656
- @contextmanager
657
- def _open_read_dbfs(self, encoding: str | None = None):
658
- workspace_client = self.safe_workspace.sdk()
659
- path = self.as_dbfs_api_path()
660
-
661
- raw = workspace_client.dbfs.open(path, read=True)
662
-
663
- if encoding is not None:
664
- with io.TextIOWrapper(raw, encoding=encoding) as f:
665
- yield f
666
- else:
667
- with raw as f:
668
- yield f
669
-
670
- @contextmanager
671
- def open_write(self, encoding: str | None = None):
672
- with self as connected:
673
- if connected.kind == DatabricksPathKind.VOLUME:
674
- with connected._open_write_volume(encoding=encoding) as f:
675
- yield f
676
- elif connected.kind == DatabricksPathKind.WORKSPACE:
677
- with connected._open_write_workspace(encoding=encoding) as f:
678
- yield f
679
- else:
680
- with connected._open_write_dbfs(encoding=encoding) as f:
681
- yield f
682
-
683
- @contextmanager
684
- def _open_write_volume(self, encoding: str | None = None, overwrite: bool = True):
685
- workspace_client = self.safe_workspace.sdk()
686
- path = self.as_files_api_path()
687
-
688
- buf = io.BytesIO()
689
-
690
- if encoding is not None:
691
- tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
692
- try:
693
- yield tw
694
- finally:
695
- tw.flush()
696
- buf.seek(0)
697
-
698
- try:
699
- workspace_client.files.upload(path, buf, overwrite=overwrite)
700
- except (NotFound, ResourceDoesNotExist, BadRequest):
701
- self.parent.mkdir(parents=True, exist_ok=True)
702
- workspace_client.files.upload(path, buf, overwrite=overwrite)
703
-
704
- tw.detach()
705
- else:
706
- try:
707
- yield buf
708
- finally:
709
- buf.seek(0)
710
-
711
- try:
712
- workspace_client.files.upload(path, buf, overwrite=overwrite)
713
- except (NotFound, ResourceDoesNotExist, BadRequest):
714
- self.parent.mkdir(parents=True, exist_ok=True)
715
- workspace_client.files.upload(path, buf, overwrite=overwrite)
716
-
717
- @contextmanager
718
- def _open_write_workspace(self, encoding: str | None = None, overwrite: bool = True):
719
- workspace_client = self.safe_workspace.sdk()
720
- path = self.as_workspace_api_path()
721
-
722
- buf = io.BytesIO()
723
-
724
- if encoding is not None:
725
- tw = io.TextIOWrapper(buf, encoding=encoding, write_through=True)
726
- try:
727
- yield tw
728
- finally:
729
- tw.flush()
730
- buf.seek(0)
731
-
732
- try:
733
- workspace_client.workspace.upload(
734
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
735
- )
736
- except Exception as e:
737
- message = str(e)
738
- if "parent folder" in message and "does not exist" in message:
739
- self.parent.mkdir(parents=True)
740
- buf.seek(0)
741
- workspace_client.workspace.upload(
742
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
743
- )
744
- else:
745
- raise
746
-
747
- tw.detach()
748
- else:
749
- try:
750
- yield buf
751
- finally:
752
- buf.seek(0)
753
-
754
- try:
755
- workspace_client.workspace.upload(
756
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
757
- )
758
- except Exception as e:
759
- message = str(e)
760
- if "parent folder" in message and "does not exist" in message:
761
- self.parent.mkdir(parents=True)
762
- buf.seek(0)
763
- workspace_client.workspace.upload(
764
- path, buf, format=ImportFormat.AUTO, overwrite=overwrite
765
- )
766
- else:
767
- raise
768
-
769
- @contextmanager
770
- def _open_write_dbfs(self, encoding: str | None = None, overwrite: bool = True):
771
- workspace_client = self.safe_workspace.sdk()
772
- path = self.as_dbfs_api_path()
773
-
774
- raw = workspace_client.dbfs.open(path, write=True, overwrite=overwrite)
775
-
776
- if encoding is not None:
777
- with io.TextIOWrapper(raw, encoding=encoding) as f:
778
- yield f
779
- else:
780
- with raw as f:
781
- yield f
782
-
783
- self.clear_cache()
784
- self._is_file, self._is_dir = True, False