ygg 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1120 @@
1
+ # src/yggdrasil/databricks/workspaces/databricks_path.py
2
+ from __future__ import annotations
3
+
4
+ import dataclasses
5
+ import datetime as dt
6
+ import random
7
+ import string
8
+ import time
9
+ from pathlib import PurePosixPath
10
+ from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Iterable
11
+
12
+ import pyarrow as pa
13
+ from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
14
+ from pyarrow.fs import FileInfo, FileType, FileSystem
15
+ import pyarrow.dataset as ds
16
+
17
+ from .io import DatabricksIO
18
+ from .path_kind import DatabricksPathKind
19
+ from ...libs.databrickslib import databricks
20
+ from ...types import cast_arrow_tabular, cast_polars_dataframe
21
+ from ...types.cast.cast_options import CastOptions
22
+ from ...types.cast.polars_cast import polars_converter
23
+ from ...types.cast.polars_pandas_cast import PolarsDataFrame
24
+ from ...types.cast.registry import convert, register_converter
25
+
26
+ if databricks is not None:
27
+ from databricks.sdk.service.catalog import VolumeType
28
+ from databricks.sdk.service.workspace import ObjectType
29
+ from databricks.sdk.errors.platform import (
30
+ NotFound,
31
+ ResourceDoesNotExist,
32
+ BadRequest,
33
+ PermissionDenied,
34
+ AlreadyExists,
35
+ ResourceAlreadyExists,
36
+ )
37
+
38
+ NOT_FOUND_ERRORS = NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied
39
+ ALREADY_EXISTS_ERRORS = AlreadyExists, ResourceAlreadyExists, BadRequest
40
+
41
+ if TYPE_CHECKING:
42
+ from .workspace import Workspace
43
+
44
+
45
+ __all__ = [
46
+ "DatabricksPathKind",
47
+ "DatabricksPath",
48
+ ]
49
+
50
+
51
+ def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
52
+ if isinstance(parts, str):
53
+ parts = [parts]
54
+
55
+ if any("/" in part for part in parts):
56
+ new_parts: list[str] = []
57
+
58
+ for part in parts:
59
+ new_parts.extend(_ for _ in part.split("/") if _)
60
+
61
+ parts = new_parts
62
+
63
+ return parts
64
+
65
+
66
+ def _rand_str(n: int) -> str:
67
+ alphabet = string.ascii_letters + string.digits
68
+ return "".join(random.choices(alphabet, k=n))
69
+
70
+
71
+ @dataclasses.dataclass
72
+ class DatabricksPath:
73
+ kind: DatabricksPathKind
74
+ parts: List[str]
75
+
76
+ _workspace: Optional["Workspace"] = None
77
+
78
+ _is_file: Optional[bool] = None
79
+ _is_dir: Optional[bool] = None
80
+ _size: Optional[int] = None
81
+ _mtime: Optional[float] = None
82
+
83
+ def clone_instance(
84
+ self,
85
+ *,
86
+ kind: Optional["DatabricksPathKind"] = None,
87
+ parts: Optional[List[str]] = None,
88
+ workspace: Optional["Workspace"] = dataclasses.MISSING,
89
+ is_file: Optional[bool] = dataclasses.MISSING,
90
+ is_dir: Optional[bool] = dataclasses.MISSING,
91
+ size: Optional[int] = dataclasses.MISSING,
92
+ mtime: Optional[float] = dataclasses.MISSING,
93
+ ) -> "DatabricksPath":
94
+ """
95
+ Return a copy of this DatabricksPath, optionally overriding fields.
96
+ Uses dataclasses.replace semantics but lets you intentionally override
97
+ cached metadata (or keep it as-is by default).
98
+ """
99
+ return dataclasses.replace(
100
+ self,
101
+ kind=self.kind if kind is None else kind,
102
+ parts=list(self.parts) if parts is None else list(parts),
103
+ _workspace=self._workspace if workspace is dataclasses.MISSING else workspace,
104
+ _is_file=self._is_file if is_file is dataclasses.MISSING else is_file,
105
+ _is_dir=self._is_dir if is_dir is dataclasses.MISSING else is_dir,
106
+ _size=self._size if size is dataclasses.MISSING else size,
107
+ _mtime=self._mtime if mtime is dataclasses.MISSING else mtime,
108
+ )
109
+
110
+ @classmethod
111
+ def parse(
112
+ cls,
113
+ obj: Union["DatabricksPath", str, List[str]],
114
+ workspace: Optional["Workspace"] = None,
115
+ ) -> "DatabricksPath":
116
+ if not obj:
117
+ return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
118
+
119
+ if not isinstance(obj, (str, list)):
120
+ if isinstance(obj, DatabricksPath):
121
+ if workspace is not None and obj._workspace is None:
122
+ obj._workspace = workspace
123
+ return obj
124
+
125
+ from .io import DatabricksIO
126
+
127
+ if isinstance(obj, DatabricksIO):
128
+ return obj.path
129
+
130
+ if not isinstance(obj, Iterable):
131
+ obj = str(obj)
132
+
133
+ obj = _flatten_parts(obj)
134
+
135
+ if obj and not obj[0]:
136
+ obj = obj[1:]
137
+
138
+ if not obj:
139
+ return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
140
+
141
+ head, *tail = obj
142
+ head = head.casefold()
143
+
144
+ if head == "dbfs":
145
+ kind = DatabricksPathKind.DBFS
146
+ elif head == "workspace":
147
+ kind = DatabricksPathKind.WORKSPACE
148
+ elif head == "volumes":
149
+ kind = DatabricksPathKind.VOLUME
150
+ else:
151
+ raise ValueError(f"Invalid DatabricksPath head {head!r} from {obj!r}, must be in ['dbfs', 'workspace', 'volumes']")
152
+
153
+ return DatabricksPath(kind=kind, parts=tail, _workspace=workspace)
154
+
155
+ def __hash__(self):
156
+ return hash(self.full_path())
157
+
158
+ def __eq__(self, other):
159
+ if not isinstance(other, DatabricksPath):
160
+ if isinstance(other, str):
161
+ return str(self) == other
162
+ return False
163
+ return self.kind == other.kind and self.parts == other.parts
164
+
165
+ def __truediv__(self, other):
166
+ if not other:
167
+ return self
168
+
169
+ other_parts = _flatten_parts(other)
170
+
171
+ return DatabricksPath(
172
+ kind=self.kind,
173
+ parts=self.parts + other_parts,
174
+ _workspace=self._workspace,
175
+ )
176
+
177
+ def __enter__(self):
178
+ return self.connect(clone=False)
179
+
180
+ def __exit__(self, exc_type, exc_val, exc_tb):
181
+ if self._workspace is not None:
182
+ self._workspace.__exit__(exc_type, exc_val, exc_tb)
183
+
184
+ def __str__(self):
185
+ return self.full_path()
186
+
187
+ def __repr__(self):
188
+ return self.url()
189
+
190
+ def __fspath__(self):
191
+ return self.full_path()
192
+
193
+ def url(self):
194
+ return "dbfs://%s" % self.full_path()
195
+
196
+ def full_path(self) -> str:
197
+ if self.kind == DatabricksPathKind.DBFS:
198
+ return self.dbfs_full_path()
199
+ elif self.kind == DatabricksPathKind.WORKSPACE:
200
+ return self.workspace_full_path()
201
+ elif self.kind == DatabricksPathKind.VOLUME:
202
+ return self.files_full_path()
203
+ else:
204
+ raise ValueError(f"Unknown DatabricksPath kind: {self.kind!r}")
205
+
206
+ def filesystem(self, workspace: Optional["Workspace"] = None):
207
+ return self.workspace.filesytem(workspace=workspace)
208
+
209
+ @property
210
+ def parent(self):
211
+ if not self.parts:
212
+ return self
213
+
214
+ if self._is_file is not None or self._is_dir is not None:
215
+ _is_file, _is_dir = False, True
216
+ else:
217
+ _is_file, _is_dir = None, None
218
+
219
+ return DatabricksPath(
220
+ kind=self.kind,
221
+ parts=self.parts[:-1],
222
+ _workspace=self._workspace,
223
+ _is_file=_is_file,
224
+ _is_dir=_is_dir,
225
+ )
226
+
227
+ @property
228
+ def workspace(self):
229
+ if self._workspace is None:
230
+ from .workspace import Workspace
231
+
232
+ return Workspace()
233
+ return self._workspace
234
+
235
+ @workspace.setter
236
+ def workspace(self, value):
237
+ self._workspace = value
238
+
239
+ @property
240
+ def name(self) -> str:
241
+ if not self.parts:
242
+ return ""
243
+
244
+ if len(self.parts) == 1:
245
+ return self.parts[-1]
246
+
247
+ return self.parts[-1] if self.parts[-1] else self.parts[-2]
248
+
249
+ @property
250
+ def extension(self) -> str:
251
+ name = self.name
252
+ if "." in name:
253
+ return name.split(".")[-1]
254
+ return ""
255
+
256
+ @property
257
+ def file_format(self) -> FileFormat:
258
+ ext = self.extension
259
+
260
+ if ext == "parquet":
261
+ return ParquetFileFormat()
262
+ elif ext == "csv":
263
+ return CsvFileFormat()
264
+ elif ext == "json":
265
+ return JsonFileFormat()
266
+ else:
267
+ raise ValueError(
268
+ "Cannot get file format from extension %s" % ext
269
+ )
270
+
271
+ @property
272
+ def content_length(self):
273
+ if self._size is None:
274
+ self.refresh_status()
275
+ return self._size
276
+
277
+ @content_length.setter
278
+ def content_length(self, value: int):
279
+ self._size = value
280
+
281
+ @property
282
+ def mtime(self) -> Optional[float]:
283
+ if self._mtime is None:
284
+ self.refresh_status()
285
+ return self._mtime
286
+
287
+ @mtime.setter
288
+ def mtime(self, value: float):
289
+ if not isinstance(value, float):
290
+ if isinstance(value, dt.datetime):
291
+ value = value.timestamp()
292
+ elif isinstance(value, str):
293
+ value = dt.datetime.fromisoformat(value).timestamp()
294
+ else:
295
+ value = float(value)
296
+ self._mtime = value
297
+
298
+ @property
299
+ def file_type(self):
300
+ if self.is_file():
301
+ return FileType.File
302
+ elif self.is_dir():
303
+ return FileType.Directory
304
+ else:
305
+ return FileType.NotFound
306
+
307
+ @property
308
+ def file_info(self):
309
+ return FileInfo(
310
+ path=self.full_path(),
311
+ type=self.file_type,
312
+ mtime=self.mtime,
313
+ size=self.content_length,
314
+ )
315
+
316
+ def is_file(self):
317
+ if self._is_file is None:
318
+ self.refresh_status()
319
+ return self._is_file
320
+
321
+ def is_dir(self):
322
+ if self._is_dir is None:
323
+ self.refresh_status()
324
+ return self._is_dir
325
+
326
+ def is_dir_sink(self):
327
+ return self.is_dir() or (self.parts and self.parts[-1] == "")
328
+
329
+ @property
330
+ def connected(self) -> bool:
331
+ return self._workspace is not None and self._workspace.connected
332
+
333
+ def connect(self, clone: bool = False) -> "DatabricksPath":
334
+ workspace = self.workspace.connect(clone=clone)
335
+
336
+ if clone:
337
+ return self.clone_instance(
338
+ workspace=workspace
339
+ )
340
+
341
+ self._workspace = workspace
342
+
343
+ return self
344
+
345
+ def close(self):
346
+ pass
347
+
348
+ def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
349
+ if self.kind != DatabricksPathKind.VOLUME:
350
+ return None, None, None, None
351
+
352
+ catalog = self.parts[0] if len(self.parts) > 0 and self.parts[0] else None
353
+ schema = self.parts[1] if len(self.parts) > 1 and self.parts[1] else None
354
+ volume = self.parts[2] if len(self.parts) > 2 and self.parts[2] else None
355
+
356
+ # NOTE: rel is used as a true/false “has relative path” indicator in this file.
357
+ # The runtime value is a list[str] (not PurePosixPath). Keeping it that way to avoid behavior changes.
358
+ return catalog, schema, volume, self.parts[3:] # type: ignore[return-value]
359
+
360
+ def refresh_status(self) -> "DatabricksPath":
361
+ if self.kind == DatabricksPathKind.VOLUME:
362
+ self._refresh_volume_status()
363
+ elif self.kind == DatabricksPathKind.WORKSPACE:
364
+ self._refresh_workspace_status()
365
+ elif self.kind == DatabricksPathKind.DBFS:
366
+ self._refresh_dbfs_status()
367
+ return self
368
+
369
+ def _refresh_volume_status(self):
370
+ full_path = self.files_full_path()
371
+ sdk = self.workspace.sdk()
372
+
373
+ try:
374
+ info = sdk.files.get_metadata(full_path)
375
+
376
+ mtime = (
377
+ dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
378
+ if info.last_modified
379
+ else None
380
+ )
381
+
382
+ return self.reset_metadata(is_file=True, is_dir=False, size=info.content_length, mtime=mtime)
383
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
384
+ pass
385
+
386
+ try:
387
+ info = sdk.files.get_directory_metadata(full_path)
388
+ mtime = (
389
+ dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
390
+ if info.last_modified
391
+ else None
392
+ )
393
+
394
+ return self.reset_metadata(is_file=False, is_dir=True, size=info, mtime=mtime)
395
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
396
+ pass
397
+
398
+ return self
399
+
400
+ def _refresh_workspace_status(self):
401
+ sdk = self.workspace.sdk()
402
+
403
+ try:
404
+ info = sdk.workspace.get_status(self.workspace_full_path())
405
+ is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
406
+ is_file = not is_dir
407
+ size = info.size
408
+ mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
409
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
410
+ found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
411
+ size = 0
412
+ mtime = found.mtime if found is not None else None
413
+
414
+ if found is None:
415
+ is_file, is_dir = None, None
416
+ else:
417
+ is_file, is_dir = False, True
418
+
419
+ return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
420
+
421
+ def _refresh_dbfs_status(self):
422
+ sdk = self.workspace.sdk()
423
+
424
+ try:
425
+ info = sdk.dbfs.get_status(self.dbfs_full_path())
426
+ is_file, is_dir = not info.is_dir, info.is_dir
427
+ size = info.file_size
428
+ mtime = info.modification_time / 1000.0 if info.modification_time else None
429
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
430
+ found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
431
+ size = 0
432
+ mtime = found.mtime if found is not None else None
433
+
434
+ if found is None:
435
+ is_file, is_dir = None, None
436
+ else:
437
+ is_file, is_dir = False, True
438
+
439
+ return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
440
+
441
+ def reset_metadata(
442
+ self,
443
+ is_file: Optional[bool] = None,
444
+ is_dir: Optional[bool] = None,
445
+ size: Optional[int] = None,
446
+ mtime: Optional[float] = None,
447
+ ):
448
+ self._is_file = is_file
449
+ self._is_dir = is_dir
450
+ self._size = size
451
+ self._mtime = mtime
452
+
453
+ return self
454
+
455
+ # ---- API path normalization helpers ----
456
+
457
+ def workspace_full_path(self) -> str:
458
+ if not self.parts:
459
+ return "/Workspace"
460
+
461
+ parts = self.parts if self.parts[-1] else self.parts[:-1]
462
+
463
+ return "/Workspace/%s" % "/".join(parts)
464
+
465
+ def dbfs_full_path(self) -> str:
466
+ if not self.parts:
467
+ return "/dbfs"
468
+
469
+ parts = self.parts if self.parts[-1] else self.parts[:-1]
470
+
471
+ return "/dbfs/%s" % "/".join(parts)
472
+
473
+ def files_full_path(self) -> str:
474
+ if not self.parts:
475
+ return "/Volumes"
476
+
477
+ parts = self.parts if self.parts[-1] else self.parts[:-1]
478
+
479
+ return "/Volumes/%s" % "/".join(parts)
480
+
481
+ def exists(self, *, follow_symlinks=True) -> bool:
482
+ return bool(self.is_file() or self.is_dir())
483
+
484
+ def mkdir(self, mode=None, parents=True, exist_ok=True):
485
+ try:
486
+ if self.kind == DatabricksPathKind.WORKSPACE:
487
+ self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
488
+ elif self.kind == DatabricksPathKind.VOLUME:
489
+ self.make_volume_dir(parents=parents, exist_ok=exist_ok)
490
+ elif self.kind == DatabricksPathKind.DBFS:
491
+ self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
492
+ except (NotFound, ResourceDoesNotExist):
493
+ if not parents or self.parent == self:
494
+ raise
495
+
496
+ self.parent.mkdir(parents=True, exist_ok=True)
497
+ self.mkdir(parents=False, exist_ok=exist_ok)
498
+ except (AlreadyExists, ResourceAlreadyExists):
499
+ if not exist_ok:
500
+ raise
501
+
502
+ return self
503
+
504
+ def _ensure_volume(self, exist_ok: bool = True, sdk=None):
505
+ catalog_name, schema_name, volume_name, rel = self.volume_parts()
506
+ sdk = self.workspace.sdk() if sdk is None else sdk
507
+
508
+ if catalog_name:
509
+ try:
510
+ sdk.catalogs.create(name=catalog_name)
511
+ except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
512
+ if not exist_ok:
513
+ raise
514
+
515
+ if schema_name:
516
+ try:
517
+ sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
518
+ except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
519
+ if not exist_ok:
520
+ raise
521
+
522
+ if volume_name:
523
+ try:
524
+ sdk.volumes.create(
525
+ catalog_name=catalog_name,
526
+ schema_name=schema_name,
527
+ name=volume_name,
528
+ volume_type=VolumeType.MANAGED,
529
+ )
530
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
531
+ if not exist_ok:
532
+ raise
533
+
534
+ def make_volume_dir(self, parents=True, exist_ok=True):
535
+ path = self.files_full_path()
536
+ sdk = self.workspace.sdk()
537
+
538
+ try:
539
+ sdk.files.create_directory(path)
540
+ except (BadRequest, NotFound, ResourceDoesNotExist) as e:
541
+ if not parents:
542
+ raise
543
+
544
+ message = str(e)
545
+ if "not exist" in message:
546
+ self._ensure_volume(sdk=sdk)
547
+
548
+ sdk.files.create_directory(path)
549
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
550
+ if not exist_ok:
551
+ raise
552
+
553
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
554
+
555
+ def make_workspace_dir(self, parents=True, exist_ok=True):
556
+ path = self.workspace_full_path()
557
+ sdk = self.workspace.sdk()
558
+
559
+ try:
560
+ sdk.workspace.mkdirs(path)
561
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
562
+ if not exist_ok:
563
+ raise
564
+
565
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
566
+
567
+ def make_dbfs_dir(self, parents=True, exist_ok=True):
568
+ path = self.dbfs_full_path()
569
+ sdk = self.workspace.sdk()
570
+
571
+ try:
572
+ sdk.dbfs.mkdirs(path)
573
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
574
+ if not exist_ok:
575
+ raise
576
+
577
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
578
+
579
+ def remove(self, recursive: bool = True):
580
+ if self.kind == DatabricksPathKind.VOLUME:
581
+ return self._remove_volume_obj(recursive=recursive)
582
+ elif self.kind == DatabricksPathKind.WORKSPACE:
583
+ return self._remove_workspace_obj(recursive=recursive)
584
+ elif self.kind == DatabricksPathKind.DBFS:
585
+ return self._remove_dbfs_obj(recursive=recursive)
586
+
587
+ def _remove_volume_obj(self, recursive: bool = True):
588
+ if self.is_file():
589
+ return self._remove_volume_file()
590
+ return self._remove_volume_dir(recursive=recursive)
591
+
592
+ def _remove_workspace_obj(self, recursive: bool = True):
593
+ if self.is_file():
594
+ return self._remove_workspace_file()
595
+ return self._remove_workspace_dir(recursive=recursive)
596
+
597
+ def _remove_dbfs_obj(self, recursive: bool = True):
598
+ if self.is_file():
599
+ return self._remove_dbfs_file()
600
+ return self._remove_dbfs_dir(recursive=recursive)
601
+
602
+ def rmfile(self):
603
+ try:
604
+ if self.kind == DatabricksPathKind.VOLUME:
605
+ return self._remove_volume_file()
606
+ elif self.kind == DatabricksPathKind.WORKSPACE:
607
+ return self._remove_workspace_file()
608
+ elif self.kind == DatabricksPathKind.DBFS:
609
+ return self._remove_dbfs_file()
610
+ finally:
611
+ self.reset_metadata()
612
+ return self
613
+
614
+ def _remove_volume_file(self):
615
+ sdk = self.workspace.sdk()
616
+ try:
617
+ sdk.files.delete(self.files_full_path())
618
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
619
+ pass
620
+ return self
621
+
622
+ def _remove_workspace_file(self):
623
+ sdk = self.workspace.sdk()
624
+ try:
625
+ sdk.workspace.delete(self.workspace_full_path(), recursive=True)
626
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
627
+ pass
628
+ return self
629
+
630
+ def _remove_dbfs_file(self):
631
+ sdk = self.workspace.sdk()
632
+ try:
633
+ sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
634
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
635
+ pass
636
+ return self
637
+
638
+ def rmdir(self, recursive: bool = True):
639
+ if self.kind == DatabricksPathKind.VOLUME:
640
+ return self._remove_volume_dir(recursive=recursive)
641
+ elif self.kind == DatabricksPathKind.WORKSPACE:
642
+ return self._remove_workspace_dir(recursive=recursive)
643
+ elif self.kind == DatabricksPathKind.DBFS:
644
+ return self._remove_dbfs_dir(recursive=recursive)
645
+
646
+ def _remove_workspace_dir(self, recursive: bool = True):
647
+ sdk = self.workspace.sdk()
648
+ try:
649
+ sdk.workspace.delete(self.workspace_full_path(), recursive=recursive)
650
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
651
+ pass
652
+ self.reset_metadata()
653
+ return self
654
+
655
+ def _remove_dbfs_dir(self, recursive: bool = True):
656
+ sdk = self.workspace.sdk()
657
+ try:
658
+ sdk.dbfs.delete(self.dbfs_full_path(), recursive=recursive)
659
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
660
+ pass
661
+ self.reset_metadata()
662
+ return self
663
+
664
+ def _remove_volume_dir(self, recursive: bool = True):
665
+ root_path = self.files_full_path()
666
+ catalog_name, schema_name, volume_name, rel = self.volume_parts()
667
+ sdk = self.workspace.sdk()
668
+
669
+ if rel:
670
+ try:
671
+ sdk.files.delete_directory(root_path)
672
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
673
+ message = str(e)
674
+ if recursive and "directory is not empty" in message:
675
+ for child_path in self.ls():
676
+ child_path._remove_volume_obj(recursive=True)
677
+ sdk.files.delete_directory(root_path)
678
+ else:
679
+ pass
680
+ elif volume_name:
681
+ try:
682
+ sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
683
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
684
+ pass
685
+ elif schema_name:
686
+ try:
687
+ sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
688
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
689
+ pass
690
+
691
+ return self.reset_metadata()
692
+
693
+ def ls(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
694
+ if self.kind == DatabricksPathKind.VOLUME:
695
+ yield from self._ls_volume(recursive=recursive, fetch_size=fetch_size, allow_not_found=allow_not_found)
696
+ elif self.kind == DatabricksPathKind.WORKSPACE:
697
+ yield from self._ls_workspace(recursive=recursive, allow_not_found=allow_not_found)
698
+ elif self.kind == DatabricksPathKind.DBFS:
699
+ yield from self._ls_dbfs(recursive=recursive, allow_not_found=allow_not_found)
700
+
701
+ def _ls_volume(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
702
+ catalog_name, schema_name, volume_name, rel = self.volume_parts()
703
+ sdk = self.workspace.sdk()
704
+
705
+ if rel is None:
706
+ if volume_name is None:
707
+ try:
708
+ for info in sdk.volumes.list(catalog_name=catalog_name, schema_name=schema_name):
709
+ base = DatabricksPath(
710
+ kind=DatabricksPathKind.VOLUME,
711
+ parts=[info.catalog_name, info.schema_name, info.name],
712
+ _workspace=self.workspace,
713
+ _is_file=False,
714
+ _is_dir=True,
715
+ _size=0,
716
+ )
717
+ if recursive:
718
+ yield from base._ls_volume(recursive=recursive)
719
+ else:
720
+ yield base
721
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
722
+ if not allow_not_found:
723
+ raise
724
+ elif schema_name is None:
725
+ try:
726
+ for info in sdk.schemas.list(catalog_name=catalog_name):
727
+ base = DatabricksPath(
728
+ kind=DatabricksPathKind.VOLUME,
729
+ parts=[info.catalog_name, info.name],
730
+ _workspace=self.workspace,
731
+ _is_file=False,
732
+ _is_dir=True,
733
+ _size=0,
734
+ )
735
+ if recursive:
736
+ yield from base._ls_volume(recursive=recursive)
737
+ else:
738
+ yield base
739
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
740
+ if not allow_not_found:
741
+ raise
742
+ else:
743
+ try:
744
+ for info in sdk.catalogs.list():
745
+ base = DatabricksPath(
746
+ kind=DatabricksPathKind.VOLUME,
747
+ parts=[info.name],
748
+ _workspace=self.workspace,
749
+ _is_file=False,
750
+ _is_dir=True,
751
+ _size=0,
752
+ )
753
+ if recursive:
754
+ yield from base._ls_volume(recursive=recursive)
755
+ else:
756
+ yield base
757
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
758
+ if not allow_not_found:
759
+ raise
760
+ else:
761
+ full_path = self.files_full_path()
762
+
763
+ try:
764
+ for info in sdk.files.list_directory_contents(full_path, page_size=fetch_size):
765
+ base = DatabricksPath(
766
+ kind=DatabricksPathKind.VOLUME,
767
+ parts=info.path.split("/")[2:],
768
+ _workspace=self.workspace,
769
+ _is_file=not info.is_directory,
770
+ _is_dir=info.is_directory,
771
+ _size=info.file_size,
772
+ )
773
+
774
+ if recursive and info.is_directory:
775
+ yield from base._ls_volume(recursive=recursive)
776
+ else:
777
+ yield base
778
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
779
+ if not allow_not_found:
780
+ raise
781
+
782
+ def _ls_workspace(self, recursive: bool = True, allow_not_found: bool = True):
783
+ sdk = self.workspace.sdk()
784
+ full_path = self.workspace_full_path()
785
+
786
+ try:
787
+ for info in sdk.workspace.list(full_path, recursive=recursive):
788
+ is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
789
+ yield DatabricksPath(
790
+ kind=DatabricksPathKind.WORKSPACE,
791
+ parts=info.path.split("/")[2:],
792
+ _workspace=self.workspace,
793
+ _is_file=not is_dir,
794
+ _is_dir=is_dir,
795
+ _size=info.size,
796
+ )
797
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
798
+ if not allow_not_found:
799
+ raise
800
+
801
+ def _ls_dbfs(self, recursive: bool = True, allow_not_found: bool = True):
802
+ sdk = self.workspace.sdk()
803
+ full_path = self.dbfs_full_path()
804
+
805
+ try:
806
+ for info in sdk.dbfs.list(full_path, recursive=recursive):
807
+ yield DatabricksPath(
808
+ kind=DatabricksPathKind.DBFS,
809
+ parts=info.path.split("/")[2:],
810
+ _workspace=self.workspace,
811
+ _is_file=not info.is_dir,
812
+ _is_dir=info.is_dir,
813
+ _size=info.file_size,
814
+ )
815
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
816
+ if not allow_not_found:
817
+ raise
818
+
819
+ def open(
820
+ self,
821
+ mode="rb",
822
+ encoding=None,
823
+ clone: bool = False,
824
+ ) -> DatabricksIO:
825
+ path = self.connect(clone=clone)
826
+
827
+ return (
828
+ DatabricksIO
829
+ .create_instance(path=path, mode=mode, encoding=encoding)
830
+ .connect(clone=False)
831
+ )
832
+
833
+ def copy_to(
834
+ self,
835
+ dest: Union["DatabricksIO", "DatabricksPath", str],
836
+ allow_not_found: bool = True,
837
+ ) -> None:
838
+ if self.is_file() and dest.is_file():
839
+ with self.open(mode="rb") as src:
840
+ src.copy_to(dest=dest)
841
+
842
+ elif self.is_dir():
843
+ dest_base = self.parse(obj=dest, workspace=self.workspace if dest._workspace is None else dest._workspace)
844
+ dest_base.mkdir(parents=True, exist_ok=True)
845
+
846
+ skip_base_parts = len(self.parts)
847
+
848
+ for src_child in self.ls(recursive=True, allow_not_found=True):
849
+ src_child: DatabricksPath = src_child
850
+ dest_child_parts = dest_base.parts + src_child.parts[skip_base_parts:]
851
+
852
+ src_child.copy_to(
853
+ dest=dest.clone_instance(parts=dest_child_parts),
854
+ allow_not_found=allow_not_found
855
+ )
856
+
857
+ elif not allow_not_found:
858
+ return None
859
+
860
+ else:
861
+ raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
862
+
863
+ # -------------------------
864
+ # Data ops (Arrow / Pandas / Polars)
865
+ # -------------------------
866
+ def arrow_dataset(
867
+ self,
868
+ workspace: Optional["Workspace"] = None,
869
+ filesystem: Optional[FileSystem] = None,
870
+ **kwargs
871
+ ):
872
+ filesystem = self.filesystem(workspace=workspace) if filesystem is None else filesystem
873
+
874
+ return ds.dataset(
875
+ source=self.full_path(),
876
+ filesystem=filesystem,
877
+ **kwargs
878
+ )
879
+
880
+ def read_arrow_table(
881
+ self,
882
+ batch_size: Optional[int] = None,
883
+ concat: bool = True,
884
+ **kwargs
885
+ ) -> pa.Table:
886
+ if self.is_file():
887
+ with self.open("rb") as f:
888
+ return f.read_arrow_table(batch_size=batch_size, **kwargs)
889
+
890
+ if self.is_dir():
891
+ tables: list[pa.Table] = []
892
+ for child in self.ls(recursive=True):
893
+ if child.is_file():
894
+ with child.open("rb") as f:
895
+ tables.append(f.read_arrow_table(batch_size=batch_size, **kwargs))
896
+
897
+ if not tables:
898
+ return pa.Table.from_batches([], schema=pa.schema([]))
899
+
900
+ if not concat:
901
+ # type: ignore[return-value]
902
+ return tables # caller asked for raw list
903
+
904
+ try:
905
+ return pa.concat_tables(tables)
906
+ except Exception:
907
+ # Fallback: concat via polars (diagonal relaxed) then back to Arrow
908
+ from polars import CompatLevel
909
+
910
+ return self.read_polars(
911
+ batch_size=batch_size,
912
+ how="diagonal_relaxed",
913
+ rechunk=True,
914
+ concat=True,
915
+ **kwargs,
916
+ ).to_arrow(compat_level=CompatLevel.newest())
917
+
918
+ raise FileNotFoundError(f"Path does not exist: {self}")
919
+
920
+ def write_arrow(
921
+ self,
922
+ table: Union[pa.Table, pa.RecordBatch],
923
+ batch_size: Optional[int] = None,
924
+ **kwargs
925
+ ):
926
+ if not isinstance(table, pa.Table):
927
+ table = convert(table, pa.Table)
928
+
929
+ return self.write_arrow_table(
930
+ table=table,
931
+ batch_size=batch_size,
932
+ **kwargs
933
+ )
934
+
935
+ def write_arrow_table(
936
+ self,
937
+ table: pa.Table,
938
+ batch_size: Optional[int] = None,
939
+ **kwargs
940
+ ):
941
+ with self.connect(clone=False) as connected:
942
+ if connected.is_dir_sink():
943
+ seed = int(time.time() * 1000)
944
+
945
+ for i, batch in enumerate(table.to_batches(max_chunksize=batch_size)):
946
+ part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
947
+
948
+ with part_path.open(mode="wb") as f:
949
+ f.write_arrow_batch(batch)
950
+
951
+ return connected
952
+
953
+ connected.open(mode="wb", clone=False).write_arrow_table(
954
+ table,
955
+ batch_size=batch_size,
956
+ **kwargs
957
+ )
958
+
959
+ return self
960
+
961
+ def read_pandas(
962
+ self,
963
+ batch_size: int = 0,
964
+ concat: bool = True,
965
+ **kwargs
966
+ ):
967
+ if concat:
968
+ return self.read_arrow_table(batch_size=batch_size, concat=True, **kwargs).to_pandas()
969
+
970
+ tables = self.read_arrow_table(batch_size=batch_size, concat=False, **kwargs)
971
+ return [t.to_pandas() for t in tables] # type: ignore[arg-type]
972
+
973
+ def write_pandas(
974
+ self,
975
+ df,
976
+ batch_size: Optional[int] = None,
977
+ **kwargs
978
+ ):
979
+ return self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
980
+
981
+ def read_polars(
982
+ self,
983
+ batch_size: Optional[int] = None,
984
+ how: str = "diagonal_relaxed",
985
+ rechunk: bool = False,
986
+ concat: bool = True,
987
+ **kwargs
988
+ ):
989
+ import polars as pl
990
+
991
+ if self.is_file():
992
+ with self.open("rb") as f:
993
+ return f.read_polars(batch_size=batch_size, **kwargs)
994
+
995
+ if self.is_dir():
996
+ dfs = []
997
+ for child in self.ls(recursive=True):
998
+ if child.is_file():
999
+ with child.open("rb") as f:
1000
+ dfs.append(f.read_polars(batch_size=batch_size, **kwargs))
1001
+
1002
+ if not dfs:
1003
+ return pl.DataFrame()
1004
+
1005
+ if concat:
1006
+ return pl.concat(dfs, how=how, rechunk=rechunk)
1007
+ return dfs # type: ignore[return-value]
1008
+
1009
+ raise FileNotFoundError(f"Path does not exist: {self}")
1010
+
1011
+ def write_polars(
1012
+ self,
1013
+ df,
1014
+ batch_size: Optional[int] = None,
1015
+ **kwargs
1016
+ ):
1017
+ """
1018
+ Write Polars to a DatabricksPath.
1019
+
1020
+ Behavior:
1021
+ - If path is a directory (or ends with a trailing "/"): shard to parquet parts.
1022
+ `batch_size` = rows per part (default 1_000_000).
1023
+ - If path is a file: write using DatabricksIO.write_polars which is extension-driven
1024
+ (parquet/csv/ipc/json/ndjson etc.).
1025
+
1026
+ Notes:
1027
+ - If `df` is a LazyFrame, we collect it first (optionally streaming).
1028
+ """
1029
+ import polars as pl
1030
+
1031
+ if isinstance(df, pl.LazyFrame):
1032
+ df = df.collect()
1033
+
1034
+ if not isinstance(df, pl.DataFrame):
1035
+ raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
1036
+
1037
+ with self.connect() as connected:
1038
+ if connected.is_dir_sink():
1039
+ seed = int(time.time() * 1000)
1040
+ rows_per_part = batch_size or 1_000_000
1041
+
1042
+ # Always parquet for directory sinks (lake layout standard)
1043
+ for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
1044
+ part_path = connected / f"part-{i:05d}-{seed}-{_rand_str(4)}.parquet"
1045
+
1046
+ part_path.write_polars(chunk, **kwargs)
1047
+
1048
+ return connected
1049
+
1050
+ # Single file write: format/extension is handled in DatabricksIO.write_polars
1051
+ connected.write_polars(df, **kwargs)
1052
+
1053
+ return connected
1054
+
1055
+ def sql(
1056
+ self,
1057
+ query: str,
1058
+ engine: str = "auto"
1059
+ ):
1060
+ if engine == "auto":
1061
+ try:
1062
+ import duckdb
1063
+ engine = "duckdb"
1064
+ except ImportError:
1065
+ engine = "polars"
1066
+
1067
+ from_table = "dbfs.`%s`" % self.full_path()
1068
+
1069
+ if from_table not in query:
1070
+ raise ValueError(
1071
+ "SQL query must contain %s to execute query:\n%s" % (
1072
+ from_table,
1073
+ query
1074
+ )
1075
+ )
1076
+
1077
+ if engine == "duckdb":
1078
+ import duckdb
1079
+
1080
+ __arrow_table__ = self.read_arrow_table()
1081
+
1082
+ return (
1083
+ duckdb.connect()
1084
+ .execute(query=query.replace(from_table, "__arrow_table__"))
1085
+ .fetch_arrow_table()
1086
+ )
1087
+ elif engine == "polars":
1088
+ from polars import CompatLevel
1089
+
1090
+ return (
1091
+ self.read_polars()
1092
+ .sql(query=query.replace(from_table, "self"))
1093
+ .to_arrow(compat_level=CompatLevel.newest())
1094
+ )
1095
+ else:
1096
+ raise ValueError(
1097
+ "Invalid engine %s, must be in duckdb, polars" % engine
1098
+ )
1099
+
1100
+
1101
+ @register_converter(DatabricksPath, pa.Table)
1102
+ def databricks_path_to_arrow_table(
1103
+ data: DatabricksPath,
1104
+ options: Optional[CastOptions] = None,
1105
+ ) -> pa.Table:
1106
+ return cast_arrow_tabular(
1107
+ data.read_arrow_table(),
1108
+ options
1109
+ )
1110
+
1111
+
1112
+ @polars_converter(DatabricksPath, PolarsDataFrame)
1113
+ def databricks_path_to_polars(
1114
+ data: DatabricksPath,
1115
+ options: Optional[CastOptions] = None,
1116
+ ) -> PolarsDataFrame:
1117
+ return cast_polars_dataframe(
1118
+ data.read_polars(),
1119
+ options
1120
+ )