ygg 0.1.51__py3-none-any.whl → 0.1.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,15 +9,17 @@ import random
9
9
  import string
10
10
  import time
11
11
  from pathlib import PurePosixPath
12
- from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Iterable
12
+ from typing import Optional, Tuple, Union, TYPE_CHECKING, List
13
13
 
14
14
  import pyarrow as pa
15
15
  import pyarrow.dataset as ds
16
+ from pyarrow import ArrowInvalid
16
17
  from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
17
18
  from pyarrow.fs import FileInfo, FileType, FileSystem
18
19
 
19
20
  from .io import DatabricksIO
20
21
  from .path_kind import DatabricksPathKind
22
+ from .volumes_path import get_volume_status, get_volume_metadata
21
23
  from ...libs.databrickslib import databricks
22
24
  from ...libs.pandaslib import PandasDataFrame
23
25
  from ...libs.polarslib import polars, PolarsDataFrame
@@ -25,9 +27,10 @@ from ...types.cast.arrow_cast import cast_arrow_tabular
25
27
  from ...types.cast.cast_options import CastOptions
26
28
  from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
27
29
  from ...types.cast.registry import convert, register_converter
30
+ from ...types.file_format import ExcelFileFormat
28
31
 
29
32
  if databricks is not None:
30
- from databricks.sdk.service.catalog import VolumeType
33
+ from databricks.sdk.service.catalog import VolumeType, PathOperation, VolumeInfo
31
34
  from databricks.sdk.service.workspace import ObjectType
32
35
  from databricks.sdk.errors.platform import (
33
36
  NotFound,
@@ -51,7 +54,9 @@ __all__ = [
51
54
  ]
52
55
 
53
56
 
54
- def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
57
+ def _flatten_parts(
58
+ parts: Union["DatabricksPath", List[str], str],
59
+ ) -> List[str]:
55
60
  """Normalize path parts by splitting on '/' and removing empties.
56
61
 
57
62
  Args:
@@ -60,8 +65,13 @@ def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
60
65
  Returns:
61
66
  A flattened list of path components.
62
67
  """
63
- if isinstance(parts, str):
64
- parts = [parts]
68
+ if not isinstance(parts, list):
69
+ if isinstance(parts, DatabricksPath):
70
+ return parts.parts
71
+ elif isinstance(parts, (set, tuple)):
72
+ parts = list(parts)
73
+ else:
74
+ parts = [str(parts).replace("\\", "/")]
65
75
 
66
76
  if any("/" in part for part in parts):
67
77
  new_parts: list[str] = []
@@ -92,13 +102,16 @@ class DatabricksPath:
92
102
  """Path wrapper for Databricks workspace, volumes, and DBFS objects."""
93
103
  kind: DatabricksPathKind
94
104
  parts: List[str]
105
+ temporary: bool = False
106
+
107
+ _is_file: Optional[bool] = dataclasses.field(repr=False, hash=False, default=None)
108
+ _is_dir: Optional[bool] = dataclasses.field(repr=False, hash=False, default=None)
109
+ _size: Optional[int] = dataclasses.field(repr=False, hash=False, default=None)
110
+ _mtime: Optional[float] = dataclasses.field(repr=False, hash=False, default=None)
95
111
 
96
- _workspace: Optional["Workspace"] = None
112
+ _workspace: Optional["Workspace"] = dataclasses.field(repr=False, hash=False, default=None)
97
113
 
98
- _is_file: Optional[bool] = None
99
- _is_dir: Optional[bool] = None
100
- _size: Optional[int] = None
101
- _mtime: Optional[float] = None
114
+ _volume_info: Optional["VolumeInfo"] = dataclasses.field(repr=False, hash=False, default=None)
102
115
 
103
116
  def clone_instance(
104
117
  self,
@@ -110,6 +123,7 @@ class DatabricksPath:
110
123
  is_dir: Optional[bool] = dataclasses.MISSING,
111
124
  size: Optional[int] = dataclasses.MISSING,
112
125
  mtime: Optional[float] = dataclasses.MISSING,
126
+ volume_info: Optional["VolumeInfo"] = dataclasses.MISSING,
113
127
  ) -> "DatabricksPath":
114
128
  """
115
129
  Return a copy of this DatabricksPath, optionally overriding fields.
@@ -125,6 +139,21 @@ class DatabricksPath:
125
139
  _is_dir=self._is_dir if is_dir is dataclasses.MISSING else is_dir,
126
140
  _size=self._size if size is dataclasses.MISSING else size,
127
141
  _mtime=self._mtime if mtime is dataclasses.MISSING else mtime,
142
+ _volume_info=self._volume_info if volume_info is dataclasses.MISSING else volume_info,
143
+ )
144
+
145
+ @classmethod
146
+ def empty_instance(cls, workspace: Optional["Workspace"] = None):
147
+ return DatabricksPath(
148
+ kind=DatabricksPathKind.DBFS,
149
+ parts=[],
150
+ temporary=False,
151
+ _workspace=workspace,
152
+ _is_file=False,
153
+ _is_dir=False,
154
+ _size=0,
155
+ _mtime=0.0,
156
+ _volume_info=None,
128
157
  )
129
158
 
130
159
  @classmethod
@@ -132,18 +161,20 @@ class DatabricksPath:
132
161
  cls,
133
162
  obj: Union["DatabricksPath", str, List[str]],
134
163
  workspace: Optional["Workspace"] = None,
164
+ temporary: bool = False
135
165
  ) -> "DatabricksPath":
136
166
  """Parse input into a DatabricksPath instance.
137
167
 
138
168
  Args:
139
169
  obj: Input path, DatabricksPath, or path parts list.
140
170
  workspace: Optional Workspace to bind to the path.
171
+ temporary: Temporary location
141
172
 
142
173
  Returns:
143
174
  A DatabricksPath instance.
144
175
  """
145
176
  if not obj:
146
- return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
177
+ return cls.empty_instance(workspace=workspace)
147
178
 
148
179
  if not isinstance(obj, (str, list)):
149
180
  if isinstance(obj, DatabricksPath):
@@ -156,30 +187,35 @@ class DatabricksPath:
156
187
  if isinstance(obj, DatabricksIO):
157
188
  return obj.path
158
189
 
159
- if not isinstance(obj, Iterable):
190
+ else:
160
191
  obj = str(obj)
161
192
 
193
+
162
194
  obj = _flatten_parts(obj)
163
195
 
164
196
  if obj and not obj[0]:
165
197
  obj = obj[1:]
166
198
 
167
199
  if not obj:
168
- return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
200
+ return cls.empty_instance(workspace=workspace)
169
201
 
170
202
  head, *tail = obj
171
- head = head.casefold()
172
203
 
173
204
  if head == "dbfs":
174
205
  kind = DatabricksPathKind.DBFS
175
- elif head == "workspace":
206
+ elif head in {"Workspace", "workspace"}:
176
207
  kind = DatabricksPathKind.WORKSPACE
177
- elif head == "volumes":
208
+ elif head in {"Volumes", "volumes"}:
178
209
  kind = DatabricksPathKind.VOLUME
179
210
  else:
180
211
  raise ValueError(f"Invalid DatabricksPath head {head!r} from {obj!r}, must be in ['dbfs', 'workspace', 'volumes']")
181
212
 
182
- return DatabricksPath(kind=kind, parts=tail, _workspace=workspace)
213
+ return DatabricksPath(
214
+ kind=kind,
215
+ parts=tail,
216
+ temporary=temporary,
217
+ _workspace=workspace,
218
+ )
183
219
 
184
220
  def __hash__(self):
185
221
  return hash(self.full_path())
@@ -259,16 +295,19 @@ class DatabricksPath:
259
295
  return self
260
296
 
261
297
  if self._is_file is not None or self._is_dir is not None:
262
- _is_file, _is_dir = False, True
298
+ _is_file, _is_dir, _size = False, True, 0
263
299
  else:
264
- _is_file, _is_dir = None, None
300
+ _is_file, _is_dir, _size = None, None, None
265
301
 
266
302
  return DatabricksPath(
267
303
  kind=self.kind,
268
304
  parts=self.parts[:-1],
305
+ temporary=False,
269
306
  _workspace=self._workspace,
270
307
  _is_file=_is_file,
271
308
  _is_dir=_is_dir,
309
+ _size=_size,
310
+ _volume_info=self._volume_info
272
311
  )
273
312
 
274
313
  @property
@@ -281,7 +320,7 @@ class DatabricksPath:
281
320
  if self._workspace is None:
282
321
  from .workspace import Workspace
283
322
 
284
- return Workspace()
323
+ self._workspace = Workspace()
285
324
  return self._workspace
286
325
 
287
326
  @workspace.setter
@@ -330,13 +369,15 @@ class DatabricksPath:
330
369
  return CsvFileFormat()
331
370
  elif ext == "json":
332
371
  return JsonFileFormat()
372
+ elif ext in {"xlsx", "xlsm", "xls"}:
373
+ return ExcelFileFormat()
333
374
  else:
334
375
  raise ValueError(
335
376
  "Cannot get file format from extension %s" % ext
336
377
  )
337
378
 
338
379
  @property
339
- def content_length(self):
380
+ def content_length(self) -> int:
340
381
  """Return the size of the path in bytes if known.
341
382
 
342
383
  Returns:
@@ -344,10 +385,10 @@ class DatabricksPath:
344
385
  """
345
386
  if self._size is None:
346
387
  self.refresh_status()
347
- return self._size
388
+ return self._size or 0
348
389
 
349
390
  @content_length.setter
350
- def content_length(self, value: int):
391
+ def content_length(self, value: Optional[int]):
351
392
  self._size = value
352
393
 
353
394
  @property
@@ -390,6 +431,10 @@ class DatabricksPath:
390
431
  size=self.content_length,
391
432
  )
392
433
 
434
+ @property
435
+ def is_local(self):
436
+ return False
437
+
393
438
  def is_file(self):
394
439
  """Return True when the path is a file.
395
440
 
@@ -416,7 +461,16 @@ class DatabricksPath:
416
461
  Returns:
417
462
  True if the path represents a directory sink.
418
463
  """
419
- return self.is_dir() or (self.parts and self.parts[-1] == "")
464
+ if self.is_dir():
465
+ return True
466
+
467
+ if self.is_file():
468
+ return False
469
+
470
+ if self.parts and self.parts[-1] == "":
471
+ return True
472
+
473
+ return not "." in self.name
420
474
 
421
475
  @property
422
476
  def connected(self) -> bool:
@@ -443,7 +497,33 @@ class DatabricksPath:
443
497
  return self
444
498
 
445
499
  def close(self):
446
- pass
500
+ if self.temporary:
501
+ self.remove(recursive=True)
502
+
503
+ def storage_location(self) -> str:
504
+ info = self.volume_info()
505
+
506
+ if info is None:
507
+ raise NotFound(
508
+ "Volume %s not found" % repr(self)
509
+ )
510
+
511
+ _, _, _, parts = self.volume_parts()
512
+
513
+ base = info.storage_location.rstrip("/") # avoid trailing slash
514
+ return f"{base}/{'/'.join(parts)}" if parts else base
515
+
516
+
517
+ def volume_info(self) -> Optional["VolumeInfo"]:
518
+ if self._volume_info is None and self.kind == DatabricksPathKind.VOLUME:
519
+ catalog, schema, volume, _ = self.volume_parts()
520
+
521
+ if catalog and schema and volume:
522
+ self._volume_info = get_volume_metadata(
523
+ sdk=self.workspace.sdk(),
524
+ full_name="%s.%s.%s" % (catalog, schema, volume)
525
+ )
526
+ return self._volume_info
447
527
 
448
528
  def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
449
529
  """Return (catalog, schema, volume, rel_path) for volume paths.
@@ -458,8 +538,6 @@ class DatabricksPath:
458
538
  schema = self.parts[1] if len(self.parts) > 1 and self.parts[1] else None
459
539
  volume = self.parts[2] if len(self.parts) > 2 and self.parts[2] else None
460
540
 
461
- # NOTE: rel is used as a true/false “has relative path” indicator in this file.
462
- # The runtime value is a list[str] (not PurePosixPath). Keeping it that way to avoid behavior changes.
463
541
  return catalog, schema, volume, self.parts[3:] # type: ignore[return-value]
464
542
 
465
543
  def refresh_status(self) -> "DatabricksPath":
@@ -480,34 +558,20 @@ class DatabricksPath:
480
558
  full_path = self.files_full_path()
481
559
  sdk = self.workspace.sdk()
482
560
 
483
- try:
484
- info = sdk.files.get_metadata(full_path)
485
-
486
- mtime = (
487
- dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
488
- if info.last_modified
489
- else None
490
- )
491
-
492
- return self.reset_metadata(is_file=True, is_dir=False, size=info.content_length, mtime=mtime)
493
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
494
- pass
495
-
496
- try:
497
- info = sdk.files.get_directory_metadata(full_path)
498
-
499
- if info is None:
500
- mtime = dt.datetime.now(tz=dt.timezone.utc)
501
- else:
502
- mtime = (
503
- dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
504
- if info.last_modified
505
- else None
506
- )
561
+ is_file, is_dir, size, mtime = get_volume_status(
562
+ sdk=sdk,
563
+ full_path=full_path,
564
+ check_file_first="." in self.name,
565
+ raise_error=False
566
+ )
507
567
 
508
- return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=mtime)
509
- except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
510
- pass
568
+ self.reset_metadata(
569
+ is_file=is_file,
570
+ is_dir=is_dir,
571
+ size=size,
572
+ mtime=mtime,
573
+ volume_info=self._volume_info
574
+ )
511
575
 
512
576
  return self
513
577
 
@@ -520,15 +584,18 @@ class DatabricksPath:
520
584
  is_file = not is_dir
521
585
  size = info.size
522
586
  mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
587
+
588
+ return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
523
589
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
524
- found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
525
- size = 0
526
- mtime = found.mtime if found is not None else None
590
+ pass
527
591
 
528
- if found is None:
529
- is_file, is_dir = None, None
530
- else:
531
- is_file, is_dir = False, True
592
+ found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
593
+ size = None
594
+
595
+ if found is None:
596
+ is_file, is_dir, mtime = None, None, None
597
+ else:
598
+ is_file, is_dir, mtime = False, True, found.mtime
532
599
 
533
600
  return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
534
601
 
@@ -540,17 +607,23 @@ class DatabricksPath:
540
607
  is_file, is_dir = not info.is_dir, info.is_dir
541
608
  size = info.file_size
542
609
  mtime = info.modification_time / 1000.0 if info.modification_time else None
610
+
611
+ return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
543
612
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
544
- found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
545
- size = 0
546
- mtime = found.mtime if found is not None else None
613
+ pass
547
614
 
548
- if found is None:
549
- is_file, is_dir = None, None
550
- else:
551
- is_file, is_dir = False, True
615
+ found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
616
+ size = None
617
+ mtime = found.mtime if found is not None else None
552
618
 
553
- return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
619
+ if found is None:
620
+ is_file, is_dir = None, None
621
+ else:
622
+ is_file, is_dir = False, True
623
+
624
+ return self.reset_metadata(
625
+ is_file=is_file, is_dir=is_dir, size=size, mtime=mtime
626
+ )
554
627
 
555
628
  def reset_metadata(
556
629
  self,
@@ -558,6 +631,7 @@ class DatabricksPath:
558
631
  is_dir: Optional[bool] = None,
559
632
  size: Optional[int] = None,
560
633
  mtime: Optional[float] = None,
634
+ volume_info: Optional["VolumeInfo"] = None
561
635
  ):
562
636
  """Update cached metadata fields.
563
637
 
@@ -566,6 +640,7 @@ class DatabricksPath:
566
640
  is_dir: Optional directory flag.
567
641
  size: Optional size in bytes.
568
642
  mtime: Optional modification time in seconds.
643
+ volume_info: volume metadata
569
644
 
570
645
  Returns:
571
646
  The DatabricksPath instance.
@@ -574,10 +649,13 @@ class DatabricksPath:
574
649
  self._is_dir = is_dir
575
650
  self._size = size
576
651
  self._mtime = mtime
652
+ self._volume_info = volume_info
577
653
 
578
654
  return self
579
655
 
580
656
  # ---- API path normalization helpers ----
657
+ def full_parts(self):
658
+ return self.parts if self.parts[-1] else self.parts[:-1]
581
659
 
582
660
  def workspace_full_path(self) -> str:
583
661
  """Return the full workspace path string.
@@ -585,12 +663,7 @@ class DatabricksPath:
585
663
  Returns:
586
664
  Workspace path string.
587
665
  """
588
- if not self.parts:
589
- return "/Workspace"
590
-
591
- parts = self.parts if self.parts[-1] else self.parts[:-1]
592
-
593
- return "/Workspace/%s" % "/".join(parts)
666
+ return "/Workspace/%s" % "/".join(self.full_parts())
594
667
 
595
668
  def dbfs_full_path(self) -> str:
596
669
  """Return the full DBFS path string.
@@ -598,12 +671,7 @@ class DatabricksPath:
598
671
  Returns:
599
672
  DBFS path string.
600
673
  """
601
- if not self.parts:
602
- return "/dbfs"
603
-
604
- parts = self.parts if self.parts[-1] else self.parts[:-1]
605
-
606
- return "/dbfs/%s" % "/".join(parts)
674
+ return "/dbfs/%s" % "/".join(self.full_parts())
607
675
 
608
676
  def files_full_path(self) -> str:
609
677
  """Return the full files (volume) path string.
@@ -611,12 +679,7 @@ class DatabricksPath:
611
679
  Returns:
612
680
  Volume path string.
613
681
  """
614
- if not self.parts:
615
- return "/Volumes"
616
-
617
- parts = self.parts if self.parts[-1] else self.parts[:-1]
618
-
619
- return "/Volumes/%s" % "/".join(parts)
682
+ return "/Volumes/%s" % "/".join(self.full_parts())
620
683
 
621
684
  def exists(self, *, follow_symlinks=True) -> bool:
622
685
  """Return True if the path exists.
@@ -627,7 +690,13 @@ class DatabricksPath:
627
690
  Returns:
628
691
  True if the path exists.
629
692
  """
630
- return bool(self.is_file() or self.is_dir())
693
+ if self.is_file():
694
+ return True
695
+
696
+ elif self.is_dir():
697
+ return True
698
+
699
+ return False
631
700
 
632
701
  def mkdir(self, mode=None, parents=True, exist_ok=True):
633
702
  """Create a directory for the path.
@@ -652,33 +721,46 @@ class DatabricksPath:
652
721
  def _ensure_volume(self, exist_ok: bool = True, sdk=None):
653
722
  catalog_name, schema_name, volume_name, rel = self.volume_parts()
654
723
  sdk = self.workspace.sdk() if sdk is None else sdk
724
+ default_tags = self.workspace.default_tags()
655
725
 
656
726
  if catalog_name:
657
727
  try:
658
- sdk.catalogs.create(name=catalog_name)
728
+ sdk.catalogs.create(
729
+ name=catalog_name,
730
+ properties=default_tags,
731
+ comment="Catalog auto generated by yggdrasil"
732
+ )
659
733
  except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
660
734
  if not exist_ok:
661
735
  raise
662
736
 
663
737
  if schema_name:
664
738
  try:
665
- sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
739
+ sdk.schemas.create(
740
+ catalog_name=catalog_name,
741
+ name=schema_name,
742
+ properties=default_tags,
743
+ comment="Schema auto generated by yggdrasil"
744
+ )
666
745
  except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
667
746
  if not exist_ok:
668
747
  raise
669
748
 
670
749
  if volume_name:
671
750
  try:
672
- sdk.volumes.create(
751
+ self._volume_info = sdk.volumes.create(
673
752
  catalog_name=catalog_name,
674
753
  schema_name=schema_name,
675
754
  name=volume_name,
676
755
  volume_type=VolumeType.MANAGED,
756
+ comment="Volume auto generated by yggdrasil"
677
757
  )
678
758
  except (AlreadyExists, ResourceAlreadyExists, BadRequest):
679
759
  if not exist_ok:
680
760
  raise
681
761
 
762
+ return self._volume_info
763
+
682
764
  def make_volume_dir(self, parents=True, exist_ok=True):
683
765
  path = self.files_full_path()
684
766
  sdk = self.workspace.sdk()
@@ -724,7 +806,10 @@ class DatabricksPath:
724
806
 
725
807
  return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
726
808
 
727
- def remove(self, recursive: bool = True):
809
+ def remove(
810
+ self,
811
+ recursive: bool = True
812
+ ):
728
813
  """Remove the path as a file or directory.
729
814
 
730
815
  Args:
@@ -755,122 +840,182 @@ class DatabricksPath:
755
840
  return self._remove_dbfs_file()
756
841
  return self._remove_dbfs_dir(recursive=recursive)
757
842
 
758
- def rmfile(self):
843
+ def rmfile(self, allow_not_found: bool = True):
759
844
  """Remove the path as a file.
760
845
 
761
846
  Returns:
762
847
  The DatabricksPath instance.
763
848
  """
764
849
  if self.kind == DatabricksPathKind.VOLUME:
765
- return self._remove_volume_file()
850
+ self._remove_volume_file(allow_not_found=allow_not_found)
766
851
  elif self.kind == DatabricksPathKind.WORKSPACE:
767
- return self._remove_workspace_file()
852
+ self._remove_workspace_file(allow_not_found=allow_not_found)
768
853
  elif self.kind == DatabricksPathKind.DBFS:
769
- return self._remove_dbfs_file()
854
+ self._remove_dbfs_file(allow_not_found=allow_not_found)
770
855
 
771
856
  return self
772
857
 
773
- def _remove_volume_file(self):
858
+ def _remove_volume_file(self, allow_not_found: bool = True):
774
859
  sdk = self.workspace.sdk()
775
860
  try:
776
861
  sdk.files.delete(self.files_full_path())
777
862
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
778
- pass
863
+ if not allow_not_found:
864
+ raise
779
865
  finally:
780
866
  self.reset_metadata()
781
867
 
782
868
  return self
783
869
 
784
- def _remove_workspace_file(self):
870
+ def _remove_workspace_file(self, allow_not_found: bool = True):
785
871
  sdk = self.workspace.sdk()
786
872
  try:
787
873
  sdk.workspace.delete(self.workspace_full_path(), recursive=True)
788
874
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
789
- pass
875
+ if not allow_not_found:
876
+ raise
790
877
  finally:
791
878
  self.reset_metadata()
792
879
 
793
880
  return self
794
881
 
795
- def _remove_dbfs_file(self):
882
+ def _remove_dbfs_file(self, allow_not_found: bool = True):
796
883
  sdk = self.workspace.sdk()
797
884
  try:
798
885
  sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
799
886
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
800
- pass
887
+ if not allow_not_found:
888
+ raise
801
889
  finally:
802
890
  self.reset_metadata()
803
891
 
804
892
  return self
805
893
 
806
- def rmdir(self, recursive: bool = True):
894
+ def rmdir(
895
+ self,
896
+ recursive: bool = True,
897
+ allow_not_found: bool = True,
898
+ with_root: bool = True
899
+ ):
807
900
  """Remove the path as a directory.
808
901
 
809
902
  Args:
810
903
  recursive: Whether to delete directories recursively.
904
+ allow_not_found: Allow not found location
905
+ with_root: Delete also dir object
811
906
 
812
907
  Returns:
813
908
  The DatabricksPath instance.
814
909
  """
815
910
  if self.kind == DatabricksPathKind.VOLUME:
816
- return self._remove_volume_dir(recursive=recursive)
911
+ return self._remove_volume_dir(
912
+ recursive=recursive,
913
+ allow_not_found=allow_not_found,
914
+ with_root=with_root
915
+ )
817
916
  elif self.kind == DatabricksPathKind.WORKSPACE:
818
- return self._remove_workspace_dir(recursive=recursive)
917
+ return self._remove_workspace_dir(
918
+ recursive=recursive,
919
+ allow_not_found=allow_not_found,
920
+ with_root=with_root
921
+ )
819
922
  elif self.kind == DatabricksPathKind.DBFS:
820
- return self._remove_dbfs_dir(recursive=recursive)
923
+ return self._remove_dbfs_dir(
924
+ recursive=recursive,
925
+ allow_not_found=allow_not_found,
926
+ with_root=with_root
927
+ )
821
928
 
822
- def _remove_workspace_dir(self, recursive: bool = True):
929
+ def _remove_workspace_dir(
930
+ self,
931
+ recursive: bool = True,
932
+ allow_not_found: bool = True,
933
+ with_root: bool = True
934
+ ):
823
935
  sdk = self.workspace.sdk()
936
+ full_path =self.workspace_full_path()
937
+
824
938
  try:
825
- sdk.workspace.delete(self.workspace_full_path(), recursive=recursive)
939
+ sdk.workspace.delete(full_path, recursive=recursive)
940
+
941
+ if not with_root:
942
+ sdk.workspace.mkdirs(full_path)
826
943
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
827
- pass
944
+ if not allow_not_found:
945
+ raise
828
946
  finally:
829
947
  self.reset_metadata()
830
948
 
831
949
  return self
832
950
 
833
- def _remove_dbfs_dir(self, recursive: bool = True):
951
+ def _remove_dbfs_dir(
952
+ self,
953
+ recursive: bool = True,
954
+ allow_not_found: bool = True,
955
+ with_root: bool = True
956
+ ):
834
957
  sdk = self.workspace.sdk()
958
+ full_path = self.dbfs_full_path()
959
+
835
960
  try:
836
- sdk.dbfs.delete(self.dbfs_full_path(), recursive=recursive)
961
+ sdk.dbfs.delete(full_path, recursive=recursive)
962
+
963
+ if not with_root:
964
+ sdk.dbfs.mkdirs(full_path)
837
965
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
838
- pass
966
+ if not allow_not_found:
967
+ raise
839
968
  finally:
840
969
  self.reset_metadata()
841
970
 
842
971
  return self
843
972
 
844
- def _remove_volume_dir(self, recursive: bool = True):
845
- root_path = self.files_full_path()
973
+ def _remove_volume_dir(
974
+ self,
975
+ recursive: bool = True,
976
+ allow_not_found: bool = True,
977
+ with_root: bool = True
978
+ ):
979
+ full_path = self.files_full_path()
846
980
  catalog_name, schema_name, volume_name, rel = self.volume_parts()
847
981
  sdk = self.workspace.sdk()
848
982
 
849
983
  if rel:
850
984
  try:
851
- sdk.files.delete_directory(root_path)
985
+ sdk.files.delete_directory(full_path)
852
986
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
853
987
  message = str(e)
988
+
854
989
  if recursive and "directory is not empty" in message:
855
990
  for child_path in self.ls():
856
991
  child_path._remove_volume_obj(recursive=True)
857
- sdk.files.delete_directory(root_path)
858
- else:
859
- pass
992
+
993
+ if with_root:
994
+ sdk.files.delete_directory(full_path)
995
+
996
+ elif not allow_not_found:
997
+ raise
860
998
  elif volume_name:
861
999
  try:
862
1000
  sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
863
1001
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
864
- pass
1002
+ if not allow_not_found:
1003
+ raise
865
1004
  elif schema_name:
866
1005
  try:
867
1006
  sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
868
1007
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
869
- pass
1008
+ if not allow_not_found:
1009
+ raise
870
1010
 
871
1011
  return self.reset_metadata()
872
1012
 
873
- def ls(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
1013
+ def ls(
1014
+ self,
1015
+ recursive: bool = False,
1016
+ fetch_size: int = None,
1017
+ allow_not_found: bool = True
1018
+ ):
874
1019
  """List directory contents for the path.
875
1020
 
876
1021
  Args:
@@ -882,11 +1027,21 @@ class DatabricksPath:
882
1027
  DatabricksPath entries.
883
1028
  """
884
1029
  if self.kind == DatabricksPathKind.VOLUME:
885
- yield from self._ls_volume(recursive=recursive, fetch_size=fetch_size, allow_not_found=allow_not_found)
1030
+ yield from self._ls_volume(
1031
+ recursive=recursive,
1032
+ fetch_size=fetch_size,
1033
+ allow_not_found=allow_not_found
1034
+ )
886
1035
  elif self.kind == DatabricksPathKind.WORKSPACE:
887
- yield from self._ls_workspace(recursive=recursive, allow_not_found=allow_not_found)
1036
+ yield from self._ls_workspace(
1037
+ recursive=recursive,
1038
+ allow_not_found=allow_not_found
1039
+ )
888
1040
  elif self.kind == DatabricksPathKind.DBFS:
889
- yield from self._ls_dbfs(recursive=recursive, allow_not_found=allow_not_found)
1041
+ yield from self._ls_dbfs(
1042
+ recursive=recursive,
1043
+ allow_not_found=allow_not_found
1044
+ )
890
1045
 
891
1046
  def _ls_volume(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
892
1047
  catalog_name, schema_name, volume_name, rel = self.volume_parts()
@@ -904,6 +1059,7 @@ class DatabricksPath:
904
1059
  _is_dir=True,
905
1060
  _size=0,
906
1061
  )
1062
+
907
1063
  if recursive:
908
1064
  yield from base._ls_volume(recursive=recursive)
909
1065
  else:
@@ -1076,6 +1232,22 @@ class DatabricksPath:
1076
1232
  with self.open("wb") as f:
1077
1233
  f.write_all_bytes(data=data)
1078
1234
 
1235
+ def temporary_credentials(
1236
+ self,
1237
+ operation: Optional["PathOperation"] = None
1238
+ ):
1239
+ if self.kind != DatabricksPathKind.VOLUME:
1240
+ raise ValueError(f"Cannot generate temporary credentials for {repr(self)}")
1241
+
1242
+ sdk = self.workspace.sdk()
1243
+ client = sdk.temporary_path_credentials
1244
+ url = self.storage_location()
1245
+
1246
+ return client.generate_temporary_path_credentials(
1247
+ url=url,
1248
+ operation=operation or PathOperation.PATH_READ,
1249
+ )
1250
+
1079
1251
  # -------------------------
1080
1252
  # Data ops (Arrow / Pandas / Polars)
1081
1253
  # -------------------------
@@ -1123,6 +1295,7 @@ class DatabricksPath:
1123
1295
  with self.open("rb") as f:
1124
1296
  data = f.read_arrow_table(batch_size=batch_size, **kwargs)
1125
1297
  return data
1298
+
1126
1299
  elif self.is_dir():
1127
1300
  tables: list[pa.Table] = []
1128
1301
  for child in self.ls(recursive=True):
@@ -1139,7 +1312,7 @@ class DatabricksPath:
1139
1312
 
1140
1313
  try:
1141
1314
  return pa.concat_tables(tables)
1142
- except Exception:
1315
+ except ArrowInvalid:
1143
1316
  # Fallback: concat via polars (diagonal relaxed) then back to Arrow
1144
1317
  from polars import CompatLevel
1145
1318
 
@@ -1208,12 +1381,14 @@ class DatabricksPath:
1208
1381
 
1209
1382
  return connected
1210
1383
 
1211
- connected.open(mode="wb", clone=False).write_arrow_table(
1212
- table,
1213
- file_format=file_format,
1214
- batch_size=batch_size,
1215
- **kwargs
1216
- )
1384
+ else:
1385
+ with connected.open(mode="wb", clone=False) as f:
1386
+ f.write_arrow_table(
1387
+ table,
1388
+ file_format=file_format,
1389
+ batch_size=batch_size,
1390
+ **kwargs
1391
+ )
1217
1392
 
1218
1393
  return self
1219
1394
 
@@ -1321,9 +1496,10 @@ class DatabricksPath:
1321
1496
  """
1322
1497
  if self.is_file():
1323
1498
  with self.open("rb") as f:
1324
- return f.read_polars(batch_size=batch_size, **kwargs)
1499
+ df = f.read_polars(batch_size=batch_size, **kwargs)
1500
+ return df
1325
1501
 
1326
- if self.is_dir():
1502
+ elif self.is_dir():
1327
1503
  dfs = []
1328
1504
  for child in self.ls(recursive=True):
1329
1505
  if child.is_file():
@@ -1337,11 +1513,13 @@ class DatabricksPath:
1337
1513
  return polars.concat(dfs, how=how, rechunk=rechunk)
1338
1514
  return dfs # type: ignore[return-value]
1339
1515
 
1340
- raise FileNotFoundError(f"Path does not exist: {self}")
1516
+ else:
1517
+ raise FileNotFoundError(f"Path does not exist: {self}")
1341
1518
 
1342
1519
  def write_polars(
1343
1520
  self,
1344
1521
  df,
1522
+ file_format: Optional[FileFormat] = None,
1345
1523
  batch_size: Optional[int] = None,
1346
1524
  **kwargs
1347
1525
  ):
@@ -1356,6 +1534,7 @@ class DatabricksPath:
1356
1534
 
1357
1535
  Args:
1358
1536
  df: polars DataFrame or LazyFrame to write.
1537
+ file_format: Optional file format override.
1359
1538
  batch_size: Optional rows per part for directory sinks.
1360
1539
  **kwargs: Format-specific options.
1361
1540
 
@@ -1368,9 +1547,6 @@ class DatabricksPath:
1368
1547
  if isinstance(df, polars.LazyFrame):
1369
1548
  df = df.collect()
1370
1549
 
1371
- if not isinstance(df, polars.DataFrame):
1372
- raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
1373
-
1374
1550
  with self.connect() as connected:
1375
1551
  if connected.is_dir_sink():
1376
1552
  seed = int(time.time() * 1000)
@@ -1380,14 +1556,23 @@ class DatabricksPath:
1380
1556
  for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
1381
1557
  part_path = connected / f"part-{i:05d}-{seed}-{_rand_str(4)}.parquet"
1382
1558
 
1383
- part_path.write_polars(chunk, **kwargs)
1384
-
1385
- return connected
1386
-
1387
- # Single file write: format/extension is handled in DatabricksIO.write_polars
1388
- connected.write_polars(df, **kwargs)
1559
+ with part_path.open(mode="wb", clone=False) as f:
1560
+ f.write_polars(
1561
+ df,
1562
+ file_format=file_format,
1563
+ batch_size=batch_size,
1564
+ **kwargs
1565
+ )
1566
+ else:
1567
+ with connected.open(mode="wb", clone=False) as f:
1568
+ f.write_polars(
1569
+ df,
1570
+ file_format=file_format,
1571
+ batch_size=batch_size,
1572
+ **kwargs
1573
+ )
1389
1574
 
1390
- return connected
1575
+ return self
1391
1576
 
1392
1577
  def sql(
1393
1578
  self,
@@ -1415,7 +1600,7 @@ class DatabricksPath:
1415
1600
  if from_table not in query:
1416
1601
  raise ValueError(
1417
1602
  "SQL query must contain %s to execute query:\n%s" % (
1418
- from_table,
1603
+ repr(from_table),
1419
1604
  query
1420
1605
  )
1421
1606
  )
@@ -1423,19 +1608,26 @@ class DatabricksPath:
1423
1608
  if engine == "duckdb":
1424
1609
  import duckdb
1425
1610
 
1426
- __arrow_table__ = self.read_arrow_table()
1611
+ __arrow_dataset__ = self.arrow_dataset()
1427
1612
 
1428
1613
  return (
1429
1614
  duckdb.connect()
1430
- .execute(query=query.replace(from_table, "__arrow_table__"))
1615
+ .execute(
1616
+ query=query.replace(from_table, "__arrow_dataset__")
1617
+ )
1431
1618
  .fetch_arrow_table()
1432
1619
  )
1433
1620
  elif engine == "polars":
1434
1621
  from polars import CompatLevel
1435
1622
 
1623
+ table_name = "__dbpath__"
1624
+
1436
1625
  return (
1437
1626
  self.read_polars()
1438
- .sql(query=query.replace(from_table, "self"))
1627
+ .sql(
1628
+ query=query.replace(from_table, table_name),
1629
+ table_name=table_name
1630
+ )
1439
1631
  .to_arrow(compat_level=CompatLevel.newest())
1440
1632
  )
1441
1633
  else:
@@ -1444,23 +1636,32 @@ class DatabricksPath:
1444
1636
  )
1445
1637
 
1446
1638
 
1447
- @register_converter(DatabricksPath, pa.Table)
1448
- def databricks_path_to_arrow_table(
1449
- data: DatabricksPath,
1450
- options: Optional[CastOptions] = None,
1451
- ) -> pa.Table:
1452
- return cast_arrow_tabular(
1453
- data.read_arrow_table(),
1454
- options
1455
- )
1639
+ if databricks is not None:
1640
+ @register_converter(DatabricksPath, pa.Table)
1641
+ def databricks_path_to_arrow_table(
1642
+ data: DatabricksPath,
1643
+ options: Optional[CastOptions] = None,
1644
+ ) -> pa.Table:
1645
+ return cast_arrow_tabular(
1646
+ data.read_arrow_table(),
1647
+ options
1648
+ )
1456
1649
 
1457
1650
 
1458
- @polars_converter(DatabricksPath, PolarsDataFrame)
1459
- def databricks_path_to_polars(
1460
- data: DatabricksPath,
1461
- options: Optional[CastOptions] = None,
1462
- ) -> PolarsDataFrame:
1463
- return cast_polars_dataframe(
1464
- data.read_polars(),
1465
- options
1466
- )
1651
+ @register_converter(DatabricksPath, ds.Dataset)
1652
+ def databricks_path_to_arrow_table(
1653
+ data: DatabricksPath,
1654
+ options: Optional[CastOptions] = None,
1655
+ ) -> ds.Dataset:
1656
+ return data.arrow_dataset()
1657
+
1658
+
1659
+ @polars_converter(DatabricksPath, PolarsDataFrame)
1660
+ def databricks_path_to_polars(
1661
+ data: DatabricksPath,
1662
+ options: Optional[CastOptions] = None,
1663
+ ) -> PolarsDataFrame:
1664
+ return cast_polars_dataframe(
1665
+ data.read_polars(),
1666
+ options
1667
+ )