ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
  2. ygg-0.1.32.dist-info/RECORD +60 -0
  3. yggdrasil/__init__.py +2 -0
  4. yggdrasil/databricks/__init__.py +2 -0
  5. yggdrasil/databricks/compute/__init__.py +2 -0
  6. yggdrasil/databricks/compute/cluster.py +241 -2
  7. yggdrasil/databricks/compute/execution_context.py +100 -11
  8. yggdrasil/databricks/compute/remote.py +16 -0
  9. yggdrasil/databricks/jobs/__init__.py +5 -0
  10. yggdrasil/databricks/jobs/config.py +31 -34
  11. yggdrasil/databricks/sql/__init__.py +2 -0
  12. yggdrasil/databricks/sql/engine.py +217 -36
  13. yggdrasil/databricks/sql/exceptions.py +1 -0
  14. yggdrasil/databricks/sql/statement_result.py +148 -1
  15. yggdrasil/databricks/sql/types.py +49 -1
  16. yggdrasil/databricks/workspaces/__init__.py +4 -1
  17. yggdrasil/databricks/workspaces/filesytem.py +344 -0
  18. yggdrasil/databricks/workspaces/io.py +1123 -0
  19. yggdrasil/databricks/workspaces/path.py +1415 -0
  20. yggdrasil/databricks/workspaces/path_kind.py +13 -0
  21. yggdrasil/databricks/workspaces/workspace.py +298 -154
  22. yggdrasil/dataclasses/__init__.py +2 -0
  23. yggdrasil/dataclasses/dataclass.py +42 -1
  24. yggdrasil/libs/__init__.py +2 -0
  25. yggdrasil/libs/databrickslib.py +9 -0
  26. yggdrasil/libs/extensions/__init__.py +2 -0
  27. yggdrasil/libs/extensions/polars_extensions.py +72 -0
  28. yggdrasil/libs/extensions/spark_extensions.py +116 -0
  29. yggdrasil/libs/pandaslib.py +7 -0
  30. yggdrasil/libs/polarslib.py +7 -0
  31. yggdrasil/libs/sparklib.py +41 -0
  32. yggdrasil/pyutils/__init__.py +4 -0
  33. yggdrasil/pyutils/callable_serde.py +106 -0
  34. yggdrasil/pyutils/exceptions.py +16 -0
  35. yggdrasil/pyutils/modules.py +44 -1
  36. yggdrasil/pyutils/parallel.py +29 -0
  37. yggdrasil/pyutils/python_env.py +301 -0
  38. yggdrasil/pyutils/retry.py +57 -0
  39. yggdrasil/requests/__init__.py +4 -0
  40. yggdrasil/requests/msal.py +124 -3
  41. yggdrasil/requests/session.py +18 -0
  42. yggdrasil/types/__init__.py +2 -0
  43. yggdrasil/types/cast/__init__.py +2 -1
  44. yggdrasil/types/cast/arrow_cast.py +131 -0
  45. yggdrasil/types/cast/cast_options.py +119 -1
  46. yggdrasil/types/cast/pandas_cast.py +29 -0
  47. yggdrasil/types/cast/polars_cast.py +47 -0
  48. yggdrasil/types/cast/polars_pandas_cast.py +29 -0
  49. yggdrasil/types/cast/registry.py +176 -0
  50. yggdrasil/types/cast/spark_cast.py +76 -0
  51. yggdrasil/types/cast/spark_pandas_cast.py +29 -0
  52. yggdrasil/types/cast/spark_polars_cast.py +28 -0
  53. yggdrasil/types/libs.py +2 -0
  54. yggdrasil/types/python_arrow.py +191 -0
  55. yggdrasil/types/python_defaults.py +73 -0
  56. yggdrasil/version.py +1 -0
  57. ygg-0.1.30.dist-info/RECORD +0 -56
  58. yggdrasil/databricks/workspaces/databricks_path.py +0 -784
  59. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
  60. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
  61. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
  62. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1415 @@
1
+ """Databricks path abstraction spanning DBFS, workspace, and volumes."""
2
+
3
+ # src/yggdrasil/databricks/workspaces/databricks_path.py
4
+ from __future__ import annotations
5
+
6
+ import dataclasses
7
+ import datetime as dt
8
+ import random
9
+ import string
10
+ import time
11
+ from pathlib import PurePosixPath
12
+ from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Iterable
13
+
14
+ import pyarrow as pa
15
+ from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
16
+ from pyarrow.fs import FileInfo, FileType, FileSystem
17
+ import pyarrow.dataset as ds
18
+
19
+ from .io import DatabricksIO
20
+ from .path_kind import DatabricksPathKind
21
+ from ...libs.databrickslib import databricks
22
+ from ...types import cast_arrow_tabular, cast_polars_dataframe
23
+ from ...types.cast.cast_options import CastOptions
24
+ from ...types.cast.polars_cast import polars_converter
25
+ from ...types.cast.polars_pandas_cast import PolarsDataFrame
26
+ from ...types.cast.registry import convert, register_converter
27
+
28
+ if databricks is not None:
29
+ from databricks.sdk.service.catalog import VolumeType
30
+ from databricks.sdk.service.workspace import ObjectType
31
+ from databricks.sdk.errors.platform import (
32
+ NotFound,
33
+ ResourceDoesNotExist,
34
+ BadRequest,
35
+ PermissionDenied,
36
+ AlreadyExists,
37
+ ResourceAlreadyExists,
38
+ )
39
+
40
+ NOT_FOUND_ERRORS = NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied
41
+ ALREADY_EXISTS_ERRORS = AlreadyExists, ResourceAlreadyExists, BadRequest
42
+
43
+ if TYPE_CHECKING:
44
+ from .workspace import Workspace
45
+
46
+
47
+ __all__ = [
48
+ "DatabricksPathKind",
49
+ "DatabricksPath",
50
+ ]
51
+
52
+
53
+ def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
54
+ """Normalize path parts by splitting on '/' and removing empties.
55
+
56
+ Args:
57
+ parts: String or list of path parts.
58
+
59
+ Returns:
60
+ A flattened list of path components.
61
+ """
62
+ if isinstance(parts, str):
63
+ parts = [parts]
64
+
65
+ if any("/" in part for part in parts):
66
+ new_parts: list[str] = []
67
+
68
+ for part in parts:
69
+ new_parts.extend(_ for _ in part.split("/") if _)
70
+
71
+ parts = new_parts
72
+
73
+ return parts
74
+
75
+
76
+ def _rand_str(n: int) -> str:
77
+ """Return a random alphanumeric string of length ``n``.
78
+
79
+ Args:
80
+ n: Length of the random string.
81
+
82
+ Returns:
83
+ Random alphanumeric string.
84
+ """
85
+ alphabet = string.ascii_letters + string.digits
86
+ return "".join(random.choices(alphabet, k=n))
87
+
88
+
89
+ @dataclasses.dataclass
90
+ class DatabricksPath:
91
+ """Path wrapper for Databricks workspace, volumes, and DBFS objects."""
92
+ kind: DatabricksPathKind
93
+ parts: List[str]
94
+
95
+ _workspace: Optional["Workspace"] = None
96
+
97
+ _is_file: Optional[bool] = None
98
+ _is_dir: Optional[bool] = None
99
+ _size: Optional[int] = None
100
+ _mtime: Optional[float] = None
101
+
102
+ def clone_instance(
103
+ self,
104
+ *,
105
+ kind: Optional["DatabricksPathKind"] = None,
106
+ parts: Optional[List[str]] = None,
107
+ workspace: Optional["Workspace"] = dataclasses.MISSING,
108
+ is_file: Optional[bool] = dataclasses.MISSING,
109
+ is_dir: Optional[bool] = dataclasses.MISSING,
110
+ size: Optional[int] = dataclasses.MISSING,
111
+ mtime: Optional[float] = dataclasses.MISSING,
112
+ ) -> "DatabricksPath":
113
+ """
114
+ Return a copy of this DatabricksPath, optionally overriding fields.
115
+ Uses dataclasses.replace semantics but lets you intentionally override
116
+ cached metadata (or keep it as-is by default).
117
+ """
118
+ return dataclasses.replace(
119
+ self,
120
+ kind=self.kind if kind is None else kind,
121
+ parts=list(self.parts) if parts is None else list(parts),
122
+ _workspace=self._workspace if workspace is dataclasses.MISSING else workspace,
123
+ _is_file=self._is_file if is_file is dataclasses.MISSING else is_file,
124
+ _is_dir=self._is_dir if is_dir is dataclasses.MISSING else is_dir,
125
+ _size=self._size if size is dataclasses.MISSING else size,
126
+ _mtime=self._mtime if mtime is dataclasses.MISSING else mtime,
127
+ )
128
+
129
+ @classmethod
130
+ def parse(
131
+ cls,
132
+ obj: Union["DatabricksPath", str, List[str]],
133
+ workspace: Optional["Workspace"] = None,
134
+ ) -> "DatabricksPath":
135
+ """Parse input into a DatabricksPath instance.
136
+
137
+ Args:
138
+ obj: Input path, DatabricksPath, or path parts list.
139
+ workspace: Optional Workspace to bind to the path.
140
+
141
+ Returns:
142
+ A DatabricksPath instance.
143
+ """
144
+ if not obj:
145
+ return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
146
+
147
+ if not isinstance(obj, (str, list)):
148
+ if isinstance(obj, DatabricksPath):
149
+ if workspace is not None and obj._workspace is None:
150
+ obj._workspace = workspace
151
+ return obj
152
+
153
+ from .io import DatabricksIO
154
+
155
+ if isinstance(obj, DatabricksIO):
156
+ return obj.path
157
+
158
+ if not isinstance(obj, Iterable):
159
+ obj = str(obj)
160
+
161
+ obj = _flatten_parts(obj)
162
+
163
+ if obj and not obj[0]:
164
+ obj = obj[1:]
165
+
166
+ if not obj:
167
+ return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
168
+
169
+ head, *tail = obj
170
+ head = head.casefold()
171
+
172
+ if head == "dbfs":
173
+ kind = DatabricksPathKind.DBFS
174
+ elif head == "workspace":
175
+ kind = DatabricksPathKind.WORKSPACE
176
+ elif head == "volumes":
177
+ kind = DatabricksPathKind.VOLUME
178
+ else:
179
+ raise ValueError(f"Invalid DatabricksPath head {head!r} from {obj!r}, must be in ['dbfs', 'workspace', 'volumes']")
180
+
181
+ return DatabricksPath(kind=kind, parts=tail, _workspace=workspace)
182
+
183
+ def __hash__(self):
184
+ return hash(self.full_path())
185
+
186
+ def __eq__(self, other):
187
+ if not isinstance(other, DatabricksPath):
188
+ if isinstance(other, str):
189
+ return str(self) == other
190
+ return False
191
+ return self.kind == other.kind and self.parts == other.parts
192
+
193
+ def __truediv__(self, other):
194
+ if not other:
195
+ return self
196
+
197
+ other_parts = _flatten_parts(other)
198
+
199
+ return DatabricksPath(
200
+ kind=self.kind,
201
+ parts=self.parts + other_parts,
202
+ _workspace=self._workspace,
203
+ )
204
+
205
+ def __enter__(self):
206
+ return self.connect(clone=False)
207
+
208
+ def __exit__(self, exc_type, exc_val, exc_tb):
209
+ if self._workspace is not None:
210
+ self._workspace.__exit__(exc_type, exc_val, exc_tb)
211
+
212
+ def __str__(self):
213
+ return self.full_path()
214
+
215
+ def __repr__(self):
216
+ return self.url()
217
+
218
+ def __fspath__(self):
219
+ return self.full_path()
220
+
221
+ def url(self):
222
+ return "dbfs://%s" % self.full_path()
223
+
224
+ def full_path(self) -> str:
225
+ """Return the fully qualified path for this namespace.
226
+
227
+ Returns:
228
+ The fully qualified path string.
229
+ """
230
+ if self.kind == DatabricksPathKind.DBFS:
231
+ return self.dbfs_full_path()
232
+ elif self.kind == DatabricksPathKind.WORKSPACE:
233
+ return self.workspace_full_path()
234
+ elif self.kind == DatabricksPathKind.VOLUME:
235
+ return self.files_full_path()
236
+ else:
237
+ raise ValueError(f"Unknown DatabricksPath kind: {self.kind!r}")
238
+
239
+ def filesystem(self, workspace: Optional["Workspace"] = None):
240
+ """Return a PyArrow filesystem adapter for this workspace.
241
+
242
+ Args:
243
+ workspace: Optional workspace override.
244
+
245
+ Returns:
246
+ A PyArrow FileSystem instance.
247
+ """
248
+ return self.workspace.filesytem(workspace=workspace)
249
+
250
+ @property
251
+ def parent(self):
252
+ """Return the parent path.
253
+
254
+ Returns:
255
+ A DatabricksPath representing the parent.
256
+ """
257
+ if not self.parts:
258
+ return self
259
+
260
+ if self._is_file is not None or self._is_dir is not None:
261
+ _is_file, _is_dir = False, True
262
+ else:
263
+ _is_file, _is_dir = None, None
264
+
265
+ return DatabricksPath(
266
+ kind=self.kind,
267
+ parts=self.parts[:-1],
268
+ _workspace=self._workspace,
269
+ _is_file=_is_file,
270
+ _is_dir=_is_dir,
271
+ )
272
+
273
+ @property
274
+ def workspace(self):
275
+ """Return the associated Workspace instance.
276
+
277
+ Returns:
278
+ The Workspace associated with this path.
279
+ """
280
+ if self._workspace is None:
281
+ from .workspace import Workspace
282
+
283
+ return Workspace()
284
+ return self._workspace
285
+
286
+ @workspace.setter
287
+ def workspace(self, value):
288
+ self._workspace = value
289
+
290
+ @property
291
+ def name(self) -> str:
292
+ """Return the final path component.
293
+
294
+ Returns:
295
+ The final path name component.
296
+ """
297
+ if not self.parts:
298
+ return ""
299
+
300
+ if len(self.parts) == 1:
301
+ return self.parts[-1]
302
+
303
+ return self.parts[-1] if self.parts[-1] else self.parts[-2]
304
+
305
+ @property
306
+ def extension(self) -> str:
307
+ """Return the file extension for the path, if any.
308
+
309
+ Returns:
310
+ The file extension without leading dot.
311
+ """
312
+ name = self.name
313
+ if "." in name:
314
+ return name.split(".")[-1]
315
+ return ""
316
+
317
+ @property
318
+ def file_format(self) -> FileFormat:
319
+ """Infer the file format from the file extension.
320
+
321
+ Returns:
322
+ A PyArrow FileFormat instance.
323
+ """
324
+ ext = self.extension
325
+
326
+ if ext == "parquet":
327
+ return ParquetFileFormat()
328
+ elif ext == "csv":
329
+ return CsvFileFormat()
330
+ elif ext == "json":
331
+ return JsonFileFormat()
332
+ else:
333
+ raise ValueError(
334
+ "Cannot get file format from extension %s" % ext
335
+ )
336
+
337
+ @property
338
+ def content_length(self):
339
+ """Return the size of the path in bytes if known.
340
+
341
+ Returns:
342
+ The size in bytes.
343
+ """
344
+ if self._size is None:
345
+ self.refresh_status()
346
+ return self._size
347
+
348
+ @content_length.setter
349
+ def content_length(self, value: int):
350
+ self._size = value
351
+
352
+ @property
353
+ def mtime(self) -> Optional[float]:
354
+ """Return the last-modified time for the path.
355
+
356
+ Returns:
357
+ Last-modified timestamp in seconds.
358
+ """
359
+ if self._mtime is None:
360
+ self.refresh_status()
361
+ return self._mtime
362
+
363
+ @mtime.setter
364
+ def mtime(self, value: float):
365
+ if not isinstance(value, float):
366
+ if isinstance(value, dt.datetime):
367
+ value = value.timestamp()
368
+ elif isinstance(value, str):
369
+ value = dt.datetime.fromisoformat(value).timestamp()
370
+ else:
371
+ value = float(value)
372
+ self._mtime = value
373
+
374
+ @property
375
+ def file_type(self):
376
+ if self.is_file():
377
+ return FileType.File
378
+ elif self.is_dir():
379
+ return FileType.Directory
380
+ else:
381
+ return FileType.NotFound
382
+
383
+ @property
384
+ def file_info(self):
385
+ return FileInfo(
386
+ path=self.full_path(),
387
+ type=self.file_type,
388
+ mtime=self.mtime,
389
+ size=self.content_length,
390
+ )
391
+
392
+ def is_file(self):
393
+ """Return True when the path is a file.
394
+
395
+ Returns:
396
+ True if the path is a file.
397
+ """
398
+ if self._is_file is None:
399
+ self.refresh_status()
400
+ return self._is_file
401
+
402
+ def is_dir(self):
403
+ """Return True when the path is a directory.
404
+
405
+ Returns:
406
+ True if the path is a directory.
407
+ """
408
+ if self._is_dir is None:
409
+ self.refresh_status()
410
+ return self._is_dir
411
+
412
+ def is_dir_sink(self):
413
+ """Return True if the path represents a directory sink.
414
+
415
+ Returns:
416
+ True if the path represents a directory sink.
417
+ """
418
+ return self.is_dir() or (self.parts and self.parts[-1] == "")
419
+
420
+ @property
421
+ def connected(self) -> bool:
422
+ return self._workspace is not None and self._workspace.connected
423
+
424
+ def connect(self, clone: bool = False) -> "DatabricksPath":
425
+ """Connect the path to its workspace, optionally returning a clone.
426
+
427
+ Args:
428
+ clone: Whether to return a cloned instance.
429
+
430
+ Returns:
431
+ The connected DatabricksPath.
432
+ """
433
+ workspace = self.workspace.connect(clone=clone)
434
+
435
+ if clone:
436
+ return self.clone_instance(
437
+ workspace=workspace
438
+ )
439
+
440
+ self._workspace = workspace
441
+
442
+ return self
443
+
444
+ def close(self):
445
+ pass
446
+
447
+ def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
448
+ """Return (catalog, schema, volume, rel_path) for volume paths.
449
+
450
+ Returns:
451
+ Tuple of (catalog, schema, volume, rel_path).
452
+ """
453
+ if self.kind != DatabricksPathKind.VOLUME:
454
+ return None, None, None, None
455
+
456
+ catalog = self.parts[0] if len(self.parts) > 0 and self.parts[0] else None
457
+ schema = self.parts[1] if len(self.parts) > 1 and self.parts[1] else None
458
+ volume = self.parts[2] if len(self.parts) > 2 and self.parts[2] else None
459
+
460
+ # NOTE: rel is used as a true/false “has relative path” indicator in this file.
461
+ # The runtime value is a list[str] (not PurePosixPath). Keeping it that way to avoid behavior changes.
462
+ return catalog, schema, volume, self.parts[3:] # type: ignore[return-value]
463
+
464
+ def refresh_status(self) -> "DatabricksPath":
465
+ """Refresh cached metadata for the path.
466
+
467
+ Returns:
468
+ The DatabricksPath instance.
469
+ """
470
+ if self.kind == DatabricksPathKind.VOLUME:
471
+ self._refresh_volume_status()
472
+ elif self.kind == DatabricksPathKind.WORKSPACE:
473
+ self._refresh_workspace_status()
474
+ elif self.kind == DatabricksPathKind.DBFS:
475
+ self._refresh_dbfs_status()
476
+ return self
477
+
478
+ def _refresh_volume_status(self):
479
+ full_path = self.files_full_path()
480
+ sdk = self.workspace.sdk()
481
+
482
+ try:
483
+ info = sdk.files.get_metadata(full_path)
484
+
485
+ mtime = (
486
+ dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
487
+ if info.last_modified
488
+ else None
489
+ )
490
+
491
+ return self.reset_metadata(is_file=True, is_dir=False, size=info.content_length, mtime=mtime)
492
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
493
+ pass
494
+
495
+ try:
496
+ info = sdk.files.get_directory_metadata(full_path)
497
+ mtime = (
498
+ dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
499
+ if info.last_modified
500
+ else None
501
+ )
502
+
503
+ return self.reset_metadata(is_file=False, is_dir=True, size=info, mtime=mtime)
504
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
505
+ pass
506
+
507
+ return self
508
+
509
+ def _refresh_workspace_status(self):
510
+ sdk = self.workspace.sdk()
511
+
512
+ try:
513
+ info = sdk.workspace.get_status(self.workspace_full_path())
514
+ is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
515
+ is_file = not is_dir
516
+ size = info.size
517
+ mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
518
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
519
+ found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
520
+ size = 0
521
+ mtime = found.mtime if found is not None else None
522
+
523
+ if found is None:
524
+ is_file, is_dir = None, None
525
+ else:
526
+ is_file, is_dir = False, True
527
+
528
+ return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
529
+
530
+ def _refresh_dbfs_status(self):
531
+ sdk = self.workspace.sdk()
532
+
533
+ try:
534
+ info = sdk.dbfs.get_status(self.dbfs_full_path())
535
+ is_file, is_dir = not info.is_dir, info.is_dir
536
+ size = info.file_size
537
+ mtime = info.modification_time / 1000.0 if info.modification_time else None
538
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
539
+ found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
540
+ size = 0
541
+ mtime = found.mtime if found is not None else None
542
+
543
+ if found is None:
544
+ is_file, is_dir = None, None
545
+ else:
546
+ is_file, is_dir = False, True
547
+
548
+ return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
549
+
550
+ def reset_metadata(
551
+ self,
552
+ is_file: Optional[bool] = None,
553
+ is_dir: Optional[bool] = None,
554
+ size: Optional[int] = None,
555
+ mtime: Optional[float] = None,
556
+ ):
557
+ """Update cached metadata fields.
558
+
559
+ Args:
560
+ is_file: Optional file flag.
561
+ is_dir: Optional directory flag.
562
+ size: Optional size in bytes.
563
+ mtime: Optional modification time in seconds.
564
+
565
+ Returns:
566
+ The DatabricksPath instance.
567
+ """
568
+ self._is_file = is_file
569
+ self._is_dir = is_dir
570
+ self._size = size
571
+ self._mtime = mtime
572
+
573
+ return self
574
+
575
+ # ---- API path normalization helpers ----
576
+
577
+ def workspace_full_path(self) -> str:
578
+ """Return the full workspace path string.
579
+
580
+ Returns:
581
+ Workspace path string.
582
+ """
583
+ if not self.parts:
584
+ return "/Workspace"
585
+
586
+ parts = self.parts if self.parts[-1] else self.parts[:-1]
587
+
588
+ return "/Workspace/%s" % "/".join(parts)
589
+
590
+ def dbfs_full_path(self) -> str:
591
+ """Return the full DBFS path string.
592
+
593
+ Returns:
594
+ DBFS path string.
595
+ """
596
+ if not self.parts:
597
+ return "/dbfs"
598
+
599
+ parts = self.parts if self.parts[-1] else self.parts[:-1]
600
+
601
+ return "/dbfs/%s" % "/".join(parts)
602
+
603
+ def files_full_path(self) -> str:
604
+ """Return the full files (volume) path string.
605
+
606
+ Returns:
607
+ Volume path string.
608
+ """
609
+ if not self.parts:
610
+ return "/Volumes"
611
+
612
+ parts = self.parts if self.parts[-1] else self.parts[:-1]
613
+
614
+ return "/Volumes/%s" % "/".join(parts)
615
+
616
+ def exists(self, *, follow_symlinks=True) -> bool:
617
+ """Return True if the path exists.
618
+
619
+ Args:
620
+ follow_symlinks: Unused; for compatibility.
621
+
622
+ Returns:
623
+ True if the path exists.
624
+ """
625
+ return bool(self.is_file() or self.is_dir())
626
+
627
+ def mkdir(self, mode=None, parents=True, exist_ok=True):
628
+ """Create a directory for the path.
629
+
630
+ Args:
631
+ mode: Optional mode (unused).
632
+ parents: Whether to create parent directories.
633
+ exist_ok: Whether to ignore existing directories.
634
+
635
+ Returns:
636
+ The DatabricksPath instance.
637
+ """
638
+ try:
639
+ if self.kind == DatabricksPathKind.WORKSPACE:
640
+ self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
641
+ elif self.kind == DatabricksPathKind.VOLUME:
642
+ self.make_volume_dir(parents=parents, exist_ok=exist_ok)
643
+ elif self.kind == DatabricksPathKind.DBFS:
644
+ self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
645
+ except (NotFound, ResourceDoesNotExist):
646
+ if not parents or self.parent == self:
647
+ raise
648
+
649
+ self.parent.mkdir(parents=True, exist_ok=True)
650
+ self.mkdir(parents=False, exist_ok=exist_ok)
651
+ except (AlreadyExists, ResourceAlreadyExists):
652
+ if not exist_ok:
653
+ raise
654
+
655
+ return self
656
+
657
+ def _ensure_volume(self, exist_ok: bool = True, sdk=None):
658
+ catalog_name, schema_name, volume_name, rel = self.volume_parts()
659
+ sdk = self.workspace.sdk() if sdk is None else sdk
660
+
661
+ if catalog_name:
662
+ try:
663
+ sdk.catalogs.create(name=catalog_name)
664
+ except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
665
+ if not exist_ok:
666
+ raise
667
+
668
+ if schema_name:
669
+ try:
670
+ sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
671
+ except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
672
+ if not exist_ok:
673
+ raise
674
+
675
+ if volume_name:
676
+ try:
677
+ sdk.volumes.create(
678
+ catalog_name=catalog_name,
679
+ schema_name=schema_name,
680
+ name=volume_name,
681
+ volume_type=VolumeType.MANAGED,
682
+ )
683
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
684
+ if not exist_ok:
685
+ raise
686
+
687
+ def make_volume_dir(self, parents=True, exist_ok=True):
688
+ path = self.files_full_path()
689
+ sdk = self.workspace.sdk()
690
+
691
+ try:
692
+ sdk.files.create_directory(path)
693
+ except (BadRequest, NotFound, ResourceDoesNotExist) as e:
694
+ if not parents:
695
+ raise
696
+
697
+ message = str(e)
698
+ if "not exist" in message:
699
+ self._ensure_volume(sdk=sdk)
700
+
701
+ sdk.files.create_directory(path)
702
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
703
+ if not exist_ok:
704
+ raise
705
+
706
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
707
+
708
+ def make_workspace_dir(self, parents=True, exist_ok=True):
709
+ path = self.workspace_full_path()
710
+ sdk = self.workspace.sdk()
711
+
712
+ try:
713
+ sdk.workspace.mkdirs(path)
714
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
715
+ if not exist_ok:
716
+ raise
717
+
718
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
719
+
720
+ def make_dbfs_dir(self, parents=True, exist_ok=True):
721
+ path = self.dbfs_full_path()
722
+ sdk = self.workspace.sdk()
723
+
724
+ try:
725
+ sdk.dbfs.mkdirs(path)
726
+ except (AlreadyExists, ResourceAlreadyExists, BadRequest):
727
+ if not exist_ok:
728
+ raise
729
+
730
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
731
+
732
+ def remove(self, recursive: bool = True):
733
+ """Remove the path as a file or directory.
734
+
735
+ Args:
736
+ recursive: Whether to delete directories recursively.
737
+
738
+ Returns:
739
+ The DatabricksPath instance.
740
+ """
741
+ if self.kind == DatabricksPathKind.VOLUME:
742
+ return self._remove_volume_obj(recursive=recursive)
743
+ elif self.kind == DatabricksPathKind.WORKSPACE:
744
+ return self._remove_workspace_obj(recursive=recursive)
745
+ elif self.kind == DatabricksPathKind.DBFS:
746
+ return self._remove_dbfs_obj(recursive=recursive)
747
+
748
+ def _remove_volume_obj(self, recursive: bool = True):
749
+ if self.is_file():
750
+ return self._remove_volume_file()
751
+ return self._remove_volume_dir(recursive=recursive)
752
+
753
+ def _remove_workspace_obj(self, recursive: bool = True):
754
+ if self.is_file():
755
+ return self._remove_workspace_file()
756
+ return self._remove_workspace_dir(recursive=recursive)
757
+
758
+ def _remove_dbfs_obj(self, recursive: bool = True):
759
+ if self.is_file():
760
+ return self._remove_dbfs_file()
761
+ return self._remove_dbfs_dir(recursive=recursive)
762
+
763
+ def rmfile(self):
764
+ """Remove the path as a file.
765
+
766
+ Returns:
767
+ The DatabricksPath instance.
768
+ """
769
+ try:
770
+ if self.kind == DatabricksPathKind.VOLUME:
771
+ return self._remove_volume_file()
772
+ elif self.kind == DatabricksPathKind.WORKSPACE:
773
+ return self._remove_workspace_file()
774
+ elif self.kind == DatabricksPathKind.DBFS:
775
+ return self._remove_dbfs_file()
776
+ finally:
777
+ self.reset_metadata()
778
+ return self
779
+
780
+ def _remove_volume_file(self):
781
+ sdk = self.workspace.sdk()
782
+ try:
783
+ sdk.files.delete(self.files_full_path())
784
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
785
+ pass
786
+ return self
787
+
788
+ def _remove_workspace_file(self):
789
+ sdk = self.workspace.sdk()
790
+ try:
791
+ sdk.workspace.delete(self.workspace_full_path(), recursive=True)
792
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
793
+ pass
794
+ return self
795
+
796
+ def _remove_dbfs_file(self):
797
+ sdk = self.workspace.sdk()
798
+ try:
799
+ sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
800
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
801
+ pass
802
+ return self
803
+
804
+ def rmdir(self, recursive: bool = True):
805
+ """Remove the path as a directory.
806
+
807
+ Args:
808
+ recursive: Whether to delete directories recursively.
809
+
810
+ Returns:
811
+ The DatabricksPath instance.
812
+ """
813
+ if self.kind == DatabricksPathKind.VOLUME:
814
+ return self._remove_volume_dir(recursive=recursive)
815
+ elif self.kind == DatabricksPathKind.WORKSPACE:
816
+ return self._remove_workspace_dir(recursive=recursive)
817
+ elif self.kind == DatabricksPathKind.DBFS:
818
+ return self._remove_dbfs_dir(recursive=recursive)
819
+
820
+ def _remove_workspace_dir(self, recursive: bool = True):
821
+ sdk = self.workspace.sdk()
822
+ try:
823
+ sdk.workspace.delete(self.workspace_full_path(), recursive=recursive)
824
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
825
+ pass
826
+ self.reset_metadata()
827
+ return self
828
+
829
+ def _remove_dbfs_dir(self, recursive: bool = True):
830
+ sdk = self.workspace.sdk()
831
+ try:
832
+ sdk.dbfs.delete(self.dbfs_full_path(), recursive=recursive)
833
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
834
+ pass
835
+ self.reset_metadata()
836
+ return self
837
+
838
+ def _remove_volume_dir(self, recursive: bool = True):
839
+ root_path = self.files_full_path()
840
+ catalog_name, schema_name, volume_name, rel = self.volume_parts()
841
+ sdk = self.workspace.sdk()
842
+
843
+ if rel:
844
+ try:
845
+ sdk.files.delete_directory(root_path)
846
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
847
+ message = str(e)
848
+ if recursive and "directory is not empty" in message:
849
+ for child_path in self.ls():
850
+ child_path._remove_volume_obj(recursive=True)
851
+ sdk.files.delete_directory(root_path)
852
+ else:
853
+ pass
854
+ elif volume_name:
855
+ try:
856
+ sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
857
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
858
+ pass
859
+ elif schema_name:
860
+ try:
861
+ sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
862
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
863
+ pass
864
+
865
+ return self.reset_metadata()
866
+
867
+ def ls(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
868
+ """List directory contents for the path.
869
+
870
+ Args:
871
+ recursive: Whether to recurse into subdirectories.
872
+ fetch_size: Optional page size for listings.
873
+ allow_not_found: Whether to suppress missing-path errors.
874
+
875
+ Yields:
876
+ DatabricksPath entries.
877
+ """
878
+ if self.kind == DatabricksPathKind.VOLUME:
879
+ yield from self._ls_volume(recursive=recursive, fetch_size=fetch_size, allow_not_found=allow_not_found)
880
+ elif self.kind == DatabricksPathKind.WORKSPACE:
881
+ yield from self._ls_workspace(recursive=recursive, allow_not_found=allow_not_found)
882
+ elif self.kind == DatabricksPathKind.DBFS:
883
+ yield from self._ls_dbfs(recursive=recursive, allow_not_found=allow_not_found)
884
+
885
+ def _ls_volume(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
886
+ catalog_name, schema_name, volume_name, rel = self.volume_parts()
887
+ sdk = self.workspace.sdk()
888
+
889
+ if rel is None:
890
+ if volume_name is None:
891
+ try:
892
+ for info in sdk.volumes.list(catalog_name=catalog_name, schema_name=schema_name):
893
+ base = DatabricksPath(
894
+ kind=DatabricksPathKind.VOLUME,
895
+ parts=[info.catalog_name, info.schema_name, info.name],
896
+ _workspace=self.workspace,
897
+ _is_file=False,
898
+ _is_dir=True,
899
+ _size=0,
900
+ )
901
+ if recursive:
902
+ yield from base._ls_volume(recursive=recursive)
903
+ else:
904
+ yield base
905
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
906
+ if not allow_not_found:
907
+ raise
908
+ elif schema_name is None:
909
+ try:
910
+ for info in sdk.schemas.list(catalog_name=catalog_name):
911
+ base = DatabricksPath(
912
+ kind=DatabricksPathKind.VOLUME,
913
+ parts=[info.catalog_name, info.name],
914
+ _workspace=self.workspace,
915
+ _is_file=False,
916
+ _is_dir=True,
917
+ _size=0,
918
+ )
919
+ if recursive:
920
+ yield from base._ls_volume(recursive=recursive)
921
+ else:
922
+ yield base
923
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
924
+ if not allow_not_found:
925
+ raise
926
+ else:
927
+ try:
928
+ for info in sdk.catalogs.list():
929
+ base = DatabricksPath(
930
+ kind=DatabricksPathKind.VOLUME,
931
+ parts=[info.name],
932
+ _workspace=self.workspace,
933
+ _is_file=False,
934
+ _is_dir=True,
935
+ _size=0,
936
+ )
937
+ if recursive:
938
+ yield from base._ls_volume(recursive=recursive)
939
+ else:
940
+ yield base
941
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
942
+ if not allow_not_found:
943
+ raise
944
+ else:
945
+ full_path = self.files_full_path()
946
+
947
+ try:
948
+ for info in sdk.files.list_directory_contents(full_path, page_size=fetch_size):
949
+ base = DatabricksPath(
950
+ kind=DatabricksPathKind.VOLUME,
951
+ parts=info.path.split("/")[2:],
952
+ _workspace=self.workspace,
953
+ _is_file=not info.is_directory,
954
+ _is_dir=info.is_directory,
955
+ _size=info.file_size,
956
+ )
957
+
958
+ if recursive and info.is_directory:
959
+ yield from base._ls_volume(recursive=recursive)
960
+ else:
961
+ yield base
962
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
963
+ if not allow_not_found:
964
+ raise
965
+
966
+ def _ls_workspace(self, recursive: bool = True, allow_not_found: bool = True):
967
+ sdk = self.workspace.sdk()
968
+ full_path = self.workspace_full_path()
969
+
970
+ try:
971
+ for info in sdk.workspace.list(full_path, recursive=recursive):
972
+ is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
973
+ yield DatabricksPath(
974
+ kind=DatabricksPathKind.WORKSPACE,
975
+ parts=info.path.split("/")[2:],
976
+ _workspace=self.workspace,
977
+ _is_file=not is_dir,
978
+ _is_dir=is_dir,
979
+ _size=info.size,
980
+ )
981
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
982
+ if not allow_not_found:
983
+ raise
984
+
985
+ def _ls_dbfs(self, recursive: bool = True, allow_not_found: bool = True):
986
+ sdk = self.workspace.sdk()
987
+ full_path = self.dbfs_full_path()
988
+
989
+ try:
990
+ for info in sdk.dbfs.list(full_path, recursive=recursive):
991
+ yield DatabricksPath(
992
+ kind=DatabricksPathKind.DBFS,
993
+ parts=info.path.split("/")[2:],
994
+ _workspace=self.workspace,
995
+ _is_file=not info.is_dir,
996
+ _is_dir=info.is_dir,
997
+ _size=info.file_size,
998
+ )
999
+ except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
1000
+ if not allow_not_found:
1001
+ raise
1002
+
1003
+ def open(
1004
+ self,
1005
+ mode="rb",
1006
+ encoding=None,
1007
+ clone: bool = False,
1008
+ ) -> DatabricksIO:
1009
+ """Open the path as a DatabricksIO instance.
1010
+
1011
+ Args:
1012
+ mode: File mode string.
1013
+ encoding: Optional text encoding.
1014
+ clone: Whether to return a cloned path instance.
1015
+
1016
+ Returns:
1017
+ A DatabricksIO instance.
1018
+ """
1019
+ path = self.connect(clone=clone)
1020
+
1021
+ return (
1022
+ DatabricksIO
1023
+ .create_instance(path=path, mode=mode, encoding=encoding)
1024
+ .connect(clone=False)
1025
+ )
1026
+
1027
+ def copy_to(
1028
+ self,
1029
+ dest: Union["DatabricksIO", "DatabricksPath", str],
1030
+ allow_not_found: bool = True,
1031
+ ) -> None:
1032
+ """Copy this path to another path or IO destination.
1033
+
1034
+ Args:
1035
+ dest: Destination IO, DatabricksPath, or path string.
1036
+ allow_not_found: Whether to suppress missing-path errors.
1037
+
1038
+ Returns:
1039
+ None.
1040
+ """
1041
+ if self.is_file() and dest.is_file():
1042
+ with self.open(mode="rb") as src:
1043
+ src.copy_to(dest=dest)
1044
+
1045
+ elif self.is_dir():
1046
+ dest_base = self.parse(obj=dest, workspace=self.workspace if dest._workspace is None else dest._workspace)
1047
+ dest_base.mkdir(parents=True, exist_ok=True)
1048
+
1049
+ skip_base_parts = len(self.parts)
1050
+
1051
+ for src_child in self.ls(recursive=True, allow_not_found=True):
1052
+ src_child: DatabricksPath = src_child
1053
+ dest_child_parts = dest_base.parts + src_child.parts[skip_base_parts:]
1054
+
1055
+ src_child.copy_to(
1056
+ dest=dest.clone_instance(parts=dest_child_parts),
1057
+ allow_not_found=allow_not_found
1058
+ )
1059
+
1060
+ elif not allow_not_found:
1061
+ return None
1062
+
1063
+ else:
1064
+ raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
1065
+
1066
+ # -------------------------
1067
+ # Data ops (Arrow / Pandas / Polars)
1068
+ # -------------------------
1069
+ def arrow_dataset(
1070
+ self,
1071
+ workspace: Optional["Workspace"] = None,
1072
+ filesystem: Optional[FileSystem] = None,
1073
+ **kwargs
1074
+ ):
1075
+ """Return a PyArrow dataset referencing this path.
1076
+
1077
+ Args:
1078
+ workspace: Optional workspace override.
1079
+ filesystem: Optional filesystem override.
1080
+ **kwargs: Dataset options.
1081
+
1082
+ Returns:
1083
+ A PyArrow Dataset instance.
1084
+ """
1085
+ filesystem = self.filesystem(workspace=workspace) if filesystem is None else filesystem
1086
+
1087
+ return ds.dataset(
1088
+ source=self.full_path(),
1089
+ filesystem=filesystem,
1090
+ **kwargs
1091
+ )
1092
+
1093
+ def read_arrow_table(
1094
+ self,
1095
+ batch_size: Optional[int] = None,
1096
+ concat: bool = True,
1097
+ **kwargs
1098
+ ) -> pa.Table:
1099
+ """Read the path into an Arrow table.
1100
+
1101
+ Args:
1102
+ batch_size: Optional batch size for reads.
1103
+ concat: Whether to concatenate tables for directories.
1104
+ **kwargs: Format-specific options.
1105
+
1106
+ Returns:
1107
+ An Arrow Table (or list of tables if concat=False).
1108
+ """
1109
+ if self.is_file():
1110
+ with self.open("rb") as f:
1111
+ return f.read_arrow_table(batch_size=batch_size, **kwargs)
1112
+
1113
+ if self.is_dir():
1114
+ tables: list[pa.Table] = []
1115
+ for child in self.ls(recursive=True):
1116
+ if child.is_file():
1117
+ with child.open("rb") as f:
1118
+ tables.append(f.read_arrow_table(batch_size=batch_size, **kwargs))
1119
+
1120
+ if not tables:
1121
+ return pa.Table.from_batches([], schema=pa.schema([]))
1122
+
1123
+ if not concat:
1124
+ # type: ignore[return-value]
1125
+ return tables # caller asked for raw list
1126
+
1127
+ try:
1128
+ return pa.concat_tables(tables)
1129
+ except Exception:
1130
+ # Fallback: concat via polars (diagonal relaxed) then back to Arrow
1131
+ from polars import CompatLevel
1132
+
1133
+ return self.read_polars(
1134
+ batch_size=batch_size,
1135
+ how="diagonal_relaxed",
1136
+ rechunk=True,
1137
+ concat=True,
1138
+ **kwargs,
1139
+ ).to_arrow(compat_level=CompatLevel.newest())
1140
+
1141
+ raise FileNotFoundError(f"Path does not exist: {self}")
1142
+
1143
+ def write_arrow(
1144
+ self,
1145
+ table: Union[pa.Table, pa.RecordBatch],
1146
+ batch_size: Optional[int] = None,
1147
+ **kwargs
1148
+ ):
1149
+ """Write Arrow data to the path.
1150
+
1151
+ Args:
1152
+ table: Arrow table or record batch to write.
1153
+ batch_size: Optional batch size for writes.
1154
+ **kwargs: Format-specific options.
1155
+
1156
+ Returns:
1157
+ The DatabricksPath instance.
1158
+ """
1159
+ if not isinstance(table, pa.Table):
1160
+ table = convert(table, pa.Table)
1161
+
1162
+ return self.write_arrow_table(
1163
+ table=table,
1164
+ batch_size=batch_size,
1165
+ **kwargs
1166
+ )
1167
+
1168
+ def write_arrow_table(
1169
+ self,
1170
+ table: pa.Table,
1171
+ file_format: Optional[FileFormat] = None,
1172
+ batch_size: Optional[int] = None,
1173
+ **kwargs
1174
+ ):
1175
+ """Write an Arrow table to the path, sharding if needed.
1176
+
1177
+ Args:
1178
+ table: Arrow table to write.
1179
+ file_format: Optional file format override.
1180
+ batch_size: Optional batch size for writes.
1181
+ **kwargs: Format-specific options.
1182
+
1183
+ Returns:
1184
+ The DatabricksPath instance.
1185
+ """
1186
+ with self.connect(clone=False) as connected:
1187
+ if connected.is_dir_sink():
1188
+ seed = int(time.time() * 1000)
1189
+
1190
+ for i, batch in enumerate(table.to_batches(max_chunksize=batch_size)):
1191
+ part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
1192
+
1193
+ with part_path.open(mode="wb") as f:
1194
+ f.write_arrow_batch(batch, file_format=file_format)
1195
+
1196
+ return connected
1197
+
1198
+ connected.open(mode="wb", clone=False).write_arrow_table(
1199
+ table,
1200
+ file_format=file_format,
1201
+ batch_size=batch_size,
1202
+ **kwargs
1203
+ )
1204
+
1205
+ return self
1206
+
1207
+ def read_pandas(
1208
+ self,
1209
+ batch_size: Optional[int] = None,
1210
+ concat: bool = True,
1211
+ **kwargs
1212
+ ):
1213
+ """Read the path into a pandas DataFrame.
1214
+
1215
+ Args:
1216
+ batch_size: Optional batch size for reads.
1217
+ concat: Whether to concatenate results for directories.
1218
+ **kwargs: Format-specific options.
1219
+
1220
+ Returns:
1221
+ A pandas DataFrame or list of DataFrames if concat=False.
1222
+ """
1223
+ if concat:
1224
+ return self.read_arrow_table(batch_size=batch_size, concat=True, **kwargs).to_pandas()
1225
+
1226
+ tables = self.read_arrow_table(batch_size=batch_size, concat=False, **kwargs)
1227
+ return [t.to_pandas() for t in tables] # type: ignore[arg-type]
1228
+
1229
+ def write_pandas(
1230
+ self,
1231
+ df,
1232
+ batch_size: Optional[int] = None,
1233
+ **kwargs
1234
+ ):
1235
+ """Write a pandas DataFrame to the path.
1236
+
1237
+ Args:
1238
+ df: pandas DataFrame to write.
1239
+ batch_size: Optional batch size for writes.
1240
+ **kwargs: Format-specific options.
1241
+
1242
+ Returns:
1243
+ The DatabricksPath instance.
1244
+ """
1245
+ return self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
1246
+
1247
+ def read_polars(
1248
+ self,
1249
+ batch_size: Optional[int] = None,
1250
+ how: str = "diagonal_relaxed",
1251
+ rechunk: bool = False,
1252
+ concat: bool = True,
1253
+ **kwargs
1254
+ ):
1255
+ """Read the path into a polars DataFrame.
1256
+
1257
+ Args:
1258
+ batch_size: Optional batch size for reads.
1259
+ how: Polars concat strategy.
1260
+ rechunk: Whether to rechunk after concat.
1261
+ concat: Whether to concatenate results for directories.
1262
+ **kwargs: Format-specific options.
1263
+
1264
+ Returns:
1265
+ A polars DataFrame or list of DataFrames if concat=False.
1266
+ """
1267
+ import polars as pl
1268
+
1269
+ if self.is_file():
1270
+ with self.open("rb") as f:
1271
+ return f.read_polars(batch_size=batch_size, **kwargs)
1272
+
1273
+ if self.is_dir():
1274
+ dfs = []
1275
+ for child in self.ls(recursive=True):
1276
+ if child.is_file():
1277
+ with child.open("rb") as f:
1278
+ dfs.append(f.read_polars(batch_size=batch_size, **kwargs))
1279
+
1280
+ if not dfs:
1281
+ return pl.DataFrame()
1282
+
1283
+ if concat:
1284
+ return pl.concat(dfs, how=how, rechunk=rechunk)
1285
+ return dfs # type: ignore[return-value]
1286
+
1287
+ raise FileNotFoundError(f"Path does not exist: {self}")
1288
+
1289
+ def write_polars(
1290
+ self,
1291
+ df,
1292
+ batch_size: Optional[int] = None,
1293
+ **kwargs
1294
+ ):
1295
+ """
1296
+ Write Polars to a DatabricksPath.
1297
+
1298
+ Behavior:
1299
+ - If path is a directory (or ends with a trailing "/"): shard to parquet parts.
1300
+ `batch_size` = rows per part (default 1_000_000).
1301
+ - If path is a file: write using DatabricksIO.write_polars which is extension-driven
1302
+ (parquet/csv/ipc/json/ndjson etc.).
1303
+
1304
+ Args:
1305
+ df: polars DataFrame or LazyFrame to write.
1306
+ batch_size: Optional rows per part for directory sinks.
1307
+ **kwargs: Format-specific options.
1308
+
1309
+ Returns:
1310
+ The DatabricksPath instance.
1311
+
1312
+ Notes:
1313
+ - If `df` is a LazyFrame, we collect it first (optionally streaming).
1314
+ """
1315
+ import polars as pl
1316
+
1317
+ if isinstance(df, pl.LazyFrame):
1318
+ df = df.collect()
1319
+
1320
+ if not isinstance(df, pl.DataFrame):
1321
+ raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
1322
+
1323
+ with self.connect() as connected:
1324
+ if connected.is_dir_sink():
1325
+ seed = int(time.time() * 1000)
1326
+ rows_per_part = batch_size or 1_000_000
1327
+
1328
+ # Always parquet for directory sinks (lake layout standard)
1329
+ for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
1330
+ part_path = connected / f"part-{i:05d}-{seed}-{_rand_str(4)}.parquet"
1331
+
1332
+ part_path.write_polars(chunk, **kwargs)
1333
+
1334
+ return connected
1335
+
1336
+ # Single file write: format/extension is handled in DatabricksIO.write_polars
1337
+ connected.write_polars(df, **kwargs)
1338
+
1339
+ return connected
1340
+
1341
+ def sql(
1342
+ self,
1343
+ query: str,
1344
+ engine: str = "auto"
1345
+ ):
1346
+ """Run a local SQL query against data at this path.
1347
+
1348
+ Args:
1349
+ query: SQL query string referencing the path.
1350
+ engine: Query engine ("duckdb", "polars", or "auto").
1351
+
1352
+ Returns:
1353
+ An Arrow Table with the query results.
1354
+ """
1355
+ if engine == "auto":
1356
+ try:
1357
+ import duckdb
1358
+ engine = "duckdb"
1359
+ except ImportError:
1360
+ engine = "polars"
1361
+
1362
+ from_table = "dbfs.`%s`" % self.full_path()
1363
+
1364
+ if from_table not in query:
1365
+ raise ValueError(
1366
+ "SQL query must contain %s to execute query:\n%s" % (
1367
+ from_table,
1368
+ query
1369
+ )
1370
+ )
1371
+
1372
+ if engine == "duckdb":
1373
+ import duckdb
1374
+
1375
+ __arrow_table__ = self.read_arrow_table()
1376
+
1377
+ return (
1378
+ duckdb.connect()
1379
+ .execute(query=query.replace(from_table, "__arrow_table__"))
1380
+ .fetch_arrow_table()
1381
+ )
1382
+ elif engine == "polars":
1383
+ from polars import CompatLevel
1384
+
1385
+ return (
1386
+ self.read_polars()
1387
+ .sql(query=query.replace(from_table, "self"))
1388
+ .to_arrow(compat_level=CompatLevel.newest())
1389
+ )
1390
+ else:
1391
+ raise ValueError(
1392
+ "Invalid engine %s, must be in duckdb, polars" % engine
1393
+ )
1394
+
1395
+
1396
+ @register_converter(DatabricksPath, pa.Table)
1397
+ def databricks_path_to_arrow_table(
1398
+ data: DatabricksPath,
1399
+ options: Optional[CastOptions] = None,
1400
+ ) -> pa.Table:
1401
+ return cast_arrow_tabular(
1402
+ data.read_arrow_table(),
1403
+ options
1404
+ )
1405
+
1406
+
1407
+ @polars_converter(DatabricksPath, PolarsDataFrame)
1408
+ def databricks_path_to_polars(
1409
+ data: DatabricksPath,
1410
+ options: Optional[CastOptions] = None,
1411
+ ) -> PolarsDataFrame:
1412
+ return cast_polars_dataframe(
1413
+ data.read_polars(),
1414
+ options
1415
+ )