ygg 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/METADATA +1 -1
- ygg-0.1.33.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +244 -3
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +24 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +29 -4
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +147 -0
- yggdrasil/databricks/sql/types.py +33 -1
- yggdrasil/databricks/workspaces/__init__.py +2 -1
- yggdrasil/databricks/workspaces/filesytem.py +183 -0
- yggdrasil/databricks/workspaces/io.py +387 -9
- yggdrasil/databricks/workspaces/path.py +297 -2
- yggdrasil/databricks/workspaces/path_kind.py +3 -0
- yggdrasil/databricks/workspaces/workspace.py +202 -5
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +123 -1
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.31.dist-info/RECORD +0 -59
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/WHEEL +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Databricks path abstraction spanning DBFS, workspace, and volumes."""
|
|
2
|
+
|
|
1
3
|
# src/yggdrasil/databricks/workspaces/databricks_path.py
|
|
2
4
|
from __future__ import annotations
|
|
3
5
|
|
|
@@ -49,6 +51,14 @@ __all__ = [
|
|
|
49
51
|
|
|
50
52
|
|
|
51
53
|
def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
|
|
54
|
+
"""Normalize path parts by splitting on '/' and removing empties.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
parts: String or list of path parts.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
A flattened list of path components.
|
|
61
|
+
"""
|
|
52
62
|
if isinstance(parts, str):
|
|
53
63
|
parts = [parts]
|
|
54
64
|
|
|
@@ -64,12 +74,21 @@ def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
|
|
|
64
74
|
|
|
65
75
|
|
|
66
76
|
def _rand_str(n: int) -> str:
|
|
77
|
+
"""Return a random alphanumeric string of length ``n``.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
n: Length of the random string.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Random alphanumeric string.
|
|
84
|
+
"""
|
|
67
85
|
alphabet = string.ascii_letters + string.digits
|
|
68
86
|
return "".join(random.choices(alphabet, k=n))
|
|
69
87
|
|
|
70
88
|
|
|
71
89
|
@dataclasses.dataclass
|
|
72
90
|
class DatabricksPath:
|
|
91
|
+
"""Path wrapper for Databricks workspace, volumes, and DBFS objects."""
|
|
73
92
|
kind: DatabricksPathKind
|
|
74
93
|
parts: List[str]
|
|
75
94
|
|
|
@@ -113,6 +132,15 @@ class DatabricksPath:
|
|
|
113
132
|
obj: Union["DatabricksPath", str, List[str]],
|
|
114
133
|
workspace: Optional["Workspace"] = None,
|
|
115
134
|
) -> "DatabricksPath":
|
|
135
|
+
"""Parse input into a DatabricksPath instance.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
obj: Input path, DatabricksPath, or path parts list.
|
|
139
|
+
workspace: Optional Workspace to bind to the path.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
A DatabricksPath instance.
|
|
143
|
+
"""
|
|
116
144
|
if not obj:
|
|
117
145
|
return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
|
|
118
146
|
|
|
@@ -194,6 +222,11 @@ class DatabricksPath:
|
|
|
194
222
|
return "dbfs://%s" % self.full_path()
|
|
195
223
|
|
|
196
224
|
def full_path(self) -> str:
|
|
225
|
+
"""Return the fully qualified path for this namespace.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
The fully qualified path string.
|
|
229
|
+
"""
|
|
197
230
|
if self.kind == DatabricksPathKind.DBFS:
|
|
198
231
|
return self.dbfs_full_path()
|
|
199
232
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
@@ -204,10 +237,23 @@ class DatabricksPath:
|
|
|
204
237
|
raise ValueError(f"Unknown DatabricksPath kind: {self.kind!r}")
|
|
205
238
|
|
|
206
239
|
def filesystem(self, workspace: Optional["Workspace"] = None):
|
|
240
|
+
"""Return a PyArrow filesystem adapter for this workspace.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
workspace: Optional workspace override.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
A PyArrow FileSystem instance.
|
|
247
|
+
"""
|
|
207
248
|
return self.workspace.filesytem(workspace=workspace)
|
|
208
249
|
|
|
209
250
|
@property
|
|
210
251
|
def parent(self):
|
|
252
|
+
"""Return the parent path.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
A DatabricksPath representing the parent.
|
|
256
|
+
"""
|
|
211
257
|
if not self.parts:
|
|
212
258
|
return self
|
|
213
259
|
|
|
@@ -226,6 +272,11 @@ class DatabricksPath:
|
|
|
226
272
|
|
|
227
273
|
@property
|
|
228
274
|
def workspace(self):
|
|
275
|
+
"""Return the associated Workspace instance.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
The Workspace associated with this path.
|
|
279
|
+
"""
|
|
229
280
|
if self._workspace is None:
|
|
230
281
|
from .workspace import Workspace
|
|
231
282
|
|
|
@@ -238,6 +289,11 @@ class DatabricksPath:
|
|
|
238
289
|
|
|
239
290
|
@property
|
|
240
291
|
def name(self) -> str:
|
|
292
|
+
"""Return the final path component.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
The final path name component.
|
|
296
|
+
"""
|
|
241
297
|
if not self.parts:
|
|
242
298
|
return ""
|
|
243
299
|
|
|
@@ -248,6 +304,11 @@ class DatabricksPath:
|
|
|
248
304
|
|
|
249
305
|
@property
|
|
250
306
|
def extension(self) -> str:
|
|
307
|
+
"""Return the file extension for the path, if any.
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
The file extension without leading dot.
|
|
311
|
+
"""
|
|
251
312
|
name = self.name
|
|
252
313
|
if "." in name:
|
|
253
314
|
return name.split(".")[-1]
|
|
@@ -255,6 +316,11 @@ class DatabricksPath:
|
|
|
255
316
|
|
|
256
317
|
@property
|
|
257
318
|
def file_format(self) -> FileFormat:
|
|
319
|
+
"""Infer the file format from the file extension.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
A PyArrow FileFormat instance.
|
|
323
|
+
"""
|
|
258
324
|
ext = self.extension
|
|
259
325
|
|
|
260
326
|
if ext == "parquet":
|
|
@@ -270,6 +336,11 @@ class DatabricksPath:
|
|
|
270
336
|
|
|
271
337
|
@property
|
|
272
338
|
def content_length(self):
|
|
339
|
+
"""Return the size of the path in bytes if known.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
The size in bytes.
|
|
343
|
+
"""
|
|
273
344
|
if self._size is None:
|
|
274
345
|
self.refresh_status()
|
|
275
346
|
return self._size
|
|
@@ -280,6 +351,11 @@ class DatabricksPath:
|
|
|
280
351
|
|
|
281
352
|
@property
|
|
282
353
|
def mtime(self) -> Optional[float]:
|
|
354
|
+
"""Return the last-modified time for the path.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Last-modified timestamp in seconds.
|
|
358
|
+
"""
|
|
283
359
|
if self._mtime is None:
|
|
284
360
|
self.refresh_status()
|
|
285
361
|
return self._mtime
|
|
@@ -314,16 +390,31 @@ class DatabricksPath:
|
|
|
314
390
|
)
|
|
315
391
|
|
|
316
392
|
def is_file(self):
|
|
393
|
+
"""Return True when the path is a file.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
True if the path is a file.
|
|
397
|
+
"""
|
|
317
398
|
if self._is_file is None:
|
|
318
399
|
self.refresh_status()
|
|
319
400
|
return self._is_file
|
|
320
401
|
|
|
321
402
|
def is_dir(self):
|
|
403
|
+
"""Return True when the path is a directory.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
True if the path is a directory.
|
|
407
|
+
"""
|
|
322
408
|
if self._is_dir is None:
|
|
323
409
|
self.refresh_status()
|
|
324
410
|
return self._is_dir
|
|
325
411
|
|
|
326
412
|
def is_dir_sink(self):
|
|
413
|
+
"""Return True if the path represents a directory sink.
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
True if the path represents a directory sink.
|
|
417
|
+
"""
|
|
327
418
|
return self.is_dir() or (self.parts and self.parts[-1] == "")
|
|
328
419
|
|
|
329
420
|
@property
|
|
@@ -331,6 +422,14 @@ class DatabricksPath:
|
|
|
331
422
|
return self._workspace is not None and self._workspace.connected
|
|
332
423
|
|
|
333
424
|
def connect(self, clone: bool = False) -> "DatabricksPath":
|
|
425
|
+
"""Connect the path to its workspace, optionally returning a clone.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
clone: Whether to return a cloned instance.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
The connected DatabricksPath.
|
|
432
|
+
"""
|
|
334
433
|
workspace = self.workspace.connect(clone=clone)
|
|
335
434
|
|
|
336
435
|
if clone:
|
|
@@ -346,6 +445,11 @@ class DatabricksPath:
|
|
|
346
445
|
pass
|
|
347
446
|
|
|
348
447
|
def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
|
|
448
|
+
"""Return (catalog, schema, volume, rel_path) for volume paths.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Tuple of (catalog, schema, volume, rel_path).
|
|
452
|
+
"""
|
|
349
453
|
if self.kind != DatabricksPathKind.VOLUME:
|
|
350
454
|
return None, None, None, None
|
|
351
455
|
|
|
@@ -358,6 +462,11 @@ class DatabricksPath:
|
|
|
358
462
|
return catalog, schema, volume, self.parts[3:] # type: ignore[return-value]
|
|
359
463
|
|
|
360
464
|
def refresh_status(self) -> "DatabricksPath":
|
|
465
|
+
"""Refresh cached metadata for the path.
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
The DatabricksPath instance.
|
|
469
|
+
"""
|
|
361
470
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
362
471
|
self._refresh_volume_status()
|
|
363
472
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
@@ -445,6 +554,17 @@ class DatabricksPath:
|
|
|
445
554
|
size: Optional[int] = None,
|
|
446
555
|
mtime: Optional[float] = None,
|
|
447
556
|
):
|
|
557
|
+
"""Update cached metadata fields.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
is_file: Optional file flag.
|
|
561
|
+
is_dir: Optional directory flag.
|
|
562
|
+
size: Optional size in bytes.
|
|
563
|
+
mtime: Optional modification time in seconds.
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
The DatabricksPath instance.
|
|
567
|
+
"""
|
|
448
568
|
self._is_file = is_file
|
|
449
569
|
self._is_dir = is_dir
|
|
450
570
|
self._size = size
|
|
@@ -455,6 +575,11 @@ class DatabricksPath:
|
|
|
455
575
|
# ---- API path normalization helpers ----
|
|
456
576
|
|
|
457
577
|
def workspace_full_path(self) -> str:
|
|
578
|
+
"""Return the full workspace path string.
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
Workspace path string.
|
|
582
|
+
"""
|
|
458
583
|
if not self.parts:
|
|
459
584
|
return "/Workspace"
|
|
460
585
|
|
|
@@ -463,6 +588,11 @@ class DatabricksPath:
|
|
|
463
588
|
return "/Workspace/%s" % "/".join(parts)
|
|
464
589
|
|
|
465
590
|
def dbfs_full_path(self) -> str:
|
|
591
|
+
"""Return the full DBFS path string.
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
DBFS path string.
|
|
595
|
+
"""
|
|
466
596
|
if not self.parts:
|
|
467
597
|
return "/dbfs"
|
|
468
598
|
|
|
@@ -471,6 +601,11 @@ class DatabricksPath:
|
|
|
471
601
|
return "/dbfs/%s" % "/".join(parts)
|
|
472
602
|
|
|
473
603
|
def files_full_path(self) -> str:
|
|
604
|
+
"""Return the full files (volume) path string.
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
Volume path string.
|
|
608
|
+
"""
|
|
474
609
|
if not self.parts:
|
|
475
610
|
return "/Volumes"
|
|
476
611
|
|
|
@@ -479,9 +614,27 @@ class DatabricksPath:
|
|
|
479
614
|
return "/Volumes/%s" % "/".join(parts)
|
|
480
615
|
|
|
481
616
|
def exists(self, *, follow_symlinks=True) -> bool:
|
|
617
|
+
"""Return True if the path exists.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
follow_symlinks: Unused; for compatibility.
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
True if the path exists.
|
|
624
|
+
"""
|
|
482
625
|
return bool(self.is_file() or self.is_dir())
|
|
483
626
|
|
|
484
627
|
def mkdir(self, mode=None, parents=True, exist_ok=True):
|
|
628
|
+
"""Create a directory for the path.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
mode: Optional mode (unused).
|
|
632
|
+
parents: Whether to create parent directories.
|
|
633
|
+
exist_ok: Whether to ignore existing directories.
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
The DatabricksPath instance.
|
|
637
|
+
"""
|
|
485
638
|
try:
|
|
486
639
|
if self.kind == DatabricksPathKind.WORKSPACE:
|
|
487
640
|
self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
|
|
@@ -577,6 +730,14 @@ class DatabricksPath:
|
|
|
577
730
|
return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
|
|
578
731
|
|
|
579
732
|
def remove(self, recursive: bool = True):
|
|
733
|
+
"""Remove the path as a file or directory.
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
recursive: Whether to delete directories recursively.
|
|
737
|
+
|
|
738
|
+
Returns:
|
|
739
|
+
The DatabricksPath instance.
|
|
740
|
+
"""
|
|
580
741
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
581
742
|
return self._remove_volume_obj(recursive=recursive)
|
|
582
743
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
@@ -600,6 +761,11 @@ class DatabricksPath:
|
|
|
600
761
|
return self._remove_dbfs_dir(recursive=recursive)
|
|
601
762
|
|
|
602
763
|
def rmfile(self):
|
|
764
|
+
"""Remove the path as a file.
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
The DatabricksPath instance.
|
|
768
|
+
"""
|
|
603
769
|
try:
|
|
604
770
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
605
771
|
return self._remove_volume_file()
|
|
@@ -636,6 +802,14 @@ class DatabricksPath:
|
|
|
636
802
|
return self
|
|
637
803
|
|
|
638
804
|
def rmdir(self, recursive: bool = True):
|
|
805
|
+
"""Remove the path as a directory.
|
|
806
|
+
|
|
807
|
+
Args:
|
|
808
|
+
recursive: Whether to delete directories recursively.
|
|
809
|
+
|
|
810
|
+
Returns:
|
|
811
|
+
The DatabricksPath instance.
|
|
812
|
+
"""
|
|
639
813
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
640
814
|
return self._remove_volume_dir(recursive=recursive)
|
|
641
815
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
@@ -691,6 +865,16 @@ class DatabricksPath:
|
|
|
691
865
|
return self.reset_metadata()
|
|
692
866
|
|
|
693
867
|
def ls(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
|
|
868
|
+
"""List directory contents for the path.
|
|
869
|
+
|
|
870
|
+
Args:
|
|
871
|
+
recursive: Whether to recurse into subdirectories.
|
|
872
|
+
fetch_size: Optional page size for listings.
|
|
873
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
874
|
+
|
|
875
|
+
Yields:
|
|
876
|
+
DatabricksPath entries.
|
|
877
|
+
"""
|
|
694
878
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
695
879
|
yield from self._ls_volume(recursive=recursive, fetch_size=fetch_size, allow_not_found=allow_not_found)
|
|
696
880
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
@@ -822,6 +1006,16 @@ class DatabricksPath:
|
|
|
822
1006
|
encoding=None,
|
|
823
1007
|
clone: bool = False,
|
|
824
1008
|
) -> DatabricksIO:
|
|
1009
|
+
"""Open the path as a DatabricksIO instance.
|
|
1010
|
+
|
|
1011
|
+
Args:
|
|
1012
|
+
mode: File mode string.
|
|
1013
|
+
encoding: Optional text encoding.
|
|
1014
|
+
clone: Whether to return a cloned path instance.
|
|
1015
|
+
|
|
1016
|
+
Returns:
|
|
1017
|
+
A DatabricksIO instance.
|
|
1018
|
+
"""
|
|
825
1019
|
path = self.connect(clone=clone)
|
|
826
1020
|
|
|
827
1021
|
return (
|
|
@@ -835,6 +1029,15 @@ class DatabricksPath:
|
|
|
835
1029
|
dest: Union["DatabricksIO", "DatabricksPath", str],
|
|
836
1030
|
allow_not_found: bool = True,
|
|
837
1031
|
) -> None:
|
|
1032
|
+
"""Copy this path to another path or IO destination.
|
|
1033
|
+
|
|
1034
|
+
Args:
|
|
1035
|
+
dest: Destination IO, DatabricksPath, or path string.
|
|
1036
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
1037
|
+
|
|
1038
|
+
Returns:
|
|
1039
|
+
None.
|
|
1040
|
+
"""
|
|
838
1041
|
if self.is_file() and dest.is_file():
|
|
839
1042
|
with self.open(mode="rb") as src:
|
|
840
1043
|
src.copy_to(dest=dest)
|
|
@@ -869,6 +1072,16 @@ class DatabricksPath:
|
|
|
869
1072
|
filesystem: Optional[FileSystem] = None,
|
|
870
1073
|
**kwargs
|
|
871
1074
|
):
|
|
1075
|
+
"""Return a PyArrow dataset referencing this path.
|
|
1076
|
+
|
|
1077
|
+
Args:
|
|
1078
|
+
workspace: Optional workspace override.
|
|
1079
|
+
filesystem: Optional filesystem override.
|
|
1080
|
+
**kwargs: Dataset options.
|
|
1081
|
+
|
|
1082
|
+
Returns:
|
|
1083
|
+
A PyArrow Dataset instance.
|
|
1084
|
+
"""
|
|
872
1085
|
filesystem = self.filesystem(workspace=workspace) if filesystem is None else filesystem
|
|
873
1086
|
|
|
874
1087
|
return ds.dataset(
|
|
@@ -883,6 +1096,16 @@ class DatabricksPath:
|
|
|
883
1096
|
concat: bool = True,
|
|
884
1097
|
**kwargs
|
|
885
1098
|
) -> pa.Table:
|
|
1099
|
+
"""Read the path into an Arrow table.
|
|
1100
|
+
|
|
1101
|
+
Args:
|
|
1102
|
+
batch_size: Optional batch size for reads.
|
|
1103
|
+
concat: Whether to concatenate tables for directories.
|
|
1104
|
+
**kwargs: Format-specific options.
|
|
1105
|
+
|
|
1106
|
+
Returns:
|
|
1107
|
+
An Arrow Table (or list of tables if concat=False).
|
|
1108
|
+
"""
|
|
886
1109
|
if self.is_file():
|
|
887
1110
|
with self.open("rb") as f:
|
|
888
1111
|
return f.read_arrow_table(batch_size=batch_size, **kwargs)
|
|
@@ -923,6 +1146,16 @@ class DatabricksPath:
|
|
|
923
1146
|
batch_size: Optional[int] = None,
|
|
924
1147
|
**kwargs
|
|
925
1148
|
):
|
|
1149
|
+
"""Write Arrow data to the path.
|
|
1150
|
+
|
|
1151
|
+
Args:
|
|
1152
|
+
table: Arrow table or record batch to write.
|
|
1153
|
+
batch_size: Optional batch size for writes.
|
|
1154
|
+
**kwargs: Format-specific options.
|
|
1155
|
+
|
|
1156
|
+
Returns:
|
|
1157
|
+
The DatabricksPath instance.
|
|
1158
|
+
"""
|
|
926
1159
|
if not isinstance(table, pa.Table):
|
|
927
1160
|
table = convert(table, pa.Table)
|
|
928
1161
|
|
|
@@ -935,9 +1168,21 @@ class DatabricksPath:
|
|
|
935
1168
|
def write_arrow_table(
|
|
936
1169
|
self,
|
|
937
1170
|
table: pa.Table,
|
|
1171
|
+
file_format: Optional[FileFormat] = None,
|
|
938
1172
|
batch_size: Optional[int] = None,
|
|
939
1173
|
**kwargs
|
|
940
1174
|
):
|
|
1175
|
+
"""Write an Arrow table to the path, sharding if needed.
|
|
1176
|
+
|
|
1177
|
+
Args:
|
|
1178
|
+
table: Arrow table to write.
|
|
1179
|
+
file_format: Optional file format override.
|
|
1180
|
+
batch_size: Optional batch size for writes.
|
|
1181
|
+
**kwargs: Format-specific options.
|
|
1182
|
+
|
|
1183
|
+
Returns:
|
|
1184
|
+
The DatabricksPath instance.
|
|
1185
|
+
"""
|
|
941
1186
|
with self.connect(clone=False) as connected:
|
|
942
1187
|
if connected.is_dir_sink():
|
|
943
1188
|
seed = int(time.time() * 1000)
|
|
@@ -946,12 +1191,13 @@ class DatabricksPath:
|
|
|
946
1191
|
part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
|
|
947
1192
|
|
|
948
1193
|
with part_path.open(mode="wb") as f:
|
|
949
|
-
f.write_arrow_batch(batch)
|
|
1194
|
+
f.write_arrow_batch(batch, file_format=file_format)
|
|
950
1195
|
|
|
951
1196
|
return connected
|
|
952
1197
|
|
|
953
1198
|
connected.open(mode="wb", clone=False).write_arrow_table(
|
|
954
1199
|
table,
|
|
1200
|
+
file_format=file_format,
|
|
955
1201
|
batch_size=batch_size,
|
|
956
1202
|
**kwargs
|
|
957
1203
|
)
|
|
@@ -960,10 +1206,20 @@ class DatabricksPath:
|
|
|
960
1206
|
|
|
961
1207
|
def read_pandas(
|
|
962
1208
|
self,
|
|
963
|
-
batch_size: int =
|
|
1209
|
+
batch_size: Optional[int] = None,
|
|
964
1210
|
concat: bool = True,
|
|
965
1211
|
**kwargs
|
|
966
1212
|
):
|
|
1213
|
+
"""Read the path into a pandas DataFrame.
|
|
1214
|
+
|
|
1215
|
+
Args:
|
|
1216
|
+
batch_size: Optional batch size for reads.
|
|
1217
|
+
concat: Whether to concatenate results for directories.
|
|
1218
|
+
**kwargs: Format-specific options.
|
|
1219
|
+
|
|
1220
|
+
Returns:
|
|
1221
|
+
A pandas DataFrame or list of DataFrames if concat=False.
|
|
1222
|
+
"""
|
|
967
1223
|
if concat:
|
|
968
1224
|
return self.read_arrow_table(batch_size=batch_size, concat=True, **kwargs).to_pandas()
|
|
969
1225
|
|
|
@@ -976,6 +1232,16 @@ class DatabricksPath:
|
|
|
976
1232
|
batch_size: Optional[int] = None,
|
|
977
1233
|
**kwargs
|
|
978
1234
|
):
|
|
1235
|
+
"""Write a pandas DataFrame to the path.
|
|
1236
|
+
|
|
1237
|
+
Args:
|
|
1238
|
+
df: pandas DataFrame to write.
|
|
1239
|
+
batch_size: Optional batch size for writes.
|
|
1240
|
+
**kwargs: Format-specific options.
|
|
1241
|
+
|
|
1242
|
+
Returns:
|
|
1243
|
+
The DatabricksPath instance.
|
|
1244
|
+
"""
|
|
979
1245
|
return self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
|
|
980
1246
|
|
|
981
1247
|
def read_polars(
|
|
@@ -986,6 +1252,18 @@ class DatabricksPath:
|
|
|
986
1252
|
concat: bool = True,
|
|
987
1253
|
**kwargs
|
|
988
1254
|
):
|
|
1255
|
+
"""Read the path into a polars DataFrame.
|
|
1256
|
+
|
|
1257
|
+
Args:
|
|
1258
|
+
batch_size: Optional batch size for reads.
|
|
1259
|
+
how: Polars concat strategy.
|
|
1260
|
+
rechunk: Whether to rechunk after concat.
|
|
1261
|
+
concat: Whether to concatenate results for directories.
|
|
1262
|
+
**kwargs: Format-specific options.
|
|
1263
|
+
|
|
1264
|
+
Returns:
|
|
1265
|
+
A polars DataFrame or list of DataFrames if concat=False.
|
|
1266
|
+
"""
|
|
989
1267
|
import polars as pl
|
|
990
1268
|
|
|
991
1269
|
if self.is_file():
|
|
@@ -1023,6 +1301,14 @@ class DatabricksPath:
|
|
|
1023
1301
|
- If path is a file: write using DatabricksIO.write_polars which is extension-driven
|
|
1024
1302
|
(parquet/csv/ipc/json/ndjson etc.).
|
|
1025
1303
|
|
|
1304
|
+
Args:
|
|
1305
|
+
df: polars DataFrame or LazyFrame to write.
|
|
1306
|
+
batch_size: Optional rows per part for directory sinks.
|
|
1307
|
+
**kwargs: Format-specific options.
|
|
1308
|
+
|
|
1309
|
+
Returns:
|
|
1310
|
+
The DatabricksPath instance.
|
|
1311
|
+
|
|
1026
1312
|
Notes:
|
|
1027
1313
|
- If `df` is a LazyFrame, we collect it first (optionally streaming).
|
|
1028
1314
|
"""
|
|
@@ -1057,6 +1343,15 @@ class DatabricksPath:
|
|
|
1057
1343
|
query: str,
|
|
1058
1344
|
engine: str = "auto"
|
|
1059
1345
|
):
|
|
1346
|
+
"""Run a local SQL query against data at this path.
|
|
1347
|
+
|
|
1348
|
+
Args:
|
|
1349
|
+
query: SQL query string referencing the path.
|
|
1350
|
+
engine: Query engine ("duckdb", "polars", or "auto").
|
|
1351
|
+
|
|
1352
|
+
Returns:
|
|
1353
|
+
An Arrow Table with the query results.
|
|
1354
|
+
"""
|
|
1060
1355
|
if engine == "auto":
|
|
1061
1356
|
try:
|
|
1062
1357
|
import duckdb
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Enumerations for Databricks path namespaces."""
|
|
2
|
+
|
|
1
3
|
from enum import Enum
|
|
2
4
|
|
|
3
5
|
|
|
@@ -5,6 +7,7 @@ __all__ = ["DatabricksPathKind"]
|
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class DatabricksPathKind(str, Enum):
|
|
10
|
+
"""Supported Databricks path kinds for workspace, volumes, and DBFS."""
|
|
8
11
|
WORKSPACE = "workspace"
|
|
9
12
|
VOLUME = "volume"
|
|
10
13
|
DBFS = "dbfs"
|