ygg 0.1.51__py3-none-any.whl → 0.1.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/METADATA +1 -1
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/RECORD +18 -16
- yggdrasil/databricks/sql/engine.py +288 -84
- yggdrasil/databricks/sql/exceptions.py +3 -1
- yggdrasil/databricks/workspaces/io.py +78 -69
- yggdrasil/databricks/workspaces/path.py +367 -166
- yggdrasil/databricks/workspaces/path_kind.py +3 -3
- yggdrasil/databricks/workspaces/volumes_path.py +85 -0
- yggdrasil/libs/databrickslib.py +5 -0
- yggdrasil/pyutils/callable_serde.py +10 -10
- yggdrasil/pyutils/retry.py +2 -2
- yggdrasil/types/cast/registry.py +0 -14
- yggdrasil/types/file_format.py +10 -0
- yggdrasil/version.py +1 -1
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/WHEEL +0 -0
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/top_level.txt +0 -0
|
@@ -9,15 +9,17 @@ import random
|
|
|
9
9
|
import string
|
|
10
10
|
import time
|
|
11
11
|
from pathlib import PurePosixPath
|
|
12
|
-
from typing import Optional, Tuple, Union, TYPE_CHECKING, List
|
|
12
|
+
from typing import Optional, Tuple, Union, TYPE_CHECKING, List
|
|
13
13
|
|
|
14
14
|
import pyarrow as pa
|
|
15
15
|
import pyarrow.dataset as ds
|
|
16
|
+
from pyarrow import ArrowInvalid
|
|
16
17
|
from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
|
|
17
18
|
from pyarrow.fs import FileInfo, FileType, FileSystem
|
|
18
19
|
|
|
19
20
|
from .io import DatabricksIO
|
|
20
21
|
from .path_kind import DatabricksPathKind
|
|
22
|
+
from .volumes_path import get_volume_status, get_volume_metadata
|
|
21
23
|
from ...libs.databrickslib import databricks
|
|
22
24
|
from ...libs.pandaslib import PandasDataFrame
|
|
23
25
|
from ...libs.polarslib import polars, PolarsDataFrame
|
|
@@ -25,9 +27,10 @@ from ...types.cast.arrow_cast import cast_arrow_tabular
|
|
|
25
27
|
from ...types.cast.cast_options import CastOptions
|
|
26
28
|
from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
|
|
27
29
|
from ...types.cast.registry import convert, register_converter
|
|
30
|
+
from ...types.file_format import ExcelFileFormat
|
|
28
31
|
|
|
29
32
|
if databricks is not None:
|
|
30
|
-
from databricks.sdk.service.catalog import VolumeType
|
|
33
|
+
from databricks.sdk.service.catalog import VolumeType, PathOperation, VolumeInfo
|
|
31
34
|
from databricks.sdk.service.workspace import ObjectType
|
|
32
35
|
from databricks.sdk.errors.platform import (
|
|
33
36
|
NotFound,
|
|
@@ -51,7 +54,9 @@ __all__ = [
|
|
|
51
54
|
]
|
|
52
55
|
|
|
53
56
|
|
|
54
|
-
def _flatten_parts(
|
|
57
|
+
def _flatten_parts(
|
|
58
|
+
parts: Union["DatabricksPath", List[str], str],
|
|
59
|
+
) -> List[str]:
|
|
55
60
|
"""Normalize path parts by splitting on '/' and removing empties.
|
|
56
61
|
|
|
57
62
|
Args:
|
|
@@ -60,8 +65,13 @@ def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
|
|
|
60
65
|
Returns:
|
|
61
66
|
A flattened list of path components.
|
|
62
67
|
"""
|
|
63
|
-
if isinstance(parts,
|
|
64
|
-
parts
|
|
68
|
+
if not isinstance(parts, list):
|
|
69
|
+
if isinstance(parts, DatabricksPath):
|
|
70
|
+
return parts.parts
|
|
71
|
+
elif isinstance(parts, (set, tuple)):
|
|
72
|
+
parts = list(parts)
|
|
73
|
+
else:
|
|
74
|
+
parts = [str(parts).replace("\\", "/")]
|
|
65
75
|
|
|
66
76
|
if any("/" in part for part in parts):
|
|
67
77
|
new_parts: list[str] = []
|
|
@@ -92,13 +102,16 @@ class DatabricksPath:
|
|
|
92
102
|
"""Path wrapper for Databricks workspace, volumes, and DBFS objects."""
|
|
93
103
|
kind: DatabricksPathKind
|
|
94
104
|
parts: List[str]
|
|
105
|
+
temporary: bool = False
|
|
106
|
+
|
|
107
|
+
_is_file: Optional[bool] = dataclasses.field(repr=False, hash=False, default=None)
|
|
108
|
+
_is_dir: Optional[bool] = dataclasses.field(repr=False, hash=False, default=None)
|
|
109
|
+
_size: Optional[int] = dataclasses.field(repr=False, hash=False, default=None)
|
|
110
|
+
_mtime: Optional[float] = dataclasses.field(repr=False, hash=False, default=None)
|
|
95
111
|
|
|
96
|
-
_workspace: Optional["Workspace"] = None
|
|
112
|
+
_workspace: Optional["Workspace"] = dataclasses.field(repr=False, hash=False, default=None)
|
|
97
113
|
|
|
98
|
-
|
|
99
|
-
_is_dir: Optional[bool] = None
|
|
100
|
-
_size: Optional[int] = None
|
|
101
|
-
_mtime: Optional[float] = None
|
|
114
|
+
_volume_info: Optional["VolumeInfo"] = dataclasses.field(repr=False, hash=False, default=None)
|
|
102
115
|
|
|
103
116
|
def clone_instance(
|
|
104
117
|
self,
|
|
@@ -110,6 +123,7 @@ class DatabricksPath:
|
|
|
110
123
|
is_dir: Optional[bool] = dataclasses.MISSING,
|
|
111
124
|
size: Optional[int] = dataclasses.MISSING,
|
|
112
125
|
mtime: Optional[float] = dataclasses.MISSING,
|
|
126
|
+
volume_info: Optional["VolumeInfo"] = dataclasses.MISSING,
|
|
113
127
|
) -> "DatabricksPath":
|
|
114
128
|
"""
|
|
115
129
|
Return a copy of this DatabricksPath, optionally overriding fields.
|
|
@@ -125,6 +139,21 @@ class DatabricksPath:
|
|
|
125
139
|
_is_dir=self._is_dir if is_dir is dataclasses.MISSING else is_dir,
|
|
126
140
|
_size=self._size if size is dataclasses.MISSING else size,
|
|
127
141
|
_mtime=self._mtime if mtime is dataclasses.MISSING else mtime,
|
|
142
|
+
_volume_info=self._volume_info if volume_info is dataclasses.MISSING else volume_info,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def empty_instance(cls, workspace: Optional["Workspace"] = None):
|
|
147
|
+
return DatabricksPath(
|
|
148
|
+
kind=DatabricksPathKind.DBFS,
|
|
149
|
+
parts=[],
|
|
150
|
+
temporary=False,
|
|
151
|
+
_workspace=workspace,
|
|
152
|
+
_is_file=False,
|
|
153
|
+
_is_dir=False,
|
|
154
|
+
_size=0,
|
|
155
|
+
_mtime=0.0,
|
|
156
|
+
_volume_info=None,
|
|
128
157
|
)
|
|
129
158
|
|
|
130
159
|
@classmethod
|
|
@@ -132,18 +161,20 @@ class DatabricksPath:
|
|
|
132
161
|
cls,
|
|
133
162
|
obj: Union["DatabricksPath", str, List[str]],
|
|
134
163
|
workspace: Optional["Workspace"] = None,
|
|
164
|
+
temporary: bool = False
|
|
135
165
|
) -> "DatabricksPath":
|
|
136
166
|
"""Parse input into a DatabricksPath instance.
|
|
137
167
|
|
|
138
168
|
Args:
|
|
139
169
|
obj: Input path, DatabricksPath, or path parts list.
|
|
140
170
|
workspace: Optional Workspace to bind to the path.
|
|
171
|
+
temporary: Temporary location
|
|
141
172
|
|
|
142
173
|
Returns:
|
|
143
174
|
A DatabricksPath instance.
|
|
144
175
|
"""
|
|
145
176
|
if not obj:
|
|
146
|
-
return
|
|
177
|
+
return cls.empty_instance(workspace=workspace)
|
|
147
178
|
|
|
148
179
|
if not isinstance(obj, (str, list)):
|
|
149
180
|
if isinstance(obj, DatabricksPath):
|
|
@@ -156,30 +187,35 @@ class DatabricksPath:
|
|
|
156
187
|
if isinstance(obj, DatabricksIO):
|
|
157
188
|
return obj.path
|
|
158
189
|
|
|
159
|
-
|
|
190
|
+
else:
|
|
160
191
|
obj = str(obj)
|
|
161
192
|
|
|
193
|
+
|
|
162
194
|
obj = _flatten_parts(obj)
|
|
163
195
|
|
|
164
196
|
if obj and not obj[0]:
|
|
165
197
|
obj = obj[1:]
|
|
166
198
|
|
|
167
199
|
if not obj:
|
|
168
|
-
return
|
|
200
|
+
return cls.empty_instance(workspace=workspace)
|
|
169
201
|
|
|
170
202
|
head, *tail = obj
|
|
171
|
-
head = head.casefold()
|
|
172
203
|
|
|
173
204
|
if head == "dbfs":
|
|
174
205
|
kind = DatabricksPathKind.DBFS
|
|
175
|
-
elif head
|
|
206
|
+
elif head in {"Workspace", "workspace"}:
|
|
176
207
|
kind = DatabricksPathKind.WORKSPACE
|
|
177
|
-
elif head
|
|
208
|
+
elif head in {"Volumes", "volumes"}:
|
|
178
209
|
kind = DatabricksPathKind.VOLUME
|
|
179
210
|
else:
|
|
180
211
|
raise ValueError(f"Invalid DatabricksPath head {head!r} from {obj!r}, must be in ['dbfs', 'workspace', 'volumes']")
|
|
181
212
|
|
|
182
|
-
return DatabricksPath(
|
|
213
|
+
return DatabricksPath(
|
|
214
|
+
kind=kind,
|
|
215
|
+
parts=tail,
|
|
216
|
+
temporary=temporary,
|
|
217
|
+
_workspace=workspace,
|
|
218
|
+
)
|
|
183
219
|
|
|
184
220
|
def __hash__(self):
|
|
185
221
|
return hash(self.full_path())
|
|
@@ -259,16 +295,19 @@ class DatabricksPath:
|
|
|
259
295
|
return self
|
|
260
296
|
|
|
261
297
|
if self._is_file is not None or self._is_dir is not None:
|
|
262
|
-
_is_file, _is_dir = False, True
|
|
298
|
+
_is_file, _is_dir, _size = False, True, 0
|
|
263
299
|
else:
|
|
264
|
-
_is_file, _is_dir = None, None
|
|
300
|
+
_is_file, _is_dir, _size = None, None, None
|
|
265
301
|
|
|
266
302
|
return DatabricksPath(
|
|
267
303
|
kind=self.kind,
|
|
268
304
|
parts=self.parts[:-1],
|
|
305
|
+
temporary=False,
|
|
269
306
|
_workspace=self._workspace,
|
|
270
307
|
_is_file=_is_file,
|
|
271
308
|
_is_dir=_is_dir,
|
|
309
|
+
_size=_size,
|
|
310
|
+
_volume_info=self._volume_info
|
|
272
311
|
)
|
|
273
312
|
|
|
274
313
|
@property
|
|
@@ -281,7 +320,7 @@ class DatabricksPath:
|
|
|
281
320
|
if self._workspace is None:
|
|
282
321
|
from .workspace import Workspace
|
|
283
322
|
|
|
284
|
-
|
|
323
|
+
self._workspace = Workspace()
|
|
285
324
|
return self._workspace
|
|
286
325
|
|
|
287
326
|
@workspace.setter
|
|
@@ -330,13 +369,15 @@ class DatabricksPath:
|
|
|
330
369
|
return CsvFileFormat()
|
|
331
370
|
elif ext == "json":
|
|
332
371
|
return JsonFileFormat()
|
|
372
|
+
elif ext in {"xlsx", "xlsm", "xls"}:
|
|
373
|
+
return ExcelFileFormat()
|
|
333
374
|
else:
|
|
334
375
|
raise ValueError(
|
|
335
376
|
"Cannot get file format from extension %s" % ext
|
|
336
377
|
)
|
|
337
378
|
|
|
338
379
|
@property
|
|
339
|
-
def content_length(self):
|
|
380
|
+
def content_length(self) -> int:
|
|
340
381
|
"""Return the size of the path in bytes if known.
|
|
341
382
|
|
|
342
383
|
Returns:
|
|
@@ -344,10 +385,10 @@ class DatabricksPath:
|
|
|
344
385
|
"""
|
|
345
386
|
if self._size is None:
|
|
346
387
|
self.refresh_status()
|
|
347
|
-
return self._size
|
|
388
|
+
return self._size or 0
|
|
348
389
|
|
|
349
390
|
@content_length.setter
|
|
350
|
-
def content_length(self, value: int):
|
|
391
|
+
def content_length(self, value: Optional[int]):
|
|
351
392
|
self._size = value
|
|
352
393
|
|
|
353
394
|
@property
|
|
@@ -390,6 +431,10 @@ class DatabricksPath:
|
|
|
390
431
|
size=self.content_length,
|
|
391
432
|
)
|
|
392
433
|
|
|
434
|
+
@property
|
|
435
|
+
def is_local(self):
|
|
436
|
+
return False
|
|
437
|
+
|
|
393
438
|
def is_file(self):
|
|
394
439
|
"""Return True when the path is a file.
|
|
395
440
|
|
|
@@ -416,7 +461,16 @@ class DatabricksPath:
|
|
|
416
461
|
Returns:
|
|
417
462
|
True if the path represents a directory sink.
|
|
418
463
|
"""
|
|
419
|
-
|
|
464
|
+
if self.is_dir():
|
|
465
|
+
return True
|
|
466
|
+
|
|
467
|
+
if self.is_file():
|
|
468
|
+
return False
|
|
469
|
+
|
|
470
|
+
if self.parts and self.parts[-1] == "":
|
|
471
|
+
return True
|
|
472
|
+
|
|
473
|
+
return not "." in self.name
|
|
420
474
|
|
|
421
475
|
@property
|
|
422
476
|
def connected(self) -> bool:
|
|
@@ -443,7 +497,33 @@ class DatabricksPath:
|
|
|
443
497
|
return self
|
|
444
498
|
|
|
445
499
|
def close(self):
|
|
446
|
-
|
|
500
|
+
if self.temporary:
|
|
501
|
+
self.remove(recursive=True)
|
|
502
|
+
|
|
503
|
+
def storage_location(self) -> str:
|
|
504
|
+
info = self.volume_info()
|
|
505
|
+
|
|
506
|
+
if info is None:
|
|
507
|
+
raise NotFound(
|
|
508
|
+
"Volume %s not found" % repr(self)
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
_, _, _, parts = self.volume_parts()
|
|
512
|
+
|
|
513
|
+
base = info.storage_location.rstrip("/") # avoid trailing slash
|
|
514
|
+
return f"{base}/{'/'.join(parts)}" if parts else base
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def volume_info(self) -> Optional["VolumeInfo"]:
|
|
518
|
+
if self._volume_info is None and self.kind == DatabricksPathKind.VOLUME:
|
|
519
|
+
catalog, schema, volume, _ = self.volume_parts()
|
|
520
|
+
|
|
521
|
+
if catalog and schema and volume:
|
|
522
|
+
self._volume_info = get_volume_metadata(
|
|
523
|
+
sdk=self.workspace.sdk(),
|
|
524
|
+
full_name="%s.%s.%s" % (catalog, schema, volume)
|
|
525
|
+
)
|
|
526
|
+
return self._volume_info
|
|
447
527
|
|
|
448
528
|
def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
|
|
449
529
|
"""Return (catalog, schema, volume, rel_path) for volume paths.
|
|
@@ -458,8 +538,6 @@ class DatabricksPath:
|
|
|
458
538
|
schema = self.parts[1] if len(self.parts) > 1 and self.parts[1] else None
|
|
459
539
|
volume = self.parts[2] if len(self.parts) > 2 and self.parts[2] else None
|
|
460
540
|
|
|
461
|
-
# NOTE: rel is used as a true/false “has relative path” indicator in this file.
|
|
462
|
-
# The runtime value is a list[str] (not PurePosixPath). Keeping it that way to avoid behavior changes.
|
|
463
541
|
return catalog, schema, volume, self.parts[3:] # type: ignore[return-value]
|
|
464
542
|
|
|
465
543
|
def refresh_status(self) -> "DatabricksPath":
|
|
@@ -480,34 +558,20 @@ class DatabricksPath:
|
|
|
480
558
|
full_path = self.files_full_path()
|
|
481
559
|
sdk = self.workspace.sdk()
|
|
482
560
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
else None
|
|
490
|
-
)
|
|
491
|
-
|
|
492
|
-
return self.reset_metadata(is_file=True, is_dir=False, size=info.content_length, mtime=mtime)
|
|
493
|
-
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
494
|
-
pass
|
|
495
|
-
|
|
496
|
-
try:
|
|
497
|
-
info = sdk.files.get_directory_metadata(full_path)
|
|
498
|
-
|
|
499
|
-
if info is None:
|
|
500
|
-
mtime = dt.datetime.now(tz=dt.timezone.utc)
|
|
501
|
-
else:
|
|
502
|
-
mtime = (
|
|
503
|
-
dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
|
|
504
|
-
if info.last_modified
|
|
505
|
-
else None
|
|
506
|
-
)
|
|
561
|
+
is_file, is_dir, size, mtime = get_volume_status(
|
|
562
|
+
sdk=sdk,
|
|
563
|
+
full_path=full_path,
|
|
564
|
+
check_file_first="." in self.name,
|
|
565
|
+
raise_error=False
|
|
566
|
+
)
|
|
507
567
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
568
|
+
self.reset_metadata(
|
|
569
|
+
is_file=is_file,
|
|
570
|
+
is_dir=is_dir,
|
|
571
|
+
size=size,
|
|
572
|
+
mtime=mtime,
|
|
573
|
+
volume_info=self._volume_info
|
|
574
|
+
)
|
|
511
575
|
|
|
512
576
|
return self
|
|
513
577
|
|
|
@@ -520,15 +584,18 @@ class DatabricksPath:
|
|
|
520
584
|
is_file = not is_dir
|
|
521
585
|
size = info.size
|
|
522
586
|
mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
|
|
587
|
+
|
|
588
|
+
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
523
589
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
524
|
-
|
|
525
|
-
size = 0
|
|
526
|
-
mtime = found.mtime if found is not None else None
|
|
590
|
+
pass
|
|
527
591
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
592
|
+
found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
|
|
593
|
+
size = None
|
|
594
|
+
|
|
595
|
+
if found is None:
|
|
596
|
+
is_file, is_dir, mtime = None, None, None
|
|
597
|
+
else:
|
|
598
|
+
is_file, is_dir, mtime = False, True, found.mtime
|
|
532
599
|
|
|
533
600
|
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
534
601
|
|
|
@@ -540,17 +607,23 @@ class DatabricksPath:
|
|
|
540
607
|
is_file, is_dir = not info.is_dir, info.is_dir
|
|
541
608
|
size = info.file_size
|
|
542
609
|
mtime = info.modification_time / 1000.0 if info.modification_time else None
|
|
610
|
+
|
|
611
|
+
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
543
612
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
544
|
-
|
|
545
|
-
size = 0
|
|
546
|
-
mtime = found.mtime if found is not None else None
|
|
613
|
+
pass
|
|
547
614
|
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
is_file, is_dir = False, True
|
|
615
|
+
found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
|
|
616
|
+
size = None
|
|
617
|
+
mtime = found.mtime if found is not None else None
|
|
552
618
|
|
|
553
|
-
|
|
619
|
+
if found is None:
|
|
620
|
+
is_file, is_dir = None, None
|
|
621
|
+
else:
|
|
622
|
+
is_file, is_dir = False, True
|
|
623
|
+
|
|
624
|
+
return self.reset_metadata(
|
|
625
|
+
is_file=is_file, is_dir=is_dir, size=size, mtime=mtime
|
|
626
|
+
)
|
|
554
627
|
|
|
555
628
|
def reset_metadata(
|
|
556
629
|
self,
|
|
@@ -558,6 +631,7 @@ class DatabricksPath:
|
|
|
558
631
|
is_dir: Optional[bool] = None,
|
|
559
632
|
size: Optional[int] = None,
|
|
560
633
|
mtime: Optional[float] = None,
|
|
634
|
+
volume_info: Optional["VolumeInfo"] = None
|
|
561
635
|
):
|
|
562
636
|
"""Update cached metadata fields.
|
|
563
637
|
|
|
@@ -566,6 +640,7 @@ class DatabricksPath:
|
|
|
566
640
|
is_dir: Optional directory flag.
|
|
567
641
|
size: Optional size in bytes.
|
|
568
642
|
mtime: Optional modification time in seconds.
|
|
643
|
+
volume_info: volume metadata
|
|
569
644
|
|
|
570
645
|
Returns:
|
|
571
646
|
The DatabricksPath instance.
|
|
@@ -574,10 +649,13 @@ class DatabricksPath:
|
|
|
574
649
|
self._is_dir = is_dir
|
|
575
650
|
self._size = size
|
|
576
651
|
self._mtime = mtime
|
|
652
|
+
self._volume_info = volume_info
|
|
577
653
|
|
|
578
654
|
return self
|
|
579
655
|
|
|
580
656
|
# ---- API path normalization helpers ----
|
|
657
|
+
def full_parts(self):
|
|
658
|
+
return self.parts if self.parts[-1] else self.parts[:-1]
|
|
581
659
|
|
|
582
660
|
def workspace_full_path(self) -> str:
|
|
583
661
|
"""Return the full workspace path string.
|
|
@@ -585,12 +663,7 @@ class DatabricksPath:
|
|
|
585
663
|
Returns:
|
|
586
664
|
Workspace path string.
|
|
587
665
|
"""
|
|
588
|
-
|
|
589
|
-
return "/Workspace"
|
|
590
|
-
|
|
591
|
-
parts = self.parts if self.parts[-1] else self.parts[:-1]
|
|
592
|
-
|
|
593
|
-
return "/Workspace/%s" % "/".join(parts)
|
|
666
|
+
return "/Workspace/%s" % "/".join(self.full_parts())
|
|
594
667
|
|
|
595
668
|
def dbfs_full_path(self) -> str:
|
|
596
669
|
"""Return the full DBFS path string.
|
|
@@ -598,12 +671,7 @@ class DatabricksPath:
|
|
|
598
671
|
Returns:
|
|
599
672
|
DBFS path string.
|
|
600
673
|
"""
|
|
601
|
-
|
|
602
|
-
return "/dbfs"
|
|
603
|
-
|
|
604
|
-
parts = self.parts if self.parts[-1] else self.parts[:-1]
|
|
605
|
-
|
|
606
|
-
return "/dbfs/%s" % "/".join(parts)
|
|
674
|
+
return "/dbfs/%s" % "/".join(self.full_parts())
|
|
607
675
|
|
|
608
676
|
def files_full_path(self) -> str:
|
|
609
677
|
"""Return the full files (volume) path string.
|
|
@@ -611,12 +679,7 @@ class DatabricksPath:
|
|
|
611
679
|
Returns:
|
|
612
680
|
Volume path string.
|
|
613
681
|
"""
|
|
614
|
-
|
|
615
|
-
return "/Volumes"
|
|
616
|
-
|
|
617
|
-
parts = self.parts if self.parts[-1] else self.parts[:-1]
|
|
618
|
-
|
|
619
|
-
return "/Volumes/%s" % "/".join(parts)
|
|
682
|
+
return "/Volumes/%s" % "/".join(self.full_parts())
|
|
620
683
|
|
|
621
684
|
def exists(self, *, follow_symlinks=True) -> bool:
|
|
622
685
|
"""Return True if the path exists.
|
|
@@ -627,7 +690,13 @@ class DatabricksPath:
|
|
|
627
690
|
Returns:
|
|
628
691
|
True if the path exists.
|
|
629
692
|
"""
|
|
630
|
-
|
|
693
|
+
if self.is_file():
|
|
694
|
+
return True
|
|
695
|
+
|
|
696
|
+
elif self.is_dir():
|
|
697
|
+
return True
|
|
698
|
+
|
|
699
|
+
return False
|
|
631
700
|
|
|
632
701
|
def mkdir(self, mode=None, parents=True, exist_ok=True):
|
|
633
702
|
"""Create a directory for the path.
|
|
@@ -652,33 +721,46 @@ class DatabricksPath:
|
|
|
652
721
|
def _ensure_volume(self, exist_ok: bool = True, sdk=None):
|
|
653
722
|
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
654
723
|
sdk = self.workspace.sdk() if sdk is None else sdk
|
|
724
|
+
default_tags = self.workspace.default_tags()
|
|
655
725
|
|
|
656
726
|
if catalog_name:
|
|
657
727
|
try:
|
|
658
|
-
sdk.catalogs.create(
|
|
728
|
+
sdk.catalogs.create(
|
|
729
|
+
name=catalog_name,
|
|
730
|
+
properties=default_tags,
|
|
731
|
+
comment="Catalog auto generated by yggdrasil"
|
|
732
|
+
)
|
|
659
733
|
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
660
734
|
if not exist_ok:
|
|
661
735
|
raise
|
|
662
736
|
|
|
663
737
|
if schema_name:
|
|
664
738
|
try:
|
|
665
|
-
sdk.schemas.create(
|
|
739
|
+
sdk.schemas.create(
|
|
740
|
+
catalog_name=catalog_name,
|
|
741
|
+
name=schema_name,
|
|
742
|
+
properties=default_tags,
|
|
743
|
+
comment="Schema auto generated by yggdrasil"
|
|
744
|
+
)
|
|
666
745
|
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
667
746
|
if not exist_ok:
|
|
668
747
|
raise
|
|
669
748
|
|
|
670
749
|
if volume_name:
|
|
671
750
|
try:
|
|
672
|
-
sdk.volumes.create(
|
|
751
|
+
self._volume_info = sdk.volumes.create(
|
|
673
752
|
catalog_name=catalog_name,
|
|
674
753
|
schema_name=schema_name,
|
|
675
754
|
name=volume_name,
|
|
676
755
|
volume_type=VolumeType.MANAGED,
|
|
756
|
+
comment="Volume auto generated by yggdrasil"
|
|
677
757
|
)
|
|
678
758
|
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
679
759
|
if not exist_ok:
|
|
680
760
|
raise
|
|
681
761
|
|
|
762
|
+
return self._volume_info
|
|
763
|
+
|
|
682
764
|
def make_volume_dir(self, parents=True, exist_ok=True):
|
|
683
765
|
path = self.files_full_path()
|
|
684
766
|
sdk = self.workspace.sdk()
|
|
@@ -724,7 +806,10 @@ class DatabricksPath:
|
|
|
724
806
|
|
|
725
807
|
return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
|
|
726
808
|
|
|
727
|
-
def remove(
|
|
809
|
+
def remove(
|
|
810
|
+
self,
|
|
811
|
+
recursive: bool = True
|
|
812
|
+
):
|
|
728
813
|
"""Remove the path as a file or directory.
|
|
729
814
|
|
|
730
815
|
Args:
|
|
@@ -755,122 +840,182 @@ class DatabricksPath:
|
|
|
755
840
|
return self._remove_dbfs_file()
|
|
756
841
|
return self._remove_dbfs_dir(recursive=recursive)
|
|
757
842
|
|
|
758
|
-
def rmfile(self):
|
|
843
|
+
def rmfile(self, allow_not_found: bool = True):
|
|
759
844
|
"""Remove the path as a file.
|
|
760
845
|
|
|
761
846
|
Returns:
|
|
762
847
|
The DatabricksPath instance.
|
|
763
848
|
"""
|
|
764
849
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
765
|
-
|
|
850
|
+
self._remove_volume_file(allow_not_found=allow_not_found)
|
|
766
851
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
767
|
-
|
|
852
|
+
self._remove_workspace_file(allow_not_found=allow_not_found)
|
|
768
853
|
elif self.kind == DatabricksPathKind.DBFS:
|
|
769
|
-
|
|
854
|
+
self._remove_dbfs_file(allow_not_found=allow_not_found)
|
|
770
855
|
|
|
771
856
|
return self
|
|
772
857
|
|
|
773
|
-
def _remove_volume_file(self):
|
|
858
|
+
def _remove_volume_file(self, allow_not_found: bool = True):
|
|
774
859
|
sdk = self.workspace.sdk()
|
|
775
860
|
try:
|
|
776
861
|
sdk.files.delete(self.files_full_path())
|
|
777
862
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
778
|
-
|
|
863
|
+
if not allow_not_found:
|
|
864
|
+
raise
|
|
779
865
|
finally:
|
|
780
866
|
self.reset_metadata()
|
|
781
867
|
|
|
782
868
|
return self
|
|
783
869
|
|
|
784
|
-
def _remove_workspace_file(self):
|
|
870
|
+
def _remove_workspace_file(self, allow_not_found: bool = True):
|
|
785
871
|
sdk = self.workspace.sdk()
|
|
786
872
|
try:
|
|
787
873
|
sdk.workspace.delete(self.workspace_full_path(), recursive=True)
|
|
788
874
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
789
|
-
|
|
875
|
+
if not allow_not_found:
|
|
876
|
+
raise
|
|
790
877
|
finally:
|
|
791
878
|
self.reset_metadata()
|
|
792
879
|
|
|
793
880
|
return self
|
|
794
881
|
|
|
795
|
-
def _remove_dbfs_file(self):
|
|
882
|
+
def _remove_dbfs_file(self, allow_not_found: bool = True):
|
|
796
883
|
sdk = self.workspace.sdk()
|
|
797
884
|
try:
|
|
798
885
|
sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
|
|
799
886
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
800
|
-
|
|
887
|
+
if not allow_not_found:
|
|
888
|
+
raise
|
|
801
889
|
finally:
|
|
802
890
|
self.reset_metadata()
|
|
803
891
|
|
|
804
892
|
return self
|
|
805
893
|
|
|
806
|
-
def rmdir(
|
|
894
|
+
def rmdir(
|
|
895
|
+
self,
|
|
896
|
+
recursive: bool = True,
|
|
897
|
+
allow_not_found: bool = True,
|
|
898
|
+
with_root: bool = True
|
|
899
|
+
):
|
|
807
900
|
"""Remove the path as a directory.
|
|
808
901
|
|
|
809
902
|
Args:
|
|
810
903
|
recursive: Whether to delete directories recursively.
|
|
904
|
+
allow_not_found: Allow not found location
|
|
905
|
+
with_root: Delete also dir object
|
|
811
906
|
|
|
812
907
|
Returns:
|
|
813
908
|
The DatabricksPath instance.
|
|
814
909
|
"""
|
|
815
910
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
816
|
-
return self._remove_volume_dir(
|
|
911
|
+
return self._remove_volume_dir(
|
|
912
|
+
recursive=recursive,
|
|
913
|
+
allow_not_found=allow_not_found,
|
|
914
|
+
with_root=with_root
|
|
915
|
+
)
|
|
817
916
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
818
|
-
return self._remove_workspace_dir(
|
|
917
|
+
return self._remove_workspace_dir(
|
|
918
|
+
recursive=recursive,
|
|
919
|
+
allow_not_found=allow_not_found,
|
|
920
|
+
with_root=with_root
|
|
921
|
+
)
|
|
819
922
|
elif self.kind == DatabricksPathKind.DBFS:
|
|
820
|
-
return self._remove_dbfs_dir(
|
|
923
|
+
return self._remove_dbfs_dir(
|
|
924
|
+
recursive=recursive,
|
|
925
|
+
allow_not_found=allow_not_found,
|
|
926
|
+
with_root=with_root
|
|
927
|
+
)
|
|
821
928
|
|
|
822
|
-
def _remove_workspace_dir(
|
|
929
|
+
def _remove_workspace_dir(
|
|
930
|
+
self,
|
|
931
|
+
recursive: bool = True,
|
|
932
|
+
allow_not_found: bool = True,
|
|
933
|
+
with_root: bool = True
|
|
934
|
+
):
|
|
823
935
|
sdk = self.workspace.sdk()
|
|
936
|
+
full_path =self.workspace_full_path()
|
|
937
|
+
|
|
824
938
|
try:
|
|
825
|
-
sdk.workspace.delete(
|
|
939
|
+
sdk.workspace.delete(full_path, recursive=recursive)
|
|
940
|
+
|
|
941
|
+
if not with_root:
|
|
942
|
+
sdk.workspace.mkdirs(full_path)
|
|
826
943
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
827
|
-
|
|
944
|
+
if not allow_not_found:
|
|
945
|
+
raise
|
|
828
946
|
finally:
|
|
829
947
|
self.reset_metadata()
|
|
830
948
|
|
|
831
949
|
return self
|
|
832
950
|
|
|
833
|
-
def _remove_dbfs_dir(
|
|
951
|
+
def _remove_dbfs_dir(
|
|
952
|
+
self,
|
|
953
|
+
recursive: bool = True,
|
|
954
|
+
allow_not_found: bool = True,
|
|
955
|
+
with_root: bool = True
|
|
956
|
+
):
|
|
834
957
|
sdk = self.workspace.sdk()
|
|
958
|
+
full_path = self.dbfs_full_path()
|
|
959
|
+
|
|
835
960
|
try:
|
|
836
|
-
sdk.dbfs.delete(
|
|
961
|
+
sdk.dbfs.delete(full_path, recursive=recursive)
|
|
962
|
+
|
|
963
|
+
if not with_root:
|
|
964
|
+
sdk.dbfs.mkdirs(full_path)
|
|
837
965
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
838
|
-
|
|
966
|
+
if not allow_not_found:
|
|
967
|
+
raise
|
|
839
968
|
finally:
|
|
840
969
|
self.reset_metadata()
|
|
841
970
|
|
|
842
971
|
return self
|
|
843
972
|
|
|
844
|
-
def _remove_volume_dir(
|
|
845
|
-
|
|
973
|
+
def _remove_volume_dir(
|
|
974
|
+
self,
|
|
975
|
+
recursive: bool = True,
|
|
976
|
+
allow_not_found: bool = True,
|
|
977
|
+
with_root: bool = True
|
|
978
|
+
):
|
|
979
|
+
full_path = self.files_full_path()
|
|
846
980
|
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
847
981
|
sdk = self.workspace.sdk()
|
|
848
982
|
|
|
849
983
|
if rel:
|
|
850
984
|
try:
|
|
851
|
-
sdk.files.delete_directory(
|
|
985
|
+
sdk.files.delete_directory(full_path)
|
|
852
986
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
|
|
853
987
|
message = str(e)
|
|
988
|
+
|
|
854
989
|
if recursive and "directory is not empty" in message:
|
|
855
990
|
for child_path in self.ls():
|
|
856
991
|
child_path._remove_volume_obj(recursive=True)
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
992
|
+
|
|
993
|
+
if with_root:
|
|
994
|
+
sdk.files.delete_directory(full_path)
|
|
995
|
+
|
|
996
|
+
elif not allow_not_found:
|
|
997
|
+
raise
|
|
860
998
|
elif volume_name:
|
|
861
999
|
try:
|
|
862
1000
|
sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
|
|
863
1001
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
864
|
-
|
|
1002
|
+
if not allow_not_found:
|
|
1003
|
+
raise
|
|
865
1004
|
elif schema_name:
|
|
866
1005
|
try:
|
|
867
1006
|
sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
|
|
868
1007
|
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
869
|
-
|
|
1008
|
+
if not allow_not_found:
|
|
1009
|
+
raise
|
|
870
1010
|
|
|
871
1011
|
return self.reset_metadata()
|
|
872
1012
|
|
|
873
|
-
def ls(
|
|
1013
|
+
def ls(
|
|
1014
|
+
self,
|
|
1015
|
+
recursive: bool = False,
|
|
1016
|
+
fetch_size: int = None,
|
|
1017
|
+
allow_not_found: bool = True
|
|
1018
|
+
):
|
|
874
1019
|
"""List directory contents for the path.
|
|
875
1020
|
|
|
876
1021
|
Args:
|
|
@@ -882,11 +1027,21 @@ class DatabricksPath:
|
|
|
882
1027
|
DatabricksPath entries.
|
|
883
1028
|
"""
|
|
884
1029
|
if self.kind == DatabricksPathKind.VOLUME:
|
|
885
|
-
yield from self._ls_volume(
|
|
1030
|
+
yield from self._ls_volume(
|
|
1031
|
+
recursive=recursive,
|
|
1032
|
+
fetch_size=fetch_size,
|
|
1033
|
+
allow_not_found=allow_not_found
|
|
1034
|
+
)
|
|
886
1035
|
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
887
|
-
yield from self._ls_workspace(
|
|
1036
|
+
yield from self._ls_workspace(
|
|
1037
|
+
recursive=recursive,
|
|
1038
|
+
allow_not_found=allow_not_found
|
|
1039
|
+
)
|
|
888
1040
|
elif self.kind == DatabricksPathKind.DBFS:
|
|
889
|
-
yield from self._ls_dbfs(
|
|
1041
|
+
yield from self._ls_dbfs(
|
|
1042
|
+
recursive=recursive,
|
|
1043
|
+
allow_not_found=allow_not_found
|
|
1044
|
+
)
|
|
890
1045
|
|
|
891
1046
|
def _ls_volume(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
|
|
892
1047
|
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
@@ -904,6 +1059,7 @@ class DatabricksPath:
|
|
|
904
1059
|
_is_dir=True,
|
|
905
1060
|
_size=0,
|
|
906
1061
|
)
|
|
1062
|
+
|
|
907
1063
|
if recursive:
|
|
908
1064
|
yield from base._ls_volume(recursive=recursive)
|
|
909
1065
|
else:
|
|
@@ -1076,6 +1232,22 @@ class DatabricksPath:
|
|
|
1076
1232
|
with self.open("wb") as f:
|
|
1077
1233
|
f.write_all_bytes(data=data)
|
|
1078
1234
|
|
|
1235
|
+
def temporary_credentials(
|
|
1236
|
+
self,
|
|
1237
|
+
operation: Optional["PathOperation"] = None
|
|
1238
|
+
):
|
|
1239
|
+
if self.kind != DatabricksPathKind.VOLUME:
|
|
1240
|
+
raise ValueError(f"Cannot generate temporary credentials for {repr(self)}")
|
|
1241
|
+
|
|
1242
|
+
sdk = self.workspace.sdk()
|
|
1243
|
+
client = sdk.temporary_path_credentials
|
|
1244
|
+
url = self.storage_location()
|
|
1245
|
+
|
|
1246
|
+
return client.generate_temporary_path_credentials(
|
|
1247
|
+
url=url,
|
|
1248
|
+
operation=operation or PathOperation.PATH_READ,
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1079
1251
|
# -------------------------
|
|
1080
1252
|
# Data ops (Arrow / Pandas / Polars)
|
|
1081
1253
|
# -------------------------
|
|
@@ -1123,6 +1295,7 @@ class DatabricksPath:
|
|
|
1123
1295
|
with self.open("rb") as f:
|
|
1124
1296
|
data = f.read_arrow_table(batch_size=batch_size, **kwargs)
|
|
1125
1297
|
return data
|
|
1298
|
+
|
|
1126
1299
|
elif self.is_dir():
|
|
1127
1300
|
tables: list[pa.Table] = []
|
|
1128
1301
|
for child in self.ls(recursive=True):
|
|
@@ -1139,7 +1312,7 @@ class DatabricksPath:
|
|
|
1139
1312
|
|
|
1140
1313
|
try:
|
|
1141
1314
|
return pa.concat_tables(tables)
|
|
1142
|
-
except
|
|
1315
|
+
except ArrowInvalid:
|
|
1143
1316
|
# Fallback: concat via polars (diagonal relaxed) then back to Arrow
|
|
1144
1317
|
from polars import CompatLevel
|
|
1145
1318
|
|
|
@@ -1208,12 +1381,14 @@ class DatabricksPath:
|
|
|
1208
1381
|
|
|
1209
1382
|
return connected
|
|
1210
1383
|
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1384
|
+
else:
|
|
1385
|
+
with connected.open(mode="wb", clone=False) as f:
|
|
1386
|
+
f.write_arrow_table(
|
|
1387
|
+
table,
|
|
1388
|
+
file_format=file_format,
|
|
1389
|
+
batch_size=batch_size,
|
|
1390
|
+
**kwargs
|
|
1391
|
+
)
|
|
1217
1392
|
|
|
1218
1393
|
return self
|
|
1219
1394
|
|
|
@@ -1321,9 +1496,10 @@ class DatabricksPath:
|
|
|
1321
1496
|
"""
|
|
1322
1497
|
if self.is_file():
|
|
1323
1498
|
with self.open("rb") as f:
|
|
1324
|
-
|
|
1499
|
+
df = f.read_polars(batch_size=batch_size, **kwargs)
|
|
1500
|
+
return df
|
|
1325
1501
|
|
|
1326
|
-
|
|
1502
|
+
elif self.is_dir():
|
|
1327
1503
|
dfs = []
|
|
1328
1504
|
for child in self.ls(recursive=True):
|
|
1329
1505
|
if child.is_file():
|
|
@@ -1337,11 +1513,13 @@ class DatabricksPath:
|
|
|
1337
1513
|
return polars.concat(dfs, how=how, rechunk=rechunk)
|
|
1338
1514
|
return dfs # type: ignore[return-value]
|
|
1339
1515
|
|
|
1340
|
-
|
|
1516
|
+
else:
|
|
1517
|
+
raise FileNotFoundError(f"Path does not exist: {self}")
|
|
1341
1518
|
|
|
1342
1519
|
def write_polars(
|
|
1343
1520
|
self,
|
|
1344
1521
|
df,
|
|
1522
|
+
file_format: Optional[FileFormat] = None,
|
|
1345
1523
|
batch_size: Optional[int] = None,
|
|
1346
1524
|
**kwargs
|
|
1347
1525
|
):
|
|
@@ -1356,6 +1534,7 @@ class DatabricksPath:
|
|
|
1356
1534
|
|
|
1357
1535
|
Args:
|
|
1358
1536
|
df: polars DataFrame or LazyFrame to write.
|
|
1537
|
+
file_format: Optional file format override.
|
|
1359
1538
|
batch_size: Optional rows per part for directory sinks.
|
|
1360
1539
|
**kwargs: Format-specific options.
|
|
1361
1540
|
|
|
@@ -1368,9 +1547,6 @@ class DatabricksPath:
|
|
|
1368
1547
|
if isinstance(df, polars.LazyFrame):
|
|
1369
1548
|
df = df.collect()
|
|
1370
1549
|
|
|
1371
|
-
if not isinstance(df, polars.DataFrame):
|
|
1372
|
-
raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
|
|
1373
|
-
|
|
1374
1550
|
with self.connect() as connected:
|
|
1375
1551
|
if connected.is_dir_sink():
|
|
1376
1552
|
seed = int(time.time() * 1000)
|
|
@@ -1380,14 +1556,23 @@ class DatabricksPath:
|
|
|
1380
1556
|
for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
|
|
1381
1557
|
part_path = connected / f"part-{i:05d}-{seed}-{_rand_str(4)}.parquet"
|
|
1382
1558
|
|
|
1383
|
-
part_path.
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1559
|
+
with part_path.open(mode="wb", clone=False) as f:
|
|
1560
|
+
f.write_polars(
|
|
1561
|
+
df,
|
|
1562
|
+
file_format=file_format,
|
|
1563
|
+
batch_size=batch_size,
|
|
1564
|
+
**kwargs
|
|
1565
|
+
)
|
|
1566
|
+
else:
|
|
1567
|
+
with connected.open(mode="wb", clone=False) as f:
|
|
1568
|
+
f.write_polars(
|
|
1569
|
+
df,
|
|
1570
|
+
file_format=file_format,
|
|
1571
|
+
batch_size=batch_size,
|
|
1572
|
+
**kwargs
|
|
1573
|
+
)
|
|
1389
1574
|
|
|
1390
|
-
|
|
1575
|
+
return self
|
|
1391
1576
|
|
|
1392
1577
|
def sql(
|
|
1393
1578
|
self,
|
|
@@ -1415,7 +1600,7 @@ class DatabricksPath:
|
|
|
1415
1600
|
if from_table not in query:
|
|
1416
1601
|
raise ValueError(
|
|
1417
1602
|
"SQL query must contain %s to execute query:\n%s" % (
|
|
1418
|
-
from_table,
|
|
1603
|
+
repr(from_table),
|
|
1419
1604
|
query
|
|
1420
1605
|
)
|
|
1421
1606
|
)
|
|
@@ -1423,19 +1608,26 @@ class DatabricksPath:
|
|
|
1423
1608
|
if engine == "duckdb":
|
|
1424
1609
|
import duckdb
|
|
1425
1610
|
|
|
1426
|
-
|
|
1611
|
+
__arrow_dataset__ = self.arrow_dataset()
|
|
1427
1612
|
|
|
1428
1613
|
return (
|
|
1429
1614
|
duckdb.connect()
|
|
1430
|
-
.execute(
|
|
1615
|
+
.execute(
|
|
1616
|
+
query=query.replace(from_table, "__arrow_dataset__")
|
|
1617
|
+
)
|
|
1431
1618
|
.fetch_arrow_table()
|
|
1432
1619
|
)
|
|
1433
1620
|
elif engine == "polars":
|
|
1434
1621
|
from polars import CompatLevel
|
|
1435
1622
|
|
|
1623
|
+
table_name = "__dbpath__"
|
|
1624
|
+
|
|
1436
1625
|
return (
|
|
1437
1626
|
self.read_polars()
|
|
1438
|
-
.sql(
|
|
1627
|
+
.sql(
|
|
1628
|
+
query=query.replace(from_table, table_name),
|
|
1629
|
+
table_name=table_name
|
|
1630
|
+
)
|
|
1439
1631
|
.to_arrow(compat_level=CompatLevel.newest())
|
|
1440
1632
|
)
|
|
1441
1633
|
else:
|
|
@@ -1444,23 +1636,32 @@ class DatabricksPath:
|
|
|
1444
1636
|
)
|
|
1445
1637
|
|
|
1446
1638
|
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1639
|
+
if databricks is not None:
|
|
1640
|
+
@register_converter(DatabricksPath, pa.Table)
|
|
1641
|
+
def databricks_path_to_arrow_table(
|
|
1642
|
+
data: DatabricksPath,
|
|
1643
|
+
options: Optional[CastOptions] = None,
|
|
1644
|
+
) -> pa.Table:
|
|
1645
|
+
return cast_arrow_tabular(
|
|
1646
|
+
data.read_arrow_table(),
|
|
1647
|
+
options
|
|
1648
|
+
)
|
|
1456
1649
|
|
|
1457
1650
|
|
|
1458
|
-
@
|
|
1459
|
-
def
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
) ->
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
)
|
|
1651
|
+
@register_converter(DatabricksPath, ds.Dataset)
|
|
1652
|
+
def databricks_path_to_arrow_table(
|
|
1653
|
+
data: DatabricksPath,
|
|
1654
|
+
options: Optional[CastOptions] = None,
|
|
1655
|
+
) -> ds.Dataset:
|
|
1656
|
+
return data.arrow_dataset()
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
@polars_converter(DatabricksPath, PolarsDataFrame)
|
|
1660
|
+
def databricks_path_to_polars(
|
|
1661
|
+
data: DatabricksPath,
|
|
1662
|
+
options: Optional[CastOptions] = None,
|
|
1663
|
+
) -> PolarsDataFrame:
|
|
1664
|
+
return cast_polars_dataframe(
|
|
1665
|
+
data.read_polars(),
|
|
1666
|
+
options
|
|
1667
|
+
)
|