ygg 0.1.29__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/METADATA +1 -1
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/RECORD +23 -20
- yggdrasil/databricks/compute/cluster.py +41 -21
- yggdrasil/databricks/compute/execution_context.py +9 -10
- yggdrasil/databricks/compute/remote.py +10 -6
- yggdrasil/databricks/jobs/config.py +2 -30
- yggdrasil/databricks/sql/engine.py +4 -2
- yggdrasil/databricks/sql/statement_result.py +18 -3
- yggdrasil/databricks/sql/types.py +16 -0
- yggdrasil/databricks/workspaces/__init__.py +3 -1
- yggdrasil/databricks/workspaces/filesytem.py +161 -0
- yggdrasil/databricks/workspaces/io.py +745 -0
- yggdrasil/databricks/workspaces/path.py +1120 -0
- yggdrasil/databricks/workspaces/path_kind.py +10 -0
- yggdrasil/databricks/workspaces/workspace.py +146 -562
- yggdrasil/pyutils/callable_serde.py +1 -0
- yggdrasil/pyutils/modules.py +1 -1
- yggdrasil/pyutils/python_env.py +81 -264
- yggdrasil/types/cast/arrow_cast.py +9 -0
- yggdrasil/databricks/workspaces/databricks_path.py +0 -875
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/WHEEL +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1120 @@
|
|
|
1
|
+
# src/yggdrasil/databricks/workspaces/databricks_path.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import dataclasses
|
|
5
|
+
import datetime as dt
|
|
6
|
+
import random
|
|
7
|
+
import string
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import PurePosixPath
|
|
10
|
+
from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Iterable
|
|
11
|
+
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
|
|
14
|
+
from pyarrow.fs import FileInfo, FileType, FileSystem
|
|
15
|
+
import pyarrow.dataset as ds
|
|
16
|
+
|
|
17
|
+
from .io import DatabricksIO
|
|
18
|
+
from .path_kind import DatabricksPathKind
|
|
19
|
+
from ...libs.databrickslib import databricks
|
|
20
|
+
from ...types import cast_arrow_tabular, cast_polars_dataframe
|
|
21
|
+
from ...types.cast.cast_options import CastOptions
|
|
22
|
+
from ...types.cast.polars_cast import polars_converter
|
|
23
|
+
from ...types.cast.polars_pandas_cast import PolarsDataFrame
|
|
24
|
+
from ...types.cast.registry import convert, register_converter
|
|
25
|
+
|
|
26
|
+
if databricks is not None:
|
|
27
|
+
from databricks.sdk.service.catalog import VolumeType
|
|
28
|
+
from databricks.sdk.service.workspace import ObjectType
|
|
29
|
+
from databricks.sdk.errors.platform import (
|
|
30
|
+
NotFound,
|
|
31
|
+
ResourceDoesNotExist,
|
|
32
|
+
BadRequest,
|
|
33
|
+
PermissionDenied,
|
|
34
|
+
AlreadyExists,
|
|
35
|
+
ResourceAlreadyExists,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
NOT_FOUND_ERRORS = NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied
|
|
39
|
+
ALREADY_EXISTS_ERRORS = AlreadyExists, ResourceAlreadyExists, BadRequest
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
from .workspace import Workspace
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"DatabricksPathKind",
|
|
47
|
+
"DatabricksPath",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _flatten_parts(parts: Union[list[str], str]) -> list[str]:
|
|
52
|
+
if isinstance(parts, str):
|
|
53
|
+
parts = [parts]
|
|
54
|
+
|
|
55
|
+
if any("/" in part for part in parts):
|
|
56
|
+
new_parts: list[str] = []
|
|
57
|
+
|
|
58
|
+
for part in parts:
|
|
59
|
+
new_parts.extend(_ for _ in part.split("/") if _)
|
|
60
|
+
|
|
61
|
+
parts = new_parts
|
|
62
|
+
|
|
63
|
+
return parts
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _rand_str(n: int) -> str:
|
|
67
|
+
alphabet = string.ascii_letters + string.digits
|
|
68
|
+
return "".join(random.choices(alphabet, k=n))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclasses.dataclass
|
|
72
|
+
class DatabricksPath:
|
|
73
|
+
kind: DatabricksPathKind
|
|
74
|
+
parts: List[str]
|
|
75
|
+
|
|
76
|
+
_workspace: Optional["Workspace"] = None
|
|
77
|
+
|
|
78
|
+
_is_file: Optional[bool] = None
|
|
79
|
+
_is_dir: Optional[bool] = None
|
|
80
|
+
_size: Optional[int] = None
|
|
81
|
+
_mtime: Optional[float] = None
|
|
82
|
+
|
|
83
|
+
def clone_instance(
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
kind: Optional["DatabricksPathKind"] = None,
|
|
87
|
+
parts: Optional[List[str]] = None,
|
|
88
|
+
workspace: Optional["Workspace"] = dataclasses.MISSING,
|
|
89
|
+
is_file: Optional[bool] = dataclasses.MISSING,
|
|
90
|
+
is_dir: Optional[bool] = dataclasses.MISSING,
|
|
91
|
+
size: Optional[int] = dataclasses.MISSING,
|
|
92
|
+
mtime: Optional[float] = dataclasses.MISSING,
|
|
93
|
+
) -> "DatabricksPath":
|
|
94
|
+
"""
|
|
95
|
+
Return a copy of this DatabricksPath, optionally overriding fields.
|
|
96
|
+
Uses dataclasses.replace semantics but lets you intentionally override
|
|
97
|
+
cached metadata (or keep it as-is by default).
|
|
98
|
+
"""
|
|
99
|
+
return dataclasses.replace(
|
|
100
|
+
self,
|
|
101
|
+
kind=self.kind if kind is None else kind,
|
|
102
|
+
parts=list(self.parts) if parts is None else list(parts),
|
|
103
|
+
_workspace=self._workspace if workspace is dataclasses.MISSING else workspace,
|
|
104
|
+
_is_file=self._is_file if is_file is dataclasses.MISSING else is_file,
|
|
105
|
+
_is_dir=self._is_dir if is_dir is dataclasses.MISSING else is_dir,
|
|
106
|
+
_size=self._size if size is dataclasses.MISSING else size,
|
|
107
|
+
_mtime=self._mtime if mtime is dataclasses.MISSING else mtime,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def parse(
|
|
112
|
+
cls,
|
|
113
|
+
obj: Union["DatabricksPath", str, List[str]],
|
|
114
|
+
workspace: Optional["Workspace"] = None,
|
|
115
|
+
) -> "DatabricksPath":
|
|
116
|
+
if not obj:
|
|
117
|
+
return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
|
|
118
|
+
|
|
119
|
+
if not isinstance(obj, (str, list)):
|
|
120
|
+
if isinstance(obj, DatabricksPath):
|
|
121
|
+
if workspace is not None and obj._workspace is None:
|
|
122
|
+
obj._workspace = workspace
|
|
123
|
+
return obj
|
|
124
|
+
|
|
125
|
+
from .io import DatabricksIO
|
|
126
|
+
|
|
127
|
+
if isinstance(obj, DatabricksIO):
|
|
128
|
+
return obj.path
|
|
129
|
+
|
|
130
|
+
if not isinstance(obj, Iterable):
|
|
131
|
+
obj = str(obj)
|
|
132
|
+
|
|
133
|
+
obj = _flatten_parts(obj)
|
|
134
|
+
|
|
135
|
+
if obj and not obj[0]:
|
|
136
|
+
obj = obj[1:]
|
|
137
|
+
|
|
138
|
+
if not obj:
|
|
139
|
+
return DatabricksPath(kind=DatabricksPathKind.DBFS, parts=[], _workspace=workspace)
|
|
140
|
+
|
|
141
|
+
head, *tail = obj
|
|
142
|
+
head = head.casefold()
|
|
143
|
+
|
|
144
|
+
if head == "dbfs":
|
|
145
|
+
kind = DatabricksPathKind.DBFS
|
|
146
|
+
elif head == "workspace":
|
|
147
|
+
kind = DatabricksPathKind.WORKSPACE
|
|
148
|
+
elif head == "volumes":
|
|
149
|
+
kind = DatabricksPathKind.VOLUME
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError(f"Invalid DatabricksPath head {head!r} from {obj!r}, must be in ['dbfs', 'workspace', 'volumes']")
|
|
152
|
+
|
|
153
|
+
return DatabricksPath(kind=kind, parts=tail, _workspace=workspace)
|
|
154
|
+
|
|
155
|
+
def __hash__(self):
|
|
156
|
+
return hash(self.full_path())
|
|
157
|
+
|
|
158
|
+
def __eq__(self, other):
|
|
159
|
+
if not isinstance(other, DatabricksPath):
|
|
160
|
+
if isinstance(other, str):
|
|
161
|
+
return str(self) == other
|
|
162
|
+
return False
|
|
163
|
+
return self.kind == other.kind and self.parts == other.parts
|
|
164
|
+
|
|
165
|
+
def __truediv__(self, other):
|
|
166
|
+
if not other:
|
|
167
|
+
return self
|
|
168
|
+
|
|
169
|
+
other_parts = _flatten_parts(other)
|
|
170
|
+
|
|
171
|
+
return DatabricksPath(
|
|
172
|
+
kind=self.kind,
|
|
173
|
+
parts=self.parts + other_parts,
|
|
174
|
+
_workspace=self._workspace,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def __enter__(self):
|
|
178
|
+
return self.connect(clone=False)
|
|
179
|
+
|
|
180
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
181
|
+
if self._workspace is not None:
|
|
182
|
+
self._workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
183
|
+
|
|
184
|
+
def __str__(self):
|
|
185
|
+
return self.full_path()
|
|
186
|
+
|
|
187
|
+
def __repr__(self):
|
|
188
|
+
return self.url()
|
|
189
|
+
|
|
190
|
+
def __fspath__(self):
|
|
191
|
+
return self.full_path()
|
|
192
|
+
|
|
193
|
+
def url(self):
|
|
194
|
+
return "dbfs://%s" % self.full_path()
|
|
195
|
+
|
|
196
|
+
def full_path(self) -> str:
|
|
197
|
+
if self.kind == DatabricksPathKind.DBFS:
|
|
198
|
+
return self.dbfs_full_path()
|
|
199
|
+
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
200
|
+
return self.workspace_full_path()
|
|
201
|
+
elif self.kind == DatabricksPathKind.VOLUME:
|
|
202
|
+
return self.files_full_path()
|
|
203
|
+
else:
|
|
204
|
+
raise ValueError(f"Unknown DatabricksPath kind: {self.kind!r}")
|
|
205
|
+
|
|
206
|
+
def filesystem(self, workspace: Optional["Workspace"] = None):
|
|
207
|
+
return self.workspace.filesytem(workspace=workspace)
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def parent(self):
|
|
211
|
+
if not self.parts:
|
|
212
|
+
return self
|
|
213
|
+
|
|
214
|
+
if self._is_file is not None or self._is_dir is not None:
|
|
215
|
+
_is_file, _is_dir = False, True
|
|
216
|
+
else:
|
|
217
|
+
_is_file, _is_dir = None, None
|
|
218
|
+
|
|
219
|
+
return DatabricksPath(
|
|
220
|
+
kind=self.kind,
|
|
221
|
+
parts=self.parts[:-1],
|
|
222
|
+
_workspace=self._workspace,
|
|
223
|
+
_is_file=_is_file,
|
|
224
|
+
_is_dir=_is_dir,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def workspace(self):
|
|
229
|
+
if self._workspace is None:
|
|
230
|
+
from .workspace import Workspace
|
|
231
|
+
|
|
232
|
+
return Workspace()
|
|
233
|
+
return self._workspace
|
|
234
|
+
|
|
235
|
+
@workspace.setter
|
|
236
|
+
def workspace(self, value):
|
|
237
|
+
self._workspace = value
|
|
238
|
+
|
|
239
|
+
@property
|
|
240
|
+
def name(self) -> str:
|
|
241
|
+
if not self.parts:
|
|
242
|
+
return ""
|
|
243
|
+
|
|
244
|
+
if len(self.parts) == 1:
|
|
245
|
+
return self.parts[-1]
|
|
246
|
+
|
|
247
|
+
return self.parts[-1] if self.parts[-1] else self.parts[-2]
|
|
248
|
+
|
|
249
|
+
@property
|
|
250
|
+
def extension(self) -> str:
|
|
251
|
+
name = self.name
|
|
252
|
+
if "." in name:
|
|
253
|
+
return name.split(".")[-1]
|
|
254
|
+
return ""
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def file_format(self) -> FileFormat:
|
|
258
|
+
ext = self.extension
|
|
259
|
+
|
|
260
|
+
if ext == "parquet":
|
|
261
|
+
return ParquetFileFormat()
|
|
262
|
+
elif ext == "csv":
|
|
263
|
+
return CsvFileFormat()
|
|
264
|
+
elif ext == "json":
|
|
265
|
+
return JsonFileFormat()
|
|
266
|
+
else:
|
|
267
|
+
raise ValueError(
|
|
268
|
+
"Cannot get file format from extension %s" % ext
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
@property
|
|
272
|
+
def content_length(self):
|
|
273
|
+
if self._size is None:
|
|
274
|
+
self.refresh_status()
|
|
275
|
+
return self._size
|
|
276
|
+
|
|
277
|
+
@content_length.setter
|
|
278
|
+
def content_length(self, value: int):
|
|
279
|
+
self._size = value
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def mtime(self) -> Optional[float]:
|
|
283
|
+
if self._mtime is None:
|
|
284
|
+
self.refresh_status()
|
|
285
|
+
return self._mtime
|
|
286
|
+
|
|
287
|
+
@mtime.setter
|
|
288
|
+
def mtime(self, value: float):
|
|
289
|
+
if not isinstance(value, float):
|
|
290
|
+
if isinstance(value, dt.datetime):
|
|
291
|
+
value = value.timestamp()
|
|
292
|
+
elif isinstance(value, str):
|
|
293
|
+
value = dt.datetime.fromisoformat(value).timestamp()
|
|
294
|
+
else:
|
|
295
|
+
value = float(value)
|
|
296
|
+
self._mtime = value
|
|
297
|
+
|
|
298
|
+
@property
|
|
299
|
+
def file_type(self):
|
|
300
|
+
if self.is_file():
|
|
301
|
+
return FileType.File
|
|
302
|
+
elif self.is_dir():
|
|
303
|
+
return FileType.Directory
|
|
304
|
+
else:
|
|
305
|
+
return FileType.NotFound
|
|
306
|
+
|
|
307
|
+
@property
|
|
308
|
+
def file_info(self):
|
|
309
|
+
return FileInfo(
|
|
310
|
+
path=self.full_path(),
|
|
311
|
+
type=self.file_type,
|
|
312
|
+
mtime=self.mtime,
|
|
313
|
+
size=self.content_length,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def is_file(self):
|
|
317
|
+
if self._is_file is None:
|
|
318
|
+
self.refresh_status()
|
|
319
|
+
return self._is_file
|
|
320
|
+
|
|
321
|
+
def is_dir(self):
|
|
322
|
+
if self._is_dir is None:
|
|
323
|
+
self.refresh_status()
|
|
324
|
+
return self._is_dir
|
|
325
|
+
|
|
326
|
+
def is_dir_sink(self):
|
|
327
|
+
return self.is_dir() or (self.parts and self.parts[-1] == "")
|
|
328
|
+
|
|
329
|
+
@property
|
|
330
|
+
def connected(self) -> bool:
|
|
331
|
+
return self._workspace is not None and self._workspace.connected
|
|
332
|
+
|
|
333
|
+
def connect(self, clone: bool = False) -> "DatabricksPath":
|
|
334
|
+
workspace = self.workspace.connect(clone=clone)
|
|
335
|
+
|
|
336
|
+
if clone:
|
|
337
|
+
return self.clone_instance(
|
|
338
|
+
workspace=workspace
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
self._workspace = workspace
|
|
342
|
+
|
|
343
|
+
return self
|
|
344
|
+
|
|
345
|
+
def close(self):
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
def volume_parts(self) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[PurePosixPath]]:
|
|
349
|
+
if self.kind != DatabricksPathKind.VOLUME:
|
|
350
|
+
return None, None, None, None
|
|
351
|
+
|
|
352
|
+
catalog = self.parts[0] if len(self.parts) > 0 and self.parts[0] else None
|
|
353
|
+
schema = self.parts[1] if len(self.parts) > 1 and self.parts[1] else None
|
|
354
|
+
volume = self.parts[2] if len(self.parts) > 2 and self.parts[2] else None
|
|
355
|
+
|
|
356
|
+
# NOTE: rel is used as a true/false “has relative path” indicator in this file.
|
|
357
|
+
# The runtime value is a list[str] (not PurePosixPath). Keeping it that way to avoid behavior changes.
|
|
358
|
+
return catalog, schema, volume, self.parts[3:] # type: ignore[return-value]
|
|
359
|
+
|
|
360
|
+
def refresh_status(self) -> "DatabricksPath":
|
|
361
|
+
if self.kind == DatabricksPathKind.VOLUME:
|
|
362
|
+
self._refresh_volume_status()
|
|
363
|
+
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
364
|
+
self._refresh_workspace_status()
|
|
365
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
366
|
+
self._refresh_dbfs_status()
|
|
367
|
+
return self
|
|
368
|
+
|
|
369
|
+
def _refresh_volume_status(self):
|
|
370
|
+
full_path = self.files_full_path()
|
|
371
|
+
sdk = self.workspace.sdk()
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
info = sdk.files.get_metadata(full_path)
|
|
375
|
+
|
|
376
|
+
mtime = (
|
|
377
|
+
dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
|
|
378
|
+
if info.last_modified
|
|
379
|
+
else None
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
return self.reset_metadata(is_file=True, is_dir=False, size=info.content_length, mtime=mtime)
|
|
383
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
384
|
+
pass
|
|
385
|
+
|
|
386
|
+
try:
|
|
387
|
+
info = sdk.files.get_directory_metadata(full_path)
|
|
388
|
+
mtime = (
|
|
389
|
+
dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
|
|
390
|
+
if info.last_modified
|
|
391
|
+
else None
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
return self.reset_metadata(is_file=False, is_dir=True, size=info, mtime=mtime)
|
|
395
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
396
|
+
pass
|
|
397
|
+
|
|
398
|
+
return self
|
|
399
|
+
|
|
400
|
+
def _refresh_workspace_status(self):
|
|
401
|
+
sdk = self.workspace.sdk()
|
|
402
|
+
|
|
403
|
+
try:
|
|
404
|
+
info = sdk.workspace.get_status(self.workspace_full_path())
|
|
405
|
+
is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
|
|
406
|
+
is_file = not is_dir
|
|
407
|
+
size = info.size
|
|
408
|
+
mtime = float(info.modified_at) / 1000.0 if info.modified_at is not None else None
|
|
409
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
410
|
+
found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
|
|
411
|
+
size = 0
|
|
412
|
+
mtime = found.mtime if found is not None else None
|
|
413
|
+
|
|
414
|
+
if found is None:
|
|
415
|
+
is_file, is_dir = None, None
|
|
416
|
+
else:
|
|
417
|
+
is_file, is_dir = False, True
|
|
418
|
+
|
|
419
|
+
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
420
|
+
|
|
421
|
+
def _refresh_dbfs_status(self):
|
|
422
|
+
sdk = self.workspace.sdk()
|
|
423
|
+
|
|
424
|
+
try:
|
|
425
|
+
info = sdk.dbfs.get_status(self.dbfs_full_path())
|
|
426
|
+
is_file, is_dir = not info.is_dir, info.is_dir
|
|
427
|
+
size = info.file_size
|
|
428
|
+
mtime = info.modification_time / 1000.0 if info.modification_time else None
|
|
429
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
430
|
+
found = next(self.ls(fetch_size=1, recursive=False, allow_not_found=True), None)
|
|
431
|
+
size = 0
|
|
432
|
+
mtime = found.mtime if found is not None else None
|
|
433
|
+
|
|
434
|
+
if found is None:
|
|
435
|
+
is_file, is_dir = None, None
|
|
436
|
+
else:
|
|
437
|
+
is_file, is_dir = False, True
|
|
438
|
+
|
|
439
|
+
return self.reset_metadata(is_file=is_file, is_dir=is_dir, size=size, mtime=mtime)
|
|
440
|
+
|
|
441
|
+
def reset_metadata(
|
|
442
|
+
self,
|
|
443
|
+
is_file: Optional[bool] = None,
|
|
444
|
+
is_dir: Optional[bool] = None,
|
|
445
|
+
size: Optional[int] = None,
|
|
446
|
+
mtime: Optional[float] = None,
|
|
447
|
+
):
|
|
448
|
+
self._is_file = is_file
|
|
449
|
+
self._is_dir = is_dir
|
|
450
|
+
self._size = size
|
|
451
|
+
self._mtime = mtime
|
|
452
|
+
|
|
453
|
+
return self
|
|
454
|
+
|
|
455
|
+
# ---- API path normalization helpers ----
|
|
456
|
+
|
|
457
|
+
def workspace_full_path(self) -> str:
|
|
458
|
+
if not self.parts:
|
|
459
|
+
return "/Workspace"
|
|
460
|
+
|
|
461
|
+
parts = self.parts if self.parts[-1] else self.parts[:-1]
|
|
462
|
+
|
|
463
|
+
return "/Workspace/%s" % "/".join(parts)
|
|
464
|
+
|
|
465
|
+
def dbfs_full_path(self) -> str:
|
|
466
|
+
if not self.parts:
|
|
467
|
+
return "/dbfs"
|
|
468
|
+
|
|
469
|
+
parts = self.parts if self.parts[-1] else self.parts[:-1]
|
|
470
|
+
|
|
471
|
+
return "/dbfs/%s" % "/".join(parts)
|
|
472
|
+
|
|
473
|
+
def files_full_path(self) -> str:
|
|
474
|
+
if not self.parts:
|
|
475
|
+
return "/Volumes"
|
|
476
|
+
|
|
477
|
+
parts = self.parts if self.parts[-1] else self.parts[:-1]
|
|
478
|
+
|
|
479
|
+
return "/Volumes/%s" % "/".join(parts)
|
|
480
|
+
|
|
481
|
+
def exists(self, *, follow_symlinks=True) -> bool:
|
|
482
|
+
return bool(self.is_file() or self.is_dir())
|
|
483
|
+
|
|
484
|
+
def mkdir(self, mode=None, parents=True, exist_ok=True):
|
|
485
|
+
try:
|
|
486
|
+
if self.kind == DatabricksPathKind.WORKSPACE:
|
|
487
|
+
self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
|
|
488
|
+
elif self.kind == DatabricksPathKind.VOLUME:
|
|
489
|
+
self.make_volume_dir(parents=parents, exist_ok=exist_ok)
|
|
490
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
491
|
+
self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
|
|
492
|
+
except (NotFound, ResourceDoesNotExist):
|
|
493
|
+
if not parents or self.parent == self:
|
|
494
|
+
raise
|
|
495
|
+
|
|
496
|
+
self.parent.mkdir(parents=True, exist_ok=True)
|
|
497
|
+
self.mkdir(parents=False, exist_ok=exist_ok)
|
|
498
|
+
except (AlreadyExists, ResourceAlreadyExists):
|
|
499
|
+
if not exist_ok:
|
|
500
|
+
raise
|
|
501
|
+
|
|
502
|
+
return self
|
|
503
|
+
|
|
504
|
+
def _ensure_volume(self, exist_ok: bool = True, sdk=None):
|
|
505
|
+
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
506
|
+
sdk = self.workspace.sdk() if sdk is None else sdk
|
|
507
|
+
|
|
508
|
+
if catalog_name:
|
|
509
|
+
try:
|
|
510
|
+
sdk.catalogs.create(name=catalog_name)
|
|
511
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
512
|
+
if not exist_ok:
|
|
513
|
+
raise
|
|
514
|
+
|
|
515
|
+
if schema_name:
|
|
516
|
+
try:
|
|
517
|
+
sdk.schemas.create(catalog_name=catalog_name, name=schema_name)
|
|
518
|
+
except (AlreadyExists, ResourceAlreadyExists, PermissionDenied, BadRequest):
|
|
519
|
+
if not exist_ok:
|
|
520
|
+
raise
|
|
521
|
+
|
|
522
|
+
if volume_name:
|
|
523
|
+
try:
|
|
524
|
+
sdk.volumes.create(
|
|
525
|
+
catalog_name=catalog_name,
|
|
526
|
+
schema_name=schema_name,
|
|
527
|
+
name=volume_name,
|
|
528
|
+
volume_type=VolumeType.MANAGED,
|
|
529
|
+
)
|
|
530
|
+
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
531
|
+
if not exist_ok:
|
|
532
|
+
raise
|
|
533
|
+
|
|
534
|
+
def make_volume_dir(self, parents=True, exist_ok=True):
|
|
535
|
+
path = self.files_full_path()
|
|
536
|
+
sdk = self.workspace.sdk()
|
|
537
|
+
|
|
538
|
+
try:
|
|
539
|
+
sdk.files.create_directory(path)
|
|
540
|
+
except (BadRequest, NotFound, ResourceDoesNotExist) as e:
|
|
541
|
+
if not parents:
|
|
542
|
+
raise
|
|
543
|
+
|
|
544
|
+
message = str(e)
|
|
545
|
+
if "not exist" in message:
|
|
546
|
+
self._ensure_volume(sdk=sdk)
|
|
547
|
+
|
|
548
|
+
sdk.files.create_directory(path)
|
|
549
|
+
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
550
|
+
if not exist_ok:
|
|
551
|
+
raise
|
|
552
|
+
|
|
553
|
+
return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
|
|
554
|
+
|
|
555
|
+
def make_workspace_dir(self, parents=True, exist_ok=True):
|
|
556
|
+
path = self.workspace_full_path()
|
|
557
|
+
sdk = self.workspace.sdk()
|
|
558
|
+
|
|
559
|
+
try:
|
|
560
|
+
sdk.workspace.mkdirs(path)
|
|
561
|
+
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
562
|
+
if not exist_ok:
|
|
563
|
+
raise
|
|
564
|
+
|
|
565
|
+
return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
|
|
566
|
+
|
|
567
|
+
def make_dbfs_dir(self, parents=True, exist_ok=True):
|
|
568
|
+
path = self.dbfs_full_path()
|
|
569
|
+
sdk = self.workspace.sdk()
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
sdk.dbfs.mkdirs(path)
|
|
573
|
+
except (AlreadyExists, ResourceAlreadyExists, BadRequest):
|
|
574
|
+
if not exist_ok:
|
|
575
|
+
raise
|
|
576
|
+
|
|
577
|
+
return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=time.time())
|
|
578
|
+
|
|
579
|
+
def remove(self, recursive: bool = True):
|
|
580
|
+
if self.kind == DatabricksPathKind.VOLUME:
|
|
581
|
+
return self._remove_volume_obj(recursive=recursive)
|
|
582
|
+
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
583
|
+
return self._remove_workspace_obj(recursive=recursive)
|
|
584
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
585
|
+
return self._remove_dbfs_obj(recursive=recursive)
|
|
586
|
+
|
|
587
|
+
def _remove_volume_obj(self, recursive: bool = True):
|
|
588
|
+
if self.is_file():
|
|
589
|
+
return self._remove_volume_file()
|
|
590
|
+
return self._remove_volume_dir(recursive=recursive)
|
|
591
|
+
|
|
592
|
+
def _remove_workspace_obj(self, recursive: bool = True):
|
|
593
|
+
if self.is_file():
|
|
594
|
+
return self._remove_workspace_file()
|
|
595
|
+
return self._remove_workspace_dir(recursive=recursive)
|
|
596
|
+
|
|
597
|
+
def _remove_dbfs_obj(self, recursive: bool = True):
|
|
598
|
+
if self.is_file():
|
|
599
|
+
return self._remove_dbfs_file()
|
|
600
|
+
return self._remove_dbfs_dir(recursive=recursive)
|
|
601
|
+
|
|
602
|
+
def rmfile(self):
|
|
603
|
+
try:
|
|
604
|
+
if self.kind == DatabricksPathKind.VOLUME:
|
|
605
|
+
return self._remove_volume_file()
|
|
606
|
+
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
607
|
+
return self._remove_workspace_file()
|
|
608
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
609
|
+
return self._remove_dbfs_file()
|
|
610
|
+
finally:
|
|
611
|
+
self.reset_metadata()
|
|
612
|
+
return self
|
|
613
|
+
|
|
614
|
+
def _remove_volume_file(self):
|
|
615
|
+
sdk = self.workspace.sdk()
|
|
616
|
+
try:
|
|
617
|
+
sdk.files.delete(self.files_full_path())
|
|
618
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
619
|
+
pass
|
|
620
|
+
return self
|
|
621
|
+
|
|
622
|
+
def _remove_workspace_file(self):
|
|
623
|
+
sdk = self.workspace.sdk()
|
|
624
|
+
try:
|
|
625
|
+
sdk.workspace.delete(self.workspace_full_path(), recursive=True)
|
|
626
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
627
|
+
pass
|
|
628
|
+
return self
|
|
629
|
+
|
|
630
|
+
def _remove_dbfs_file(self):
|
|
631
|
+
sdk = self.workspace.sdk()
|
|
632
|
+
try:
|
|
633
|
+
sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
|
|
634
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
635
|
+
pass
|
|
636
|
+
return self
|
|
637
|
+
|
|
638
|
+
def rmdir(self, recursive: bool = True):
|
|
639
|
+
if self.kind == DatabricksPathKind.VOLUME:
|
|
640
|
+
return self._remove_volume_dir(recursive=recursive)
|
|
641
|
+
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
642
|
+
return self._remove_workspace_dir(recursive=recursive)
|
|
643
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
644
|
+
return self._remove_dbfs_dir(recursive=recursive)
|
|
645
|
+
|
|
646
|
+
def _remove_workspace_dir(self, recursive: bool = True):
|
|
647
|
+
sdk = self.workspace.sdk()
|
|
648
|
+
try:
|
|
649
|
+
sdk.workspace.delete(self.workspace_full_path(), recursive=recursive)
|
|
650
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
651
|
+
pass
|
|
652
|
+
self.reset_metadata()
|
|
653
|
+
return self
|
|
654
|
+
|
|
655
|
+
def _remove_dbfs_dir(self, recursive: bool = True):
|
|
656
|
+
sdk = self.workspace.sdk()
|
|
657
|
+
try:
|
|
658
|
+
sdk.dbfs.delete(self.dbfs_full_path(), recursive=recursive)
|
|
659
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
660
|
+
pass
|
|
661
|
+
self.reset_metadata()
|
|
662
|
+
return self
|
|
663
|
+
|
|
664
|
+
def _remove_volume_dir(self, recursive: bool = True):
|
|
665
|
+
root_path = self.files_full_path()
|
|
666
|
+
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
667
|
+
sdk = self.workspace.sdk()
|
|
668
|
+
|
|
669
|
+
if rel:
|
|
670
|
+
try:
|
|
671
|
+
sdk.files.delete_directory(root_path)
|
|
672
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied) as e:
|
|
673
|
+
message = str(e)
|
|
674
|
+
if recursive and "directory is not empty" in message:
|
|
675
|
+
for child_path in self.ls():
|
|
676
|
+
child_path._remove_volume_obj(recursive=True)
|
|
677
|
+
sdk.files.delete_directory(root_path)
|
|
678
|
+
else:
|
|
679
|
+
pass
|
|
680
|
+
elif volume_name:
|
|
681
|
+
try:
|
|
682
|
+
sdk.volumes.delete(f"{catalog_name}.{schema_name}.{volume_name}")
|
|
683
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
684
|
+
pass
|
|
685
|
+
elif schema_name:
|
|
686
|
+
try:
|
|
687
|
+
sdk.schemas.delete(f"{catalog_name}.{schema_name}", force=True)
|
|
688
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
689
|
+
pass
|
|
690
|
+
|
|
691
|
+
return self.reset_metadata()
|
|
692
|
+
|
|
693
|
+
def ls(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
|
|
694
|
+
if self.kind == DatabricksPathKind.VOLUME:
|
|
695
|
+
yield from self._ls_volume(recursive=recursive, fetch_size=fetch_size, allow_not_found=allow_not_found)
|
|
696
|
+
elif self.kind == DatabricksPathKind.WORKSPACE:
|
|
697
|
+
yield from self._ls_workspace(recursive=recursive, allow_not_found=allow_not_found)
|
|
698
|
+
elif self.kind == DatabricksPathKind.DBFS:
|
|
699
|
+
yield from self._ls_dbfs(recursive=recursive, allow_not_found=allow_not_found)
|
|
700
|
+
|
|
701
|
+
def _ls_volume(self, recursive: bool = False, fetch_size: int = None, allow_not_found: bool = True):
|
|
702
|
+
catalog_name, schema_name, volume_name, rel = self.volume_parts()
|
|
703
|
+
sdk = self.workspace.sdk()
|
|
704
|
+
|
|
705
|
+
if rel is None:
|
|
706
|
+
if volume_name is None:
|
|
707
|
+
try:
|
|
708
|
+
for info in sdk.volumes.list(catalog_name=catalog_name, schema_name=schema_name):
|
|
709
|
+
base = DatabricksPath(
|
|
710
|
+
kind=DatabricksPathKind.VOLUME,
|
|
711
|
+
parts=[info.catalog_name, info.schema_name, info.name],
|
|
712
|
+
_workspace=self.workspace,
|
|
713
|
+
_is_file=False,
|
|
714
|
+
_is_dir=True,
|
|
715
|
+
_size=0,
|
|
716
|
+
)
|
|
717
|
+
if recursive:
|
|
718
|
+
yield from base._ls_volume(recursive=recursive)
|
|
719
|
+
else:
|
|
720
|
+
yield base
|
|
721
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
722
|
+
if not allow_not_found:
|
|
723
|
+
raise
|
|
724
|
+
elif schema_name is None:
|
|
725
|
+
try:
|
|
726
|
+
for info in sdk.schemas.list(catalog_name=catalog_name):
|
|
727
|
+
base = DatabricksPath(
|
|
728
|
+
kind=DatabricksPathKind.VOLUME,
|
|
729
|
+
parts=[info.catalog_name, info.name],
|
|
730
|
+
_workspace=self.workspace,
|
|
731
|
+
_is_file=False,
|
|
732
|
+
_is_dir=True,
|
|
733
|
+
_size=0,
|
|
734
|
+
)
|
|
735
|
+
if recursive:
|
|
736
|
+
yield from base._ls_volume(recursive=recursive)
|
|
737
|
+
else:
|
|
738
|
+
yield base
|
|
739
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
740
|
+
if not allow_not_found:
|
|
741
|
+
raise
|
|
742
|
+
else:
|
|
743
|
+
try:
|
|
744
|
+
for info in sdk.catalogs.list():
|
|
745
|
+
base = DatabricksPath(
|
|
746
|
+
kind=DatabricksPathKind.VOLUME,
|
|
747
|
+
parts=[info.name],
|
|
748
|
+
_workspace=self.workspace,
|
|
749
|
+
_is_file=False,
|
|
750
|
+
_is_dir=True,
|
|
751
|
+
_size=0,
|
|
752
|
+
)
|
|
753
|
+
if recursive:
|
|
754
|
+
yield from base._ls_volume(recursive=recursive)
|
|
755
|
+
else:
|
|
756
|
+
yield base
|
|
757
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
758
|
+
if not allow_not_found:
|
|
759
|
+
raise
|
|
760
|
+
else:
|
|
761
|
+
full_path = self.files_full_path()
|
|
762
|
+
|
|
763
|
+
try:
|
|
764
|
+
for info in sdk.files.list_directory_contents(full_path, page_size=fetch_size):
|
|
765
|
+
base = DatabricksPath(
|
|
766
|
+
kind=DatabricksPathKind.VOLUME,
|
|
767
|
+
parts=info.path.split("/")[2:],
|
|
768
|
+
_workspace=self.workspace,
|
|
769
|
+
_is_file=not info.is_directory,
|
|
770
|
+
_is_dir=info.is_directory,
|
|
771
|
+
_size=info.file_size,
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
if recursive and info.is_directory:
|
|
775
|
+
yield from base._ls_volume(recursive=recursive)
|
|
776
|
+
else:
|
|
777
|
+
yield base
|
|
778
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
779
|
+
if not allow_not_found:
|
|
780
|
+
raise
|
|
781
|
+
|
|
782
|
+
def _ls_workspace(self, recursive: bool = True, allow_not_found: bool = True):
|
|
783
|
+
sdk = self.workspace.sdk()
|
|
784
|
+
full_path = self.workspace_full_path()
|
|
785
|
+
|
|
786
|
+
try:
|
|
787
|
+
for info in sdk.workspace.list(full_path, recursive=recursive):
|
|
788
|
+
is_dir = info.object_type in (ObjectType.DIRECTORY, ObjectType.REPO)
|
|
789
|
+
yield DatabricksPath(
|
|
790
|
+
kind=DatabricksPathKind.WORKSPACE,
|
|
791
|
+
parts=info.path.split("/")[2:],
|
|
792
|
+
_workspace=self.workspace,
|
|
793
|
+
_is_file=not is_dir,
|
|
794
|
+
_is_dir=is_dir,
|
|
795
|
+
_size=info.size,
|
|
796
|
+
)
|
|
797
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
798
|
+
if not allow_not_found:
|
|
799
|
+
raise
|
|
800
|
+
|
|
801
|
+
def _ls_dbfs(self, recursive: bool = True, allow_not_found: bool = True):
|
|
802
|
+
sdk = self.workspace.sdk()
|
|
803
|
+
full_path = self.dbfs_full_path()
|
|
804
|
+
|
|
805
|
+
try:
|
|
806
|
+
for info in sdk.dbfs.list(full_path, recursive=recursive):
|
|
807
|
+
yield DatabricksPath(
|
|
808
|
+
kind=DatabricksPathKind.DBFS,
|
|
809
|
+
parts=info.path.split("/")[2:],
|
|
810
|
+
_workspace=self.workspace,
|
|
811
|
+
_is_file=not info.is_dir,
|
|
812
|
+
_is_dir=info.is_dir,
|
|
813
|
+
_size=info.file_size,
|
|
814
|
+
)
|
|
815
|
+
except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
|
|
816
|
+
if not allow_not_found:
|
|
817
|
+
raise
|
|
818
|
+
|
|
819
|
+
def open(
|
|
820
|
+
self,
|
|
821
|
+
mode="rb",
|
|
822
|
+
encoding=None,
|
|
823
|
+
clone: bool = False,
|
|
824
|
+
) -> DatabricksIO:
|
|
825
|
+
path = self.connect(clone=clone)
|
|
826
|
+
|
|
827
|
+
return (
|
|
828
|
+
DatabricksIO
|
|
829
|
+
.create_instance(path=path, mode=mode, encoding=encoding)
|
|
830
|
+
.connect(clone=False)
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
def copy_to(
|
|
834
|
+
self,
|
|
835
|
+
dest: Union["DatabricksIO", "DatabricksPath", str],
|
|
836
|
+
allow_not_found: bool = True,
|
|
837
|
+
) -> None:
|
|
838
|
+
if self.is_file() and dest.is_file():
|
|
839
|
+
with self.open(mode="rb") as src:
|
|
840
|
+
src.copy_to(dest=dest)
|
|
841
|
+
|
|
842
|
+
elif self.is_dir():
|
|
843
|
+
dest_base = self.parse(obj=dest, workspace=self.workspace if dest._workspace is None else dest._workspace)
|
|
844
|
+
dest_base.mkdir(parents=True, exist_ok=True)
|
|
845
|
+
|
|
846
|
+
skip_base_parts = len(self.parts)
|
|
847
|
+
|
|
848
|
+
for src_child in self.ls(recursive=True, allow_not_found=True):
|
|
849
|
+
src_child: DatabricksPath = src_child
|
|
850
|
+
dest_child_parts = dest_base.parts + src_child.parts[skip_base_parts:]
|
|
851
|
+
|
|
852
|
+
src_child.copy_to(
|
|
853
|
+
dest=dest.clone_instance(parts=dest_child_parts),
|
|
854
|
+
allow_not_found=allow_not_found
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
elif not allow_not_found:
|
|
858
|
+
return None
|
|
859
|
+
|
|
860
|
+
else:
|
|
861
|
+
raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
|
|
862
|
+
|
|
863
|
+
# -------------------------
|
|
864
|
+
# Data ops (Arrow / Pandas / Polars)
|
|
865
|
+
# -------------------------
|
|
866
|
+
def arrow_dataset(
|
|
867
|
+
self,
|
|
868
|
+
workspace: Optional["Workspace"] = None,
|
|
869
|
+
filesystem: Optional[FileSystem] = None,
|
|
870
|
+
**kwargs
|
|
871
|
+
):
|
|
872
|
+
filesystem = self.filesystem(workspace=workspace) if filesystem is None else filesystem
|
|
873
|
+
|
|
874
|
+
return ds.dataset(
|
|
875
|
+
source=self.full_path(),
|
|
876
|
+
filesystem=filesystem,
|
|
877
|
+
**kwargs
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
def read_arrow_table(
|
|
881
|
+
self,
|
|
882
|
+
batch_size: Optional[int] = None,
|
|
883
|
+
concat: bool = True,
|
|
884
|
+
**kwargs
|
|
885
|
+
) -> pa.Table:
|
|
886
|
+
if self.is_file():
|
|
887
|
+
with self.open("rb") as f:
|
|
888
|
+
return f.read_arrow_table(batch_size=batch_size, **kwargs)
|
|
889
|
+
|
|
890
|
+
if self.is_dir():
|
|
891
|
+
tables: list[pa.Table] = []
|
|
892
|
+
for child in self.ls(recursive=True):
|
|
893
|
+
if child.is_file():
|
|
894
|
+
with child.open("rb") as f:
|
|
895
|
+
tables.append(f.read_arrow_table(batch_size=batch_size, **kwargs))
|
|
896
|
+
|
|
897
|
+
if not tables:
|
|
898
|
+
return pa.Table.from_batches([], schema=pa.schema([]))
|
|
899
|
+
|
|
900
|
+
if not concat:
|
|
901
|
+
# type: ignore[return-value]
|
|
902
|
+
return tables # caller asked for raw list
|
|
903
|
+
|
|
904
|
+
try:
|
|
905
|
+
return pa.concat_tables(tables)
|
|
906
|
+
except Exception:
|
|
907
|
+
# Fallback: concat via polars (diagonal relaxed) then back to Arrow
|
|
908
|
+
from polars import CompatLevel
|
|
909
|
+
|
|
910
|
+
return self.read_polars(
|
|
911
|
+
batch_size=batch_size,
|
|
912
|
+
how="diagonal_relaxed",
|
|
913
|
+
rechunk=True,
|
|
914
|
+
concat=True,
|
|
915
|
+
**kwargs,
|
|
916
|
+
).to_arrow(compat_level=CompatLevel.newest())
|
|
917
|
+
|
|
918
|
+
raise FileNotFoundError(f"Path does not exist: {self}")
|
|
919
|
+
|
|
920
|
+
def write_arrow(
|
|
921
|
+
self,
|
|
922
|
+
table: Union[pa.Table, pa.RecordBatch],
|
|
923
|
+
batch_size: Optional[int] = None,
|
|
924
|
+
**kwargs
|
|
925
|
+
):
|
|
926
|
+
if not isinstance(table, pa.Table):
|
|
927
|
+
table = convert(table, pa.Table)
|
|
928
|
+
|
|
929
|
+
return self.write_arrow_table(
|
|
930
|
+
table=table,
|
|
931
|
+
batch_size=batch_size,
|
|
932
|
+
**kwargs
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
def write_arrow_table(
|
|
936
|
+
self,
|
|
937
|
+
table: pa.Table,
|
|
938
|
+
batch_size: Optional[int] = None,
|
|
939
|
+
**kwargs
|
|
940
|
+
):
|
|
941
|
+
with self.connect(clone=False) as connected:
|
|
942
|
+
if connected.is_dir_sink():
|
|
943
|
+
seed = int(time.time() * 1000)
|
|
944
|
+
|
|
945
|
+
for i, batch in enumerate(table.to_batches(max_chunksize=batch_size)):
|
|
946
|
+
part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
|
|
947
|
+
|
|
948
|
+
with part_path.open(mode="wb") as f:
|
|
949
|
+
f.write_arrow_batch(batch)
|
|
950
|
+
|
|
951
|
+
return connected
|
|
952
|
+
|
|
953
|
+
connected.open(mode="wb", clone=False).write_arrow_table(
|
|
954
|
+
table,
|
|
955
|
+
batch_size=batch_size,
|
|
956
|
+
**kwargs
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
return self
|
|
960
|
+
|
|
961
|
+
def read_pandas(
|
|
962
|
+
self,
|
|
963
|
+
batch_size: int = 0,
|
|
964
|
+
concat: bool = True,
|
|
965
|
+
**kwargs
|
|
966
|
+
):
|
|
967
|
+
if concat:
|
|
968
|
+
return self.read_arrow_table(batch_size=batch_size, concat=True, **kwargs).to_pandas()
|
|
969
|
+
|
|
970
|
+
tables = self.read_arrow_table(batch_size=batch_size, concat=False, **kwargs)
|
|
971
|
+
return [t.to_pandas() for t in tables] # type: ignore[arg-type]
|
|
972
|
+
|
|
973
|
+
def write_pandas(
|
|
974
|
+
self,
|
|
975
|
+
df,
|
|
976
|
+
batch_size: Optional[int] = None,
|
|
977
|
+
**kwargs
|
|
978
|
+
):
|
|
979
|
+
return self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
|
|
980
|
+
|
|
981
|
+
def read_polars(
|
|
982
|
+
self,
|
|
983
|
+
batch_size: Optional[int] = None,
|
|
984
|
+
how: str = "diagonal_relaxed",
|
|
985
|
+
rechunk: bool = False,
|
|
986
|
+
concat: bool = True,
|
|
987
|
+
**kwargs
|
|
988
|
+
):
|
|
989
|
+
import polars as pl
|
|
990
|
+
|
|
991
|
+
if self.is_file():
|
|
992
|
+
with self.open("rb") as f:
|
|
993
|
+
return f.read_polars(batch_size=batch_size, **kwargs)
|
|
994
|
+
|
|
995
|
+
if self.is_dir():
|
|
996
|
+
dfs = []
|
|
997
|
+
for child in self.ls(recursive=True):
|
|
998
|
+
if child.is_file():
|
|
999
|
+
with child.open("rb") as f:
|
|
1000
|
+
dfs.append(f.read_polars(batch_size=batch_size, **kwargs))
|
|
1001
|
+
|
|
1002
|
+
if not dfs:
|
|
1003
|
+
return pl.DataFrame()
|
|
1004
|
+
|
|
1005
|
+
if concat:
|
|
1006
|
+
return pl.concat(dfs, how=how, rechunk=rechunk)
|
|
1007
|
+
return dfs # type: ignore[return-value]
|
|
1008
|
+
|
|
1009
|
+
raise FileNotFoundError(f"Path does not exist: {self}")
|
|
1010
|
+
|
|
1011
|
+
def write_polars(
|
|
1012
|
+
self,
|
|
1013
|
+
df,
|
|
1014
|
+
batch_size: Optional[int] = None,
|
|
1015
|
+
**kwargs
|
|
1016
|
+
):
|
|
1017
|
+
"""
|
|
1018
|
+
Write Polars to a DatabricksPath.
|
|
1019
|
+
|
|
1020
|
+
Behavior:
|
|
1021
|
+
- If path is a directory (or ends with a trailing "/"): shard to parquet parts.
|
|
1022
|
+
`batch_size` = rows per part (default 1_000_000).
|
|
1023
|
+
- If path is a file: write using DatabricksIO.write_polars which is extension-driven
|
|
1024
|
+
(parquet/csv/ipc/json/ndjson etc.).
|
|
1025
|
+
|
|
1026
|
+
Notes:
|
|
1027
|
+
- If `df` is a LazyFrame, we collect it first (optionally streaming).
|
|
1028
|
+
"""
|
|
1029
|
+
import polars as pl
|
|
1030
|
+
|
|
1031
|
+
if isinstance(df, pl.LazyFrame):
|
|
1032
|
+
df = df.collect()
|
|
1033
|
+
|
|
1034
|
+
if not isinstance(df, pl.DataFrame):
|
|
1035
|
+
raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
|
|
1036
|
+
|
|
1037
|
+
with self.connect() as connected:
|
|
1038
|
+
if connected.is_dir_sink():
|
|
1039
|
+
seed = int(time.time() * 1000)
|
|
1040
|
+
rows_per_part = batch_size or 1_000_000
|
|
1041
|
+
|
|
1042
|
+
# Always parquet for directory sinks (lake layout standard)
|
|
1043
|
+
for i, chunk in enumerate(df.iter_slices(n_rows=rows_per_part)):
|
|
1044
|
+
part_path = connected / f"part-{i:05d}-{seed}-{_rand_str(4)}.parquet"
|
|
1045
|
+
|
|
1046
|
+
part_path.write_polars(chunk, **kwargs)
|
|
1047
|
+
|
|
1048
|
+
return connected
|
|
1049
|
+
|
|
1050
|
+
# Single file write: format/extension is handled in DatabricksIO.write_polars
|
|
1051
|
+
connected.write_polars(df, **kwargs)
|
|
1052
|
+
|
|
1053
|
+
return connected
|
|
1054
|
+
|
|
1055
|
+
def sql(
|
|
1056
|
+
self,
|
|
1057
|
+
query: str,
|
|
1058
|
+
engine: str = "auto"
|
|
1059
|
+
):
|
|
1060
|
+
if engine == "auto":
|
|
1061
|
+
try:
|
|
1062
|
+
import duckdb
|
|
1063
|
+
engine = "duckdb"
|
|
1064
|
+
except ImportError:
|
|
1065
|
+
engine = "polars"
|
|
1066
|
+
|
|
1067
|
+
from_table = "dbfs.`%s`" % self.full_path()
|
|
1068
|
+
|
|
1069
|
+
if from_table not in query:
|
|
1070
|
+
raise ValueError(
|
|
1071
|
+
"SQL query must contain %s to execute query:\n%s" % (
|
|
1072
|
+
from_table,
|
|
1073
|
+
query
|
|
1074
|
+
)
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
if engine == "duckdb":
|
|
1078
|
+
import duckdb
|
|
1079
|
+
|
|
1080
|
+
__arrow_table__ = self.read_arrow_table()
|
|
1081
|
+
|
|
1082
|
+
return (
|
|
1083
|
+
duckdb.connect()
|
|
1084
|
+
.execute(query=query.replace(from_table, "__arrow_table__"))
|
|
1085
|
+
.fetch_arrow_table()
|
|
1086
|
+
)
|
|
1087
|
+
elif engine == "polars":
|
|
1088
|
+
from polars import CompatLevel
|
|
1089
|
+
|
|
1090
|
+
return (
|
|
1091
|
+
self.read_polars()
|
|
1092
|
+
.sql(query=query.replace(from_table, "self"))
|
|
1093
|
+
.to_arrow(compat_level=CompatLevel.newest())
|
|
1094
|
+
)
|
|
1095
|
+
else:
|
|
1096
|
+
raise ValueError(
|
|
1097
|
+
"Invalid engine %s, must be in duckdb, polars" % engine
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
|
|
1101
|
+
@register_converter(DatabricksPath, pa.Table)
|
|
1102
|
+
def databricks_path_to_arrow_table(
|
|
1103
|
+
data: DatabricksPath,
|
|
1104
|
+
options: Optional[CastOptions] = None,
|
|
1105
|
+
) -> pa.Table:
|
|
1106
|
+
return cast_arrow_tabular(
|
|
1107
|
+
data.read_arrow_table(),
|
|
1108
|
+
options
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
@polars_converter(DatabricksPath, PolarsDataFrame)
|
|
1113
|
+
def databricks_path_to_polars(
|
|
1114
|
+
data: DatabricksPath,
|
|
1115
|
+
options: Optional[CastOptions] = None,
|
|
1116
|
+
) -> PolarsDataFrame:
|
|
1117
|
+
return cast_polars_dataframe(
|
|
1118
|
+
data.read_polars(),
|
|
1119
|
+
options
|
|
1120
|
+
)
|