ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
- ygg-0.1.32.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +241 -2
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +16 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +31 -34
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +148 -1
- yggdrasil/databricks/sql/types.py +49 -1
- yggdrasil/databricks/workspaces/__init__.py +4 -1
- yggdrasil/databricks/workspaces/filesytem.py +344 -0
- yggdrasil/databricks/workspaces/io.py +1123 -0
- yggdrasil/databricks/workspaces/path.py +1415 -0
- yggdrasil/databricks/workspaces/path_kind.py +13 -0
- yggdrasil/databricks/workspaces/workspace.py +298 -154
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +131 -0
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.30.dist-info/RECORD +0 -56
- yggdrasil/databricks/workspaces/databricks_path.py +0 -784
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""PyArrow filesystem wrappers for Databricks paths."""
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"DatabricksFileSystem",
|
|
5
|
+
"DatabricksFileSystemHandler"
|
|
6
|
+
]
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Union, List, Optional
|
|
9
|
+
|
|
10
|
+
from pyarrow import PythonFile
|
|
11
|
+
from pyarrow.fs import FileSystem, FileInfo, FileSelector, PyFileSystem, FileSystemHandler
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from ..workspaces.workspace import Workspace
|
|
15
|
+
from .path import DatabricksPath
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DatabricksFileSystemHandler(FileSystemHandler):
|
|
19
|
+
"""PyArrow FileSystemHandler backed by Databricks paths."""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
workspace: "Workspace",
|
|
24
|
+
):
|
|
25
|
+
"""Create a handler bound to a Workspace.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
workspace: Workspace instance to use.
|
|
29
|
+
"""
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.workspace = workspace
|
|
32
|
+
|
|
33
|
+
def __enter__(self):
|
|
34
|
+
"""Enter a context manager and connect to the workspace.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
A connected DatabricksFileSystemHandler instance.
|
|
38
|
+
"""
|
|
39
|
+
return self.connect(clone=True)
|
|
40
|
+
|
|
41
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
42
|
+
"""Exit the context manager and close the workspace.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
exc_type: Exception type, if raised.
|
|
46
|
+
exc_val: Exception value, if raised.
|
|
47
|
+
exc_tb: Exception traceback, if raised.
|
|
48
|
+
"""
|
|
49
|
+
self.workspace.__exit__(exc_type, exc_val, exc_tb)
|
|
50
|
+
|
|
51
|
+
def _parse_path(self, obj: Any) -> "DatabricksPath":
|
|
52
|
+
"""Parse a path-like object into a DatabricksPath.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
obj: Path-like object to parse.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
A DatabricksPath instance.
|
|
59
|
+
"""
|
|
60
|
+
from .path import DatabricksPath
|
|
61
|
+
|
|
62
|
+
return DatabricksPath.parse(obj, workspace=self.workspace)
|
|
63
|
+
|
|
64
|
+
def connect(self, clone: bool = True):
|
|
65
|
+
"""Connect the workspace and optionally return a cloned handler.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
clone: Whether to return a cloned handler.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
A connected handler.
|
|
72
|
+
"""
|
|
73
|
+
workspace = self.connect(clone=clone)
|
|
74
|
+
|
|
75
|
+
if clone:
|
|
76
|
+
return DatabricksFileSystemHandler(
|
|
77
|
+
workspace=workspace
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
self.workspace = workspace
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
def close(self):
|
|
84
|
+
"""Close the underlying workspace client.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
None.
|
|
88
|
+
"""
|
|
89
|
+
self.workspace.close()
|
|
90
|
+
|
|
91
|
+
def copy_file(self, src, dest, *, chunk_size: int = 4 * 1024 * 1024):
|
|
92
|
+
"""Copy a file between Databricks paths.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
src: Source path.
|
|
96
|
+
dest: Destination path.
|
|
97
|
+
chunk_size: Chunk size in bytes.
|
|
98
|
+
"""
|
|
99
|
+
src = self._parse_path(src)
|
|
100
|
+
dest = self._parse_path(dest)
|
|
101
|
+
|
|
102
|
+
with src.open("rb") as r, dest.open("wb") as w:
|
|
103
|
+
while True:
|
|
104
|
+
chunk = r.read(chunk_size)
|
|
105
|
+
if not chunk:
|
|
106
|
+
break
|
|
107
|
+
w.write(chunk)
|
|
108
|
+
|
|
109
|
+
def create_dir(self, path, *args, recursive: bool = True, **kwargs):
|
|
110
|
+
"""Create a directory at the given path.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
path: Directory path to create.
|
|
114
|
+
recursive: Whether to create parents.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
The created DatabricksPath instance.
|
|
118
|
+
"""
|
|
119
|
+
return self._parse_path(path).mkdir(parents=recursive)
|
|
120
|
+
|
|
121
|
+
def delete_dir(self, path):
|
|
122
|
+
"""Delete a directory recursively.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
path: Directory path to delete.
|
|
126
|
+
"""
|
|
127
|
+
return self._parse_path(path).rmdir(recursive=True)
|
|
128
|
+
|
|
129
|
+
def delete_dir_contents(self, path, *args, accept_root_dir: bool = False, **kwargs):
|
|
130
|
+
"""Delete the contents of a directory.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
path: Directory path whose contents should be removed.
|
|
134
|
+
accept_root_dir: Whether to allow deleting root contents.
|
|
135
|
+
"""
|
|
136
|
+
return self._parse_path(path).rmdir(recursive=True)
|
|
137
|
+
|
|
138
|
+
def delete_root_dir_contents(self):
|
|
139
|
+
"""Delete the contents of the root directory."""
|
|
140
|
+
return self.delete_dir_contents("/", accept_root_dir=True)
|
|
141
|
+
|
|
142
|
+
def delete_file(self, path):
|
|
143
|
+
"""Delete a single file.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
path: File path to delete.
|
|
147
|
+
"""
|
|
148
|
+
return self._parse_path(path).rmfile()
|
|
149
|
+
|
|
150
|
+
def equals(self, other: FileSystem):
|
|
151
|
+
"""Return True if the filesystem handler matches another.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
other: Another FileSystem instance.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
True if equal, otherwise False.
|
|
158
|
+
"""
|
|
159
|
+
return self == other
|
|
160
|
+
|
|
161
|
+
def from_uri(self, uri):
|
|
162
|
+
"""Return a handler for the workspace in the provided URI.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
uri: URI or path to parse.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
A DatabricksFileSystemHandler for the URI.
|
|
169
|
+
"""
|
|
170
|
+
uri = self._parse_path(uri)
|
|
171
|
+
|
|
172
|
+
return self.__class__(
|
|
173
|
+
workspace=uri.workspace
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def get_file_info(
|
|
177
|
+
self,
|
|
178
|
+
paths_or_selector: Union[FileSelector, str, "DatabricksPath", List[Union[str, "DatabricksPath"]]]
|
|
179
|
+
) -> Union[FileInfo, List[FileInfo]]:
|
|
180
|
+
"""Return FileInfo objects for paths or selectors.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
paths_or_selector: Path(s) or a FileSelector.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
A FileInfo or list of FileInfo objects.
|
|
187
|
+
"""
|
|
188
|
+
from .path import DatabricksPath
|
|
189
|
+
|
|
190
|
+
if isinstance(paths_or_selector, (str, DatabricksPath)):
|
|
191
|
+
result = self._parse_path(paths_or_selector).file_info
|
|
192
|
+
|
|
193
|
+
return result
|
|
194
|
+
|
|
195
|
+
if isinstance(paths_or_selector, FileSelector):
|
|
196
|
+
return self.get_file_info_selector(paths_or_selector)
|
|
197
|
+
|
|
198
|
+
return [
|
|
199
|
+
self.get_file_info(obj)
|
|
200
|
+
for obj in paths_or_selector
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
def get_file_info_selector(
|
|
204
|
+
self,
|
|
205
|
+
selector: FileSelector
|
|
206
|
+
):
|
|
207
|
+
"""Return FileInfo entries for a FileSelector.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
selector: FileSelector describing the listing.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
A list of FileInfo entries.
|
|
214
|
+
"""
|
|
215
|
+
base_dir = self._parse_path(selector.base_dir)
|
|
216
|
+
|
|
217
|
+
return [
|
|
218
|
+
p.file_info
|
|
219
|
+
for p in base_dir.ls(
|
|
220
|
+
recursive=selector.recursive,
|
|
221
|
+
allow_not_found=selector.allow_not_found
|
|
222
|
+
)
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
def get_type_name(self):
|
|
226
|
+
"""Return the filesystem type name.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
The filesystem type name string.
|
|
230
|
+
"""
|
|
231
|
+
return "dbfs"
|
|
232
|
+
|
|
233
|
+
def move(self, src, dest):
|
|
234
|
+
"""Move a file by copying then deleting.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
src: Source path.
|
|
238
|
+
dest: Destination path.
|
|
239
|
+
"""
|
|
240
|
+
src = self._parse_path(src)
|
|
241
|
+
|
|
242
|
+
src.copy_to(dest)
|
|
243
|
+
|
|
244
|
+
src.remove(recursive=True)
|
|
245
|
+
|
|
246
|
+
def normalize_path(self, path):
|
|
247
|
+
"""Normalize a path to a full Databricks path string.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
path: Path to normalize.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
The normalized full path string.
|
|
254
|
+
"""
|
|
255
|
+
return self._parse_path(path).full_path()
|
|
256
|
+
|
|
257
|
+
def open(
|
|
258
|
+
self,
|
|
259
|
+
path,
|
|
260
|
+
mode: str = "r+",
|
|
261
|
+
encoding: Optional[str] = None,
|
|
262
|
+
):
|
|
263
|
+
"""Open a file path as a Databricks IO stream.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
path: Path to open.
|
|
267
|
+
mode: File mode string.
|
|
268
|
+
encoding: Optional text encoding.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
A DatabricksIO instance.
|
|
272
|
+
"""
|
|
273
|
+
return self._parse_path(path).open(mode=mode, encoding=encoding, clone=False)
|
|
274
|
+
|
|
275
|
+
def open_append_stream(self, path, compression='detect', buffer_size=None, metadata=None):
|
|
276
|
+
"""Open an append stream.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
path: Path to open.
|
|
280
|
+
compression: Optional compression hint.
|
|
281
|
+
buffer_size: Optional buffer size.
|
|
282
|
+
metadata: Optional metadata.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
A DatabricksIO instance.
|
|
286
|
+
"""
|
|
287
|
+
return self._parse_path(path).open(mode="ab")
|
|
288
|
+
|
|
289
|
+
def open_input_file(self, path, mode: str = "rb", **kwargs):
|
|
290
|
+
"""Open an input file as a PyArrow PythonFile.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
path: Path to open.
|
|
294
|
+
mode: File mode string.
|
|
295
|
+
**kwargs: Additional options.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
A PyArrow PythonFile instance.
|
|
299
|
+
"""
|
|
300
|
+
buf = self._parse_path(path).open(mode=mode).connect(clone=True)
|
|
301
|
+
|
|
302
|
+
return PythonFile(
|
|
303
|
+
buf,
|
|
304
|
+
mode=mode
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
def open_input_stream(self, path, compression='detect', buffer_size=None):
|
|
308
|
+
"""Open an input stream for reading bytes.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
path: Path to open.
|
|
312
|
+
compression: Optional compression hint.
|
|
313
|
+
buffer_size: Optional buffer size.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
A DatabricksIO instance.
|
|
317
|
+
"""
|
|
318
|
+
return self._parse_path(path).open(mode="rb")
|
|
319
|
+
|
|
320
|
+
def open_output_stream(self, path, compression='detect', buffer_size=None, metadata=None):
|
|
321
|
+
"""Open an output stream for writing bytes.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
path: Path to open.
|
|
325
|
+
compression: Optional compression hint.
|
|
326
|
+
buffer_size: Optional buffer size.
|
|
327
|
+
metadata: Optional metadata.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
A DatabricksIO instance.
|
|
331
|
+
"""
|
|
332
|
+
return self._parse_path(path).open(mode="wb")
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class DatabricksFileSystem(PyFileSystem):
|
|
336
|
+
"""PyArrow filesystem wrapper for Databricks paths."""
|
|
337
|
+
|
|
338
|
+
def __init__(self, handler): # real signature unknown; restored from __doc__
|
|
339
|
+
"""Initialize the filesystem with a handler.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
handler: FileSystemHandler instance.
|
|
343
|
+
"""
|
|
344
|
+
super().__init__(handler)
|