ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
- ygg-0.1.32.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +241 -2
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +16 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +31 -34
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +148 -1
- yggdrasil/databricks/sql/types.py +49 -1
- yggdrasil/databricks/workspaces/__init__.py +4 -1
- yggdrasil/databricks/workspaces/filesytem.py +344 -0
- yggdrasil/databricks/workspaces/io.py +1123 -0
- yggdrasil/databricks/workspaces/path.py +1415 -0
- yggdrasil/databricks/workspaces/path_kind.py +13 -0
- yggdrasil/databricks/workspaces/workspace.py +298 -154
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +131 -0
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.30.dist-info/RECORD +0 -56
- yggdrasil/databricks/workspaces/databricks_path.py +0 -784
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1123 @@
|
|
|
1
|
+
"""File-like IO abstractions for Databricks paths."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import io
|
|
5
|
+
import time
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import pyarrow.csv as pcsv
|
|
11
|
+
import pyarrow.parquet as pq
|
|
12
|
+
from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat
|
|
13
|
+
|
|
14
|
+
from .path_kind import DatabricksPathKind
|
|
15
|
+
from ...libs.databrickslib import databricks
|
|
16
|
+
from ...types.cast.pandas_cast import PandasDataFrame
|
|
17
|
+
from ...types.cast.polars_pandas_cast import PolarsDataFrame
|
|
18
|
+
from ...types.cast.registry import convert
|
|
19
|
+
|
|
20
|
+
if databricks is not None:
|
|
21
|
+
from databricks.sdk.service.workspace import ImportFormat, ExportFormat
|
|
22
|
+
from databricks.sdk.errors.platform import (
|
|
23
|
+
NotFound,
|
|
24
|
+
ResourceDoesNotExist,
|
|
25
|
+
BadRequest,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from .path import DatabricksPath
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"DatabricksIO"
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatabricksIO(ABC, IO):
|
|
38
|
+
"""File-like interface for Databricks workspace, volume, or DBFS paths."""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
path: "DatabricksPath",
|
|
43
|
+
mode: str,
|
|
44
|
+
encoding: Optional[str] = None,
|
|
45
|
+
compression: Optional[str] = "detect",
|
|
46
|
+
position: int = 0,
|
|
47
|
+
buffer: Optional[io.BytesIO] = None,
|
|
48
|
+
):
|
|
49
|
+
super().__init__()
|
|
50
|
+
|
|
51
|
+
self.encoding = encoding
|
|
52
|
+
self.mode = mode
|
|
53
|
+
self.compression = compression
|
|
54
|
+
|
|
55
|
+
self.path = path
|
|
56
|
+
|
|
57
|
+
self.buffer = buffer
|
|
58
|
+
self.position = position
|
|
59
|
+
|
|
60
|
+
self._write_flag = False
|
|
61
|
+
|
|
62
|
+
def __enter__(self) -> "DatabricksIO":
|
|
63
|
+
"""Enter a context manager and connect the underlying path."""
|
|
64
|
+
return self.connect(clone=False)
|
|
65
|
+
|
|
66
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
67
|
+
"""Exit the context manager and close the buffer."""
|
|
68
|
+
self.close()
|
|
69
|
+
|
|
70
|
+
def __del__(self):
|
|
71
|
+
self.close()
|
|
72
|
+
|
|
73
|
+
def __next__(self):
|
|
74
|
+
"""Iterate over lines in the file."""
|
|
75
|
+
line = self.readline()
|
|
76
|
+
if not line:
|
|
77
|
+
raise StopIteration
|
|
78
|
+
return line
|
|
79
|
+
|
|
80
|
+
def __iter__(self):
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
def __hash__(self):
|
|
84
|
+
return self.path.__hash__()
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def create_instance(
|
|
88
|
+
cls,
|
|
89
|
+
path: "DatabricksPath",
|
|
90
|
+
mode: str,
|
|
91
|
+
encoding: Optional[str] = None,
|
|
92
|
+
compression: Optional[str] = "detect",
|
|
93
|
+
position: int = 0,
|
|
94
|
+
buffer: Optional[io.BytesIO] = None,
|
|
95
|
+
) -> "DatabricksIO":
|
|
96
|
+
"""Create the appropriate IO subclass for the given path kind.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
path: DatabricksPath to open.
|
|
100
|
+
mode: File mode string.
|
|
101
|
+
encoding: Optional text encoding for text mode.
|
|
102
|
+
compression: Optional compression mode.
|
|
103
|
+
position: Initial file cursor position.
|
|
104
|
+
buffer: Optional pre-seeded buffer.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
A DatabricksIO subclass instance.
|
|
108
|
+
"""
|
|
109
|
+
if path.kind == DatabricksPathKind.VOLUME:
|
|
110
|
+
return DatabricksVolumeIO(
|
|
111
|
+
path=path,
|
|
112
|
+
mode=mode,
|
|
113
|
+
encoding=encoding,
|
|
114
|
+
compression=compression,
|
|
115
|
+
position=position,
|
|
116
|
+
buffer=buffer,
|
|
117
|
+
)
|
|
118
|
+
elif path.kind == DatabricksPathKind.DBFS:
|
|
119
|
+
return DatabricksDBFSIO(
|
|
120
|
+
path=path,
|
|
121
|
+
mode=mode,
|
|
122
|
+
encoding=encoding,
|
|
123
|
+
compression=compression,
|
|
124
|
+
position=position,
|
|
125
|
+
buffer=buffer,
|
|
126
|
+
)
|
|
127
|
+
elif path.kind == DatabricksPathKind.WORKSPACE:
|
|
128
|
+
return DatabricksWorkspaceIO(
|
|
129
|
+
path=path,
|
|
130
|
+
mode=mode,
|
|
131
|
+
encoding=encoding,
|
|
132
|
+
compression=compression,
|
|
133
|
+
position=position,
|
|
134
|
+
buffer=buffer,
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
raise ValueError(f"Unsupported DatabricksPath kind: {path.kind}")
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def workspace(self):
|
|
141
|
+
"""Return the associated Workspace instance.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
The Workspace bound to the path.
|
|
145
|
+
"""
|
|
146
|
+
return self.path.workspace
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def name(self):
|
|
150
|
+
"""Return the name of the underlying path.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
The path name component.
|
|
154
|
+
"""
|
|
155
|
+
return self.path.name
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def mode(self):
|
|
159
|
+
return self._mode
|
|
160
|
+
|
|
161
|
+
@mode.setter
|
|
162
|
+
def mode(self, value: str):
|
|
163
|
+
self._mode = value
|
|
164
|
+
|
|
165
|
+
# Basic text/binary behavior:
|
|
166
|
+
# - binary -> encoding None
|
|
167
|
+
# - text -> default utf-8
|
|
168
|
+
if "b" in self._mode:
|
|
169
|
+
self.encoding = None
|
|
170
|
+
else:
|
|
171
|
+
if self.encoding is None:
|
|
172
|
+
self.encoding = "utf-8"
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def content_length(self) -> int:
|
|
176
|
+
return self.path.content_length
|
|
177
|
+
|
|
178
|
+
def size(self):
|
|
179
|
+
"""Return the size of the file in bytes.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
The file size in bytes.
|
|
183
|
+
"""
|
|
184
|
+
return self.content_length
|
|
185
|
+
|
|
186
|
+
@content_length.setter
|
|
187
|
+
def content_length(self, value: int):
|
|
188
|
+
self.path.content_length = value
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def buffer(self):
|
|
192
|
+
"""Return the in-memory buffer, creating it if necessary.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
A BytesIO buffer for the file contents.
|
|
196
|
+
"""
|
|
197
|
+
if self._buffer is None:
|
|
198
|
+
self._buffer = io.BytesIO()
|
|
199
|
+
self._buffer.seek(self.position, io.SEEK_SET)
|
|
200
|
+
return self._buffer
|
|
201
|
+
|
|
202
|
+
@buffer.setter
|
|
203
|
+
def buffer(self, value: Optional[io.BytesIO]):
|
|
204
|
+
self._buffer = value
|
|
205
|
+
|
|
206
|
+
def clear_buffer(self):
|
|
207
|
+
"""Clear any cached in-memory buffer.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
None.
|
|
211
|
+
"""
|
|
212
|
+
if self._buffer is not None:
|
|
213
|
+
self._buffer.close()
|
|
214
|
+
self._buffer = None
|
|
215
|
+
|
|
216
|
+
def clone_instance(self, **kwargs):
|
|
217
|
+
"""Clone this IO instance with optional overrides.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
**kwargs: Field overrides for the new instance.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
A cloned DatabricksIO instance.
|
|
224
|
+
"""
|
|
225
|
+
return self.__class__(
|
|
226
|
+
path=kwargs.get("path", self.path),
|
|
227
|
+
mode=kwargs.get("mode", self.mode),
|
|
228
|
+
encoding=kwargs.get("encoding", self.encoding),
|
|
229
|
+
compression=kwargs.get("compression", self.compression),
|
|
230
|
+
position=kwargs.get("position", self.position),
|
|
231
|
+
buffer=kwargs.get("buffer", self._buffer),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def connected(self):
|
|
236
|
+
"""Return True if the underlying path is connected.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
True if connected, otherwise False.
|
|
240
|
+
"""
|
|
241
|
+
return self.path.connected
|
|
242
|
+
|
|
243
|
+
def connect(self, clone: bool = False) -> "DatabricksIO":
|
|
244
|
+
"""Connect the underlying path and optionally return a clone.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
clone: Whether to return a cloned instance.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
The connected DatabricksIO instance.
|
|
251
|
+
"""
|
|
252
|
+
path = self.path.connect(clone=clone)
|
|
253
|
+
|
|
254
|
+
if clone:
|
|
255
|
+
return self.clone_instance(path=path)
|
|
256
|
+
|
|
257
|
+
self.path = path
|
|
258
|
+
return self
|
|
259
|
+
|
|
260
|
+
def close(self):
|
|
261
|
+
"""Flush pending writes and close the buffer.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
None.
|
|
265
|
+
"""
|
|
266
|
+
self.flush()
|
|
267
|
+
if self._buffer is not None:
|
|
268
|
+
self._buffer.close()
|
|
269
|
+
|
|
270
|
+
def fileno(self):
|
|
271
|
+
"""Return a pseudo file descriptor based on object hash.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
An integer file descriptor-like value.
|
|
275
|
+
"""
|
|
276
|
+
return hash(self)
|
|
277
|
+
|
|
278
|
+
def isatty(self):
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
def tell(self):
|
|
282
|
+
"""Return the current cursor position.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
The current position in bytes.
|
|
286
|
+
"""
|
|
287
|
+
return self.position
|
|
288
|
+
|
|
289
|
+
def seekable(self):
|
|
290
|
+
"""Return True to indicate seek support.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
True.
|
|
294
|
+
"""
|
|
295
|
+
return True
|
|
296
|
+
|
|
297
|
+
def seek(self, offset, whence=0, /):
|
|
298
|
+
"""Move the cursor to a new position.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
offset: Offset in bytes.
|
|
302
|
+
whence: Reference point (start, current, end).
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
The new position in bytes.
|
|
306
|
+
"""
|
|
307
|
+
if whence == io.SEEK_SET:
|
|
308
|
+
new_position = offset
|
|
309
|
+
elif whence == io.SEEK_CUR:
|
|
310
|
+
new_position = self.position + offset
|
|
311
|
+
elif whence == io.SEEK_END:
|
|
312
|
+
end_position = self.content_length
|
|
313
|
+
new_position = end_position + offset
|
|
314
|
+
else:
|
|
315
|
+
raise ValueError("Invalid value for whence")
|
|
316
|
+
|
|
317
|
+
if new_position < 0:
|
|
318
|
+
raise ValueError("New position is before the start of the file")
|
|
319
|
+
|
|
320
|
+
if self._buffer is not None:
|
|
321
|
+
self._buffer.seek(new_position, io.SEEK_SET)
|
|
322
|
+
|
|
323
|
+
self.position = new_position
|
|
324
|
+
return self.position
|
|
325
|
+
|
|
326
|
+
def readable(self):
|
|
327
|
+
"""Return True to indicate read support.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
True.
|
|
331
|
+
"""
|
|
332
|
+
return True
|
|
333
|
+
|
|
334
|
+
def getvalue(self):
|
|
335
|
+
"""Return the buffer contents, reading from remote if needed.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
File contents as bytes or str depending on mode.
|
|
339
|
+
"""
|
|
340
|
+
if self._buffer is not None:
|
|
341
|
+
return self._buffer.getvalue()
|
|
342
|
+
return self.read_all_bytes()
|
|
343
|
+
|
|
344
|
+
def getbuffer(self):
|
|
345
|
+
"""Return the underlying BytesIO buffer.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
The BytesIO buffer instance.
|
|
349
|
+
"""
|
|
350
|
+
return self.buffer
|
|
351
|
+
|
|
352
|
+
@abstractmethod
|
|
353
|
+
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
354
|
+
"""Read a byte range from the remote path.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
start: Starting byte offset.
|
|
358
|
+
length: Number of bytes to read.
|
|
359
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
The bytes read from the remote path.
|
|
363
|
+
"""
|
|
364
|
+
pass
|
|
365
|
+
|
|
366
|
+
def read_all_bytes(self, use_cache: bool = True, allow_not_found: bool = False) -> bytes:
|
|
367
|
+
"""Read the full contents into memory, optionally caching.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
use_cache: Whether to cache contents in memory.
|
|
371
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
File contents as bytes.
|
|
375
|
+
"""
|
|
376
|
+
if use_cache and self._buffer is not None:
|
|
377
|
+
buffer_value = self._buffer.getvalue()
|
|
378
|
+
|
|
379
|
+
if len(buffer_value) == self.content_length:
|
|
380
|
+
return buffer_value
|
|
381
|
+
|
|
382
|
+
self._buffer.close()
|
|
383
|
+
self._buffer = None
|
|
384
|
+
|
|
385
|
+
data = self.read_byte_range(0, self.content_length, allow_not_found=allow_not_found)
|
|
386
|
+
|
|
387
|
+
# Keep size accurate even if backend didn't know it
|
|
388
|
+
self.content_length = len(data)
|
|
389
|
+
|
|
390
|
+
if use_cache and self._buffer is None:
|
|
391
|
+
self._buffer = io.BytesIO(data)
|
|
392
|
+
self._buffer.seek(self.position, io.SEEK_SET)
|
|
393
|
+
|
|
394
|
+
return data
|
|
395
|
+
|
|
396
|
+
def read(self, n=-1, use_cache: bool = True):
|
|
397
|
+
"""Read up to ``n`` bytes/characters from the file.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
n: Number of bytes/characters to read; -1 for all.
|
|
401
|
+
use_cache: Whether to use cached contents.
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
The read bytes or string depending on mode.
|
|
405
|
+
"""
|
|
406
|
+
if not self.readable():
|
|
407
|
+
raise IOError("File not open for reading")
|
|
408
|
+
|
|
409
|
+
current_position = self.position
|
|
410
|
+
all_data = self.read_all_bytes(use_cache=use_cache)
|
|
411
|
+
|
|
412
|
+
if n == -1:
|
|
413
|
+
n = self.content_length - current_position
|
|
414
|
+
|
|
415
|
+
data = all_data[current_position:current_position + n]
|
|
416
|
+
read_length = len(data)
|
|
417
|
+
|
|
418
|
+
self.position += read_length
|
|
419
|
+
|
|
420
|
+
if self.encoding:
|
|
421
|
+
return data.decode(self.encoding)
|
|
422
|
+
return data
|
|
423
|
+
|
|
424
|
+
def readline(self, limit=-1, use_cache: bool = True):
|
|
425
|
+
"""Read a single line from the file.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
limit: Max characters/bytes to read; -1 for no limit.
|
|
429
|
+
use_cache: Whether to use cached contents.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
The next line as bytes or string.
|
|
433
|
+
"""
|
|
434
|
+
if not self.readable():
|
|
435
|
+
raise IOError("File not open for reading")
|
|
436
|
+
|
|
437
|
+
if self.encoding:
|
|
438
|
+
# Text-mode: accumulate characters
|
|
439
|
+
out_chars = []
|
|
440
|
+
read_chars = 0
|
|
441
|
+
|
|
442
|
+
while limit == -1 or read_chars < limit:
|
|
443
|
+
ch = self.read(1, use_cache=use_cache)
|
|
444
|
+
if not ch:
|
|
445
|
+
break
|
|
446
|
+
out_chars.append(ch)
|
|
447
|
+
read_chars += 1
|
|
448
|
+
if ch == "\n":
|
|
449
|
+
break
|
|
450
|
+
|
|
451
|
+
return "".join(out_chars)
|
|
452
|
+
|
|
453
|
+
# Binary-mode: accumulate bytes
|
|
454
|
+
line_bytes = bytearray()
|
|
455
|
+
bytes_read = 0
|
|
456
|
+
|
|
457
|
+
while limit == -1 or bytes_read < limit:
|
|
458
|
+
b = self.read(1, use_cache=use_cache)
|
|
459
|
+
if not b:
|
|
460
|
+
break
|
|
461
|
+
line_bytes.extend(b)
|
|
462
|
+
bytes_read += 1
|
|
463
|
+
if b == b"\n":
|
|
464
|
+
break
|
|
465
|
+
|
|
466
|
+
return bytes(line_bytes)
|
|
467
|
+
|
|
468
|
+
def readlines(self, hint=-1, use_cache: bool = True):
|
|
469
|
+
"""Read all lines from the file.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
hint: Optional byte/char count hint; -1 for no hint.
|
|
473
|
+
use_cache: Whether to use cached contents.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
A list of lines.
|
|
477
|
+
"""
|
|
478
|
+
if not self.readable():
|
|
479
|
+
raise IOError("File not open for reading")
|
|
480
|
+
|
|
481
|
+
lines = []
|
|
482
|
+
total = 0
|
|
483
|
+
|
|
484
|
+
while True:
|
|
485
|
+
line = self.readline(use_cache=use_cache)
|
|
486
|
+
if not line:
|
|
487
|
+
break
|
|
488
|
+
lines.append(line)
|
|
489
|
+
total += len(line)
|
|
490
|
+
if hint != -1 and total >= hint:
|
|
491
|
+
break
|
|
492
|
+
|
|
493
|
+
return lines
|
|
494
|
+
|
|
495
|
+
def appendable(self):
|
|
496
|
+
"""Return True when the file is open in append mode.
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
True if in append mode.
|
|
500
|
+
"""
|
|
501
|
+
return "a" in self.mode
|
|
502
|
+
|
|
503
|
+
def writable(self):
|
|
504
|
+
"""Return True to indicate write support.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
True.
|
|
508
|
+
"""
|
|
509
|
+
return True
|
|
510
|
+
|
|
511
|
+
@abstractmethod
|
|
512
|
+
def write_all_bytes(self, data: bytes):
|
|
513
|
+
"""Write raw bytes to the remote path.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
data: Bytes to write.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
None.
|
|
520
|
+
"""
|
|
521
|
+
pass
|
|
522
|
+
|
|
523
|
+
def truncate(self, size=None, /):
|
|
524
|
+
"""Resize the file to ``size`` bytes.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
size: Target size in bytes (defaults to current position).
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
The new size in bytes.
|
|
531
|
+
"""
|
|
532
|
+
if size is None:
|
|
533
|
+
size = self.position
|
|
534
|
+
|
|
535
|
+
if self._buffer is not None:
|
|
536
|
+
self._buffer.truncate(size)
|
|
537
|
+
else:
|
|
538
|
+
data = b"\x00" * size
|
|
539
|
+
self.write_all_bytes(data=data)
|
|
540
|
+
|
|
541
|
+
self.content_length = size
|
|
542
|
+
self._write_flag = True
|
|
543
|
+
return size
|
|
544
|
+
|
|
545
|
+
def flush(self):
|
|
546
|
+
"""Flush buffered data to the remote path.
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
None.
|
|
550
|
+
"""
|
|
551
|
+
if self._write_flag and self._buffer is not None:
|
|
552
|
+
self.write_all_bytes(data=self._buffer.getvalue())
|
|
553
|
+
self._write_flag = False
|
|
554
|
+
|
|
555
|
+
def write(self, data: AnyStr) -> int:
|
|
556
|
+
"""Write data to the buffer and mark for flush.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
data: String or bytes to write.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
The number of bytes written.
|
|
563
|
+
"""
|
|
564
|
+
if not self.writable():
|
|
565
|
+
raise IOError("File not open for writing")
|
|
566
|
+
|
|
567
|
+
if isinstance(data, str):
|
|
568
|
+
data = data.encode(self.encoding or "utf-8")
|
|
569
|
+
|
|
570
|
+
written = self.buffer.write(data)
|
|
571
|
+
|
|
572
|
+
self.position += written
|
|
573
|
+
self.content_length = self.position
|
|
574
|
+
self._write_flag = True
|
|
575
|
+
|
|
576
|
+
return written
|
|
577
|
+
|
|
578
|
+
def writelines(self, lines) -> None:
|
|
579
|
+
"""Write multiple lines to the buffer.
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
lines: Iterable of lines to write.
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
None.
|
|
586
|
+
"""
|
|
587
|
+
for line in lines:
|
|
588
|
+
if isinstance(line, str):
|
|
589
|
+
line = line.encode(self.encoding or "utf-8")
|
|
590
|
+
elif not isinstance(line, (bytes, bytearray)):
|
|
591
|
+
raise TypeError(
|
|
592
|
+
"a bytes-like or str object is required, not '{}'".format(type(line).__name__)
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
data = line + b"\n" if not line.endswith(b"\n") else line
|
|
596
|
+
self.write(data)
|
|
597
|
+
|
|
598
|
+
def get_output_stream(self, *args, **kwargs):
|
|
599
|
+
"""Return this instance for compatibility with Arrow APIs.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
The current DatabricksIO instance.
|
|
603
|
+
"""
|
|
604
|
+
return self
|
|
605
|
+
|
|
606
|
+
def copy_to(
|
|
607
|
+
self,
|
|
608
|
+
dest: Union["DatabricksIO", "DatabricksPath", str]
|
|
609
|
+
) -> None:
|
|
610
|
+
"""Copy the file contents to another Databricks IO/path.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
dest: Destination IO, DatabricksPath, or path string.
|
|
614
|
+
|
|
615
|
+
Returns:
|
|
616
|
+
None.
|
|
617
|
+
"""
|
|
618
|
+
if not isinstance(dest, DatabricksIO):
|
|
619
|
+
from .path import DatabricksPath
|
|
620
|
+
|
|
621
|
+
dest_path = DatabricksPath.parse(dest, workspace=self.workspace)
|
|
622
|
+
|
|
623
|
+
with dest_path.open(mode="wb") as d:
|
|
624
|
+
return self.copy_to(dest=d)
|
|
625
|
+
|
|
626
|
+
dest.write_all_bytes(data=self.read_all_bytes(use_cache=False))
|
|
627
|
+
|
|
628
|
+
# ---- format helpers ----
|
|
629
|
+
|
|
630
|
+
def _reset_for_write(self):
|
|
631
|
+
if self._buffer is not None:
|
|
632
|
+
self._buffer.seek(0, io.SEEK_SET)
|
|
633
|
+
self._buffer.truncate(0)
|
|
634
|
+
|
|
635
|
+
self.position = 0
|
|
636
|
+
self.content_length = 0
|
|
637
|
+
self._write_flag = True
|
|
638
|
+
|
|
639
|
+
# ---- Data Querying Helpers ----
|
|
640
|
+
|
|
641
|
+
def write_table(
|
|
642
|
+
self,
|
|
643
|
+
table: Union[pa.Table, pa.RecordBatch, PolarsDataFrame, PandasDataFrame],
|
|
644
|
+
file_format: Optional[FileFormat] = None,
|
|
645
|
+
batch_size: Optional[int] = None,
|
|
646
|
+
**kwargs
|
|
647
|
+
):
|
|
648
|
+
"""Write a table-like object to the path using an inferred format.
|
|
649
|
+
|
|
650
|
+
Args:
|
|
651
|
+
table: Table-like object to write.
|
|
652
|
+
file_format: Optional file format override.
|
|
653
|
+
batch_size: Optional batch size for writes.
|
|
654
|
+
**kwargs: Format-specific options.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
The result of the specific write implementation.
|
|
658
|
+
"""
|
|
659
|
+
if isinstance(table, pa.Table):
|
|
660
|
+
return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
661
|
+
elif isinstance(table, pa.RecordBatch):
|
|
662
|
+
return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
663
|
+
elif isinstance(table, PolarsDataFrame):
|
|
664
|
+
return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
665
|
+
elif isinstance(table, PandasDataFrame):
|
|
666
|
+
return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
667
|
+
else:
|
|
668
|
+
raise ValueError(f"Cannot write {type(table)} to {self.path}")
|
|
669
|
+
|
|
670
|
+
# ---- Arrow ----
|
|
671
|
+
|
|
672
|
+
def read_arrow_table(
|
|
673
|
+
self,
|
|
674
|
+
file_format: Optional[FileFormat] = None,
|
|
675
|
+
batch_size: Optional[int] = None,
|
|
676
|
+
**kwargs
|
|
677
|
+
) -> pa.Table:
|
|
678
|
+
"""Read the file as an Arrow table.
|
|
679
|
+
|
|
680
|
+
Args:
|
|
681
|
+
file_format: Optional file format override.
|
|
682
|
+
batch_size: Optional batch size for reads.
|
|
683
|
+
**kwargs: Format-specific options.
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
An Arrow Table with the file contents.
|
|
687
|
+
"""
|
|
688
|
+
file_format = self.path.file_format if file_format is None else file_format
|
|
689
|
+
self.seek(0)
|
|
690
|
+
|
|
691
|
+
if isinstance(file_format, ParquetFileFormat):
|
|
692
|
+
return pq.read_table(self, **kwargs)
|
|
693
|
+
|
|
694
|
+
if isinstance(file_format, CsvFileFormat):
|
|
695
|
+
return pcsv.read_csv(self, parse_options=file_format.parse_options)
|
|
696
|
+
|
|
697
|
+
raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
|
|
698
|
+
|
|
699
|
+
def write_arrow(
|
|
700
|
+
self,
|
|
701
|
+
table: Union[pa.Table, pa.RecordBatch],
|
|
702
|
+
batch_size: Optional[int] = None,
|
|
703
|
+
**kwargs
|
|
704
|
+
):
|
|
705
|
+
"""Write an Arrow table or record batch to the path.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
table: Arrow table or batch to write.
|
|
709
|
+
batch_size: Optional batch size for writes.
|
|
710
|
+
**kwargs: Format-specific options.
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
None.
|
|
714
|
+
"""
|
|
715
|
+
if not isinstance(table, pa.Table):
|
|
716
|
+
table = convert(table, pa.Table)
|
|
717
|
+
|
|
718
|
+
return self.write_arrow_table(
|
|
719
|
+
table=table,
|
|
720
|
+
batch_size=batch_size,
|
|
721
|
+
**kwargs
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
def write_arrow_table(
|
|
725
|
+
self,
|
|
726
|
+
table: pa.Table,
|
|
727
|
+
file_format: Optional[FileFormat] = None,
|
|
728
|
+
batch_size: Optional[int] = None,
|
|
729
|
+
**kwargs
|
|
730
|
+
):
|
|
731
|
+
"""Write an Arrow table using the selected file format.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
table: Arrow table to write.
|
|
735
|
+
file_format: Optional file format override.
|
|
736
|
+
batch_size: Optional batch size for writes.
|
|
737
|
+
**kwargs: Format-specific options.
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
None.
|
|
741
|
+
"""
|
|
742
|
+
file_format = self.path.file_format if file_format is None else file_format
|
|
743
|
+
buffer = io.BytesIO()
|
|
744
|
+
|
|
745
|
+
if isinstance(file_format, ParquetFileFormat):
|
|
746
|
+
pq.write_table(table, buffer, write_batch_size=batch_size, **kwargs)
|
|
747
|
+
|
|
748
|
+
elif isinstance(file_format, CsvFileFormat):
|
|
749
|
+
pcsv.write_csv(table, buffer, **kwargs)
|
|
750
|
+
|
|
751
|
+
else:
|
|
752
|
+
raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
|
|
753
|
+
|
|
754
|
+
self.write_all_bytes(data=buffer.getvalue())
|
|
755
|
+
|
|
756
|
+
def write_arrow_batch(
|
|
757
|
+
self,
|
|
758
|
+
batch: pa.RecordBatch,
|
|
759
|
+
file_format: Optional[FileFormat] = None,
|
|
760
|
+
batch_size: Optional[int] = None,
|
|
761
|
+
**kwargs
|
|
762
|
+
):
|
|
763
|
+
"""Write a single Arrow record batch.
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
batch: RecordBatch to write.
|
|
767
|
+
file_format: Optional file format override.
|
|
768
|
+
batch_size: Optional batch size for writes.
|
|
769
|
+
**kwargs: Format-specific options.
|
|
770
|
+
|
|
771
|
+
Returns:
|
|
772
|
+
None.
|
|
773
|
+
"""
|
|
774
|
+
table = pa.Table.from_batches([batch])
|
|
775
|
+
self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
776
|
+
|
|
777
|
+
def read_arrow_batches(
|
|
778
|
+
self,
|
|
779
|
+
batch_size: Optional[int] = None,
|
|
780
|
+
**kwargs
|
|
781
|
+
):
|
|
782
|
+
"""Yield Arrow record batches from the file.
|
|
783
|
+
|
|
784
|
+
Args:
|
|
785
|
+
batch_size: Optional batch size for reads.
|
|
786
|
+
**kwargs: Format-specific options.
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
An iterator over Arrow RecordBatch objects.
|
|
790
|
+
"""
|
|
791
|
+
return (
|
|
792
|
+
self
|
|
793
|
+
.read_arrow_table(batch_size=batch_size, **kwargs)
|
|
794
|
+
.to_batches(max_chunksize=batch_size)
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
# ---- Pandas ----
|
|
798
|
+
|
|
799
|
+
def read_pandas(
|
|
800
|
+
self,
|
|
801
|
+
batch_size: Optional[int] = None,
|
|
802
|
+
**kwargs
|
|
803
|
+
):
|
|
804
|
+
"""Read the file into a pandas DataFrame.
|
|
805
|
+
|
|
806
|
+
Args:
|
|
807
|
+
batch_size: Optional batch size for reads.
|
|
808
|
+
**kwargs: Format-specific options.
|
|
809
|
+
|
|
810
|
+
Returns:
|
|
811
|
+
A pandas DataFrame with the file contents.
|
|
812
|
+
"""
|
|
813
|
+
return self.read_arrow_table(batch_size=batch_size, **kwargs).to_pandas()
|
|
814
|
+
|
|
815
|
+
def write_pandas(
|
|
816
|
+
self,
|
|
817
|
+
df,
|
|
818
|
+
batch_size: Optional[int] = None,
|
|
819
|
+
**kwargs
|
|
820
|
+
):
|
|
821
|
+
"""Write a pandas DataFrame to the file.
|
|
822
|
+
|
|
823
|
+
Args:
|
|
824
|
+
df: pandas DataFrame to write.
|
|
825
|
+
batch_size: Optional batch size for writes.
|
|
826
|
+
**kwargs: Format-specific options.
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
None.
|
|
830
|
+
"""
|
|
831
|
+
self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
|
|
832
|
+
|
|
833
|
+
# ---- Polars ----
|
|
834
|
+
|
|
835
|
+
def read_polars(
|
|
836
|
+
self,
|
|
837
|
+
file_format: Optional[FileFormat] = None,
|
|
838
|
+
batch_size: Optional[int] = None,
|
|
839
|
+
**kwargs
|
|
840
|
+
):
|
|
841
|
+
"""Read the file into a polars DataFrame.
|
|
842
|
+
|
|
843
|
+
Args:
|
|
844
|
+
file_format: Optional file format override.
|
|
845
|
+
batch_size: Optional batch size for reads.
|
|
846
|
+
**kwargs: Format-specific options.
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
A polars DataFrame with the file contents.
|
|
850
|
+
"""
|
|
851
|
+
import polars as pl
|
|
852
|
+
|
|
853
|
+
file_format = self.path.file_format if file_format is None else file_format
|
|
854
|
+
self.seek(0)
|
|
855
|
+
|
|
856
|
+
if isinstance(file_format, ParquetFileFormat):
|
|
857
|
+
return pl.read_parquet(self, **kwargs)
|
|
858
|
+
|
|
859
|
+
if isinstance(file_format, CsvFileFormat):
|
|
860
|
+
return pl.read_csv(self, **kwargs)
|
|
861
|
+
|
|
862
|
+
raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
|
|
863
|
+
|
|
864
|
+
def write_polars(
|
|
865
|
+
self,
|
|
866
|
+
df,
|
|
867
|
+
file_format: Optional[FileFormat] = None,
|
|
868
|
+
batch_size: Optional[int] = None,
|
|
869
|
+
**kwargs
|
|
870
|
+
):
|
|
871
|
+
"""Write a polars DataFrame to the file.
|
|
872
|
+
|
|
873
|
+
Args:
|
|
874
|
+
df: polars DataFrame to write.
|
|
875
|
+
file_format: Optional file format override.
|
|
876
|
+
batch_size: Optional batch size for writes.
|
|
877
|
+
**kwargs: Format-specific options.
|
|
878
|
+
|
|
879
|
+
Returns:
|
|
880
|
+
None.
|
|
881
|
+
"""
|
|
882
|
+
file_format = self.path.file_format if file_format is None else FileFormat
|
|
883
|
+
buffer = io.BytesIO()
|
|
884
|
+
|
|
885
|
+
if isinstance(file_format, ParquetFileFormat):
|
|
886
|
+
df.write_parquet(buffer, **kwargs)
|
|
887
|
+
|
|
888
|
+
elif isinstance(file_format, CsvFileFormat):
|
|
889
|
+
df.write_csv(buffer, **kwargs)
|
|
890
|
+
|
|
891
|
+
else:
|
|
892
|
+
raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
|
|
893
|
+
|
|
894
|
+
self.write_all_bytes(data=buffer.getvalue())
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
class DatabricksWorkspaceIO(DatabricksIO):
|
|
898
|
+
"""IO adapter for Workspace files."""
|
|
899
|
+
|
|
900
|
+
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
901
|
+
"""Read bytes from a Workspace file.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
start: Starting byte offset.
|
|
905
|
+
length: Number of bytes to read.
|
|
906
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
907
|
+
|
|
908
|
+
Returns:
|
|
909
|
+
Bytes read from the file.
|
|
910
|
+
"""
|
|
911
|
+
if length == 0:
|
|
912
|
+
return b""
|
|
913
|
+
|
|
914
|
+
sdk = self.workspace.sdk()
|
|
915
|
+
client = sdk.workspace
|
|
916
|
+
full_path = self.path.workspace_full_path()
|
|
917
|
+
|
|
918
|
+
result = client.download(
|
|
919
|
+
path=full_path,
|
|
920
|
+
format=ExportFormat.AUTO,
|
|
921
|
+
)
|
|
922
|
+
|
|
923
|
+
if result is None:
|
|
924
|
+
return b""
|
|
925
|
+
|
|
926
|
+
data = result.read()
|
|
927
|
+
|
|
928
|
+
end = start + length
|
|
929
|
+
return data[start:end]
|
|
930
|
+
|
|
931
|
+
def write_all_bytes(self, data: bytes):
|
|
932
|
+
"""Write bytes to a Workspace file.
|
|
933
|
+
|
|
934
|
+
Args:
|
|
935
|
+
data: Bytes to write.
|
|
936
|
+
|
|
937
|
+
Returns:
|
|
938
|
+
The DatabricksWorkspaceIO instance.
|
|
939
|
+
"""
|
|
940
|
+
sdk = self.workspace.sdk()
|
|
941
|
+
workspace_client = sdk.workspace
|
|
942
|
+
full_path = self.path.workspace_full_path()
|
|
943
|
+
|
|
944
|
+
try:
|
|
945
|
+
workspace_client.upload(
|
|
946
|
+
full_path,
|
|
947
|
+
data,
|
|
948
|
+
format=ImportFormat.AUTO,
|
|
949
|
+
overwrite=True
|
|
950
|
+
)
|
|
951
|
+
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
952
|
+
self.path.parent.make_workspace_dir(parents=True)
|
|
953
|
+
|
|
954
|
+
workspace_client.upload(
|
|
955
|
+
full_path,
|
|
956
|
+
data,
|
|
957
|
+
format=ImportFormat.AUTO,
|
|
958
|
+
overwrite=True
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
self.path.reset_metadata(
|
|
962
|
+
is_file=True,
|
|
963
|
+
is_dir=False,
|
|
964
|
+
size=len(data),
|
|
965
|
+
mtime=time.time()
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
return self
|
|
969
|
+
|
|
970
|
+
|
|
971
|
+
class DatabricksVolumeIO(DatabricksIO):
|
|
972
|
+
"""IO adapter for Unity Catalog volume files."""
|
|
973
|
+
|
|
974
|
+
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
975
|
+
"""Read bytes from a volume file.
|
|
976
|
+
|
|
977
|
+
Args:
|
|
978
|
+
start: Starting byte offset.
|
|
979
|
+
length: Number of bytes to read.
|
|
980
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
Bytes read from the file.
|
|
984
|
+
"""
|
|
985
|
+
if length == 0:
|
|
986
|
+
return b""
|
|
987
|
+
|
|
988
|
+
sdk = self.workspace.sdk()
|
|
989
|
+
client = sdk.files
|
|
990
|
+
full_path = self.path.files_full_path()
|
|
991
|
+
|
|
992
|
+
resp = client.download(full_path)
|
|
993
|
+
result = (
|
|
994
|
+
resp.contents
|
|
995
|
+
.seek(start, io.SEEK_SET)
|
|
996
|
+
.read(length)
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
return result
|
|
1000
|
+
|
|
1001
|
+
def write_all_bytes(self, data: bytes):
|
|
1002
|
+
"""Write bytes to a volume file.
|
|
1003
|
+
|
|
1004
|
+
Args:
|
|
1005
|
+
data: Bytes to write.
|
|
1006
|
+
|
|
1007
|
+
Returns:
|
|
1008
|
+
The DatabricksVolumeIO instance.
|
|
1009
|
+
"""
|
|
1010
|
+
sdk = self.workspace.sdk()
|
|
1011
|
+
client = sdk.files
|
|
1012
|
+
full_path = self.path.files_full_path()
|
|
1013
|
+
|
|
1014
|
+
try:
|
|
1015
|
+
client.upload(
|
|
1016
|
+
full_path,
|
|
1017
|
+
io.BytesIO(data),
|
|
1018
|
+
overwrite=True
|
|
1019
|
+
)
|
|
1020
|
+
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
1021
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
1022
|
+
|
|
1023
|
+
client.upload(
|
|
1024
|
+
full_path,
|
|
1025
|
+
io.BytesIO(data),
|
|
1026
|
+
overwrite=True
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
self.path.reset_metadata(
|
|
1030
|
+
is_file=True,
|
|
1031
|
+
is_dir=False,
|
|
1032
|
+
size=len(data),
|
|
1033
|
+
mtime=time.time()
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
return self
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
class DatabricksDBFSIO(DatabricksIO):
|
|
1040
|
+
"""IO adapter for DBFS files."""
|
|
1041
|
+
|
|
1042
|
+
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
1043
|
+
"""Read bytes from a DBFS file.
|
|
1044
|
+
|
|
1045
|
+
Args:
|
|
1046
|
+
start: Starting byte offset.
|
|
1047
|
+
length: Number of bytes to read.
|
|
1048
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
1049
|
+
|
|
1050
|
+
Returns:
|
|
1051
|
+
Bytes read from the file.
|
|
1052
|
+
"""
|
|
1053
|
+
if length == 0:
|
|
1054
|
+
return b""
|
|
1055
|
+
|
|
1056
|
+
sdk = self.workspace.sdk()
|
|
1057
|
+
client = sdk.dbfs
|
|
1058
|
+
full_path = self.path.dbfs_full_path()
|
|
1059
|
+
|
|
1060
|
+
read_bytes = bytearray()
|
|
1061
|
+
bytes_to_read = length
|
|
1062
|
+
current_position = start
|
|
1063
|
+
|
|
1064
|
+
while bytes_to_read > 0:
|
|
1065
|
+
chunk_size = min(bytes_to_read, 2 * 1024 * 1024)
|
|
1066
|
+
|
|
1067
|
+
resp = client.read(
|
|
1068
|
+
path=full_path,
|
|
1069
|
+
offset=current_position,
|
|
1070
|
+
length=chunk_size
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
if not resp.data:
|
|
1074
|
+
break
|
|
1075
|
+
|
|
1076
|
+
# resp.data is base64; decode and move offsets by *decoded* length
|
|
1077
|
+
resp_data_bytes = base64.b64decode(resp.data)
|
|
1078
|
+
|
|
1079
|
+
read_bytes.extend(resp_data_bytes)
|
|
1080
|
+
bytes_read = len(resp_data_bytes) # <-- FIX (was base64 string length)
|
|
1081
|
+
current_position += bytes_read
|
|
1082
|
+
bytes_to_read -= bytes_read
|
|
1083
|
+
|
|
1084
|
+
return bytes(read_bytes)
|
|
1085
|
+
|
|
1086
|
+
def write_all_bytes(self, data: bytes):
|
|
1087
|
+
"""Write bytes to a DBFS file.
|
|
1088
|
+
|
|
1089
|
+
Args:
|
|
1090
|
+
data: Bytes to write.
|
|
1091
|
+
|
|
1092
|
+
Returns:
|
|
1093
|
+
The DatabricksDBFSIO instance.
|
|
1094
|
+
"""
|
|
1095
|
+
sdk = self.workspace.sdk()
|
|
1096
|
+
client = sdk.dbfs
|
|
1097
|
+
full_path = self.path.dbfs_full_path()
|
|
1098
|
+
|
|
1099
|
+
try:
|
|
1100
|
+
with client.open(
|
|
1101
|
+
path=full_path,
|
|
1102
|
+
read=False,
|
|
1103
|
+
write=True,
|
|
1104
|
+
overwrite=True
|
|
1105
|
+
) as f:
|
|
1106
|
+
f.write(data)
|
|
1107
|
+
except (NotFound, ResourceDoesNotExist, BadRequest):
|
|
1108
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
1109
|
+
|
|
1110
|
+
with client.open(
|
|
1111
|
+
path=full_path,
|
|
1112
|
+
read=False,
|
|
1113
|
+
write=True,
|
|
1114
|
+
overwrite=True
|
|
1115
|
+
) as f:
|
|
1116
|
+
f.write(data)
|
|
1117
|
+
|
|
1118
|
+
self.path.reset_metadata(
|
|
1119
|
+
is_file=True,
|
|
1120
|
+
is_dir=False,
|
|
1121
|
+
size=len(data),
|
|
1122
|
+
mtime=time.time()
|
|
1123
|
+
)
|