ygg 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/METADATA +1 -1
- ygg-0.1.33.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +244 -3
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +24 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +29 -4
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +147 -0
- yggdrasil/databricks/sql/types.py +33 -1
- yggdrasil/databricks/workspaces/__init__.py +2 -1
- yggdrasil/databricks/workspaces/filesytem.py +183 -0
- yggdrasil/databricks/workspaces/io.py +387 -9
- yggdrasil/databricks/workspaces/path.py +297 -2
- yggdrasil/databricks/workspaces/path_kind.py +3 -0
- yggdrasil/databricks/workspaces/workspace.py +202 -5
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +123 -1
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.31.dist-info/RECORD +0 -59
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/WHEEL +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""File-like IO abstractions for Databricks paths."""
|
|
2
|
+
|
|
1
3
|
import base64
|
|
2
4
|
import io
|
|
3
5
|
import time
|
|
@@ -33,6 +35,7 @@ __all__ = [
|
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
class DatabricksIO(ABC, IO):
|
|
38
|
+
"""File-like interface for Databricks workspace, volume, or DBFS paths."""
|
|
36
39
|
|
|
37
40
|
def __init__(
|
|
38
41
|
self,
|
|
@@ -57,15 +60,18 @@ class DatabricksIO(ABC, IO):
|
|
|
57
60
|
self._write_flag = False
|
|
58
61
|
|
|
59
62
|
def __enter__(self) -> "DatabricksIO":
|
|
63
|
+
"""Enter a context manager and connect the underlying path."""
|
|
60
64
|
return self.connect(clone=False)
|
|
61
65
|
|
|
62
66
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
67
|
+
"""Exit the context manager and close the buffer."""
|
|
63
68
|
self.close()
|
|
64
69
|
|
|
65
70
|
def __del__(self):
|
|
66
71
|
self.close()
|
|
67
72
|
|
|
68
73
|
def __next__(self):
|
|
74
|
+
"""Iterate over lines in the file."""
|
|
69
75
|
line = self.readline()
|
|
70
76
|
if not line:
|
|
71
77
|
raise StopIteration
|
|
@@ -87,6 +93,19 @@ class DatabricksIO(ABC, IO):
|
|
|
87
93
|
position: int = 0,
|
|
88
94
|
buffer: Optional[io.BytesIO] = None,
|
|
89
95
|
) -> "DatabricksIO":
|
|
96
|
+
"""Create the appropriate IO subclass for the given path kind.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
path: DatabricksPath to open.
|
|
100
|
+
mode: File mode string.
|
|
101
|
+
encoding: Optional text encoding for text mode.
|
|
102
|
+
compression: Optional compression mode.
|
|
103
|
+
position: Initial file cursor position.
|
|
104
|
+
buffer: Optional pre-seeded buffer.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
A DatabricksIO subclass instance.
|
|
108
|
+
"""
|
|
90
109
|
if path.kind == DatabricksPathKind.VOLUME:
|
|
91
110
|
return DatabricksVolumeIO(
|
|
92
111
|
path=path,
|
|
@@ -119,10 +138,20 @@ class DatabricksIO(ABC, IO):
|
|
|
119
138
|
|
|
120
139
|
@property
|
|
121
140
|
def workspace(self):
|
|
141
|
+
"""Return the associated Workspace instance.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
The Workspace bound to the path.
|
|
145
|
+
"""
|
|
122
146
|
return self.path.workspace
|
|
123
147
|
|
|
124
148
|
@property
|
|
125
149
|
def name(self):
|
|
150
|
+
"""Return the name of the underlying path.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
The path name component.
|
|
154
|
+
"""
|
|
126
155
|
return self.path.name
|
|
127
156
|
|
|
128
157
|
@property
|
|
@@ -147,6 +176,11 @@ class DatabricksIO(ABC, IO):
|
|
|
147
176
|
return self.path.content_length
|
|
148
177
|
|
|
149
178
|
def size(self):
|
|
179
|
+
"""Return the size of the file in bytes.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
The file size in bytes.
|
|
183
|
+
"""
|
|
150
184
|
return self.content_length
|
|
151
185
|
|
|
152
186
|
@content_length.setter
|
|
@@ -155,6 +189,11 @@ class DatabricksIO(ABC, IO):
|
|
|
155
189
|
|
|
156
190
|
@property
|
|
157
191
|
def buffer(self):
|
|
192
|
+
"""Return the in-memory buffer, creating it if necessary.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
A BytesIO buffer for the file contents.
|
|
196
|
+
"""
|
|
158
197
|
if self._buffer is None:
|
|
159
198
|
self._buffer = io.BytesIO()
|
|
160
199
|
self._buffer.seek(self.position, io.SEEK_SET)
|
|
@@ -165,11 +204,24 @@ class DatabricksIO(ABC, IO):
|
|
|
165
204
|
self._buffer = value
|
|
166
205
|
|
|
167
206
|
def clear_buffer(self):
|
|
207
|
+
"""Clear any cached in-memory buffer.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
None.
|
|
211
|
+
"""
|
|
168
212
|
if self._buffer is not None:
|
|
169
213
|
self._buffer.close()
|
|
170
214
|
self._buffer = None
|
|
171
215
|
|
|
172
216
|
def clone_instance(self, **kwargs):
|
|
217
|
+
"""Clone this IO instance with optional overrides.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
**kwargs: Field overrides for the new instance.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
A cloned DatabricksIO instance.
|
|
224
|
+
"""
|
|
173
225
|
return self.__class__(
|
|
174
226
|
path=kwargs.get("path", self.path),
|
|
175
227
|
mode=kwargs.get("mode", self.mode),
|
|
@@ -181,9 +233,22 @@ class DatabricksIO(ABC, IO):
|
|
|
181
233
|
|
|
182
234
|
@property
|
|
183
235
|
def connected(self):
|
|
236
|
+
"""Return True if the underlying path is connected.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
True if connected, otherwise False.
|
|
240
|
+
"""
|
|
184
241
|
return self.path.connected
|
|
185
242
|
|
|
186
243
|
def connect(self, clone: bool = False) -> "DatabricksIO":
|
|
244
|
+
"""Connect the underlying path and optionally return a clone.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
clone: Whether to return a cloned instance.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
The connected DatabricksIO instance.
|
|
251
|
+
"""
|
|
187
252
|
path = self.path.connect(clone=clone)
|
|
188
253
|
|
|
189
254
|
if clone:
|
|
@@ -193,23 +258,52 @@ class DatabricksIO(ABC, IO):
|
|
|
193
258
|
return self
|
|
194
259
|
|
|
195
260
|
def close(self):
|
|
261
|
+
"""Flush pending writes and close the buffer.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
None.
|
|
265
|
+
"""
|
|
196
266
|
self.flush()
|
|
197
267
|
if self._buffer is not None:
|
|
198
268
|
self._buffer.close()
|
|
199
269
|
|
|
200
270
|
def fileno(self):
|
|
271
|
+
"""Return a pseudo file descriptor based on object hash.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
An integer file descriptor-like value.
|
|
275
|
+
"""
|
|
201
276
|
return hash(self)
|
|
202
277
|
|
|
203
278
|
def isatty(self):
|
|
204
279
|
return False
|
|
205
280
|
|
|
206
281
|
def tell(self):
|
|
282
|
+
"""Return the current cursor position.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
The current position in bytes.
|
|
286
|
+
"""
|
|
207
287
|
return self.position
|
|
208
288
|
|
|
209
289
|
def seekable(self):
|
|
290
|
+
"""Return True to indicate seek support.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
True.
|
|
294
|
+
"""
|
|
210
295
|
return True
|
|
211
296
|
|
|
212
297
|
def seek(self, offset, whence=0, /):
|
|
298
|
+
"""Move the cursor to a new position.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
offset: Offset in bytes.
|
|
302
|
+
whence: Reference point (start, current, end).
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
The new position in bytes.
|
|
306
|
+
"""
|
|
213
307
|
if whence == io.SEEK_SET:
|
|
214
308
|
new_position = offset
|
|
215
309
|
elif whence == io.SEEK_CUR:
|
|
@@ -230,21 +324,55 @@ class DatabricksIO(ABC, IO):
|
|
|
230
324
|
return self.position
|
|
231
325
|
|
|
232
326
|
def readable(self):
|
|
327
|
+
"""Return True to indicate read support.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
True.
|
|
331
|
+
"""
|
|
233
332
|
return True
|
|
234
333
|
|
|
235
334
|
def getvalue(self):
|
|
335
|
+
"""Return the buffer contents, reading from remote if needed.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
File contents as bytes or str depending on mode.
|
|
339
|
+
"""
|
|
236
340
|
if self._buffer is not None:
|
|
237
341
|
return self._buffer.getvalue()
|
|
238
342
|
return self.read_all_bytes()
|
|
239
343
|
|
|
240
344
|
def getbuffer(self):
|
|
345
|
+
"""Return the underlying BytesIO buffer.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
The BytesIO buffer instance.
|
|
349
|
+
"""
|
|
241
350
|
return self.buffer
|
|
242
351
|
|
|
243
352
|
@abstractmethod
|
|
244
353
|
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
354
|
+
"""Read a byte range from the remote path.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
start: Starting byte offset.
|
|
358
|
+
length: Number of bytes to read.
|
|
359
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
The bytes read from the remote path.
|
|
363
|
+
"""
|
|
245
364
|
pass
|
|
246
365
|
|
|
247
366
|
def read_all_bytes(self, use_cache: bool = True, allow_not_found: bool = False) -> bytes:
|
|
367
|
+
"""Read the full contents into memory, optionally caching.
|
|
368
|
+
|
|
369
|
+
Args:
|
|
370
|
+
use_cache: Whether to cache contents in memory.
|
|
371
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
File contents as bytes.
|
|
375
|
+
"""
|
|
248
376
|
if use_cache and self._buffer is not None:
|
|
249
377
|
buffer_value = self._buffer.getvalue()
|
|
250
378
|
|
|
@@ -266,6 +394,15 @@ class DatabricksIO(ABC, IO):
|
|
|
266
394
|
return data
|
|
267
395
|
|
|
268
396
|
def read(self, n=-1, use_cache: bool = True):
|
|
397
|
+
"""Read up to ``n`` bytes/characters from the file.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
n: Number of bytes/characters to read; -1 for all.
|
|
401
|
+
use_cache: Whether to use cached contents.
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
The read bytes or string depending on mode.
|
|
405
|
+
"""
|
|
269
406
|
if not self.readable():
|
|
270
407
|
raise IOError("File not open for reading")
|
|
271
408
|
|
|
@@ -285,6 +422,15 @@ class DatabricksIO(ABC, IO):
|
|
|
285
422
|
return data
|
|
286
423
|
|
|
287
424
|
def readline(self, limit=-1, use_cache: bool = True):
|
|
425
|
+
"""Read a single line from the file.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
limit: Max characters/bytes to read; -1 for no limit.
|
|
429
|
+
use_cache: Whether to use cached contents.
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
The next line as bytes or string.
|
|
433
|
+
"""
|
|
288
434
|
if not self.readable():
|
|
289
435
|
raise IOError("File not open for reading")
|
|
290
436
|
|
|
@@ -320,6 +466,15 @@ class DatabricksIO(ABC, IO):
|
|
|
320
466
|
return bytes(line_bytes)
|
|
321
467
|
|
|
322
468
|
def readlines(self, hint=-1, use_cache: bool = True):
|
|
469
|
+
"""Read all lines from the file.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
hint: Optional byte/char count hint; -1 for no hint.
|
|
473
|
+
use_cache: Whether to use cached contents.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
A list of lines.
|
|
477
|
+
"""
|
|
323
478
|
if not self.readable():
|
|
324
479
|
raise IOError("File not open for reading")
|
|
325
480
|
|
|
@@ -338,16 +493,42 @@ class DatabricksIO(ABC, IO):
|
|
|
338
493
|
return lines
|
|
339
494
|
|
|
340
495
|
def appendable(self):
|
|
496
|
+
"""Return True when the file is open in append mode.
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
True if in append mode.
|
|
500
|
+
"""
|
|
341
501
|
return "a" in self.mode
|
|
342
502
|
|
|
343
503
|
def writable(self):
|
|
504
|
+
"""Return True to indicate write support.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
True.
|
|
508
|
+
"""
|
|
344
509
|
return True
|
|
345
510
|
|
|
346
511
|
@abstractmethod
|
|
347
512
|
def write_all_bytes(self, data: bytes):
|
|
513
|
+
"""Write raw bytes to the remote path.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
data: Bytes to write.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
None.
|
|
520
|
+
"""
|
|
348
521
|
pass
|
|
349
522
|
|
|
350
523
|
def truncate(self, size=None, /):
|
|
524
|
+
"""Resize the file to ``size`` bytes.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
size: Target size in bytes (defaults to current position).
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
The new size in bytes.
|
|
531
|
+
"""
|
|
351
532
|
if size is None:
|
|
352
533
|
size = self.position
|
|
353
534
|
|
|
@@ -362,11 +543,24 @@ class DatabricksIO(ABC, IO):
|
|
|
362
543
|
return size
|
|
363
544
|
|
|
364
545
|
def flush(self):
|
|
546
|
+
"""Flush buffered data to the remote path.
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
None.
|
|
550
|
+
"""
|
|
365
551
|
if self._write_flag and self._buffer is not None:
|
|
366
552
|
self.write_all_bytes(data=self._buffer.getvalue())
|
|
367
553
|
self._write_flag = False
|
|
368
554
|
|
|
369
555
|
def write(self, data: AnyStr) -> int:
|
|
556
|
+
"""Write data to the buffer and mark for flush.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
data: String or bytes to write.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
The number of bytes written.
|
|
563
|
+
"""
|
|
370
564
|
if not self.writable():
|
|
371
565
|
raise IOError("File not open for writing")
|
|
372
566
|
|
|
@@ -382,6 +576,14 @@ class DatabricksIO(ABC, IO):
|
|
|
382
576
|
return written
|
|
383
577
|
|
|
384
578
|
def writelines(self, lines) -> None:
|
|
579
|
+
"""Write multiple lines to the buffer.
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
lines: Iterable of lines to write.
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
None.
|
|
586
|
+
"""
|
|
385
587
|
for line in lines:
|
|
386
588
|
if isinstance(line, str):
|
|
387
589
|
line = line.encode(self.encoding or "utf-8")
|
|
@@ -394,12 +596,25 @@ class DatabricksIO(ABC, IO):
|
|
|
394
596
|
self.write(data)
|
|
395
597
|
|
|
396
598
|
def get_output_stream(self, *args, **kwargs):
|
|
599
|
+
"""Return this instance for compatibility with Arrow APIs.
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
The current DatabricksIO instance.
|
|
603
|
+
"""
|
|
397
604
|
return self
|
|
398
605
|
|
|
399
606
|
def copy_to(
|
|
400
607
|
self,
|
|
401
608
|
dest: Union["DatabricksIO", "DatabricksPath", str]
|
|
402
609
|
) -> None:
|
|
610
|
+
"""Copy the file contents to another Databricks IO/path.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
dest: Destination IO, DatabricksPath, or path string.
|
|
614
|
+
|
|
615
|
+
Returns:
|
|
616
|
+
None.
|
|
617
|
+
"""
|
|
403
618
|
if not isinstance(dest, DatabricksIO):
|
|
404
619
|
from .path import DatabricksPath
|
|
405
620
|
|
|
@@ -426,17 +641,29 @@ class DatabricksIO(ABC, IO):
|
|
|
426
641
|
def write_table(
|
|
427
642
|
self,
|
|
428
643
|
table: Union[pa.Table, pa.RecordBatch, PolarsDataFrame, PandasDataFrame],
|
|
644
|
+
file_format: Optional[FileFormat] = None,
|
|
429
645
|
batch_size: Optional[int] = None,
|
|
430
646
|
**kwargs
|
|
431
647
|
):
|
|
648
|
+
"""Write a table-like object to the path using an inferred format.
|
|
649
|
+
|
|
650
|
+
Args:
|
|
651
|
+
table: Table-like object to write.
|
|
652
|
+
file_format: Optional file format override.
|
|
653
|
+
batch_size: Optional batch size for writes.
|
|
654
|
+
**kwargs: Format-specific options.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
The result of the specific write implementation.
|
|
658
|
+
"""
|
|
432
659
|
if isinstance(table, pa.Table):
|
|
433
|
-
return self.write_arrow_table(table, batch_size=batch_size, **kwargs)
|
|
660
|
+
return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
434
661
|
elif isinstance(table, pa.RecordBatch):
|
|
435
|
-
return self.write_arrow_batch(table, batch_size=batch_size, **kwargs)
|
|
662
|
+
return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
436
663
|
elif isinstance(table, PolarsDataFrame):
|
|
437
|
-
return self.write_polars(table, batch_size=batch_size, **kwargs)
|
|
664
|
+
return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
438
665
|
elif isinstance(table, PandasDataFrame):
|
|
439
|
-
return self.write_pandas(table, batch_size=batch_size, **kwargs)
|
|
666
|
+
return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
440
667
|
else:
|
|
441
668
|
raise ValueError(f"Cannot write {type(table)} to {self.path}")
|
|
442
669
|
|
|
@@ -448,6 +675,16 @@ class DatabricksIO(ABC, IO):
|
|
|
448
675
|
batch_size: Optional[int] = None,
|
|
449
676
|
**kwargs
|
|
450
677
|
) -> pa.Table:
|
|
678
|
+
"""Read the file as an Arrow table.
|
|
679
|
+
|
|
680
|
+
Args:
|
|
681
|
+
file_format: Optional file format override.
|
|
682
|
+
batch_size: Optional batch size for reads.
|
|
683
|
+
**kwargs: Format-specific options.
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
An Arrow Table with the file contents.
|
|
687
|
+
"""
|
|
451
688
|
file_format = self.path.file_format if file_format is None else file_format
|
|
452
689
|
self.seek(0)
|
|
453
690
|
|
|
@@ -465,6 +702,16 @@ class DatabricksIO(ABC, IO):
|
|
|
465
702
|
batch_size: Optional[int] = None,
|
|
466
703
|
**kwargs
|
|
467
704
|
):
|
|
705
|
+
"""Write an Arrow table or record batch to the path.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
table: Arrow table or batch to write.
|
|
709
|
+
batch_size: Optional batch size for writes.
|
|
710
|
+
**kwargs: Format-specific options.
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
None.
|
|
714
|
+
"""
|
|
468
715
|
if not isinstance(table, pa.Table):
|
|
469
716
|
table = convert(table, pa.Table)
|
|
470
717
|
|
|
@@ -481,11 +728,22 @@ class DatabricksIO(ABC, IO):
|
|
|
481
728
|
batch_size: Optional[int] = None,
|
|
482
729
|
**kwargs
|
|
483
730
|
):
|
|
731
|
+
"""Write an Arrow table using the selected file format.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
table: Arrow table to write.
|
|
735
|
+
file_format: Optional file format override.
|
|
736
|
+
batch_size: Optional batch size for writes.
|
|
737
|
+
**kwargs: Format-specific options.
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
None.
|
|
741
|
+
"""
|
|
484
742
|
file_format = self.path.file_format if file_format is None else file_format
|
|
485
743
|
buffer = io.BytesIO()
|
|
486
744
|
|
|
487
745
|
if isinstance(file_format, ParquetFileFormat):
|
|
488
|
-
pq.write_table(table, buffer, **kwargs)
|
|
746
|
+
pq.write_table(table, buffer, write_batch_size=batch_size, **kwargs)
|
|
489
747
|
|
|
490
748
|
elif isinstance(file_format, CsvFileFormat):
|
|
491
749
|
pcsv.write_csv(table, buffer, **kwargs)
|
|
@@ -498,17 +756,38 @@ class DatabricksIO(ABC, IO):
|
|
|
498
756
|
def write_arrow_batch(
|
|
499
757
|
self,
|
|
500
758
|
batch: pa.RecordBatch,
|
|
759
|
+
file_format: Optional[FileFormat] = None,
|
|
501
760
|
batch_size: Optional[int] = None,
|
|
502
761
|
**kwargs
|
|
503
762
|
):
|
|
763
|
+
"""Write a single Arrow record batch.
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
batch: RecordBatch to write.
|
|
767
|
+
file_format: Optional file format override.
|
|
768
|
+
batch_size: Optional batch size for writes.
|
|
769
|
+
**kwargs: Format-specific options.
|
|
770
|
+
|
|
771
|
+
Returns:
|
|
772
|
+
None.
|
|
773
|
+
"""
|
|
504
774
|
table = pa.Table.from_batches([batch])
|
|
505
|
-
self.write_arrow_table(table, batch_size=batch_size, **kwargs)
|
|
775
|
+
self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
|
|
506
776
|
|
|
507
777
|
def read_arrow_batches(
|
|
508
778
|
self,
|
|
509
779
|
batch_size: Optional[int] = None,
|
|
510
780
|
**kwargs
|
|
511
781
|
):
|
|
782
|
+
"""Yield Arrow record batches from the file.
|
|
783
|
+
|
|
784
|
+
Args:
|
|
785
|
+
batch_size: Optional batch size for reads.
|
|
786
|
+
**kwargs: Format-specific options.
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
An iterator over Arrow RecordBatch objects.
|
|
790
|
+
"""
|
|
512
791
|
return (
|
|
513
792
|
self
|
|
514
793
|
.read_arrow_table(batch_size=batch_size, **kwargs)
|
|
@@ -522,6 +801,15 @@ class DatabricksIO(ABC, IO):
|
|
|
522
801
|
batch_size: Optional[int] = None,
|
|
523
802
|
**kwargs
|
|
524
803
|
):
|
|
804
|
+
"""Read the file into a pandas DataFrame.
|
|
805
|
+
|
|
806
|
+
Args:
|
|
807
|
+
batch_size: Optional batch size for reads.
|
|
808
|
+
**kwargs: Format-specific options.
|
|
809
|
+
|
|
810
|
+
Returns:
|
|
811
|
+
A pandas DataFrame with the file contents.
|
|
812
|
+
"""
|
|
525
813
|
return self.read_arrow_table(batch_size=batch_size, **kwargs).to_pandas()
|
|
526
814
|
|
|
527
815
|
def write_pandas(
|
|
@@ -530,6 +818,16 @@ class DatabricksIO(ABC, IO):
|
|
|
530
818
|
batch_size: Optional[int] = None,
|
|
531
819
|
**kwargs
|
|
532
820
|
):
|
|
821
|
+
"""Write a pandas DataFrame to the file.
|
|
822
|
+
|
|
823
|
+
Args:
|
|
824
|
+
df: pandas DataFrame to write.
|
|
825
|
+
batch_size: Optional batch size for writes.
|
|
826
|
+
**kwargs: Format-specific options.
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
None.
|
|
830
|
+
"""
|
|
533
831
|
self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
|
|
534
832
|
|
|
535
833
|
# ---- Polars ----
|
|
@@ -540,6 +838,16 @@ class DatabricksIO(ABC, IO):
|
|
|
540
838
|
batch_size: Optional[int] = None,
|
|
541
839
|
**kwargs
|
|
542
840
|
):
|
|
841
|
+
"""Read the file into a polars DataFrame.
|
|
842
|
+
|
|
843
|
+
Args:
|
|
844
|
+
file_format: Optional file format override.
|
|
845
|
+
batch_size: Optional batch size for reads.
|
|
846
|
+
**kwargs: Format-specific options.
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
A polars DataFrame with the file contents.
|
|
850
|
+
"""
|
|
543
851
|
import polars as pl
|
|
544
852
|
|
|
545
853
|
file_format = self.path.file_format if file_format is None else file_format
|
|
@@ -560,22 +868,46 @@ class DatabricksIO(ABC, IO):
|
|
|
560
868
|
batch_size: Optional[int] = None,
|
|
561
869
|
**kwargs
|
|
562
870
|
):
|
|
871
|
+
"""Write a polars DataFrame to the file.
|
|
872
|
+
|
|
873
|
+
Args:
|
|
874
|
+
df: polars DataFrame to write.
|
|
875
|
+
file_format: Optional file format override.
|
|
876
|
+
batch_size: Optional batch size for writes.
|
|
877
|
+
**kwargs: Format-specific options.
|
|
878
|
+
|
|
879
|
+
Returns:
|
|
880
|
+
None.
|
|
881
|
+
"""
|
|
563
882
|
file_format = self.path.file_format if file_format is None else FileFormat
|
|
564
|
-
|
|
883
|
+
buffer = io.BytesIO()
|
|
565
884
|
|
|
566
885
|
if isinstance(file_format, ParquetFileFormat):
|
|
567
|
-
df.write_parquet(
|
|
886
|
+
df.write_parquet(buffer, **kwargs)
|
|
568
887
|
|
|
569
888
|
elif isinstance(file_format, CsvFileFormat):
|
|
570
|
-
df.write_csv(
|
|
889
|
+
df.write_csv(buffer, **kwargs)
|
|
571
890
|
|
|
572
891
|
else:
|
|
573
892
|
raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
|
|
574
893
|
|
|
894
|
+
self.write_all_bytes(data=buffer.getvalue())
|
|
895
|
+
|
|
575
896
|
|
|
576
897
|
class DatabricksWorkspaceIO(DatabricksIO):
|
|
898
|
+
"""IO adapter for Workspace files."""
|
|
577
899
|
|
|
578
900
|
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
901
|
+
"""Read bytes from a Workspace file.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
start: Starting byte offset.
|
|
905
|
+
length: Number of bytes to read.
|
|
906
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
907
|
+
|
|
908
|
+
Returns:
|
|
909
|
+
Bytes read from the file.
|
|
910
|
+
"""
|
|
579
911
|
if length == 0:
|
|
580
912
|
return b""
|
|
581
913
|
|
|
@@ -597,6 +929,14 @@ class DatabricksWorkspaceIO(DatabricksIO):
|
|
|
597
929
|
return data[start:end]
|
|
598
930
|
|
|
599
931
|
def write_all_bytes(self, data: bytes):
|
|
932
|
+
"""Write bytes to a Workspace file.
|
|
933
|
+
|
|
934
|
+
Args:
|
|
935
|
+
data: Bytes to write.
|
|
936
|
+
|
|
937
|
+
Returns:
|
|
938
|
+
The DatabricksWorkspaceIO instance.
|
|
939
|
+
"""
|
|
600
940
|
sdk = self.workspace.sdk()
|
|
601
941
|
workspace_client = sdk.workspace
|
|
602
942
|
full_path = self.path.workspace_full_path()
|
|
@@ -629,8 +969,19 @@ class DatabricksWorkspaceIO(DatabricksIO):
|
|
|
629
969
|
|
|
630
970
|
|
|
631
971
|
class DatabricksVolumeIO(DatabricksIO):
|
|
972
|
+
"""IO adapter for Unity Catalog volume files."""
|
|
632
973
|
|
|
633
974
|
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
975
|
+
"""Read bytes from a volume file.
|
|
976
|
+
|
|
977
|
+
Args:
|
|
978
|
+
start: Starting byte offset.
|
|
979
|
+
length: Number of bytes to read.
|
|
980
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
Bytes read from the file.
|
|
984
|
+
"""
|
|
634
985
|
if length == 0:
|
|
635
986
|
return b""
|
|
636
987
|
|
|
@@ -648,6 +999,14 @@ class DatabricksVolumeIO(DatabricksIO):
|
|
|
648
999
|
return result
|
|
649
1000
|
|
|
650
1001
|
def write_all_bytes(self, data: bytes):
|
|
1002
|
+
"""Write bytes to a volume file.
|
|
1003
|
+
|
|
1004
|
+
Args:
|
|
1005
|
+
data: Bytes to write.
|
|
1006
|
+
|
|
1007
|
+
Returns:
|
|
1008
|
+
The DatabricksVolumeIO instance.
|
|
1009
|
+
"""
|
|
651
1010
|
sdk = self.workspace.sdk()
|
|
652
1011
|
client = sdk.files
|
|
653
1012
|
full_path = self.path.files_full_path()
|
|
@@ -678,8 +1037,19 @@ class DatabricksVolumeIO(DatabricksIO):
|
|
|
678
1037
|
|
|
679
1038
|
|
|
680
1039
|
class DatabricksDBFSIO(DatabricksIO):
|
|
1040
|
+
"""IO adapter for DBFS files."""
|
|
681
1041
|
|
|
682
1042
|
def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
|
|
1043
|
+
"""Read bytes from a DBFS file.
|
|
1044
|
+
|
|
1045
|
+
Args:
|
|
1046
|
+
start: Starting byte offset.
|
|
1047
|
+
length: Number of bytes to read.
|
|
1048
|
+
allow_not_found: Whether to suppress missing-path errors.
|
|
1049
|
+
|
|
1050
|
+
Returns:
|
|
1051
|
+
Bytes read from the file.
|
|
1052
|
+
"""
|
|
683
1053
|
if length == 0:
|
|
684
1054
|
return b""
|
|
685
1055
|
|
|
@@ -714,6 +1084,14 @@ class DatabricksDBFSIO(DatabricksIO):
|
|
|
714
1084
|
return bytes(read_bytes)
|
|
715
1085
|
|
|
716
1086
|
def write_all_bytes(self, data: bytes):
|
|
1087
|
+
"""Write bytes to a DBFS file.
|
|
1088
|
+
|
|
1089
|
+
Args:
|
|
1090
|
+
data: Bytes to write.
|
|
1091
|
+
|
|
1092
|
+
Returns:
|
|
1093
|
+
The DatabricksDBFSIO instance.
|
|
1094
|
+
"""
|
|
717
1095
|
sdk = self.workspace.sdk()
|
|
718
1096
|
client = sdk.dbfs
|
|
719
1097
|
full_path = self.path.dbfs_full_path()
|