ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
  2. ygg-0.1.32.dist-info/RECORD +60 -0
  3. yggdrasil/__init__.py +2 -0
  4. yggdrasil/databricks/__init__.py +2 -0
  5. yggdrasil/databricks/compute/__init__.py +2 -0
  6. yggdrasil/databricks/compute/cluster.py +241 -2
  7. yggdrasil/databricks/compute/execution_context.py +100 -11
  8. yggdrasil/databricks/compute/remote.py +16 -0
  9. yggdrasil/databricks/jobs/__init__.py +5 -0
  10. yggdrasil/databricks/jobs/config.py +31 -34
  11. yggdrasil/databricks/sql/__init__.py +2 -0
  12. yggdrasil/databricks/sql/engine.py +217 -36
  13. yggdrasil/databricks/sql/exceptions.py +1 -0
  14. yggdrasil/databricks/sql/statement_result.py +148 -1
  15. yggdrasil/databricks/sql/types.py +49 -1
  16. yggdrasil/databricks/workspaces/__init__.py +4 -1
  17. yggdrasil/databricks/workspaces/filesytem.py +344 -0
  18. yggdrasil/databricks/workspaces/io.py +1123 -0
  19. yggdrasil/databricks/workspaces/path.py +1415 -0
  20. yggdrasil/databricks/workspaces/path_kind.py +13 -0
  21. yggdrasil/databricks/workspaces/workspace.py +298 -154
  22. yggdrasil/dataclasses/__init__.py +2 -0
  23. yggdrasil/dataclasses/dataclass.py +42 -1
  24. yggdrasil/libs/__init__.py +2 -0
  25. yggdrasil/libs/databrickslib.py +9 -0
  26. yggdrasil/libs/extensions/__init__.py +2 -0
  27. yggdrasil/libs/extensions/polars_extensions.py +72 -0
  28. yggdrasil/libs/extensions/spark_extensions.py +116 -0
  29. yggdrasil/libs/pandaslib.py +7 -0
  30. yggdrasil/libs/polarslib.py +7 -0
  31. yggdrasil/libs/sparklib.py +41 -0
  32. yggdrasil/pyutils/__init__.py +4 -0
  33. yggdrasil/pyutils/callable_serde.py +106 -0
  34. yggdrasil/pyutils/exceptions.py +16 -0
  35. yggdrasil/pyutils/modules.py +44 -1
  36. yggdrasil/pyutils/parallel.py +29 -0
  37. yggdrasil/pyutils/python_env.py +301 -0
  38. yggdrasil/pyutils/retry.py +57 -0
  39. yggdrasil/requests/__init__.py +4 -0
  40. yggdrasil/requests/msal.py +124 -3
  41. yggdrasil/requests/session.py +18 -0
  42. yggdrasil/types/__init__.py +2 -0
  43. yggdrasil/types/cast/__init__.py +2 -1
  44. yggdrasil/types/cast/arrow_cast.py +131 -0
  45. yggdrasil/types/cast/cast_options.py +119 -1
  46. yggdrasil/types/cast/pandas_cast.py +29 -0
  47. yggdrasil/types/cast/polars_cast.py +47 -0
  48. yggdrasil/types/cast/polars_pandas_cast.py +29 -0
  49. yggdrasil/types/cast/registry.py +176 -0
  50. yggdrasil/types/cast/spark_cast.py +76 -0
  51. yggdrasil/types/cast/spark_pandas_cast.py +29 -0
  52. yggdrasil/types/cast/spark_polars_cast.py +28 -0
  53. yggdrasil/types/libs.py +2 -0
  54. yggdrasil/types/python_arrow.py +191 -0
  55. yggdrasil/types/python_defaults.py +73 -0
  56. yggdrasil/version.py +1 -0
  57. ygg-0.1.30.dist-info/RECORD +0 -56
  58. yggdrasil/databricks/workspaces/databricks_path.py +0 -784
  59. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
  60. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
  61. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
  62. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1123 @@
1
+ """File-like IO abstractions for Databricks paths."""
2
+
3
+ import base64
4
+ import io
5
+ import time
6
+ from abc import ABC, abstractmethod
7
+ from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union
8
+
9
+ import pyarrow as pa
10
+ import pyarrow.csv as pcsv
11
+ import pyarrow.parquet as pq
12
+ from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat
13
+
14
+ from .path_kind import DatabricksPathKind
15
+ from ...libs.databrickslib import databricks
16
+ from ...types.cast.pandas_cast import PandasDataFrame
17
+ from ...types.cast.polars_pandas_cast import PolarsDataFrame
18
+ from ...types.cast.registry import convert
19
+
20
+ if databricks is not None:
21
+ from databricks.sdk.service.workspace import ImportFormat, ExportFormat
22
+ from databricks.sdk.errors.platform import (
23
+ NotFound,
24
+ ResourceDoesNotExist,
25
+ BadRequest,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from .path import DatabricksPath
30
+
31
+
32
+ __all__ = [
33
+ "DatabricksIO"
34
+ ]
35
+
36
+
37
+ class DatabricksIO(ABC, IO):
38
+ """File-like interface for Databricks workspace, volume, or DBFS paths."""
39
+
40
+ def __init__(
41
+ self,
42
+ path: "DatabricksPath",
43
+ mode: str,
44
+ encoding: Optional[str] = None,
45
+ compression: Optional[str] = "detect",
46
+ position: int = 0,
47
+ buffer: Optional[io.BytesIO] = None,
48
+ ):
49
+ super().__init__()
50
+
51
+ self.encoding = encoding
52
+ self.mode = mode
53
+ self.compression = compression
54
+
55
+ self.path = path
56
+
57
+ self.buffer = buffer
58
+ self.position = position
59
+
60
+ self._write_flag = False
61
+
62
+ def __enter__(self) -> "DatabricksIO":
63
+ """Enter a context manager and connect the underlying path."""
64
+ return self.connect(clone=False)
65
+
66
+ def __exit__(self, exc_type, exc_value, traceback):
67
+ """Exit the context manager and close the buffer."""
68
+ self.close()
69
+
70
+ def __del__(self):
71
+ self.close()
72
+
73
+ def __next__(self):
74
+ """Iterate over lines in the file."""
75
+ line = self.readline()
76
+ if not line:
77
+ raise StopIteration
78
+ return line
79
+
80
+ def __iter__(self):
81
+ return self
82
+
83
+ def __hash__(self):
84
+ return self.path.__hash__()
85
+
86
+ @classmethod
87
+ def create_instance(
88
+ cls,
89
+ path: "DatabricksPath",
90
+ mode: str,
91
+ encoding: Optional[str] = None,
92
+ compression: Optional[str] = "detect",
93
+ position: int = 0,
94
+ buffer: Optional[io.BytesIO] = None,
95
+ ) -> "DatabricksIO":
96
+ """Create the appropriate IO subclass for the given path kind.
97
+
98
+ Args:
99
+ path: DatabricksPath to open.
100
+ mode: File mode string.
101
+ encoding: Optional text encoding for text mode.
102
+ compression: Optional compression mode.
103
+ position: Initial file cursor position.
104
+ buffer: Optional pre-seeded buffer.
105
+
106
+ Returns:
107
+ A DatabricksIO subclass instance.
108
+ """
109
+ if path.kind == DatabricksPathKind.VOLUME:
110
+ return DatabricksVolumeIO(
111
+ path=path,
112
+ mode=mode,
113
+ encoding=encoding,
114
+ compression=compression,
115
+ position=position,
116
+ buffer=buffer,
117
+ )
118
+ elif path.kind == DatabricksPathKind.DBFS:
119
+ return DatabricksDBFSIO(
120
+ path=path,
121
+ mode=mode,
122
+ encoding=encoding,
123
+ compression=compression,
124
+ position=position,
125
+ buffer=buffer,
126
+ )
127
+ elif path.kind == DatabricksPathKind.WORKSPACE:
128
+ return DatabricksWorkspaceIO(
129
+ path=path,
130
+ mode=mode,
131
+ encoding=encoding,
132
+ compression=compression,
133
+ position=position,
134
+ buffer=buffer,
135
+ )
136
+ else:
137
+ raise ValueError(f"Unsupported DatabricksPath kind: {path.kind}")
138
+
139
+ @property
140
+ def workspace(self):
141
+ """Return the associated Workspace instance.
142
+
143
+ Returns:
144
+ The Workspace bound to the path.
145
+ """
146
+ return self.path.workspace
147
+
148
+ @property
149
+ def name(self):
150
+ """Return the name of the underlying path.
151
+
152
+ Returns:
153
+ The path name component.
154
+ """
155
+ return self.path.name
156
+
157
+ @property
158
+ def mode(self):
159
+ return self._mode
160
+
161
+ @mode.setter
162
+ def mode(self, value: str):
163
+ self._mode = value
164
+
165
+ # Basic text/binary behavior:
166
+ # - binary -> encoding None
167
+ # - text -> default utf-8
168
+ if "b" in self._mode:
169
+ self.encoding = None
170
+ else:
171
+ if self.encoding is None:
172
+ self.encoding = "utf-8"
173
+
174
+ @property
175
+ def content_length(self) -> int:
176
+ return self.path.content_length
177
+
178
+ def size(self):
179
+ """Return the size of the file in bytes.
180
+
181
+ Returns:
182
+ The file size in bytes.
183
+ """
184
+ return self.content_length
185
+
186
+ @content_length.setter
187
+ def content_length(self, value: int):
188
+ self.path.content_length = value
189
+
190
+ @property
191
+ def buffer(self):
192
+ """Return the in-memory buffer, creating it if necessary.
193
+
194
+ Returns:
195
+ A BytesIO buffer for the file contents.
196
+ """
197
+ if self._buffer is None:
198
+ self._buffer = io.BytesIO()
199
+ self._buffer.seek(self.position, io.SEEK_SET)
200
+ return self._buffer
201
+
202
+ @buffer.setter
203
+ def buffer(self, value: Optional[io.BytesIO]):
204
+ self._buffer = value
205
+
206
+ def clear_buffer(self):
207
+ """Clear any cached in-memory buffer.
208
+
209
+ Returns:
210
+ None.
211
+ """
212
+ if self._buffer is not None:
213
+ self._buffer.close()
214
+ self._buffer = None
215
+
216
+ def clone_instance(self, **kwargs):
217
+ """Clone this IO instance with optional overrides.
218
+
219
+ Args:
220
+ **kwargs: Field overrides for the new instance.
221
+
222
+ Returns:
223
+ A cloned DatabricksIO instance.
224
+ """
225
+ return self.__class__(
226
+ path=kwargs.get("path", self.path),
227
+ mode=kwargs.get("mode", self.mode),
228
+ encoding=kwargs.get("encoding", self.encoding),
229
+ compression=kwargs.get("compression", self.compression),
230
+ position=kwargs.get("position", self.position),
231
+ buffer=kwargs.get("buffer", self._buffer),
232
+ )
233
+
234
+ @property
235
+ def connected(self):
236
+ """Return True if the underlying path is connected.
237
+
238
+ Returns:
239
+ True if connected, otherwise False.
240
+ """
241
+ return self.path.connected
242
+
243
+ def connect(self, clone: bool = False) -> "DatabricksIO":
244
+ """Connect the underlying path and optionally return a clone.
245
+
246
+ Args:
247
+ clone: Whether to return a cloned instance.
248
+
249
+ Returns:
250
+ The connected DatabricksIO instance.
251
+ """
252
+ path = self.path.connect(clone=clone)
253
+
254
+ if clone:
255
+ return self.clone_instance(path=path)
256
+
257
+ self.path = path
258
+ return self
259
+
260
+ def close(self):
261
+ """Flush pending writes and close the buffer.
262
+
263
+ Returns:
264
+ None.
265
+ """
266
+ self.flush()
267
+ if self._buffer is not None:
268
+ self._buffer.close()
269
+
270
+ def fileno(self):
271
+ """Return a pseudo file descriptor based on object hash.
272
+
273
+ Returns:
274
+ An integer file descriptor-like value.
275
+ """
276
+ return hash(self)
277
+
278
+ def isatty(self):
279
+ return False
280
+
281
+ def tell(self):
282
+ """Return the current cursor position.
283
+
284
+ Returns:
285
+ The current position in bytes.
286
+ """
287
+ return self.position
288
+
289
+ def seekable(self):
290
+ """Return True to indicate seek support.
291
+
292
+ Returns:
293
+ True.
294
+ """
295
+ return True
296
+
297
+ def seek(self, offset, whence=0, /):
298
+ """Move the cursor to a new position.
299
+
300
+ Args:
301
+ offset: Offset in bytes.
302
+ whence: Reference point (start, current, end).
303
+
304
+ Returns:
305
+ The new position in bytes.
306
+ """
307
+ if whence == io.SEEK_SET:
308
+ new_position = offset
309
+ elif whence == io.SEEK_CUR:
310
+ new_position = self.position + offset
311
+ elif whence == io.SEEK_END:
312
+ end_position = self.content_length
313
+ new_position = end_position + offset
314
+ else:
315
+ raise ValueError("Invalid value for whence")
316
+
317
+ if new_position < 0:
318
+ raise ValueError("New position is before the start of the file")
319
+
320
+ if self._buffer is not None:
321
+ self._buffer.seek(new_position, io.SEEK_SET)
322
+
323
+ self.position = new_position
324
+ return self.position
325
+
326
+ def readable(self):
327
+ """Return True to indicate read support.
328
+
329
+ Returns:
330
+ True.
331
+ """
332
+ return True
333
+
334
+ def getvalue(self):
335
+ """Return the buffer contents, reading from remote if needed.
336
+
337
+ Returns:
338
+ File contents as bytes or str depending on mode.
339
+ """
340
+ if self._buffer is not None:
341
+ return self._buffer.getvalue()
342
+ return self.read_all_bytes()
343
+
344
+ def getbuffer(self):
345
+ """Return the underlying BytesIO buffer.
346
+
347
+ Returns:
348
+ The BytesIO buffer instance.
349
+ """
350
+ return self.buffer
351
+
352
+ @abstractmethod
353
+ def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
354
+ """Read a byte range from the remote path.
355
+
356
+ Args:
357
+ start: Starting byte offset.
358
+ length: Number of bytes to read.
359
+ allow_not_found: Whether to suppress missing-path errors.
360
+
361
+ Returns:
362
+ The bytes read from the remote path.
363
+ """
364
+ pass
365
+
366
+ def read_all_bytes(self, use_cache: bool = True, allow_not_found: bool = False) -> bytes:
367
+ """Read the full contents into memory, optionally caching.
368
+
369
+ Args:
370
+ use_cache: Whether to cache contents in memory.
371
+ allow_not_found: Whether to suppress missing-path errors.
372
+
373
+ Returns:
374
+ File contents as bytes.
375
+ """
376
+ if use_cache and self._buffer is not None:
377
+ buffer_value = self._buffer.getvalue()
378
+
379
+ if len(buffer_value) == self.content_length:
380
+ return buffer_value
381
+
382
+ self._buffer.close()
383
+ self._buffer = None
384
+
385
+ data = self.read_byte_range(0, self.content_length, allow_not_found=allow_not_found)
386
+
387
+ # Keep size accurate even if backend didn't know it
388
+ self.content_length = len(data)
389
+
390
+ if use_cache and self._buffer is None:
391
+ self._buffer = io.BytesIO(data)
392
+ self._buffer.seek(self.position, io.SEEK_SET)
393
+
394
+ return data
395
+
396
+ def read(self, n=-1, use_cache: bool = True):
397
+ """Read up to ``n`` bytes/characters from the file.
398
+
399
+ Args:
400
+ n: Number of bytes/characters to read; -1 for all.
401
+ use_cache: Whether to use cached contents.
402
+
403
+ Returns:
404
+ The read bytes or string depending on mode.
405
+ """
406
+ if not self.readable():
407
+ raise IOError("File not open for reading")
408
+
409
+ current_position = self.position
410
+ all_data = self.read_all_bytes(use_cache=use_cache)
411
+
412
+ if n == -1:
413
+ n = self.content_length - current_position
414
+
415
+ data = all_data[current_position:current_position + n]
416
+ read_length = len(data)
417
+
418
+ self.position += read_length
419
+
420
+ if self.encoding:
421
+ return data.decode(self.encoding)
422
+ return data
423
+
424
+ def readline(self, limit=-1, use_cache: bool = True):
425
+ """Read a single line from the file.
426
+
427
+ Args:
428
+ limit: Max characters/bytes to read; -1 for no limit.
429
+ use_cache: Whether to use cached contents.
430
+
431
+ Returns:
432
+ The next line as bytes or string.
433
+ """
434
+ if not self.readable():
435
+ raise IOError("File not open for reading")
436
+
437
+ if self.encoding:
438
+ # Text-mode: accumulate characters
439
+ out_chars = []
440
+ read_chars = 0
441
+
442
+ while limit == -1 or read_chars < limit:
443
+ ch = self.read(1, use_cache=use_cache)
444
+ if not ch:
445
+ break
446
+ out_chars.append(ch)
447
+ read_chars += 1
448
+ if ch == "\n":
449
+ break
450
+
451
+ return "".join(out_chars)
452
+
453
+ # Binary-mode: accumulate bytes
454
+ line_bytes = bytearray()
455
+ bytes_read = 0
456
+
457
+ while limit == -1 or bytes_read < limit:
458
+ b = self.read(1, use_cache=use_cache)
459
+ if not b:
460
+ break
461
+ line_bytes.extend(b)
462
+ bytes_read += 1
463
+ if b == b"\n":
464
+ break
465
+
466
+ return bytes(line_bytes)
467
+
468
+ def readlines(self, hint=-1, use_cache: bool = True):
469
+ """Read all lines from the file.
470
+
471
+ Args:
472
+ hint: Optional byte/char count hint; -1 for no hint.
473
+ use_cache: Whether to use cached contents.
474
+
475
+ Returns:
476
+ A list of lines.
477
+ """
478
+ if not self.readable():
479
+ raise IOError("File not open for reading")
480
+
481
+ lines = []
482
+ total = 0
483
+
484
+ while True:
485
+ line = self.readline(use_cache=use_cache)
486
+ if not line:
487
+ break
488
+ lines.append(line)
489
+ total += len(line)
490
+ if hint != -1 and total >= hint:
491
+ break
492
+
493
+ return lines
494
+
495
+ def appendable(self):
496
+ """Return True when the file is open in append mode.
497
+
498
+ Returns:
499
+ True if in append mode.
500
+ """
501
+ return "a" in self.mode
502
+
503
+ def writable(self):
504
+ """Return True to indicate write support.
505
+
506
+ Returns:
507
+ True.
508
+ """
509
+ return True
510
+
511
+ @abstractmethod
512
+ def write_all_bytes(self, data: bytes):
513
+ """Write raw bytes to the remote path.
514
+
515
+ Args:
516
+ data: Bytes to write.
517
+
518
+ Returns:
519
+ None.
520
+ """
521
+ pass
522
+
523
+ def truncate(self, size=None, /):
524
+ """Resize the file to ``size`` bytes.
525
+
526
+ Args:
527
+ size: Target size in bytes (defaults to current position).
528
+
529
+ Returns:
530
+ The new size in bytes.
531
+ """
532
+ if size is None:
533
+ size = self.position
534
+
535
+ if self._buffer is not None:
536
+ self._buffer.truncate(size)
537
+ else:
538
+ data = b"\x00" * size
539
+ self.write_all_bytes(data=data)
540
+
541
+ self.content_length = size
542
+ self._write_flag = True
543
+ return size
544
+
545
+ def flush(self):
546
+ """Flush buffered data to the remote path.
547
+
548
+ Returns:
549
+ None.
550
+ """
551
+ if self._write_flag and self._buffer is not None:
552
+ self.write_all_bytes(data=self._buffer.getvalue())
553
+ self._write_flag = False
554
+
555
+ def write(self, data: AnyStr) -> int:
556
+ """Write data to the buffer and mark for flush.
557
+
558
+ Args:
559
+ data: String or bytes to write.
560
+
561
+ Returns:
562
+ The number of bytes written.
563
+ """
564
+ if not self.writable():
565
+ raise IOError("File not open for writing")
566
+
567
+ if isinstance(data, str):
568
+ data = data.encode(self.encoding or "utf-8")
569
+
570
+ written = self.buffer.write(data)
571
+
572
+ self.position += written
573
+ self.content_length = self.position
574
+ self._write_flag = True
575
+
576
+ return written
577
+
578
+ def writelines(self, lines) -> None:
579
+ """Write multiple lines to the buffer.
580
+
581
+ Args:
582
+ lines: Iterable of lines to write.
583
+
584
+ Returns:
585
+ None.
586
+ """
587
+ for line in lines:
588
+ if isinstance(line, str):
589
+ line = line.encode(self.encoding or "utf-8")
590
+ elif not isinstance(line, (bytes, bytearray)):
591
+ raise TypeError(
592
+ "a bytes-like or str object is required, not '{}'".format(type(line).__name__)
593
+ )
594
+
595
+ data = line + b"\n" if not line.endswith(b"\n") else line
596
+ self.write(data)
597
+
598
+ def get_output_stream(self, *args, **kwargs):
599
+ """Return this instance for compatibility with Arrow APIs.
600
+
601
+ Returns:
602
+ The current DatabricksIO instance.
603
+ """
604
+ return self
605
+
606
+ def copy_to(
607
+ self,
608
+ dest: Union["DatabricksIO", "DatabricksPath", str]
609
+ ) -> None:
610
+ """Copy the file contents to another Databricks IO/path.
611
+
612
+ Args:
613
+ dest: Destination IO, DatabricksPath, or path string.
614
+
615
+ Returns:
616
+ None.
617
+ """
618
+ if not isinstance(dest, DatabricksIO):
619
+ from .path import DatabricksPath
620
+
621
+ dest_path = DatabricksPath.parse(dest, workspace=self.workspace)
622
+
623
+ with dest_path.open(mode="wb") as d:
624
+ return self.copy_to(dest=d)
625
+
626
+ dest.write_all_bytes(data=self.read_all_bytes(use_cache=False))
627
+
628
+ # ---- format helpers ----
629
+
630
+ def _reset_for_write(self):
631
+ if self._buffer is not None:
632
+ self._buffer.seek(0, io.SEEK_SET)
633
+ self._buffer.truncate(0)
634
+
635
+ self.position = 0
636
+ self.content_length = 0
637
+ self._write_flag = True
638
+
639
+ # ---- Data Querying Helpers ----
640
+
641
+ def write_table(
642
+ self,
643
+ table: Union[pa.Table, pa.RecordBatch, PolarsDataFrame, PandasDataFrame],
644
+ file_format: Optional[FileFormat] = None,
645
+ batch_size: Optional[int] = None,
646
+ **kwargs
647
+ ):
648
+ """Write a table-like object to the path using an inferred format.
649
+
650
+ Args:
651
+ table: Table-like object to write.
652
+ file_format: Optional file format override.
653
+ batch_size: Optional batch size for writes.
654
+ **kwargs: Format-specific options.
655
+
656
+ Returns:
657
+ The result of the specific write implementation.
658
+ """
659
+ if isinstance(table, pa.Table):
660
+ return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
661
+ elif isinstance(table, pa.RecordBatch):
662
+ return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size, **kwargs)
663
+ elif isinstance(table, PolarsDataFrame):
664
+ return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
665
+ elif isinstance(table, PandasDataFrame):
666
+ return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
667
+ else:
668
+ raise ValueError(f"Cannot write {type(table)} to {self.path}")
669
+
670
+ # ---- Arrow ----
671
+
672
+ def read_arrow_table(
673
+ self,
674
+ file_format: Optional[FileFormat] = None,
675
+ batch_size: Optional[int] = None,
676
+ **kwargs
677
+ ) -> pa.Table:
678
+ """Read the file as an Arrow table.
679
+
680
+ Args:
681
+ file_format: Optional file format override.
682
+ batch_size: Optional batch size for reads.
683
+ **kwargs: Format-specific options.
684
+
685
+ Returns:
686
+ An Arrow Table with the file contents.
687
+ """
688
+ file_format = self.path.file_format if file_format is None else file_format
689
+ self.seek(0)
690
+
691
+ if isinstance(file_format, ParquetFileFormat):
692
+ return pq.read_table(self, **kwargs)
693
+
694
+ if isinstance(file_format, CsvFileFormat):
695
+ return pcsv.read_csv(self, parse_options=file_format.parse_options)
696
+
697
+ raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
698
+
699
+ def write_arrow(
700
+ self,
701
+ table: Union[pa.Table, pa.RecordBatch],
702
+ batch_size: Optional[int] = None,
703
+ **kwargs
704
+ ):
705
+ """Write an Arrow table or record batch to the path.
706
+
707
+ Args:
708
+ table: Arrow table or batch to write.
709
+ batch_size: Optional batch size for writes.
710
+ **kwargs: Format-specific options.
711
+
712
+ Returns:
713
+ None.
714
+ """
715
+ if not isinstance(table, pa.Table):
716
+ table = convert(table, pa.Table)
717
+
718
+ return self.write_arrow_table(
719
+ table=table,
720
+ batch_size=batch_size,
721
+ **kwargs
722
+ )
723
+
724
+ def write_arrow_table(
725
+ self,
726
+ table: pa.Table,
727
+ file_format: Optional[FileFormat] = None,
728
+ batch_size: Optional[int] = None,
729
+ **kwargs
730
+ ):
731
+ """Write an Arrow table using the selected file format.
732
+
733
+ Args:
734
+ table: Arrow table to write.
735
+ file_format: Optional file format override.
736
+ batch_size: Optional batch size for writes.
737
+ **kwargs: Format-specific options.
738
+
739
+ Returns:
740
+ None.
741
+ """
742
+ file_format = self.path.file_format if file_format is None else file_format
743
+ buffer = io.BytesIO()
744
+
745
+ if isinstance(file_format, ParquetFileFormat):
746
+ pq.write_table(table, buffer, write_batch_size=batch_size, **kwargs)
747
+
748
+ elif isinstance(file_format, CsvFileFormat):
749
+ pcsv.write_csv(table, buffer, **kwargs)
750
+
751
+ else:
752
+ raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
753
+
754
+ self.write_all_bytes(data=buffer.getvalue())
755
+
756
+ def write_arrow_batch(
757
+ self,
758
+ batch: pa.RecordBatch,
759
+ file_format: Optional[FileFormat] = None,
760
+ batch_size: Optional[int] = None,
761
+ **kwargs
762
+ ):
763
+ """Write a single Arrow record batch.
764
+
765
+ Args:
766
+ batch: RecordBatch to write.
767
+ file_format: Optional file format override.
768
+ batch_size: Optional batch size for writes.
769
+ **kwargs: Format-specific options.
770
+
771
+ Returns:
772
+ None.
773
+ """
774
+ table = pa.Table.from_batches([batch])
775
+ self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
776
+
777
+ def read_arrow_batches(
778
+ self,
779
+ batch_size: Optional[int] = None,
780
+ **kwargs
781
+ ):
782
+ """Yield Arrow record batches from the file.
783
+
784
+ Args:
785
+ batch_size: Optional batch size for reads.
786
+ **kwargs: Format-specific options.
787
+
788
+ Returns:
789
+ An iterator over Arrow RecordBatch objects.
790
+ """
791
+ return (
792
+ self
793
+ .read_arrow_table(batch_size=batch_size, **kwargs)
794
+ .to_batches(max_chunksize=batch_size)
795
+ )
796
+
797
+ # ---- Pandas ----
798
+
799
+ def read_pandas(
800
+ self,
801
+ batch_size: Optional[int] = None,
802
+ **kwargs
803
+ ):
804
+ """Read the file into a pandas DataFrame.
805
+
806
+ Args:
807
+ batch_size: Optional batch size for reads.
808
+ **kwargs: Format-specific options.
809
+
810
+ Returns:
811
+ A pandas DataFrame with the file contents.
812
+ """
813
+ return self.read_arrow_table(batch_size=batch_size, **kwargs).to_pandas()
814
+
815
+ def write_pandas(
816
+ self,
817
+ df,
818
+ batch_size: Optional[int] = None,
819
+ **kwargs
820
+ ):
821
+ """Write a pandas DataFrame to the file.
822
+
823
+ Args:
824
+ df: pandas DataFrame to write.
825
+ batch_size: Optional batch size for writes.
826
+ **kwargs: Format-specific options.
827
+
828
+ Returns:
829
+ None.
830
+ """
831
+ self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
832
+
833
+ # ---- Polars ----
834
+
835
+ def read_polars(
836
+ self,
837
+ file_format: Optional[FileFormat] = None,
838
+ batch_size: Optional[int] = None,
839
+ **kwargs
840
+ ):
841
+ """Read the file into a polars DataFrame.
842
+
843
+ Args:
844
+ file_format: Optional file format override.
845
+ batch_size: Optional batch size for reads.
846
+ **kwargs: Format-specific options.
847
+
848
+ Returns:
849
+ A polars DataFrame with the file contents.
850
+ """
851
+ import polars as pl
852
+
853
+ file_format = self.path.file_format if file_format is None else file_format
854
+ self.seek(0)
855
+
856
+ if isinstance(file_format, ParquetFileFormat):
857
+ return pl.read_parquet(self, **kwargs)
858
+
859
+ if isinstance(file_format, CsvFileFormat):
860
+ return pl.read_csv(self, **kwargs)
861
+
862
+ raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
863
+
864
+ def write_polars(
865
+ self,
866
+ df,
867
+ file_format: Optional[FileFormat] = None,
868
+ batch_size: Optional[int] = None,
869
+ **kwargs
870
+ ):
871
+ """Write a polars DataFrame to the file.
872
+
873
+ Args:
874
+ df: polars DataFrame to write.
875
+ file_format: Optional file format override.
876
+ batch_size: Optional batch size for writes.
877
+ **kwargs: Format-specific options.
878
+
879
+ Returns:
880
+ None.
881
+ """
882
+ file_format = self.path.file_format if file_format is None else FileFormat
883
+ buffer = io.BytesIO()
884
+
885
+ if isinstance(file_format, ParquetFileFormat):
886
+ df.write_parquet(buffer, **kwargs)
887
+
888
+ elif isinstance(file_format, CsvFileFormat):
889
+ df.write_csv(buffer, **kwargs)
890
+
891
+ else:
892
+ raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
893
+
894
+ self.write_all_bytes(data=buffer.getvalue())
895
+
896
+
897
+ class DatabricksWorkspaceIO(DatabricksIO):
898
+ """IO adapter for Workspace files."""
899
+
900
+ def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
901
+ """Read bytes from a Workspace file.
902
+
903
+ Args:
904
+ start: Starting byte offset.
905
+ length: Number of bytes to read.
906
+ allow_not_found: Whether to suppress missing-path errors.
907
+
908
+ Returns:
909
+ Bytes read from the file.
910
+ """
911
+ if length == 0:
912
+ return b""
913
+
914
+ sdk = self.workspace.sdk()
915
+ client = sdk.workspace
916
+ full_path = self.path.workspace_full_path()
917
+
918
+ result = client.download(
919
+ path=full_path,
920
+ format=ExportFormat.AUTO,
921
+ )
922
+
923
+ if result is None:
924
+ return b""
925
+
926
+ data = result.read()
927
+
928
+ end = start + length
929
+ return data[start:end]
930
+
931
+ def write_all_bytes(self, data: bytes):
932
+ """Write bytes to a Workspace file.
933
+
934
+ Args:
935
+ data: Bytes to write.
936
+
937
+ Returns:
938
+ The DatabricksWorkspaceIO instance.
939
+ """
940
+ sdk = self.workspace.sdk()
941
+ workspace_client = sdk.workspace
942
+ full_path = self.path.workspace_full_path()
943
+
944
+ try:
945
+ workspace_client.upload(
946
+ full_path,
947
+ data,
948
+ format=ImportFormat.AUTO,
949
+ overwrite=True
950
+ )
951
+ except (NotFound, ResourceDoesNotExist, BadRequest):
952
+ self.path.parent.make_workspace_dir(parents=True)
953
+
954
+ workspace_client.upload(
955
+ full_path,
956
+ data,
957
+ format=ImportFormat.AUTO,
958
+ overwrite=True
959
+ )
960
+
961
+ self.path.reset_metadata(
962
+ is_file=True,
963
+ is_dir=False,
964
+ size=len(data),
965
+ mtime=time.time()
966
+ )
967
+
968
+ return self
969
+
970
+
971
+ class DatabricksVolumeIO(DatabricksIO):
972
+ """IO adapter for Unity Catalog volume files."""
973
+
974
+ def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
975
+ """Read bytes from a volume file.
976
+
977
+ Args:
978
+ start: Starting byte offset.
979
+ length: Number of bytes to read.
980
+ allow_not_found: Whether to suppress missing-path errors.
981
+
982
+ Returns:
983
+ Bytes read from the file.
984
+ """
985
+ if length == 0:
986
+ return b""
987
+
988
+ sdk = self.workspace.sdk()
989
+ client = sdk.files
990
+ full_path = self.path.files_full_path()
991
+
992
+ resp = client.download(full_path)
993
+ result = (
994
+ resp.contents
995
+ .seek(start, io.SEEK_SET)
996
+ .read(length)
997
+ )
998
+
999
+ return result
1000
+
1001
+ def write_all_bytes(self, data: bytes):
1002
+ """Write bytes to a volume file.
1003
+
1004
+ Args:
1005
+ data: Bytes to write.
1006
+
1007
+ Returns:
1008
+ The DatabricksVolumeIO instance.
1009
+ """
1010
+ sdk = self.workspace.sdk()
1011
+ client = sdk.files
1012
+ full_path = self.path.files_full_path()
1013
+
1014
+ try:
1015
+ client.upload(
1016
+ full_path,
1017
+ io.BytesIO(data),
1018
+ overwrite=True
1019
+ )
1020
+ except (NotFound, ResourceDoesNotExist, BadRequest):
1021
+ self.path.parent.mkdir(parents=True, exist_ok=True)
1022
+
1023
+ client.upload(
1024
+ full_path,
1025
+ io.BytesIO(data),
1026
+ overwrite=True
1027
+ )
1028
+
1029
+ self.path.reset_metadata(
1030
+ is_file=True,
1031
+ is_dir=False,
1032
+ size=len(data),
1033
+ mtime=time.time()
1034
+ )
1035
+
1036
+ return self
1037
+
1038
+
1039
+ class DatabricksDBFSIO(DatabricksIO):
1040
+ """IO adapter for DBFS files."""
1041
+
1042
+ def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
1043
+ """Read bytes from a DBFS file.
1044
+
1045
+ Args:
1046
+ start: Starting byte offset.
1047
+ length: Number of bytes to read.
1048
+ allow_not_found: Whether to suppress missing-path errors.
1049
+
1050
+ Returns:
1051
+ Bytes read from the file.
1052
+ """
1053
+ if length == 0:
1054
+ return b""
1055
+
1056
+ sdk = self.workspace.sdk()
1057
+ client = sdk.dbfs
1058
+ full_path = self.path.dbfs_full_path()
1059
+
1060
+ read_bytes = bytearray()
1061
+ bytes_to_read = length
1062
+ current_position = start
1063
+
1064
+ while bytes_to_read > 0:
1065
+ chunk_size = min(bytes_to_read, 2 * 1024 * 1024)
1066
+
1067
+ resp = client.read(
1068
+ path=full_path,
1069
+ offset=current_position,
1070
+ length=chunk_size
1071
+ )
1072
+
1073
+ if not resp.data:
1074
+ break
1075
+
1076
+ # resp.data is base64; decode and move offsets by *decoded* length
1077
+ resp_data_bytes = base64.b64decode(resp.data)
1078
+
1079
+ read_bytes.extend(resp_data_bytes)
1080
+ bytes_read = len(resp_data_bytes) # <-- FIX (was base64 string length)
1081
+ current_position += bytes_read
1082
+ bytes_to_read -= bytes_read
1083
+
1084
+ return bytes(read_bytes)
1085
+
1086
+ def write_all_bytes(self, data: bytes):
1087
+ """Write bytes to a DBFS file.
1088
+
1089
+ Args:
1090
+ data: Bytes to write.
1091
+
1092
+ Returns:
1093
+ The DatabricksDBFSIO instance.
1094
+ """
1095
+ sdk = self.workspace.sdk()
1096
+ client = sdk.dbfs
1097
+ full_path = self.path.dbfs_full_path()
1098
+
1099
+ try:
1100
+ with client.open(
1101
+ path=full_path,
1102
+ read=False,
1103
+ write=True,
1104
+ overwrite=True
1105
+ ) as f:
1106
+ f.write(data)
1107
+ except (NotFound, ResourceDoesNotExist, BadRequest):
1108
+ self.path.parent.mkdir(parents=True, exist_ok=True)
1109
+
1110
+ with client.open(
1111
+ path=full_path,
1112
+ read=False,
1113
+ write=True,
1114
+ overwrite=True
1115
+ ) as f:
1116
+ f.write(data)
1117
+
1118
+ self.path.reset_metadata(
1119
+ is_file=True,
1120
+ is_dir=False,
1121
+ size=len(data),
1122
+ mtime=time.time()
1123
+ )