ygg 0.1.51__py3-none-any.whl → 0.1.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/METADATA +1 -1
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/RECORD +18 -16
- yggdrasil/databricks/sql/engine.py +288 -84
- yggdrasil/databricks/sql/exceptions.py +3 -1
- yggdrasil/databricks/workspaces/io.py +78 -69
- yggdrasil/databricks/workspaces/path.py +367 -166
- yggdrasil/databricks/workspaces/path_kind.py +3 -3
- yggdrasil/databricks/workspaces/volumes_path.py +85 -0
- yggdrasil/libs/databrickslib.py +5 -0
- yggdrasil/pyutils/callable_serde.py +10 -10
- yggdrasil/pyutils/retry.py +2 -2
- yggdrasil/types/cast/registry.py +0 -14
- yggdrasil/types/file_format.py +10 -0
- yggdrasil/version.py +1 -1
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/WHEEL +0 -0
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.51.dist-info → ygg-0.1.52.dist-info}/top_level.txt +0 -0
|
@@ -9,13 +9,18 @@ from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union
|
|
|
9
9
|
import pyarrow as pa
|
|
10
10
|
import pyarrow.csv as pcsv
|
|
11
11
|
import pyarrow.parquet as pq
|
|
12
|
-
from
|
|
12
|
+
from Lib.threading import Thread
|
|
13
|
+
from pyarrow.dataset import (
|
|
14
|
+
FileFormat,
|
|
15
|
+
ParquetFileFormat, CsvFileFormat,
|
|
16
|
+
)
|
|
13
17
|
|
|
14
18
|
from .path_kind import DatabricksPathKind
|
|
15
19
|
from ...libs.databrickslib import databricks
|
|
16
|
-
from ...libs.pandaslib import
|
|
20
|
+
from ...libs.pandaslib import PandasDataFrame
|
|
17
21
|
from ...libs.polarslib import polars, PolarsDataFrame
|
|
18
22
|
from ...types.cast.registry import convert
|
|
23
|
+
from ...types.file_format import ExcelFileFormat
|
|
19
24
|
|
|
20
25
|
if databricks is not None:
|
|
21
26
|
from databricks.sdk.service.workspace import ImportFormat, ExportFormat
|
|
@@ -66,7 +71,10 @@ class DatabricksIO(ABC, IO):
|
|
|
66
71
|
self.close()
|
|
67
72
|
|
|
68
73
|
def __del__(self):
|
|
69
|
-
|
|
74
|
+
try:
|
|
75
|
+
Thread(target=self.close).start()
|
|
76
|
+
except BaseException:
|
|
77
|
+
pass
|
|
70
78
|
|
|
71
79
|
def __next__(self):
|
|
72
80
|
"""Iterate over lines in the file."""
|
|
@@ -75,8 +83,11 @@ class DatabricksIO(ABC, IO):
|
|
|
75
83
|
raise StopIteration
|
|
76
84
|
return line
|
|
77
85
|
|
|
86
|
+
def __len__(self):
|
|
87
|
+
return self.content_length or 0
|
|
88
|
+
|
|
78
89
|
def __iter__(self):
|
|
79
|
-
return self
|
|
90
|
+
return self.read_all_bytes().__iter__()
|
|
80
91
|
|
|
81
92
|
def __hash__(self):
|
|
82
93
|
return self.path.__hash__()
|
|
@@ -87,7 +98,6 @@ class DatabricksIO(ABC, IO):
|
|
|
87
98
|
path: "DatabricksPath",
|
|
88
99
|
mode: str,
|
|
89
100
|
encoding: Optional[str] = None,
|
|
90
|
-
compression: Optional[str] = "detect",
|
|
91
101
|
position: int = 0,
|
|
92
102
|
buffer: Optional[io.BytesIO] = None,
|
|
93
103
|
) -> "DatabricksIO":
|
|
@@ -97,7 +107,6 @@ class DatabricksIO(ABC, IO):
|
|
|
97
107
|
path: DatabricksPath to open.
|
|
98
108
|
mode: File mode string.
|
|
99
109
|
encoding: Optional text encoding for text mode.
|
|
100
|
-
compression: Optional compression mode.
|
|
101
110
|
position: Initial file cursor position.
|
|
102
111
|
buffer: Optional pre-seeded buffer.
|
|
103
112
|
|
|
@@ -170,6 +179,10 @@ class DatabricksIO(ABC, IO):
|
|
|
170
179
|
def content_length(self) -> int:
|
|
171
180
|
return self.path.content_length
|
|
172
181
|
|
|
182
|
+
@content_length.setter
|
|
183
|
+
def content_length(self, value: int):
|
|
184
|
+
self.path.content_length = value
|
|
185
|
+
|
|
173
186
|
def size(self):
|
|
174
187
|
"""Return the size of the file in bytes.
|
|
175
188
|
|
|
@@ -178,10 +191,6 @@ class DatabricksIO(ABC, IO):
|
|
|
178
191
|
"""
|
|
179
192
|
return self.content_length
|
|
180
193
|
|
|
181
|
-
@content_length.setter
|
|
182
|
-
def content_length(self, value: int):
|
|
183
|
-
self.path.content_length = value
|
|
184
|
-
|
|
185
194
|
@property
|
|
186
195
|
def buffer(self):
|
|
187
196
|
"""Return the in-memory buffer, creating it if necessary.
|
|
@@ -204,9 +213,7 @@ class DatabricksIO(ABC, IO):
|
|
|
204
213
|
Returns:
|
|
205
214
|
None.
|
|
206
215
|
"""
|
|
207
|
-
|
|
208
|
-
self._buffer.close()
|
|
209
|
-
self._buffer = None
|
|
216
|
+
self._buffer = None
|
|
210
217
|
|
|
211
218
|
def clone_instance(self, **kwargs):
|
|
212
219
|
"""Clone this IO instance with optional overrides.
|
|
@@ -251,15 +258,23 @@ class DatabricksIO(ABC, IO):
|
|
|
251
258
|
self.path = path
|
|
252
259
|
return self
|
|
253
260
|
|
|
254
|
-
def close(self):
|
|
261
|
+
def close(self, flush: bool = True):
|
|
255
262
|
"""Flush pending writes and close the buffer.
|
|
256
263
|
|
|
264
|
+
Args:
|
|
265
|
+
flush: Checks flush data to commit to remote location
|
|
266
|
+
|
|
257
267
|
Returns:
|
|
258
268
|
None.
|
|
259
269
|
"""
|
|
260
|
-
|
|
270
|
+
if flush:
|
|
271
|
+
self.flush()
|
|
261
272
|
self.clear_buffer()
|
|
262
273
|
|
|
274
|
+
@property
|
|
275
|
+
def closed(self):
|
|
276
|
+
return False
|
|
277
|
+
|
|
263
278
|
def fileno(self):
|
|
264
279
|
"""Return a pseudo file descriptor based on object hash.
|
|
265
280
|
|
|
@@ -508,14 +523,14 @@ class DatabricksIO(ABC, IO):
|
|
|
508
523
|
if size is None:
|
|
509
524
|
size = self.position
|
|
510
525
|
|
|
511
|
-
if self._buffer is
|
|
512
|
-
self.
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
self.write_all_bytes(data=data)
|
|
526
|
+
if self._buffer is None:
|
|
527
|
+
return self.write_all_bytes(data=b"\x00" * size)
|
|
528
|
+
|
|
529
|
+
self._buffer.truncate(size)
|
|
516
530
|
|
|
517
531
|
self.content_length = size
|
|
518
532
|
self._write_flag = True
|
|
533
|
+
|
|
519
534
|
return size
|
|
520
535
|
|
|
521
536
|
def flush(self):
|
|
@@ -588,7 +603,13 @@ class DatabricksIO(ABC, IO):
|
|
|
588
603
|
Returns:
|
|
589
604
|
None.
|
|
590
605
|
"""
|
|
591
|
-
|
|
606
|
+
data = self.read_all_bytes(use_cache=False)
|
|
607
|
+
|
|
608
|
+
if isinstance(dest, DatabricksIO):
|
|
609
|
+
dest.write_all_bytes(data=data)
|
|
610
|
+
elif hasattr(dest, "write"):
|
|
611
|
+
dest.write(data)
|
|
612
|
+
else:
|
|
592
613
|
from .path import DatabricksPath
|
|
593
614
|
|
|
594
615
|
dest_path = DatabricksPath.parse(dest, workspace=self.workspace)
|
|
@@ -596,8 +617,6 @@ class DatabricksIO(ABC, IO):
|
|
|
596
617
|
with dest_path.open(mode="wb") as d:
|
|
597
618
|
return self.copy_to(dest=d)
|
|
598
619
|
|
|
599
|
-
dest.write_all_bytes(data=self.read_all_bytes(use_cache=False))
|
|
600
|
-
|
|
601
620
|
# ---- format helpers ----
|
|
602
621
|
|
|
603
622
|
def _reset_for_write(self):
|
|
@@ -616,7 +635,6 @@ class DatabricksIO(ABC, IO):
|
|
|
616
635
|
table: Union[pa.Table, pa.RecordBatch, PolarsDataFrame, PandasDataFrame],
|
|
617
636
|
file_format: Optional[FileFormat] = None,
|
|
618
637
|
batch_size: Optional[int] = None,
|
|
619
|
-
**kwargs
|
|
620
638
|
):
|
|
621
639
|
"""Write a table-like object to the path using an inferred format.
|
|
622
640
|
|
|
@@ -624,19 +642,18 @@ class DatabricksIO(ABC, IO):
|
|
|
624
642
|
table: Table-like object to write.
|
|
625
643
|
file_format: Optional file format override.
|
|
626
644
|
batch_size: Optional batch size for writes.
|
|
627
|
-
**kwargs: Format-specific options.
|
|
628
645
|
|
|
629
646
|
Returns:
|
|
630
647
|
The result of the specific write implementation.
|
|
631
648
|
"""
|
|
632
649
|
if isinstance(table, pa.Table):
|
|
633
|
-
return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size
|
|
650
|
+
return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size)
|
|
634
651
|
elif isinstance(table, pa.RecordBatch):
|
|
635
|
-
return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size
|
|
652
|
+
return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size)
|
|
636
653
|
elif isinstance(table, PolarsDataFrame):
|
|
637
|
-
return self.write_polars(table, file_format=file_format, batch_size=batch_size
|
|
654
|
+
return self.write_polars(table, file_format=file_format, batch_size=batch_size)
|
|
638
655
|
elif isinstance(table, PandasDataFrame):
|
|
639
|
-
return self.write_pandas(table, file_format=file_format, batch_size=batch_size
|
|
656
|
+
return self.write_pandas(table, file_format=file_format, batch_size=batch_size)
|
|
640
657
|
|
|
641
658
|
return self.write_arrow(
|
|
642
659
|
table=table,
|
|
@@ -679,7 +696,6 @@ class DatabricksIO(ABC, IO):
|
|
|
679
696
|
table: Union[pa.Table, pa.RecordBatch],
|
|
680
697
|
file_format: Optional[FileFormat] = None,
|
|
681
698
|
batch_size: Optional[int] = None,
|
|
682
|
-
**kwargs
|
|
683
699
|
):
|
|
684
700
|
"""Write an Arrow table or record batch to the path.
|
|
685
701
|
|
|
@@ -687,7 +703,6 @@ class DatabricksIO(ABC, IO):
|
|
|
687
703
|
table: Arrow table or batch to write.
|
|
688
704
|
file_format: Optional file format override.
|
|
689
705
|
batch_size: Optional batch size for writes.
|
|
690
|
-
**kwargs: Format-specific options.
|
|
691
706
|
|
|
692
707
|
Returns:
|
|
693
708
|
None.
|
|
@@ -699,7 +714,6 @@ class DatabricksIO(ABC, IO):
|
|
|
699
714
|
table=table,
|
|
700
715
|
file_format=file_format,
|
|
701
716
|
batch_size=batch_size,
|
|
702
|
-
**kwargs
|
|
703
717
|
)
|
|
704
718
|
|
|
705
719
|
def write_arrow_table(
|
|
@@ -707,7 +721,6 @@ class DatabricksIO(ABC, IO):
|
|
|
707
721
|
table: pa.Table,
|
|
708
722
|
file_format: Optional[FileFormat] = None,
|
|
709
723
|
batch_size: Optional[int] = None,
|
|
710
|
-
**kwargs
|
|
711
724
|
):
|
|
712
725
|
"""Write an Arrow table using the selected file format.
|
|
713
726
|
|
|
@@ -715,7 +728,6 @@ class DatabricksIO(ABC, IO):
|
|
|
715
728
|
table: Arrow table to write.
|
|
716
729
|
file_format: Optional file format override.
|
|
717
730
|
batch_size: Optional batch size for writes.
|
|
718
|
-
**kwargs: Format-specific options.
|
|
719
731
|
|
|
720
732
|
Returns:
|
|
721
733
|
None.
|
|
@@ -724,13 +736,20 @@ class DatabricksIO(ABC, IO):
|
|
|
724
736
|
buffer = io.BytesIO()
|
|
725
737
|
|
|
726
738
|
if isinstance(file_format, ParquetFileFormat):
|
|
727
|
-
pq.write_table(
|
|
739
|
+
pq.write_table(
|
|
740
|
+
table, buffer,
|
|
741
|
+
write_batch_size=batch_size
|
|
742
|
+
)
|
|
728
743
|
|
|
729
744
|
elif isinstance(file_format, CsvFileFormat):
|
|
730
|
-
pcsv.write_csv(table, buffer
|
|
745
|
+
pcsv.write_csv(table, buffer)
|
|
731
746
|
|
|
732
747
|
else:
|
|
733
|
-
|
|
748
|
+
return self.write_polars(
|
|
749
|
+
df=polars.from_arrow(table),
|
|
750
|
+
file_format=file_format,
|
|
751
|
+
batch_size=batch_size
|
|
752
|
+
)
|
|
734
753
|
|
|
735
754
|
self.write_all_bytes(data=buffer.getvalue())
|
|
736
755
|
|
|
@@ -739,7 +758,6 @@ class DatabricksIO(ABC, IO):
|
|
|
739
758
|
batch: pa.RecordBatch,
|
|
740
759
|
file_format: Optional[FileFormat] = None,
|
|
741
760
|
batch_size: Optional[int] = None,
|
|
742
|
-
**kwargs
|
|
743
761
|
):
|
|
744
762
|
"""Write a single Arrow record batch.
|
|
745
763
|
|
|
@@ -747,26 +765,23 @@ class DatabricksIO(ABC, IO):
|
|
|
747
765
|
batch: RecordBatch to write.
|
|
748
766
|
file_format: Optional file format override.
|
|
749
767
|
batch_size: Optional batch size for writes.
|
|
750
|
-
**kwargs: Format-specific options.
|
|
751
768
|
|
|
752
769
|
Returns:
|
|
753
770
|
None.
|
|
754
771
|
"""
|
|
755
772
|
table = pa.Table.from_batches([batch])
|
|
756
|
-
self.write_arrow_table(table, file_format=file_format, batch_size=batch_size
|
|
773
|
+
self.write_arrow_table(table, file_format=file_format, batch_size=batch_size)
|
|
757
774
|
|
|
758
775
|
def read_arrow_batches(
|
|
759
776
|
self,
|
|
760
777
|
file_format: Optional[FileFormat] = None,
|
|
761
778
|
batch_size: Optional[int] = None,
|
|
762
|
-
**kwargs
|
|
763
779
|
):
|
|
764
780
|
"""Yield Arrow record batches from the file.
|
|
765
781
|
|
|
766
782
|
Args:
|
|
767
783
|
file_format: Optional file format override.
|
|
768
784
|
batch_size: Optional batch size for reads.
|
|
769
|
-
**kwargs: Format-specific options.
|
|
770
785
|
|
|
771
786
|
Returns:
|
|
772
787
|
An iterator over Arrow RecordBatch objects.
|
|
@@ -776,7 +791,6 @@ class DatabricksIO(ABC, IO):
|
|
|
776
791
|
.read_arrow_table(
|
|
777
792
|
file_format=file_format,
|
|
778
793
|
batch_size=batch_size,
|
|
779
|
-
**kwargs
|
|
780
794
|
)
|
|
781
795
|
.to_batches(max_chunksize=batch_size)
|
|
782
796
|
)
|
|
@@ -787,36 +801,26 @@ class DatabricksIO(ABC, IO):
|
|
|
787
801
|
self,
|
|
788
802
|
file_format: Optional[FileFormat] = None,
|
|
789
803
|
batch_size: Optional[int] = None,
|
|
790
|
-
**kwargs
|
|
791
804
|
):
|
|
792
805
|
"""Read the file into a pandas DataFrame.
|
|
793
806
|
|
|
794
807
|
Args:
|
|
795
808
|
file_format: Optional file format override.
|
|
796
809
|
batch_size: Optional batch size for reads.
|
|
797
|
-
**kwargs: Format-specific options.
|
|
798
810
|
|
|
799
811
|
Returns:
|
|
800
812
|
A pandas DataFrame with the file contents.
|
|
801
813
|
"""
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
return pandas.read_parquet(self, **kwargs)
|
|
807
|
-
|
|
808
|
-
elif isinstance(file_format, CsvFileFormat):
|
|
809
|
-
return pandas.read_csv(self, **kwargs)
|
|
810
|
-
|
|
811
|
-
else:
|
|
812
|
-
raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
|
|
814
|
+
return self.read_arrow_table(
|
|
815
|
+
file_format=file_format,
|
|
816
|
+
batch_size=batch_size
|
|
817
|
+
).to_pandas()
|
|
813
818
|
|
|
814
819
|
def write_pandas(
|
|
815
820
|
self,
|
|
816
821
|
df: PandasDataFrame,
|
|
817
822
|
file_format: Optional[FileFormat] = None,
|
|
818
823
|
batch_size: Optional[int] = None,
|
|
819
|
-
**kwargs
|
|
820
824
|
):
|
|
821
825
|
"""Write a pandas DataFrame to the file.
|
|
822
826
|
|
|
@@ -824,7 +828,6 @@ class DatabricksIO(ABC, IO):
|
|
|
824
828
|
df: pandas DataFrame to write.
|
|
825
829
|
file_format: Optional file format override.
|
|
826
830
|
batch_size: Optional batch size for writes.
|
|
827
|
-
**kwargs: Format-specific options.
|
|
828
831
|
|
|
829
832
|
Returns:
|
|
830
833
|
None.
|
|
@@ -833,13 +836,17 @@ class DatabricksIO(ABC, IO):
|
|
|
833
836
|
buffer = io.BytesIO()
|
|
834
837
|
|
|
835
838
|
if isinstance(file_format, ParquetFileFormat):
|
|
836
|
-
df.to_parquet(buffer
|
|
839
|
+
df.to_parquet(buffer)
|
|
837
840
|
|
|
838
841
|
elif isinstance(file_format, CsvFileFormat):
|
|
839
|
-
df.to_csv(buffer
|
|
842
|
+
df.to_csv(buffer)
|
|
840
843
|
|
|
841
844
|
else:
|
|
842
|
-
|
|
845
|
+
return self.write_polars(
|
|
846
|
+
df=polars.from_pandas(df),
|
|
847
|
+
file_format=file_format,
|
|
848
|
+
batch_size=batch_size,
|
|
849
|
+
)
|
|
843
850
|
|
|
844
851
|
self.write_all_bytes(data=buffer.getvalue())
|
|
845
852
|
|
|
@@ -849,14 +856,12 @@ class DatabricksIO(ABC, IO):
|
|
|
849
856
|
self,
|
|
850
857
|
file_format: Optional[FileFormat] = None,
|
|
851
858
|
batch_size: Optional[int] = None,
|
|
852
|
-
**kwargs
|
|
853
859
|
):
|
|
854
860
|
"""Read the file into a polars DataFrame.
|
|
855
861
|
|
|
856
862
|
Args:
|
|
857
863
|
file_format: Optional file format override.
|
|
858
864
|
batch_size: Optional batch size for reads.
|
|
859
|
-
**kwargs: Format-specific options.
|
|
860
865
|
|
|
861
866
|
Returns:
|
|
862
867
|
A polars DataFrame with the file contents.
|
|
@@ -865,10 +870,13 @@ class DatabricksIO(ABC, IO):
|
|
|
865
870
|
self.seek(0)
|
|
866
871
|
|
|
867
872
|
if isinstance(file_format, ParquetFileFormat):
|
|
868
|
-
return polars.read_parquet(self
|
|
873
|
+
return polars.read_parquet(self)
|
|
869
874
|
|
|
870
875
|
elif isinstance(file_format, CsvFileFormat):
|
|
871
|
-
return polars.read_csv(self
|
|
876
|
+
return polars.read_csv(self)
|
|
877
|
+
|
|
878
|
+
elif isinstance(file_format, ExcelFileFormat):
|
|
879
|
+
return polars.read_excel(self)
|
|
872
880
|
|
|
873
881
|
else:
|
|
874
882
|
raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
|
|
@@ -878,7 +886,6 @@ class DatabricksIO(ABC, IO):
|
|
|
878
886
|
df: PolarsDataFrame,
|
|
879
887
|
file_format: Optional[FileFormat] = None,
|
|
880
888
|
batch_size: Optional[int] = None,
|
|
881
|
-
**kwargs
|
|
882
889
|
):
|
|
883
890
|
"""Write a polars DataFrame to the file.
|
|
884
891
|
|
|
@@ -886,7 +893,6 @@ class DatabricksIO(ABC, IO):
|
|
|
886
893
|
df: polars DataFrame to write.
|
|
887
894
|
file_format: Optional file format override.
|
|
888
895
|
batch_size: Optional batch size for writes.
|
|
889
|
-
**kwargs: Format-specific options.
|
|
890
896
|
|
|
891
897
|
Returns:
|
|
892
898
|
None.
|
|
@@ -895,10 +901,13 @@ class DatabricksIO(ABC, IO):
|
|
|
895
901
|
buffer = io.BytesIO()
|
|
896
902
|
|
|
897
903
|
if isinstance(file_format, ParquetFileFormat):
|
|
898
|
-
df.write_parquet(buffer
|
|
904
|
+
df.write_parquet(buffer)
|
|
899
905
|
|
|
900
906
|
elif isinstance(file_format, CsvFileFormat):
|
|
901
|
-
df.write_csv(buffer
|
|
907
|
+
df.write_csv(buffer)
|
|
908
|
+
|
|
909
|
+
elif isinstance(file_format, ExcelFileFormat):
|
|
910
|
+
df.write_excel(buffer)
|
|
902
911
|
|
|
903
912
|
else:
|
|
904
913
|
raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
|