ygg 0.1.50__py3-none-any.whl → 0.1.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,13 +9,18 @@ from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union
9
9
  import pyarrow as pa
10
10
  import pyarrow.csv as pcsv
11
11
  import pyarrow.parquet as pq
12
- from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat
12
+ from Lib.threading import Thread
13
+ from pyarrow.dataset import (
14
+ FileFormat,
15
+ ParquetFileFormat, CsvFileFormat,
16
+ )
13
17
 
14
18
  from .path_kind import DatabricksPathKind
15
19
  from ...libs.databrickslib import databricks
16
- from ...libs.pandaslib import pandas, PandasDataFrame
20
+ from ...libs.pandaslib import PandasDataFrame
17
21
  from ...libs.polarslib import polars, PolarsDataFrame
18
22
  from ...types.cast.registry import convert
23
+ from ...types.file_format import ExcelFileFormat
19
24
 
20
25
  if databricks is not None:
21
26
  from databricks.sdk.service.workspace import ImportFormat, ExportFormat
@@ -66,7 +71,10 @@ class DatabricksIO(ABC, IO):
66
71
  self.close()
67
72
 
68
73
  def __del__(self):
69
- self.close()
74
+ try:
75
+ Thread(target=self.close).start()
76
+ except BaseException:
77
+ pass
70
78
 
71
79
  def __next__(self):
72
80
  """Iterate over lines in the file."""
@@ -75,8 +83,11 @@ class DatabricksIO(ABC, IO):
75
83
  raise StopIteration
76
84
  return line
77
85
 
86
+ def __len__(self):
87
+ return self.content_length or 0
88
+
78
89
  def __iter__(self):
79
- return self
90
+ return self.read_all_bytes().__iter__()
80
91
 
81
92
  def __hash__(self):
82
93
  return self.path.__hash__()
@@ -87,7 +98,6 @@ class DatabricksIO(ABC, IO):
87
98
  path: "DatabricksPath",
88
99
  mode: str,
89
100
  encoding: Optional[str] = None,
90
- compression: Optional[str] = "detect",
91
101
  position: int = 0,
92
102
  buffer: Optional[io.BytesIO] = None,
93
103
  ) -> "DatabricksIO":
@@ -97,7 +107,6 @@ class DatabricksIO(ABC, IO):
97
107
  path: DatabricksPath to open.
98
108
  mode: File mode string.
99
109
  encoding: Optional text encoding for text mode.
100
- compression: Optional compression mode.
101
110
  position: Initial file cursor position.
102
111
  buffer: Optional pre-seeded buffer.
103
112
 
@@ -170,6 +179,10 @@ class DatabricksIO(ABC, IO):
170
179
  def content_length(self) -> int:
171
180
  return self.path.content_length
172
181
 
182
+ @content_length.setter
183
+ def content_length(self, value: int):
184
+ self.path.content_length = value
185
+
173
186
  def size(self):
174
187
  """Return the size of the file in bytes.
175
188
 
@@ -178,10 +191,6 @@ class DatabricksIO(ABC, IO):
178
191
  """
179
192
  return self.content_length
180
193
 
181
- @content_length.setter
182
- def content_length(self, value: int):
183
- self.path.content_length = value
184
-
185
194
  @property
186
195
  def buffer(self):
187
196
  """Return the in-memory buffer, creating it if necessary.
@@ -204,9 +213,7 @@ class DatabricksIO(ABC, IO):
204
213
  Returns:
205
214
  None.
206
215
  """
207
- if self._buffer is not None:
208
- self._buffer.close()
209
- self._buffer = None
216
+ self._buffer = None
210
217
 
211
218
  def clone_instance(self, **kwargs):
212
219
  """Clone this IO instance with optional overrides.
@@ -251,15 +258,23 @@ class DatabricksIO(ABC, IO):
251
258
  self.path = path
252
259
  return self
253
260
 
254
- def close(self):
261
+ def close(self, flush: bool = True):
255
262
  """Flush pending writes and close the buffer.
256
263
 
264
+ Args:
265
+ flush: Checks flush data to commit to remote location
266
+
257
267
  Returns:
258
268
  None.
259
269
  """
260
- self.flush()
270
+ if flush:
271
+ self.flush()
261
272
  self.clear_buffer()
262
273
 
274
+ @property
275
+ def closed(self):
276
+ return False
277
+
263
278
  def fileno(self):
264
279
  """Return a pseudo file descriptor based on object hash.
265
280
 
@@ -508,14 +523,14 @@ class DatabricksIO(ABC, IO):
508
523
  if size is None:
509
524
  size = self.position
510
525
 
511
- if self._buffer is not None:
512
- self._buffer.truncate(size)
513
- else:
514
- data = b"\x00" * size
515
- self.write_all_bytes(data=data)
526
+ if self._buffer is None:
527
+ return self.write_all_bytes(data=b"\x00" * size)
528
+
529
+ self._buffer.truncate(size)
516
530
 
517
531
  self.content_length = size
518
532
  self._write_flag = True
533
+
519
534
  return size
520
535
 
521
536
  def flush(self):
@@ -588,7 +603,13 @@ class DatabricksIO(ABC, IO):
588
603
  Returns:
589
604
  None.
590
605
  """
591
- if not isinstance(dest, DatabricksIO):
606
+ data = self.read_all_bytes(use_cache=False)
607
+
608
+ if isinstance(dest, DatabricksIO):
609
+ dest.write_all_bytes(data=data)
610
+ elif hasattr(dest, "write"):
611
+ dest.write(data)
612
+ else:
592
613
  from .path import DatabricksPath
593
614
 
594
615
  dest_path = DatabricksPath.parse(dest, workspace=self.workspace)
@@ -596,8 +617,6 @@ class DatabricksIO(ABC, IO):
596
617
  with dest_path.open(mode="wb") as d:
597
618
  return self.copy_to(dest=d)
598
619
 
599
- dest.write_all_bytes(data=self.read_all_bytes(use_cache=False))
600
-
601
620
  # ---- format helpers ----
602
621
 
603
622
  def _reset_for_write(self):
@@ -616,7 +635,6 @@ class DatabricksIO(ABC, IO):
616
635
  table: Union[pa.Table, pa.RecordBatch, PolarsDataFrame, PandasDataFrame],
617
636
  file_format: Optional[FileFormat] = None,
618
637
  batch_size: Optional[int] = None,
619
- **kwargs
620
638
  ):
621
639
  """Write a table-like object to the path using an inferred format.
622
640
 
@@ -624,19 +642,18 @@ class DatabricksIO(ABC, IO):
624
642
  table: Table-like object to write.
625
643
  file_format: Optional file format override.
626
644
  batch_size: Optional batch size for writes.
627
- **kwargs: Format-specific options.
628
645
 
629
646
  Returns:
630
647
  The result of the specific write implementation.
631
648
  """
632
649
  if isinstance(table, pa.Table):
633
- return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
650
+ return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size)
634
651
  elif isinstance(table, pa.RecordBatch):
635
- return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size, **kwargs)
652
+ return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size)
636
653
  elif isinstance(table, PolarsDataFrame):
637
- return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
654
+ return self.write_polars(table, file_format=file_format, batch_size=batch_size)
638
655
  elif isinstance(table, PandasDataFrame):
639
- return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
656
+ return self.write_pandas(table, file_format=file_format, batch_size=batch_size)
640
657
 
641
658
  return self.write_arrow(
642
659
  table=table,
@@ -666,10 +683,10 @@ class DatabricksIO(ABC, IO):
666
683
  self.seek(0)
667
684
 
668
685
  if isinstance(file_format, ParquetFileFormat):
669
- pq.read_table(self, **kwargs)
686
+ return pq.read_table(self, **kwargs)
670
687
 
671
688
  elif isinstance(file_format, CsvFileFormat):
672
- pcsv.read_csv(self, parse_options=file_format.parse_options)
689
+ return pcsv.read_csv(self, parse_options=file_format.parse_options)
673
690
 
674
691
  else:
675
692
  ValueError(f"Unsupported file format for Arrow table: {file_format}")
@@ -679,7 +696,6 @@ class DatabricksIO(ABC, IO):
679
696
  table: Union[pa.Table, pa.RecordBatch],
680
697
  file_format: Optional[FileFormat] = None,
681
698
  batch_size: Optional[int] = None,
682
- **kwargs
683
699
  ):
684
700
  """Write an Arrow table or record batch to the path.
685
701
 
@@ -687,7 +703,6 @@ class DatabricksIO(ABC, IO):
687
703
  table: Arrow table or batch to write.
688
704
  file_format: Optional file format override.
689
705
  batch_size: Optional batch size for writes.
690
- **kwargs: Format-specific options.
691
706
 
692
707
  Returns:
693
708
  None.
@@ -699,7 +714,6 @@ class DatabricksIO(ABC, IO):
699
714
  table=table,
700
715
  file_format=file_format,
701
716
  batch_size=batch_size,
702
- **kwargs
703
717
  )
704
718
 
705
719
  def write_arrow_table(
@@ -707,7 +721,6 @@ class DatabricksIO(ABC, IO):
707
721
  table: pa.Table,
708
722
  file_format: Optional[FileFormat] = None,
709
723
  batch_size: Optional[int] = None,
710
- **kwargs
711
724
  ):
712
725
  """Write an Arrow table using the selected file format.
713
726
 
@@ -715,7 +728,6 @@ class DatabricksIO(ABC, IO):
715
728
  table: Arrow table to write.
716
729
  file_format: Optional file format override.
717
730
  batch_size: Optional batch size for writes.
718
- **kwargs: Format-specific options.
719
731
 
720
732
  Returns:
721
733
  None.
@@ -724,13 +736,20 @@ class DatabricksIO(ABC, IO):
724
736
  buffer = io.BytesIO()
725
737
 
726
738
  if isinstance(file_format, ParquetFileFormat):
727
- pq.write_table(table, buffer, write_batch_size=batch_size, **kwargs)
739
+ pq.write_table(
740
+ table, buffer,
741
+ write_batch_size=batch_size
742
+ )
728
743
 
729
744
  elif isinstance(file_format, CsvFileFormat):
730
- pcsv.write_csv(table, buffer, **kwargs)
745
+ pcsv.write_csv(table, buffer)
731
746
 
732
747
  else:
733
- raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
748
+ return self.write_polars(
749
+ df=polars.from_arrow(table),
750
+ file_format=file_format,
751
+ batch_size=batch_size
752
+ )
734
753
 
735
754
  self.write_all_bytes(data=buffer.getvalue())
736
755
 
@@ -739,7 +758,6 @@ class DatabricksIO(ABC, IO):
739
758
  batch: pa.RecordBatch,
740
759
  file_format: Optional[FileFormat] = None,
741
760
  batch_size: Optional[int] = None,
742
- **kwargs
743
761
  ):
744
762
  """Write a single Arrow record batch.
745
763
 
@@ -747,26 +765,23 @@ class DatabricksIO(ABC, IO):
747
765
  batch: RecordBatch to write.
748
766
  file_format: Optional file format override.
749
767
  batch_size: Optional batch size for writes.
750
- **kwargs: Format-specific options.
751
768
 
752
769
  Returns:
753
770
  None.
754
771
  """
755
772
  table = pa.Table.from_batches([batch])
756
- self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
773
+ self.write_arrow_table(table, file_format=file_format, batch_size=batch_size)
757
774
 
758
775
  def read_arrow_batches(
759
776
  self,
760
777
  file_format: Optional[FileFormat] = None,
761
778
  batch_size: Optional[int] = None,
762
- **kwargs
763
779
  ):
764
780
  """Yield Arrow record batches from the file.
765
781
 
766
782
  Args:
767
783
  file_format: Optional file format override.
768
784
  batch_size: Optional batch size for reads.
769
- **kwargs: Format-specific options.
770
785
 
771
786
  Returns:
772
787
  An iterator over Arrow RecordBatch objects.
@@ -776,7 +791,6 @@ class DatabricksIO(ABC, IO):
776
791
  .read_arrow_table(
777
792
  file_format=file_format,
778
793
  batch_size=batch_size,
779
- **kwargs
780
794
  )
781
795
  .to_batches(max_chunksize=batch_size)
782
796
  )
@@ -787,36 +801,26 @@ class DatabricksIO(ABC, IO):
787
801
  self,
788
802
  file_format: Optional[FileFormat] = None,
789
803
  batch_size: Optional[int] = None,
790
- **kwargs
791
804
  ):
792
805
  """Read the file into a pandas DataFrame.
793
806
 
794
807
  Args:
795
808
  file_format: Optional file format override.
796
809
  batch_size: Optional batch size for reads.
797
- **kwargs: Format-specific options.
798
810
 
799
811
  Returns:
800
812
  A pandas DataFrame with the file contents.
801
813
  """
802
- file_format = self.path.file_format if file_format is None else file_format
803
- self.seek(0)
804
-
805
- if isinstance(file_format, ParquetFileFormat):
806
- pandas.read_parquet(self, **kwargs)
807
-
808
- elif isinstance(file_format, CsvFileFormat):
809
- pandas.read_csv(self, **kwargs)
810
-
811
- else:
812
- raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
814
+ return self.read_arrow_table(
815
+ file_format=file_format,
816
+ batch_size=batch_size
817
+ ).to_pandas()
813
818
 
814
819
  def write_pandas(
815
820
  self,
816
821
  df: PandasDataFrame,
817
822
  file_format: Optional[FileFormat] = None,
818
823
  batch_size: Optional[int] = None,
819
- **kwargs
820
824
  ):
821
825
  """Write a pandas DataFrame to the file.
822
826
 
@@ -824,7 +828,6 @@ class DatabricksIO(ABC, IO):
824
828
  df: pandas DataFrame to write.
825
829
  file_format: Optional file format override.
826
830
  batch_size: Optional batch size for writes.
827
- **kwargs: Format-specific options.
828
831
 
829
832
  Returns:
830
833
  None.
@@ -833,13 +836,17 @@ class DatabricksIO(ABC, IO):
833
836
  buffer = io.BytesIO()
834
837
 
835
838
  if isinstance(file_format, ParquetFileFormat):
836
- df.to_parquet(buffer, **kwargs)
839
+ df.to_parquet(buffer)
837
840
 
838
841
  elif isinstance(file_format, CsvFileFormat):
839
- df.to_csv(buffer, **kwargs)
842
+ df.to_csv(buffer)
840
843
 
841
844
  else:
842
- raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
845
+ return self.write_polars(
846
+ df=polars.from_pandas(df),
847
+ file_format=file_format,
848
+ batch_size=batch_size,
849
+ )
843
850
 
844
851
  self.write_all_bytes(data=buffer.getvalue())
845
852
 
@@ -849,14 +856,12 @@ class DatabricksIO(ABC, IO):
849
856
  self,
850
857
  file_format: Optional[FileFormat] = None,
851
858
  batch_size: Optional[int] = None,
852
- **kwargs
853
859
  ):
854
860
  """Read the file into a polars DataFrame.
855
861
 
856
862
  Args:
857
863
  file_format: Optional file format override.
858
864
  batch_size: Optional batch size for reads.
859
- **kwargs: Format-specific options.
860
865
 
861
866
  Returns:
862
867
  A polars DataFrame with the file contents.
@@ -865,10 +870,13 @@ class DatabricksIO(ABC, IO):
865
870
  self.seek(0)
866
871
 
867
872
  if isinstance(file_format, ParquetFileFormat):
868
- polars.read_parquet(self, **kwargs)
873
+ return polars.read_parquet(self)
869
874
 
870
875
  elif isinstance(file_format, CsvFileFormat):
871
- polars.read_csv(self, **kwargs)
876
+ return polars.read_csv(self)
877
+
878
+ elif isinstance(file_format, ExcelFileFormat):
879
+ return polars.read_excel(self)
872
880
 
873
881
  else:
874
882
  raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
@@ -878,7 +886,6 @@ class DatabricksIO(ABC, IO):
878
886
  df: PolarsDataFrame,
879
887
  file_format: Optional[FileFormat] = None,
880
888
  batch_size: Optional[int] = None,
881
- **kwargs
882
889
  ):
883
890
  """Write a polars DataFrame to the file.
884
891
 
@@ -886,7 +893,6 @@ class DatabricksIO(ABC, IO):
886
893
  df: polars DataFrame to write.
887
894
  file_format: Optional file format override.
888
895
  batch_size: Optional batch size for writes.
889
- **kwargs: Format-specific options.
890
896
 
891
897
  Returns:
892
898
  None.
@@ -895,10 +901,13 @@ class DatabricksIO(ABC, IO):
895
901
  buffer = io.BytesIO()
896
902
 
897
903
  if isinstance(file_format, ParquetFileFormat):
898
- df.write_parquet(buffer, **kwargs)
904
+ df.write_parquet(buffer)
899
905
 
900
906
  elif isinstance(file_format, CsvFileFormat):
901
- df.write_csv(buffer, **kwargs)
907
+ df.write_csv(buffer)
908
+
909
+ elif isinstance(file_format, ExcelFileFormat):
910
+ df.write_excel(buffer)
902
911
 
903
912
  else:
904
913
  raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")