PyPI - ygg - Versions diffs - 0.1.50__py3-none-any.whl → 0.1.52__py3-none-any.whl - Mend

ygg 0.1.50py3-none-any.whl → 0.1.52py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{ygg-0.1.50.dist-info → ygg-0.1.52.dist-info}/METADATA +1 -1
{ygg-0.1.50.dist-info → ygg-0.1.52.dist-info}/RECORD +18 -16
yggdrasil/databricks/sql/engine.py +288 -84
yggdrasil/databricks/sql/exceptions.py +3 -1
yggdrasil/databricks/workspaces/io.py +80 -71
yggdrasil/databricks/workspaces/path.py +369 -168
yggdrasil/databricks/workspaces/path_kind.py +3 -3
yggdrasil/databricks/workspaces/volumes_path.py +85 -0
yggdrasil/libs/databrickslib.py +5 -0
yggdrasil/pyutils/callable_serde.py +10 -10
yggdrasil/pyutils/retry.py +2 -2
yggdrasil/types/cast/registry.py +0 -14
yggdrasil/types/file_format.py +10 -0
yggdrasil/version.py +1 -1
{ygg-0.1.50.dist-info → ygg-0.1.52.dist-info}/WHEEL +0 -0
{ygg-0.1.50.dist-info → ygg-0.1.52.dist-info}/entry_points.txt +0 -0
{ygg-0.1.50.dist-info → ygg-0.1.52.dist-info}/licenses/LICENSE +0 -0
{ygg-0.1.50.dist-info → ygg-0.1.52.dist-info}/top_level.txt +0 -0

yggdrasil/databricks/workspaces/io.py CHANGED Viewed

@@ -9,13 +9,18 @@ from typing import TYPE_CHECKING, Optional, IO, AnyStr, Union
 import pyarrow as pa
 import pyarrow.csv as pcsv
 import pyarrow.parquet as pq
-from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat
+from Lib.threading import Thread
+from pyarrow.dataset import (
+    FileFormat,
+    ParquetFileFormat, CsvFileFormat,
+)
 from .path_kind import DatabricksPathKind
 from ...libs.databrickslib import databricks
-from ...libs.pandaslib import pandas, PandasDataFrame
+from ...libs.pandaslib import PandasDataFrame
 from ...libs.polarslib import polars, PolarsDataFrame
 from ...types.cast.registry import convert
+from ...types.file_format import ExcelFileFormat
 if databricks is not None:
     from databricks.sdk.service.workspace import ImportFormat, ExportFormat
@@ -66,7 +71,10 @@ class DatabricksIO(ABC, IO):
         self.close()
     def __del__(self):
-        self.close()
+        try:
+            Thread(target=self.close).start()
+        except BaseException:
+            pass
     def __next__(self):
         """Iterate over lines in the file."""
@@ -75,8 +83,11 @@ class DatabricksIO(ABC, IO):
             raise StopIteration
         return line
+    def __len__(self):
+        return self.content_length or 0
     def __iter__(self):
-        return self
+        return self.read_all_bytes().__iter__()
     def __hash__(self):
         return self.path.__hash__()
@@ -87,7 +98,6 @@ class DatabricksIO(ABC, IO):
         path: "DatabricksPath",
         mode: str,
         encoding: Optional[str] = None,
-        compression: Optional[str] = "detect",
         position: int = 0,
         buffer: Optional[io.BytesIO] = None,
     ) -> "DatabricksIO":
@@ -97,7 +107,6 @@ class DatabricksIO(ABC, IO):
             path: DatabricksPath to open.
             mode: File mode string.
             encoding: Optional text encoding for text mode.
-            compression: Optional compression mode.
             position: Initial file cursor position.
             buffer: Optional pre-seeded buffer.
@@ -170,6 +179,10 @@ class DatabricksIO(ABC, IO):
     def content_length(self) -> int:
         return self.path.content_length
+    @content_length.setter
+    def content_length(self, value: int):
+        self.path.content_length = value
     def size(self):
         """Return the size of the file in bytes.
@@ -178,10 +191,6 @@ class DatabricksIO(ABC, IO):
         """
         return self.content_length
-    @content_length.setter
-    def content_length(self, value: int):
-        self.path.content_length = value
     @property
     def buffer(self):
         """Return the in-memory buffer, creating it if necessary.
@@ -204,9 +213,7 @@ class DatabricksIO(ABC, IO):
         Returns:
             None.
         """
-        if self._buffer is not None:
-            self._buffer.close()
-            self._buffer = None
+        self._buffer = None
     def clone_instance(self, **kwargs):
         """Clone this IO instance with optional overrides.
@@ -251,15 +258,23 @@ class DatabricksIO(ABC, IO):
         self.path = path
         return self
-    def close(self):
+    def close(self, flush: bool = True):
         """Flush pending writes and close the buffer.
+        Args:
+            flush: Checks flush data to commit to remote location
         Returns:
             None.
         """
-        self.flush()
+        if flush:
+            self.flush()
         self.clear_buffer()
+    @property
+    def closed(self):
+        return False
     def fileno(self):
         """Return a pseudo file descriptor based on object hash.
@@ -508,14 +523,14 @@ class DatabricksIO(ABC, IO):
         if size is None:
             size = self.position
-        if self._buffer is not None:
-            self._buffer.truncate(size)
-        else:
-            data = b"\x00" * size
-            self.write_all_bytes(data=data)
+        if self._buffer is None:
+            return self.write_all_bytes(data=b"\x00" * size)
+        self._buffer.truncate(size)
         self.content_length = size
         self._write_flag = True
         return size
     def flush(self):
@@ -588,7 +603,13 @@ class DatabricksIO(ABC, IO):
         Returns:
             None.
         """
-        if not isinstance(dest, DatabricksIO):
+        data = self.read_all_bytes(use_cache=False)
+        if isinstance(dest, DatabricksIO):
+            dest.write_all_bytes(data=data)
+        elif hasattr(dest, "write"):
+            dest.write(data)
+        else:
             from .path import DatabricksPath
             dest_path = DatabricksPath.parse(dest, workspace=self.workspace)
@@ -596,8 +617,6 @@ class DatabricksIO(ABC, IO):
             with dest_path.open(mode="wb") as d:
                 return self.copy_to(dest=d)
-        dest.write_all_bytes(data=self.read_all_bytes(use_cache=False))
     # ---- format helpers ----
     def _reset_for_write(self):
@@ -616,7 +635,6 @@ class DatabricksIO(ABC, IO):
         table: Union[pa.Table, pa.RecordBatch, PolarsDataFrame, PandasDataFrame],
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Write a table-like object to the path using an inferred format.
@@ -624,19 +642,18 @@ class DatabricksIO(ABC, IO):
             table: Table-like object to write.
             file_format: Optional file format override.
             batch_size: Optional batch size for writes.
-            **kwargs: Format-specific options.
         Returns:
             The result of the specific write implementation.
         """
         if isinstance(table, pa.Table):
-            return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
+            return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size)
         elif isinstance(table, pa.RecordBatch):
-            return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size, **kwargs)
+            return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size)
         elif isinstance(table, PolarsDataFrame):
-            return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
+            return self.write_polars(table, file_format=file_format, batch_size=batch_size)
         elif isinstance(table, PandasDataFrame):
-            return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
+            return self.write_pandas(table, file_format=file_format, batch_size=batch_size)
         return self.write_arrow(
             table=table,
@@ -666,10 +683,10 @@ class DatabricksIO(ABC, IO):
         self.seek(0)
         if isinstance(file_format, ParquetFileFormat):
-            pq.read_table(self, **kwargs)
+            return pq.read_table(self, **kwargs)
         elif isinstance(file_format, CsvFileFormat):
-            pcsv.read_csv(self, parse_options=file_format.parse_options)
+            return pcsv.read_csv(self, parse_options=file_format.parse_options)
         else:
             ValueError(f"Unsupported file format for Arrow table: {file_format}")
@@ -679,7 +696,6 @@ class DatabricksIO(ABC, IO):
         table: Union[pa.Table, pa.RecordBatch],
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Write an Arrow table or record batch to the path.
@@ -687,7 +703,6 @@ class DatabricksIO(ABC, IO):
             table: Arrow table or batch to write.
             file_format: Optional file format override.
             batch_size: Optional batch size for writes.
-            **kwargs: Format-specific options.
         Returns:
             None.
@@ -699,7 +714,6 @@ class DatabricksIO(ABC, IO):
             table=table,
             file_format=file_format,
             batch_size=batch_size,
-            **kwargs
         )
     def write_arrow_table(
@@ -707,7 +721,6 @@ class DatabricksIO(ABC, IO):
         table: pa.Table,
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Write an Arrow table using the selected file format.
@@ -715,7 +728,6 @@ class DatabricksIO(ABC, IO):
             table: Arrow table to write.
             file_format: Optional file format override.
             batch_size: Optional batch size for writes.
-            **kwargs: Format-specific options.
         Returns:
             None.
@@ -724,13 +736,20 @@ class DatabricksIO(ABC, IO):
         buffer = io.BytesIO()
         if isinstance(file_format, ParquetFileFormat):
-            pq.write_table(table, buffer, write_batch_size=batch_size, **kwargs)
+            pq.write_table(
+                table, buffer,
+                write_batch_size=batch_size
+            )
         elif isinstance(file_format, CsvFileFormat):
-            pcsv.write_csv(table, buffer, **kwargs)
+            pcsv.write_csv(table, buffer)
         else:
-            raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
+            return self.write_polars(
+                df=polars.from_arrow(table),
+                file_format=file_format,
+                batch_size=batch_size
+            )
         self.write_all_bytes(data=buffer.getvalue())
@@ -739,7 +758,6 @@ class DatabricksIO(ABC, IO):
         batch: pa.RecordBatch,
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Write a single Arrow record batch.
@@ -747,26 +765,23 @@ class DatabricksIO(ABC, IO):
             batch: RecordBatch to write.
             file_format: Optional file format override.
             batch_size: Optional batch size for writes.
-            **kwargs: Format-specific options.
         Returns:
             None.
         """
         table = pa.Table.from_batches([batch])
-        self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
+        self.write_arrow_table(table, file_format=file_format, batch_size=batch_size)
     def read_arrow_batches(
         self,
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Yield Arrow record batches from the file.
         Args:
             file_format: Optional file format override.
             batch_size: Optional batch size for reads.
-            **kwargs: Format-specific options.
         Returns:
             An iterator over Arrow RecordBatch objects.
@@ -776,7 +791,6 @@ class DatabricksIO(ABC, IO):
             .read_arrow_table(
                 file_format=file_format,
                 batch_size=batch_size,
-                **kwargs
             )
             .to_batches(max_chunksize=batch_size)
         )
@@ -787,36 +801,26 @@ class DatabricksIO(ABC, IO):
         self,
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Read the file into a pandas DataFrame.
         Args:
             file_format: Optional file format override.
             batch_size: Optional batch size for reads.
-            **kwargs: Format-specific options.
         Returns:
             A pandas DataFrame with the file contents.
         """
-        file_format = self.path.file_format if file_format is None else file_format
-        self.seek(0)
-        if isinstance(file_format, ParquetFileFormat):
-            pandas.read_parquet(self, **kwargs)
-        elif isinstance(file_format, CsvFileFormat):
-            pandas.read_csv(self, **kwargs)
-        else:
-            raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
+        return self.read_arrow_table(
+            file_format=file_format,
+            batch_size=batch_size
+        ).to_pandas()
     def write_pandas(
         self,
         df: PandasDataFrame,
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Write a pandas DataFrame to the file.
@@ -824,7 +828,6 @@ class DatabricksIO(ABC, IO):
             df: pandas DataFrame to write.
             file_format: Optional file format override.
             batch_size: Optional batch size for writes.
-            **kwargs: Format-specific options.
         Returns:
             None.
@@ -833,13 +836,17 @@ class DatabricksIO(ABC, IO):
         buffer = io.BytesIO()
         if isinstance(file_format, ParquetFileFormat):
-            df.to_parquet(buffer, **kwargs)
+            df.to_parquet(buffer)
         elif isinstance(file_format, CsvFileFormat):
-            df.to_csv(buffer, **kwargs)
+            df.to_csv(buffer)
         else:
-            raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
+            return self.write_polars(
+                df=polars.from_pandas(df),
+                file_format=file_format,
+                batch_size=batch_size,
+            )
         self.write_all_bytes(data=buffer.getvalue())
@@ -849,14 +856,12 @@ class DatabricksIO(ABC, IO):
         self,
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Read the file into a polars DataFrame.
         Args:
             file_format: Optional file format override.
             batch_size: Optional batch size for reads.
-            **kwargs: Format-specific options.
         Returns:
             A polars DataFrame with the file contents.
@@ -865,10 +870,13 @@ class DatabricksIO(ABC, IO):
         self.seek(0)
         if isinstance(file_format, ParquetFileFormat):
-            polars.read_parquet(self, **kwargs)
+            return polars.read_parquet(self)
         elif isinstance(file_format, CsvFileFormat):
-            polars.read_csv(self, **kwargs)
+            return polars.read_csv(self)
+        elif isinstance(file_format, ExcelFileFormat):
+            return polars.read_excel(self)
         else:
             raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
@@ -878,7 +886,6 @@ class DatabricksIO(ABC, IO):
         df: PolarsDataFrame,
         file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
-        **kwargs
     ):
         """Write a polars DataFrame to the file.
@@ -886,7 +893,6 @@ class DatabricksIO(ABC, IO):
             df: polars DataFrame to write.
             file_format: Optional file format override.
             batch_size: Optional batch size for writes.
-            **kwargs: Format-specific options.
         Returns:
             None.
@@ -895,10 +901,13 @@ class DatabricksIO(ABC, IO):
         buffer = io.BytesIO()
         if isinstance(file_format, ParquetFileFormat):
-            df.write_parquet(buffer, **kwargs)
+            df.write_parquet(buffer)
         elif isinstance(file_format, CsvFileFormat):
-            df.write_csv(buffer, **kwargs)
+            df.write_csv(buffer)
+        elif isinstance(file_format, ExcelFileFormat):
+            df.write_excel(buffer)
         else:
             raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")

ygg 0.1.50__py3-none-any.whl → 0.1.52__py3-none-any.whl

ygg 0.1.50py3-none-any.whl → 0.1.52py3-none-any.whl