PyPI - ygg - Versions diffs - 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl - Mend

ygg 0.1.31py3-none-any.whl → 0.1.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/METADATA +1 -1
ygg-0.1.33.dist-info/RECORD +60 -0
yggdrasil/__init__.py +2 -0
yggdrasil/databricks/__init__.py +2 -0
yggdrasil/databricks/compute/__init__.py +2 -0
yggdrasil/databricks/compute/cluster.py +244 -3
yggdrasil/databricks/compute/execution_context.py +100 -11
yggdrasil/databricks/compute/remote.py +24 -0
yggdrasil/databricks/jobs/__init__.py +5 -0
yggdrasil/databricks/jobs/config.py +29 -4
yggdrasil/databricks/sql/__init__.py +2 -0
yggdrasil/databricks/sql/engine.py +217 -36
yggdrasil/databricks/sql/exceptions.py +1 -0
yggdrasil/databricks/sql/statement_result.py +147 -0
yggdrasil/databricks/sql/types.py +33 -1
yggdrasil/databricks/workspaces/__init__.py +2 -1
yggdrasil/databricks/workspaces/filesytem.py +183 -0
yggdrasil/databricks/workspaces/io.py +387 -9
yggdrasil/databricks/workspaces/path.py +297 -2
yggdrasil/databricks/workspaces/path_kind.py +3 -0
yggdrasil/databricks/workspaces/workspace.py +202 -5
yggdrasil/dataclasses/__init__.py +2 -0
yggdrasil/dataclasses/dataclass.py +42 -1
yggdrasil/libs/__init__.py +2 -0
yggdrasil/libs/databrickslib.py +9 -0
yggdrasil/libs/extensions/__init__.py +2 -0
yggdrasil/libs/extensions/polars_extensions.py +72 -0
yggdrasil/libs/extensions/spark_extensions.py +116 -0
yggdrasil/libs/pandaslib.py +7 -0
yggdrasil/libs/polarslib.py +7 -0
yggdrasil/libs/sparklib.py +41 -0
yggdrasil/pyutils/__init__.py +4 -0
yggdrasil/pyutils/callable_serde.py +106 -0
yggdrasil/pyutils/exceptions.py +16 -0
yggdrasil/pyutils/modules.py +44 -1
yggdrasil/pyutils/parallel.py +29 -0
yggdrasil/pyutils/python_env.py +301 -0
yggdrasil/pyutils/retry.py +57 -0
yggdrasil/requests/__init__.py +4 -0
yggdrasil/requests/msal.py +124 -3
yggdrasil/requests/session.py +18 -0
yggdrasil/types/__init__.py +2 -0
yggdrasil/types/cast/__init__.py +2 -1
yggdrasil/types/cast/arrow_cast.py +123 -1
yggdrasil/types/cast/cast_options.py +119 -1
yggdrasil/types/cast/pandas_cast.py +29 -0
yggdrasil/types/cast/polars_cast.py +47 -0
yggdrasil/types/cast/polars_pandas_cast.py +29 -0
yggdrasil/types/cast/registry.py +176 -0
yggdrasil/types/cast/spark_cast.py +76 -0
yggdrasil/types/cast/spark_pandas_cast.py +29 -0
yggdrasil/types/cast/spark_polars_cast.py +28 -0
yggdrasil/types/libs.py +2 -0
yggdrasil/types/python_arrow.py +191 -0
yggdrasil/types/python_defaults.py +73 -0
yggdrasil/version.py +1 -0
ygg-0.1.31.dist-info/RECORD +0 -59
{ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/WHEEL +0 -0
{ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/entry_points.txt +0 -0
{ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/licenses/LICENSE +0 -0
{ygg-0.1.31.dist-info → ygg-0.1.33.dist-info}/top_level.txt +0 -0

yggdrasil/databricks/workspaces/io.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""File-like IO abstractions for Databricks paths."""
 import base64
 import io
 import time
@@ -33,6 +35,7 @@ __all__ = [
 class DatabricksIO(ABC, IO):
+    """File-like interface for Databricks workspace, volume, or DBFS paths."""
     def __init__(
         self,
@@ -57,15 +60,18 @@ class DatabricksIO(ABC, IO):
         self._write_flag = False
     def __enter__(self) -> "DatabricksIO":
+        """Enter a context manager and connect the underlying path."""
         return self.connect(clone=False)
     def __exit__(self, exc_type, exc_value, traceback):
+        """Exit the context manager and close the buffer."""
         self.close()
     def __del__(self):
         self.close()
     def __next__(self):
+        """Iterate over lines in the file."""
         line = self.readline()
         if not line:
             raise StopIteration
@@ -87,6 +93,19 @@ class DatabricksIO(ABC, IO):
         position: int = 0,
         buffer: Optional[io.BytesIO] = None,
     ) -> "DatabricksIO":
+        """Create the appropriate IO subclass for the given path kind.
+        Args:
+            path: DatabricksPath to open.
+            mode: File mode string.
+            encoding: Optional text encoding for text mode.
+            compression: Optional compression mode.
+            position: Initial file cursor position.
+            buffer: Optional pre-seeded buffer.
+        Returns:
+            A DatabricksIO subclass instance.
+        """
         if path.kind == DatabricksPathKind.VOLUME:
             return DatabricksVolumeIO(
                 path=path,
@@ -119,10 +138,20 @@ class DatabricksIO(ABC, IO):
     @property
     def workspace(self):
+        """Return the associated Workspace instance.
+        Returns:
+            The Workspace bound to the path.
+        """
         return self.path.workspace
     @property
     def name(self):
+        """Return the name of the underlying path.
+        Returns:
+            The path name component.
+        """
         return self.path.name
     @property
@@ -147,6 +176,11 @@ class DatabricksIO(ABC, IO):
         return self.path.content_length
     def size(self):
+        """Return the size of the file in bytes.
+        Returns:
+            The file size in bytes.
+        """
         return self.content_length
     @content_length.setter
@@ -155,6 +189,11 @@ class DatabricksIO(ABC, IO):
     @property
     def buffer(self):
+        """Return the in-memory buffer, creating it if necessary.
+        Returns:
+            A BytesIO buffer for the file contents.
+        """
         if self._buffer is None:
             self._buffer = io.BytesIO()
             self._buffer.seek(self.position, io.SEEK_SET)
@@ -165,11 +204,24 @@ class DatabricksIO(ABC, IO):
         self._buffer = value
     def clear_buffer(self):
+        """Clear any cached in-memory buffer.
+        Returns:
+            None.
+        """
         if self._buffer is not None:
             self._buffer.close()
             self._buffer = None
     def clone_instance(self, **kwargs):
+        """Clone this IO instance with optional overrides.
+        Args:
+            **kwargs: Field overrides for the new instance.
+        Returns:
+            A cloned DatabricksIO instance.
+        """
         return self.__class__(
             path=kwargs.get("path", self.path),
             mode=kwargs.get("mode", self.mode),
@@ -181,9 +233,22 @@ class DatabricksIO(ABC, IO):
     @property
     def connected(self):
+        """Return True if the underlying path is connected.
+        Returns:
+            True if connected, otherwise False.
+        """
         return self.path.connected
     def connect(self, clone: bool = False) -> "DatabricksIO":
+        """Connect the underlying path and optionally return a clone.
+        Args:
+            clone: Whether to return a cloned instance.
+        Returns:
+            The connected DatabricksIO instance.
+        """
         path = self.path.connect(clone=clone)
         if clone:
@@ -193,23 +258,52 @@ class DatabricksIO(ABC, IO):
         return self
     def close(self):
+        """Flush pending writes and close the buffer.
+        Returns:
+            None.
+        """
         self.flush()
         if self._buffer is not None:
             self._buffer.close()
     def fileno(self):
+        """Return a pseudo file descriptor based on object hash.
+        Returns:
+            An integer file descriptor-like value.
+        """
         return hash(self)
     def isatty(self):
         return False
     def tell(self):
+        """Return the current cursor position.
+        Returns:
+            The current position in bytes.
+        """
         return self.position
     def seekable(self):
+        """Return True to indicate seek support.
+        Returns:
+            True.
+        """
         return True
     def seek(self, offset, whence=0, /):
+        """Move the cursor to a new position.
+        Args:
+            offset: Offset in bytes.
+            whence: Reference point (start, current, end).
+        Returns:
+            The new position in bytes.
+        """
         if whence == io.SEEK_SET:
             new_position = offset
         elif whence == io.SEEK_CUR:
@@ -230,21 +324,55 @@ class DatabricksIO(ABC, IO):
         return self.position
     def readable(self):
+        """Return True to indicate read support.
+        Returns:
+            True.
+        """
         return True
     def getvalue(self):
+        """Return the buffer contents, reading from remote if needed.
+        Returns:
+            File contents as bytes or str depending on mode.
+        """
         if self._buffer is not None:
             return self._buffer.getvalue()
         return self.read_all_bytes()
     def getbuffer(self):
+        """Return the underlying BytesIO buffer.
+        Returns:
+            The BytesIO buffer instance.
+        """
         return self.buffer
     @abstractmethod
     def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
+        """Read a byte range from the remote path.
+        Args:
+            start: Starting byte offset.
+            length: Number of bytes to read.
+            allow_not_found: Whether to suppress missing-path errors.
+        Returns:
+            The bytes read from the remote path.
+        """
         pass
     def read_all_bytes(self, use_cache: bool = True, allow_not_found: bool = False) -> bytes:
+        """Read the full contents into memory, optionally caching.
+        Args:
+            use_cache: Whether to cache contents in memory.
+            allow_not_found: Whether to suppress missing-path errors.
+        Returns:
+            File contents as bytes.
+        """
         if use_cache and self._buffer is not None:
             buffer_value = self._buffer.getvalue()
@@ -266,6 +394,15 @@ class DatabricksIO(ABC, IO):
         return data
     def read(self, n=-1, use_cache: bool = True):
+        """Read up to ``n`` bytes/characters from the file.
+        Args:
+            n: Number of bytes/characters to read; -1 for all.
+            use_cache: Whether to use cached contents.
+        Returns:
+            The read bytes or string depending on mode.
+        """
         if not self.readable():
             raise IOError("File not open for reading")
@@ -285,6 +422,15 @@ class DatabricksIO(ABC, IO):
         return data
     def readline(self, limit=-1, use_cache: bool = True):
+        """Read a single line from the file.
+        Args:
+            limit: Max characters/bytes to read; -1 for no limit.
+            use_cache: Whether to use cached contents.
+        Returns:
+            The next line as bytes or string.
+        """
         if not self.readable():
             raise IOError("File not open for reading")
@@ -320,6 +466,15 @@ class DatabricksIO(ABC, IO):
         return bytes(line_bytes)
     def readlines(self, hint=-1, use_cache: bool = True):
+        """Read all lines from the file.
+        Args:
+            hint: Optional byte/char count hint; -1 for no hint.
+            use_cache: Whether to use cached contents.
+        Returns:
+            A list of lines.
+        """
         if not self.readable():
             raise IOError("File not open for reading")
@@ -338,16 +493,42 @@ class DatabricksIO(ABC, IO):
         return lines
     def appendable(self):
+        """Return True when the file is open in append mode.
+        Returns:
+            True if in append mode.
+        """
         return "a" in self.mode
     def writable(self):
+        """Return True to indicate write support.
+        Returns:
+            True.
+        """
         return True
     @abstractmethod
     def write_all_bytes(self, data: bytes):
+        """Write raw bytes to the remote path.
+        Args:
+            data: Bytes to write.
+        Returns:
+            None.
+        """
         pass
     def truncate(self, size=None, /):
+        """Resize the file to ``size`` bytes.
+        Args:
+            size: Target size in bytes (defaults to current position).
+        Returns:
+            The new size in bytes.
+        """
         if size is None:
             size = self.position
@@ -362,11 +543,24 @@ class DatabricksIO(ABC, IO):
         return size
     def flush(self):
+        """Flush buffered data to the remote path.
+        Returns:
+            None.
+        """
         if self._write_flag and self._buffer is not None:
             self.write_all_bytes(data=self._buffer.getvalue())
             self._write_flag = False
     def write(self, data: AnyStr) -> int:
+        """Write data to the buffer and mark for flush.
+        Args:
+            data: String or bytes to write.
+        Returns:
+            The number of bytes written.
+        """
         if not self.writable():
             raise IOError("File not open for writing")
@@ -382,6 +576,14 @@ class DatabricksIO(ABC, IO):
         return written
     def writelines(self, lines) -> None:
+        """Write multiple lines to the buffer.
+        Args:
+            lines: Iterable of lines to write.
+        Returns:
+            None.
+        """
         for line in lines:
             if isinstance(line, str):
                 line = line.encode(self.encoding or "utf-8")
@@ -394,12 +596,25 @@ class DatabricksIO(ABC, IO):
             self.write(data)
     def get_output_stream(self, *args, **kwargs):
+        """Return this instance for compatibility with Arrow APIs.
+        Returns:
+            The current DatabricksIO instance.
+        """
         return self
     def copy_to(
         self,
         dest: Union["DatabricksIO", "DatabricksPath", str]
     ) -> None:
+        """Copy the file contents to another Databricks IO/path.
+        Args:
+            dest: Destination IO, DatabricksPath, or path string.
+        Returns:
+            None.
+        """
         if not isinstance(dest, DatabricksIO):
             from .path import DatabricksPath
@@ -426,17 +641,29 @@ class DatabricksIO(ABC, IO):
     def write_table(
         self,
         table: Union[pa.Table, pa.RecordBatch, PolarsDataFrame, PandasDataFrame],
+        file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Write a table-like object to the path using an inferred format.
+        Args:
+            table: Table-like object to write.
+            file_format: Optional file format override.
+            batch_size: Optional batch size for writes.
+            **kwargs: Format-specific options.
+        Returns:
+            The result of the specific write implementation.
+        """
         if isinstance(table, pa.Table):
-            return self.write_arrow_table(table, batch_size=batch_size, **kwargs)
+            return self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
         elif isinstance(table, pa.RecordBatch):
-            return self.write_arrow_batch(table, batch_size=batch_size, **kwargs)
+            return self.write_arrow_batch(table, file_format=file_format, batch_size=batch_size, **kwargs)
         elif isinstance(table, PolarsDataFrame):
-            return self.write_polars(table, batch_size=batch_size, **kwargs)
+            return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
         elif isinstance(table, PandasDataFrame):
-            return self.write_pandas(table, batch_size=batch_size, **kwargs)
+            return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
         else:
             raise ValueError(f"Cannot write {type(table)} to {self.path}")
@@ -448,6 +675,16 @@ class DatabricksIO(ABC, IO):
         batch_size: Optional[int] = None,
         **kwargs
     ) -> pa.Table:
+        """Read the file as an Arrow table.
+        Args:
+            file_format: Optional file format override.
+            batch_size: Optional batch size for reads.
+            **kwargs: Format-specific options.
+        Returns:
+            An Arrow Table with the file contents.
+        """
         file_format = self.path.file_format if file_format is None else file_format
         self.seek(0)
@@ -465,6 +702,16 @@ class DatabricksIO(ABC, IO):
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Write an Arrow table or record batch to the path.
+        Args:
+            table: Arrow table or batch to write.
+            batch_size: Optional batch size for writes.
+            **kwargs: Format-specific options.
+        Returns:
+            None.
+        """
         if not isinstance(table, pa.Table):
             table = convert(table, pa.Table)
@@ -481,11 +728,22 @@ class DatabricksIO(ABC, IO):
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Write an Arrow table using the selected file format.
+        Args:
+            table: Arrow table to write.
+            file_format: Optional file format override.
+            batch_size: Optional batch size for writes.
+            **kwargs: Format-specific options.
+        Returns:
+            None.
+        """
         file_format = self.path.file_format if file_format is None else file_format
         buffer = io.BytesIO()
         if isinstance(file_format, ParquetFileFormat):
-            pq.write_table(table, buffer, **kwargs)
+            pq.write_table(table, buffer, write_batch_size=batch_size, **kwargs)
         elif isinstance(file_format, CsvFileFormat):
             pcsv.write_csv(table, buffer, **kwargs)
@@ -498,17 +756,38 @@ class DatabricksIO(ABC, IO):
     def write_arrow_batch(
         self,
         batch: pa.RecordBatch,
+        file_format: Optional[FileFormat] = None,
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Write a single Arrow record batch.
+        Args:
+            batch: RecordBatch to write.
+            file_format: Optional file format override.
+            batch_size: Optional batch size for writes.
+            **kwargs: Format-specific options.
+        Returns:
+            None.
+        """
         table = pa.Table.from_batches([batch])
-        self.write_arrow_table(table, batch_size=batch_size, **kwargs)
+        self.write_arrow_table(table, file_format=file_format, batch_size=batch_size, **kwargs)
     def read_arrow_batches(
         self,
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Yield Arrow record batches from the file.
+        Args:
+            batch_size: Optional batch size for reads.
+            **kwargs: Format-specific options.
+        Returns:
+            An iterator over Arrow RecordBatch objects.
+        """
         return (
             self
             .read_arrow_table(batch_size=batch_size, **kwargs)
@@ -522,6 +801,15 @@ class DatabricksIO(ABC, IO):
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Read the file into a pandas DataFrame.
+        Args:
+            batch_size: Optional batch size for reads.
+            **kwargs: Format-specific options.
+        Returns:
+            A pandas DataFrame with the file contents.
+        """
         return self.read_arrow_table(batch_size=batch_size, **kwargs).to_pandas()
     def write_pandas(
@@ -530,6 +818,16 @@ class DatabricksIO(ABC, IO):
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Write a pandas DataFrame to the file.
+        Args:
+            df: pandas DataFrame to write.
+            batch_size: Optional batch size for writes.
+            **kwargs: Format-specific options.
+        Returns:
+            None.
+        """
         self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
     # ---- Polars ----
@@ -540,6 +838,16 @@ class DatabricksIO(ABC, IO):
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Read the file into a polars DataFrame.
+        Args:
+            file_format: Optional file format override.
+            batch_size: Optional batch size for reads.
+            **kwargs: Format-specific options.
+        Returns:
+            A polars DataFrame with the file contents.
+        """
         import polars as pl
         file_format = self.path.file_format if file_format is None else file_format
@@ -560,22 +868,46 @@ class DatabricksIO(ABC, IO):
         batch_size: Optional[int] = None,
         **kwargs
     ):
+        """Write a polars DataFrame to the file.
+        Args:
+            df: polars DataFrame to write.
+            file_format: Optional file format override.
+            batch_size: Optional batch size for writes.
+            **kwargs: Format-specific options.
+        Returns:
+            None.
+        """
         file_format = self.path.file_format if file_format is None else FileFormat
-        self._reset_for_write()
+        buffer = io.BytesIO()
         if isinstance(file_format, ParquetFileFormat):
-            df.write_parquet(self, **kwargs)
+            df.write_parquet(buffer, **kwargs)
         elif isinstance(file_format, CsvFileFormat):
-            df.write_csv(self, **kwargs)
+            df.write_csv(buffer, **kwargs)
         else:
             raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
+        self.write_all_bytes(data=buffer.getvalue())
 class DatabricksWorkspaceIO(DatabricksIO):
+    """IO adapter for Workspace files."""
     def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
+        """Read bytes from a Workspace file.
+        Args:
+            start: Starting byte offset.
+            length: Number of bytes to read.
+            allow_not_found: Whether to suppress missing-path errors.
+        Returns:
+            Bytes read from the file.
+        """
         if length == 0:
             return b""
@@ -597,6 +929,14 @@ class DatabricksWorkspaceIO(DatabricksIO):
         return data[start:end]
     def write_all_bytes(self, data: bytes):
+        """Write bytes to a Workspace file.
+        Args:
+            data: Bytes to write.
+        Returns:
+            The DatabricksWorkspaceIO instance.
+        """
         sdk = self.workspace.sdk()
         workspace_client = sdk.workspace
         full_path = self.path.workspace_full_path()
@@ -629,8 +969,19 @@ class DatabricksWorkspaceIO(DatabricksIO):
 class DatabricksVolumeIO(DatabricksIO):
+    """IO adapter for Unity Catalog volume files."""
     def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
+        """Read bytes from a volume file.
+        Args:
+            start: Starting byte offset.
+            length: Number of bytes to read.
+            allow_not_found: Whether to suppress missing-path errors.
+        Returns:
+            Bytes read from the file.
+        """
         if length == 0:
             return b""
@@ -648,6 +999,14 @@ class DatabricksVolumeIO(DatabricksIO):
         return result
     def write_all_bytes(self, data: bytes):
+        """Write bytes to a volume file.
+        Args:
+            data: Bytes to write.
+        Returns:
+            The DatabricksVolumeIO instance.
+        """
         sdk = self.workspace.sdk()
         client = sdk.files
         full_path = self.path.files_full_path()
@@ -678,8 +1037,19 @@ class DatabricksVolumeIO(DatabricksIO):
 class DatabricksDBFSIO(DatabricksIO):
+    """IO adapter for DBFS files."""
     def read_byte_range(self, start: int, length: int, allow_not_found: bool = False) -> bytes:
+        """Read bytes from a DBFS file.
+        Args:
+            start: Starting byte offset.
+            length: Number of bytes to read.
+            allow_not_found: Whether to suppress missing-path errors.
+        Returns:
+            Bytes read from the file.
+        """
         if length == 0:
             return b""
@@ -714,6 +1084,14 @@ class DatabricksDBFSIO(DatabricksIO):
         return bytes(read_bytes)
     def write_all_bytes(self, data: bytes):
+        """Write bytes to a DBFS file.
+        Args:
+            data: Bytes to write.
+        Returns:
+            The DatabricksDBFSIO instance.
+        """
         sdk = self.workspace.sdk()
         client = sdk.dbfs
         full_path = self.path.dbfs_full_path()

ygg 0.1.31__py3-none-any.whl → 0.1.33__py3-none-any.whl

ygg 0.1.31py3-none-any.whl → 0.1.33py3-none-any.whl