PyPI - ygg - Versions diffs - 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl - Mend

ygg 0.1.30py3-none-any.whl → 0.1.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
ygg-0.1.32.dist-info/RECORD +60 -0
yggdrasil/__init__.py +2 -0
yggdrasil/databricks/__init__.py +2 -0
yggdrasil/databricks/compute/__init__.py +2 -0
yggdrasil/databricks/compute/cluster.py +241 -2
yggdrasil/databricks/compute/execution_context.py +100 -11
yggdrasil/databricks/compute/remote.py +16 -0
yggdrasil/databricks/jobs/__init__.py +5 -0
yggdrasil/databricks/jobs/config.py +31 -34
yggdrasil/databricks/sql/__init__.py +2 -0
yggdrasil/databricks/sql/engine.py +217 -36
yggdrasil/databricks/sql/exceptions.py +1 -0
yggdrasil/databricks/sql/statement_result.py +148 -1
yggdrasil/databricks/sql/types.py +49 -1
yggdrasil/databricks/workspaces/__init__.py +4 -1
yggdrasil/databricks/workspaces/filesytem.py +344 -0
yggdrasil/databricks/workspaces/io.py +1123 -0
yggdrasil/databricks/workspaces/path.py +1415 -0
yggdrasil/databricks/workspaces/path_kind.py +13 -0
yggdrasil/databricks/workspaces/workspace.py +298 -154
yggdrasil/dataclasses/__init__.py +2 -0
yggdrasil/dataclasses/dataclass.py +42 -1
yggdrasil/libs/__init__.py +2 -0
yggdrasil/libs/databrickslib.py +9 -0
yggdrasil/libs/extensions/__init__.py +2 -0
yggdrasil/libs/extensions/polars_extensions.py +72 -0
yggdrasil/libs/extensions/spark_extensions.py +116 -0
yggdrasil/libs/pandaslib.py +7 -0
yggdrasil/libs/polarslib.py +7 -0
yggdrasil/libs/sparklib.py +41 -0
yggdrasil/pyutils/__init__.py +4 -0
yggdrasil/pyutils/callable_serde.py +106 -0
yggdrasil/pyutils/exceptions.py +16 -0
yggdrasil/pyutils/modules.py +44 -1
yggdrasil/pyutils/parallel.py +29 -0
yggdrasil/pyutils/python_env.py +301 -0
yggdrasil/pyutils/retry.py +57 -0
yggdrasil/requests/__init__.py +4 -0
yggdrasil/requests/msal.py +124 -3
yggdrasil/requests/session.py +18 -0
yggdrasil/types/__init__.py +2 -0
yggdrasil/types/cast/__init__.py +2 -1
yggdrasil/types/cast/arrow_cast.py +131 -0
yggdrasil/types/cast/cast_options.py +119 -1
yggdrasil/types/cast/pandas_cast.py +29 -0
yggdrasil/types/cast/polars_cast.py +47 -0
yggdrasil/types/cast/polars_pandas_cast.py +29 -0
yggdrasil/types/cast/registry.py +176 -0
yggdrasil/types/cast/spark_cast.py +76 -0
yggdrasil/types/cast/spark_pandas_cast.py +29 -0
yggdrasil/types/cast/spark_polars_cast.py +28 -0
yggdrasil/types/libs.py +2 -0
yggdrasil/types/python_arrow.py +191 -0
yggdrasil/types/python_defaults.py +73 -0
yggdrasil/version.py +1 -0
ygg-0.1.30.dist-info/RECORD +0 -56
yggdrasil/databricks/workspaces/databricks_path.py +0 -784
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0

yggdrasil/databricks/sql/statement_result.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""Result wrapper for Databricks SQL statement execution."""
 import dataclasses
 import threading
 import time
@@ -49,6 +51,7 @@ __all__ = [
 @dataclasses.dataclass
 class StatementResult:
+    """Container for statement responses, data extraction, and conversions."""
     engine: "SQLEngine"
     statement_id: str
     disposition: "Disposition"
@@ -60,6 +63,11 @@ class StatementResult:
     _arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
     def __getstate__(self):
+        """Serialize statement results, converting Spark dataframes to Arrow.
+        Returns:
+            A pickle-ready state dictionary.
+        """
         state = self.__dict__.copy()
         _spark_df = state.pop("_spark_df", None)
@@ -70,38 +78,71 @@ class StatementResult:
         return state
     def __setstate__(self, state):
+        """Restore statement result state, rehydrating cached data.
+        Args:
+            state: Serialized state dictionary.
+        """
         _spark_df = state.pop("_spark_df")
     def __iter__(self):
+        """Iterate over Arrow record batches."""
         return self.to_arrow_batches()
     @property
     def is_spark_sql(self):
+        """Return True when this result was produced by Spark SQL."""
         return self._spark_df is not None
     @property
     def response(self):
+        """Return the latest statement response, refreshing when needed.
+        Returns:
+            The current StatementResponse object.
+        """
         if self._response is None and not self.is_spark_sql:
             self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
         return self._response
     @response.setter
     def response(self, value: "StatementResponse"):
+        """Update the cached response and refresh timestamp.
+        Args:
+            value: StatementResponse to cache.
+        """
         self._response = value
         self._response_refresh_time = time.time()
         self.statement_id = self._response.statement_id
     def fresh_response(self, delay: float):
+        """Refresh the response if it is older than ``delay`` seconds.
+        Args:
+            delay: Minimum age in seconds before refreshing.
+        Returns:
+            The refreshed StatementResponse object.
+        """
         if self.is_spark_sql:
             return self._response
-        if not self.done and self.statement_id and time.time() - self._response_refresh_time > delay:
+        if self.statement_id and not self.done and time.time() - self._response_refresh_time > delay:
             self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
         return self._response
     def result_data_at(self, chunk_index: int):
+        """Fetch a specific result chunk by index.
+        Args:
+            chunk_index: Result chunk index to retrieve.
+        Returns:
+            The SDK result chunk response.
+        """
         sdk = self.workspace.sdk()
         return sdk.statement_execution.get_statement_result_chunk_n(
@@ -111,10 +152,20 @@ class StatementResult:
     @property
     def workspace(self):
+        """Expose the underlying workspace from the engine.
+        Returns:
+            The Workspace instance backing this statement.
+        """
         return self.engine.workspace
     @property
     def status(self):
+        """Return the statement status, handling persisted data.
+        Returns:
+            A StatementStatus object.
+        """
         if self.persisted:
             return StatementStatus(
                 state=StatementState.SUCCEEDED
@@ -129,20 +180,40 @@ class StatementResult:
     @property
     def state(self):
+        """Return the statement state.
+        Returns:
+            The StatementState enum value.
+        """
         return self.status.state
     @property
     def manifest(self):
+        """Return the SQL result manifest, if available.
+        Returns:
+            The result manifest or None for Spark SQL results.
+        """
         if self.is_spark_sql:
             return None
         return self.response.manifest
     @property
     def result(self):
+        """Return the raw statement result object.
+        Returns:
+            The statement result payload from the API.
+        """
         return self.response.result
     @property
     def done(self):
+        """Return True when the statement is in a terminal state.
+        Returns:
+            True if the statement is done, otherwise False.
+        """
         if self.persisted:
             return True
@@ -155,6 +226,11 @@ class StatementResult:
     @property
     def failed(self):
+        """Return True when the statement failed or was cancelled.
+        Returns:
+            True if the statement failed or was cancelled.
+        """
         if self.persisted:
             return True
@@ -165,14 +241,29 @@ class StatementResult:
     @property
     def persisted(self):
+        """Return True when data is cached locally.
+        Returns:
+            True when cached Arrow or Spark data is present.
+        """
         return self._spark_df is not None or self._arrow_table is not None
     def persist(self):
+        """Cache the statement result locally as Arrow data.
+        Returns:
+            The current StatementResult instance.
+        """
         if not self.persisted:
             self._arrow_table = self.to_arrow_table()
         return self
     def external_links(self):
+        """Yield external result links for EXTERNAL_LINKS dispositions.
+        Yields:
+            External link objects in result order.
+        """
         assert self.disposition == Disposition.EXTERNAL_LINKS, "Cannot get from %s, disposition %s != %s" % (
             self, self.disposition, Disposition.EXTERNAL_LINKS
         )
@@ -222,6 +313,11 @@ class StatementResult:
                 )
     def raise_for_status(self):
+        """Raise a ValueError if the statement failed.
+        Returns:
+            None.
+        """
         if self.failed:
             # grab error info if present
             err = self.status.error
@@ -244,6 +340,15 @@ class StatementResult:
         timeout: Optional[int] = None,
         poll_interval: Optional[float] = None
     ):
+        """Wait for statement completion with optional timeout.
+        Args:
+            timeout: Maximum seconds to wait.
+            poll_interval: Initial poll interval in seconds.
+        Returns:
+            The current StatementResult instance.
+        """
         if self.done:
             return self
@@ -265,6 +370,11 @@ class StatementResult:
         return current
     def arrow_schema(self):
+        """Return the Arrow schema for the result.
+        Returns:
+            An Arrow Schema instance.
+        """
         if self.persisted:
             if self._arrow_table is not None:
                 return self._arrow_table.schema
@@ -277,6 +387,14 @@ class StatementResult:
         return pa.schema(fields)
     def to_arrow_table(self, parallel_pool: Optional[int] = 4) -> pa.Table:
+        """Collect the statement result into a single Arrow table.
+        Args:
+            parallel_pool: Maximum parallel fetch workers.
+        Returns:
+            An Arrow Table containing all rows.
+        """
         if self.persisted:
             if self._arrow_table:
                 return self._arrow_table
@@ -295,6 +413,14 @@ class StatementResult:
         self,
         parallel_pool: Optional[int] = 4
     ) -> Iterator[pa.RecordBatch]:
+        """Stream the result as Arrow record batches.
+        Args:
+            parallel_pool: Maximum parallel fetch workers.
+        Yields:
+            Arrow RecordBatch objects.
+        """
         if self.persisted:
             if self._arrow_table is not None:
                 for batch in self._arrow_table.to_batches(max_chunksize=64 * 1024):
@@ -379,15 +505,36 @@ class StatementResult:
         self,
         parallel_pool: Optional[int] = 4
     ) -> "pandas.DataFrame":
+        """Return the result as a pandas DataFrame.
+        Args:
+            parallel_pool: Maximum parallel fetch workers.
+        Returns:
+            A pandas DataFrame with the result rows.
+        """
         return self.to_arrow_table(parallel_pool=parallel_pool).to_pandas()
     def to_polars(
         self,
         parallel_pool: Optional[int] = 4
     ) -> "polars.DataFrame":
+        """Return the result as a polars DataFrame.
+        Args:
+            parallel_pool: Maximum parallel fetch workers.
+        Returns:
+            A polars DataFrame with the result rows.
+        """
         return polars.from_arrow(self.to_arrow_table(parallel_pool=parallel_pool))
     def to_spark(self):
+        """Return the result as a Spark DataFrame, caching it locally.
+        Returns:
+            A Spark DataFrame with the result rows.
+        """
         if self._spark_df:
             return self._spark_df

yggdrasil/databricks/sql/types.py CHANGED Viewed

@@ -1,3 +1,5 @@
+"""Type utilities for Databricks SQL metadata and Arrow."""
 import json
 import re
 from typing import Union
@@ -86,6 +88,14 @@ _struct_re = re.compile(r"^STRUCT\s*<\s*(.+)\s*>$", re.IGNORECASE)
 def _split_top_level_commas(s: str):
+    """Split a type string by commas, respecting nested angle brackets.
+    Args:
+        s: Type string to split.
+    Returns:
+        A list of top-level comma-separated parts.
+    """
     parts, cur, depth = [], [], 0
     for ch in s:
         if ch == '<':
@@ -102,12 +112,38 @@ def _split_top_level_commas(s: str):
     return parts
+def _safe_bytes(obj):
+    """Convert an object to UTF-8 bytes, with safe handling for None.
+    Args:
+        obj: Value to convert.
+    Returns:
+        UTF-8 encoded bytes.
+    """
+    if not isinstance(obj, bytes):
+        if not obj:
+            return b""
+        if not isinstance(obj, str):
+            obj = str(obj)
+        return obj.encode("utf-8")
+    return obj
 def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
     """
     Adapted parser that:
       - looks up base types in STRING_TYPE_MAP (expects uppercase keys)
       - supports DECIMAL(p,s), ARRAY<...>, MAP<k,v>, STRUCT<...> recursively
       - raises ValueError if it cannot map the provided type string
+    Args:
+        type_str: SQL type string to parse.
+    Returns:
+        The corresponding Arrow DataType.
     """
     if not type_str:
         raise ValueError("Empty type string")
@@ -165,11 +201,23 @@ def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
 def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
+    """Convert Databricks SQL/Catalog column info into an Arrow field.
+    Args:
+        col: ColumnInfo from SQL or Catalog APIs.
+    Returns:
+        An Arrow Field for the column.
+    """
     arrow_type = parse_sql_type_to_pa(col.type_text)
     if isinstance(col, CatalogColumnInfo):
         parsed = json.loads(col.type_json)
         md = parsed.get("metadata", {}) or {}
+        md = {
+            _safe_bytes(k): _safe_bytes(v)
+            for k, v in md.items()
+        }
         nullable = col.nullable
     elif isinstance(col, SQLColumnInfo):
         md = {}
@@ -182,4 +230,4 @@ def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
         arrow_type,
         nullable=nullable,
         metadata=md
-    )
+    )

yggdrasil/databricks/workspaces/__init__.py CHANGED Viewed

@@ -1,2 +1,5 @@
+"""Workspace, filesystem, and path utilities for Databricks."""
 from .workspace import *
-from .databricks_path import *
+from .path import *
+from .io import *

ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

ygg 0.1.30py3-none-any.whl → 0.1.32py3-none-any.whl