PyPI - ygg - Versions diffs - 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl - Mend

ygg 0.1.30py3-none-any.whl → 0.1.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
ygg-0.1.32.dist-info/RECORD +60 -0
yggdrasil/__init__.py +2 -0
yggdrasil/databricks/__init__.py +2 -0
yggdrasil/databricks/compute/__init__.py +2 -0
yggdrasil/databricks/compute/cluster.py +241 -2
yggdrasil/databricks/compute/execution_context.py +100 -11
yggdrasil/databricks/compute/remote.py +16 -0
yggdrasil/databricks/jobs/__init__.py +5 -0
yggdrasil/databricks/jobs/config.py +31 -34
yggdrasil/databricks/sql/__init__.py +2 -0
yggdrasil/databricks/sql/engine.py +217 -36
yggdrasil/databricks/sql/exceptions.py +1 -0
yggdrasil/databricks/sql/statement_result.py +148 -1
yggdrasil/databricks/sql/types.py +49 -1
yggdrasil/databricks/workspaces/__init__.py +4 -1
yggdrasil/databricks/workspaces/filesytem.py +344 -0
yggdrasil/databricks/workspaces/io.py +1123 -0
yggdrasil/databricks/workspaces/path.py +1415 -0
yggdrasil/databricks/workspaces/path_kind.py +13 -0
yggdrasil/databricks/workspaces/workspace.py +298 -154
yggdrasil/dataclasses/__init__.py +2 -0
yggdrasil/dataclasses/dataclass.py +42 -1
yggdrasil/libs/__init__.py +2 -0
yggdrasil/libs/databrickslib.py +9 -0
yggdrasil/libs/extensions/__init__.py +2 -0
yggdrasil/libs/extensions/polars_extensions.py +72 -0
yggdrasil/libs/extensions/spark_extensions.py +116 -0
yggdrasil/libs/pandaslib.py +7 -0
yggdrasil/libs/polarslib.py +7 -0
yggdrasil/libs/sparklib.py +41 -0
yggdrasil/pyutils/__init__.py +4 -0
yggdrasil/pyutils/callable_serde.py +106 -0
yggdrasil/pyutils/exceptions.py +16 -0
yggdrasil/pyutils/modules.py +44 -1
yggdrasil/pyutils/parallel.py +29 -0
yggdrasil/pyutils/python_env.py +301 -0
yggdrasil/pyutils/retry.py +57 -0
yggdrasil/requests/__init__.py +4 -0
yggdrasil/requests/msal.py +124 -3
yggdrasil/requests/session.py +18 -0
yggdrasil/types/__init__.py +2 -0
yggdrasil/types/cast/__init__.py +2 -1
yggdrasil/types/cast/arrow_cast.py +131 -0
yggdrasil/types/cast/cast_options.py +119 -1
yggdrasil/types/cast/pandas_cast.py +29 -0
yggdrasil/types/cast/polars_cast.py +47 -0
yggdrasil/types/cast/polars_pandas_cast.py +29 -0
yggdrasil/types/cast/registry.py +176 -0
yggdrasil/types/cast/spark_cast.py +76 -0
yggdrasil/types/cast/spark_pandas_cast.py +29 -0
yggdrasil/types/cast/spark_polars_cast.py +28 -0
yggdrasil/types/libs.py +2 -0
yggdrasil/types/python_arrow.py +191 -0
yggdrasil/types/python_defaults.py +73 -0
yggdrasil/version.py +1 -0
ygg-0.1.30.dist-info/RECORD +0 -56
yggdrasil/databricks/workspaces/databricks_path.py +0 -784
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
{ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0

yggdrasil/databricks/sql/engine.py CHANGED Viewed

@@ -1,17 +1,18 @@
+"""Databricks SQL engine utilities and helpers."""
 import dataclasses
-import io
 import logging
 import random
 import string
 import time
-from typing import Optional, Union, Any, Dict, List
+from typing import Optional, Union, Any, Dict, List, Literal
 import pyarrow as pa
 import pyarrow.parquet as pq
 from .statement_result import StatementResult
 from .types import column_info_to_arrow_field
-from .. import DatabricksPathKind
+from .. import DatabricksPathKind, DatabricksPath
 from ..workspaces import WorkspaceService
 from ...libs.databrickslib import databricks_sdk
 from ...libs.sparklib import SparkSession, SparkDataFrame, pyspark
@@ -56,11 +57,12 @@ __all__ = [
 class SqlExecutionError(RuntimeError):
-    pass
+    """Raised when a SQL statement execution fails."""
 @dataclasses.dataclass
 class SQLEngine(WorkspaceService):
+    """Execute SQL statements and manage tables via Databricks."""
     warehouse_id: Optional[str] = None
     catalog_name: Optional[str] = None
     schema_name: Optional[str] = None
@@ -72,6 +74,17 @@ class SQLEngine(WorkspaceService):
         table_name: Optional[str] = None,
         safe_chars: bool = True
     ):
+        """Build a fully qualified table name for the current catalog/schema.
+        Args:
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Table name to qualify.
+            safe_chars: Whether to wrap identifiers in backticks.
+        Returns:
+            The fully qualified table name.
+        """
         catalog_name = catalog_name or self.catalog_name
         schema_name = schema_name or self.schema_name
@@ -87,6 +100,14 @@ class SQLEngine(WorkspaceService):
         self,
         full_name: str,
     ):
+        """Parse a catalog.schema.table string into components.
+        Args:
+            full_name: A fully qualified name or partial name.
+        Returns:
+            A tuple of (catalog_name, schema_name, table_name).
+        """
         parts = [
             _.strip("`") for _ in full_name.split(".")
         ]
@@ -108,6 +129,14 @@ class SQLEngine(WorkspaceService):
         self,
         cluster_size: str = "Small"
     ):
+        """Return a default SQL warehouse matching the desired size.
+        Args:
+            cluster_size: Desired warehouse size filter.
+        Returns:
+            The matched warehouse object.
+        """
         wk = self.workspace.sdk()
         existing = list(wk.warehouses.list())
         first = None
@@ -131,6 +160,14 @@ class SQLEngine(WorkspaceService):
         self,
         cluster_size = "Small"
     ):
+        """Return the configured warehouse id or a default one.
+        Args:
+            cluster_size: Desired warehouse size filter.
+        Returns:
+            The warehouse id string.
+        """
         if not self.warehouse_id:
             dft = self._default_warehouse(cluster_size=cluster_size)
@@ -139,6 +176,14 @@ class SQLEngine(WorkspaceService):
     @staticmethod
     def _random_suffix(prefix: str = "") -> str:
+        """Generate a unique suffix for temporary resources.
+        Args:
+            prefix: Optional prefix to prepend.
+        Returns:
+            A unique suffix string.
+        """
         unique = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
         timestamp = int(time.time() * 1000)
         return f"{prefix}{timestamp}_{unique}"
@@ -147,6 +192,7 @@ class SQLEngine(WorkspaceService):
         self,
         statement: Optional[str] = None,
         *,
+        engine: Optional[Literal["spark", "api"]] = None,
         warehouse_id: Optional[str] = None,
         byte_limit: Optional[int] = None,
         disposition: Optional["Disposition"] = None,
@@ -158,6 +204,7 @@ class SQLEngine(WorkspaceService):
         catalog_name: Optional[str] = None,
         schema_name: Optional[str] = None,
         table_name: Optional[str] = None,
+        wait_result: bool = True,
         **kwargs,
     ) -> "StatementResult":
         """
@@ -167,19 +214,46 @@ class SQLEngine(WorkspaceService):
             - On SUCCEEDED: return final statement object
             - On FAILED / CANCELED: raise SqlExecutionError
         - If wait=False: return initial execution handle without polling.
+        Args:
+            statement: SQL statement to execute. If omitted, selects from the table.
+            engine: Execution engine ("spark" or "api").
+            warehouse_id: Optional warehouse id override.
+            byte_limit: Optional byte limit for results.
+            disposition: Result disposition mode.
+            format: Result format for Databricks SQL API.
+            on_wait_timeout: Timeout behavior for waiting.
+            parameters: Optional statement parameters.
+            row_limit: Optional row limit.
+            wait_timeout: Optional API wait timeout.
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+            wait_result: Whether to block until completion.
+            **kwargs: Additional API parameters.
+        Returns:
+            A StatementResult wrapper for the execution.
         """
-        if pyspark is not None:
+        if not engine:
+            if pyspark is not None:
+                spark_session = SparkSession.getActiveSession()
+                if spark_session is not None:
+                    engine = "spark"
+        if engine == "spark":
             spark_session = SparkSession.getActiveSession()
-            if spark_session is not None:
-                result = spark_session.sql(statement)
+            if spark_session is None:
+                raise ValueError("No spark session found to run sql query")
-                return StatementResult(
-                    engine=self,
-                    statement_id="sparksql",
-                    disposition=Disposition.EXTERNAL_LINKS,
-                    _spark_df=result
-                )
+            return StatementResult(
+                engine=self,
+                statement_id="sparksql",
+                disposition=Disposition.EXTERNAL_LINKS,
+                _spark_df=spark_session.sql(statement)
+            )
         if format is None:
             format = Format.ARROW_STREAM
@@ -217,7 +291,7 @@ class SQLEngine(WorkspaceService):
             disposition=disposition
         )
-        return execution
+        return execution.wait() if wait_result else wait_result
     def spark_table(
         self,
@@ -226,6 +300,17 @@ class SQLEngine(WorkspaceService):
         schema_name: Optional[str] = None,
         table_name: Optional[str] = None,
     ):
+        """Return a DeltaTable handle for a given table name.
+        Args:
+            full_name: Fully qualified table name.
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+        Returns:
+            A Spark DeltaTable handle.
+        """
         if not full_name:
             full_name = self.table_full_name(
                 catalog_name=catalog_name,
@@ -258,6 +343,27 @@ class SQLEngine(WorkspaceService):
         spark_session: Optional[SparkSession] = None,
         spark_options: Optional[Dict[str, Any]] = None
     ):
+        """Insert data into a table using Spark or Arrow paths.
+        Args:
+            data: Arrow or Spark data to insert.
+            location: Fully qualified table name override.
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+            mode: Insert mode ("auto", "append", "overwrite").
+            cast_options: Optional casting options.
+            overwrite_schema: Whether to overwrite schema (Spark).
+            match_by: Optional merge keys for upserts.
+            zorder_by: Optional Z-ORDER columns.
+            optimize_after_merge: Whether to run OPTIMIZE after merge.
+            vacuum_hours: Optional VACUUM retention window.
+            spark_session: Optional SparkSession override.
+            spark_options: Optional Spark write options.
+        Returns:
+            None for Arrow inserts, or the Spark insert result.
+        """
         # -------- existing logic you provided (kept intact) ----------
         if pyspark is not None:
             spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
@@ -310,8 +416,30 @@ class SQLEngine(WorkspaceService):
         zorder_by: list[str] = None,
         optimize_after_merge: bool = False,
         vacuum_hours: int | None = None,  # e.g., 168 for 7 days
-        existing_schema: pa.Schema | None = None
+        existing_schema: pa.Schema | None = None,
+        temp_volume_path: Optional[Union[str, DatabricksPath]] = None
     ):
+        """Insert Arrow data by staging to a temp volume and running SQL.
+        Args:
+            data: Arrow table/batch data to insert.
+            location: Fully qualified table name override.
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+            mode: Insert mode ("auto", "append", "overwrite").
+            cast_options: Optional casting options.
+            overwrite_schema: Whether to overwrite schema.
+            match_by: Optional merge keys for upserts.
+            zorder_by: Optional Z-ORDER columns.
+            optimize_after_merge: Whether to run OPTIMIZE after merge.
+            vacuum_hours: Optional VACUUM retention window.
+            existing_schema: Optional pre-fetched schema.
+            temp_volume_path: Optional temp volume path override.
+        Returns:
+            None.
+        """
         location, catalog_name, schema_name, table_name = self._check_location_params(
             location=location,
             catalog_name=catalog_name,
@@ -375,14 +503,14 @@ class SQLEngine(WorkspaceService):
             data = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
             # Write in temp volume
-            databricks_tmp_path = connected.dbfs_path(
+            temp_volume_path = connected.dbfs_path(
                 kind=DatabricksPathKind.VOLUME,
-                parts=[catalog_name, schema_name, "tmp", transaction_id, "data.parquet"]
-            )
-            databricks_tmp_folder = databricks_tmp_path.parent
+                parts=[catalog_name, schema_name, "tmp", "sql", transaction_id]
+            ) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
+            temp_volume_path.mkdir()
-            with databricks_tmp_path.open(mode="wb") as f:
-                pq.write_table(data, f, compression="snappy")
+            temp_volume_path.write_arrow_table(data)
             # get column list from arrow schema
             columns = [c for c in existing_schema.names]
@@ -412,7 +540,7 @@ class SQLEngine(WorkspaceService):
                 merge_sql = f"""MERGE INTO {location} AS T
 USING (
-  SELECT {cols_quoted} FROM parquet.`{databricks_tmp_folder}`
+  SELECT {cols_quoted} FROM parquet.`{temp_volume_path}`
 ) AS S
 ON {on_condition}
 {update_clause}
@@ -424,12 +552,12 @@ ON {on_condition}
                 if mode.lower() in ("overwrite",):
                     insert_sql = f"""INSERT OVERWRITE {location}
 SELECT {cols_quoted}
-FROM parquet.`{databricks_tmp_folder}`"""
+FROM parquet.`{temp_volume_path}`"""
                 else:
                     # default: append
                     insert_sql = f"""INSERT INTO {location} ({cols_quoted})
 SELECT {cols_quoted}
-FROM parquet.`{databricks_tmp_folder}`"""
+FROM parquet.`{temp_volume_path}`"""
                 statements.append(insert_sql)
             # Execute statements (use your existing execute helper)
@@ -439,7 +567,7 @@ FROM parquet.`{databricks_tmp_folder}`"""
                     connected.execute(stmt.strip())
             finally:
                 try:
-                    databricks_tmp_folder.rmdir(recursive=True)
+                    temp_volume_path.rmdir(recursive=True)
                 except Exception as e:
                     logger.warning(e)
@@ -474,6 +602,26 @@ FROM parquet.`{databricks_tmp_folder}`"""
         vacuum_hours: int | None = None,  # e.g., 168 for 7 days
         spark_options: Optional[Dict[str, Any]] = None,
     ):
+        """Insert a Spark DataFrame into a Delta table with optional merge semantics.
+        Args:
+            data: Spark DataFrame to insert.
+            location: Fully qualified table name override.
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+            mode: Insert mode ("auto", "append", "overwrite").
+            cast_options: Optional casting options.
+            overwrite_schema: Whether to overwrite schema.
+            match_by: Optional merge keys for upserts.
+            zorder_by: Optional Z-ORDER columns.
+            optimize_after_merge: Whether to run OPTIMIZE after merge.
+            vacuum_hours: Optional VACUUM retention window.
+            spark_options: Optional Spark write options.
+        Returns:
+            None.
+        """
         location, catalog_name, schema_name, table_name = self._check_location_params(
             location=location,
             catalog_name=catalog_name,
@@ -573,6 +721,17 @@ FROM parquet.`{databricks_tmp_folder}`"""
         table_name: Optional[str] = None,
         to_arrow_schema: bool = True
     ) -> Union[pa.Field, pa.Schema]:
+        """Fetch a table schema from Unity Catalog as Arrow types.
+        Args:
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+            to_arrow_schema: Whether to return an Arrow schema or field.
+        Returns:
+            Arrow Schema or Field representing the table.
+        """
         full_name = self.table_full_name(
             catalog_name=catalog_name,
             schema_name=schema_name,
@@ -603,6 +762,17 @@ FROM parquet.`{databricks_tmp_folder}`"""
         schema_name: Optional[str] = None,
         table_name: Optional[str] = None,
     ):
+        """Drop a table if it exists.
+        Args:
+            location: Fully qualified table name override.
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+        Returns:
+            The StatementResult from executing the drop statement.
+        """
         location, _, _, _ = self._check_location_params(
             location=location,
             catalog_name=catalog_name,
@@ -656,23 +826,22 @@ FROM parquet.`{databricks_tmp_folder}`"""
             safe_chars=True
         )
-        # Create the DDL statement
-        sql = [f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} ("]
-        # Generate column definitions
-        column_defs = []
         if pa.types.is_struct(field.type):
             children = list(field.type)
         else:
             children = [field]
-        for child in children:
-            column_def = self._field_to_ddl(child)
-            column_defs.append(column_def)
+        # Create the DDL statement
+        column_definitions = [
+            self._field_to_ddl(child)
+            for child in children
+        ]
-        sql.append(",\n  ".join(column_defs))
-        sql.append(")")
+        sql = [
+            f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} (",
+            ",\n  ".join(column_definitions),
+            ")"
+        ]
         # Add partition by clause if provided
         if partition_by and len(partition_by) > 0:
@@ -729,6 +898,18 @@ FROM parquet.`{databricks_tmp_folder}`"""
         table_name: Optional[str] = None,
         safe_chars: bool = True
     ):
+        """Resolve location/catalog/schema/table parameters to a full name.
+        Args:
+            location: Fully qualified table name override.
+            catalog_name: Optional catalog override.
+            schema_name: Optional schema override.
+            table_name: Optional table name override.
+            safe_chars: Whether to wrap identifiers in backticks.
+        Returns:
+            A tuple of (location, catalog_name, schema_name, table_name).
+        """
         if location:
             c, s, t = self._catalog_schema_table_names(location)
             catalog_name, schema_name, table_name = catalog_name or c, schema_name or s, table_name or t

yggdrasil/databricks/sql/exceptions.py CHANGED Viewed

	@@ -0,0 +1 @@
1	+ """Custom exceptions for Databricks SQL helpers."""

ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

ygg 0.1.30py3-none-any.whl → 0.1.32py3-none-any.whl