PyPI - ygg - Versions diffs - 0.1.57__py3-none-any.whl → 0.1.60__py3-none-any.whl - Mend

ygg 0.1.57py3-none-any.whl → 0.1.60py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/METADATA +1 -1
ygg-0.1.60.dist-info/RECORD +74 -0
yggdrasil/ai/__init__.py +2 -0
yggdrasil/ai/session.py +89 -0
yggdrasil/ai/sql_session.py +310 -0
yggdrasil/databricks/__init__.py +0 -3
yggdrasil/databricks/compute/cluster.py +68 -113
yggdrasil/databricks/compute/command_execution.py +674 -0
yggdrasil/databricks/compute/exceptions.py +19 -0
yggdrasil/databricks/compute/execution_context.py +491 -282
yggdrasil/databricks/compute/remote.py +4 -14
yggdrasil/databricks/exceptions.py +10 -0
yggdrasil/databricks/sql/__init__.py +0 -4
yggdrasil/databricks/sql/engine.py +161 -173
yggdrasil/databricks/sql/exceptions.py +9 -1
yggdrasil/databricks/sql/statement_result.py +108 -120
yggdrasil/databricks/sql/warehouse.py +331 -92
yggdrasil/databricks/workspaces/io.py +89 -9
yggdrasil/databricks/workspaces/path.py +120 -72
yggdrasil/databricks/workspaces/workspace.py +214 -61
yggdrasil/exceptions.py +7 -0
yggdrasil/libs/databrickslib.py +23 -18
yggdrasil/libs/extensions/spark_extensions.py +1 -1
yggdrasil/libs/pandaslib.py +15 -6
yggdrasil/libs/polarslib.py +49 -13
yggdrasil/pyutils/__init__.py +1 -2
yggdrasil/pyutils/callable_serde.py +12 -19
yggdrasil/pyutils/exceptions.py +16 -0
yggdrasil/pyutils/python_env.py +14 -13
yggdrasil/pyutils/waiting_config.py +171 -0
yggdrasil/types/cast/arrow_cast.py +3 -0
yggdrasil/types/cast/pandas_cast.py +157 -169
yggdrasil/types/cast/polars_cast.py +11 -43
yggdrasil/types/dummy_class.py +81 -0
yggdrasil/version.py +1 -1
ygg-0.1.57.dist-info/RECORD +0 -66
yggdrasil/databricks/ai/loki.py +0 -53
{ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/WHEEL +0 -0
{ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/entry_points.txt +0 -0
{ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/licenses/LICENSE +0 -0
{ygg-0.1.57.dist-info → ygg-0.1.60.dist-info}/top_level.txt +0 -0
/yggdrasil/{databricks/ai/__init__.py → pyutils/mimetypes.py} +0 -0

yggdrasil/databricks/compute/command_execution.py ADDED Viewed

@@ -0,0 +1,674 @@
+import base64
+import gzip
+import io
+import json
+import logging
+import os
+import time
+from dataclasses import dataclass, field
+from json import JSONDecodeError
+from typing import TYPE_CHECKING, Optional, Any, Callable, Dict, Iterable, Union, Generator, Iterator
+import dill
+import pyarrow
+from .exceptions import ClientTerminatedSession
+from ...libs.databrickslib import databricks_sdk, DatabricksDummyClass
+from ...libs.pandaslib import PandasDataFrame
+from ...libs.polarslib import PolarsDataFrame
+from ...pyutils.exceptions import raise_parsed_traceback
+from ...pyutils.waiting_config import WaitingConfig, WaitingConfigArg
+if databricks_sdk is not None:
+    from databricks.sdk.errors import InternalError
+    from databricks.sdk.service.compute import (
+        Language, CommandExecutionAPI, CommandStatusResponse, CommandStatus, ResultType
+    )
+    DONE_STATES = {
+        CommandStatus.FINISHED, CommandStatus.CANCELLED, CommandStatus.ERROR
+    }
+    PENDING_STATES = {
+        CommandStatus.RUNNING, CommandStatus.QUEUED, CommandStatus.RUNNING
+    }
+    FAILED_STATES = {
+        CommandStatus.ERROR, CommandStatus.CANCELLED
+    }
+else:
+    InternalError = DatabricksDummyClass
+    Language = DatabricksDummyClass
+    CommandExecutionAPI = DatabricksDummyClass
+    ResultType = DatabricksDummyClass
+    DONE_STATES, PENDING_STATES, FAILED_STATES = set(), set(), set()
+if TYPE_CHECKING:
+    from .execution_context import ExecutionContext
+__all__ = [
+    "CommandExecution"
+]
+LOGGER = logging.getLogger(__name__)
+@dataclass
+class CommandExecution:
+    context: "ExecutionContext"
+    command_id: Optional[str] = None
+    language: Optional[Language] = field(default=None, repr=False, compare=False, hash=False)
+    command: Optional[str] = field(default=None, repr=False, compare=False, hash=False)
+    environ: Optional[Dict[str, str]] = field(default=None, repr=False, compare=False, hash=False)
+    _details: Optional[CommandStatusResponse] = field(default=None, repr=False, compare=False, hash=False)
+    def __post_init__(self):
+        if self.environ:
+            if isinstance(self.environ, (list, tuple, set)):
+                self.environ = {
+                    k: os.getenv(k)
+                    for k in self.environ
+                }
+    def __call__(self, *args, **kwargs):
+        assert self.command, "Cannot call %s, missing command" % self
+        args_blob = dill.dumps([self.encode_object(_) for _ in args])
+        kwargs_blob = dill.dumps({k: self.encode_object(v) for k, v in kwargs.items()})
+        if self.environ:
+            env_blob = {
+                k: os.getenv(k) or v
+                for k, v in self.environ.items()
+                if os.getenv(k) or v
+            }
+        else:
+            env_blob = {}
+        args_b64 = base64.b64encode(args_blob).decode("ascii")
+        kwargs_b64 = base64.b64encode(kwargs_blob).decode("ascii")
+        command = (
+            self.command
+            .replace("__ARGS__", repr(args_b64))
+            .replace("__KWARGS__", repr(kwargs_b64))
+            .replace("__ENVIRON__", repr(env_blob))
+        )
+        run = (
+            self.create(
+                context=self.context,
+                command=command,
+                language=self.language
+            )
+            .start()
+        )
+        return run.wait(raise_error=True).result(raise_error=True)
+    def __bool__(self):
+        return self.done
+    def __repr__(self):
+        return "%s(url=%s)" % (
+            self.__class__.__name__,
+            self.url()
+        )
+    def __str__(self):
+        return self.url()
+    def url(self) -> str:
+        return "%s/command/%s" % (
+            self.context.url(),
+            self.command_id or "unknown"
+        )
+    def create(
+        self,
+        context: Optional["ExecutionContext"] = None,
+        func: Optional[Callable] = None,
+        command: Optional[str] = None,
+        language: Optional[Language] = None,
+        command_id: Optional[str] = None,
+        environ: Optional[Union[Iterable[str], Dict[str, str]]] = None,
+    ):
+        context = self.context if context is None else context
+        command = self.command if command is None else command
+        environ = self.environ if environ is None else environ
+        if environ is not None:
+            if not isinstance(environ, dict):
+                environ = {
+                    str(k): os.getenv(str(k))
+                    for k in environ
+                }
+            else:
+                environ = {
+                    str(k): str(v)
+                    for k, v in environ.items()
+                }
+        if not command:
+            if callable(func):
+                command = self.make_python_function_command(
+                    func=func,
+                )
+        if language is None:
+            language = context.language or Language.PYTHON
+        assert context is not None, "Missing context to execute command"
+        return CommandExecution(
+            context=context,
+            language=language,
+            command=command,
+            command_id=command_id,
+            environ=environ
+        )
+    def start(self, reset: bool = False):
+        if self.command_id:
+            if not reset:
+                return self
+            self._details = None
+            self.command_id = None
+        client = self.context.workspace_client().command_execution
+        assert self.command, "Missing command arg in %s" % self
+        try:
+            details = client.execute(
+                cluster_id=self.cluster_id,
+                context_id=self.context_id,
+                language=self.language,
+                command=self.command,
+            ).response
+        except Exception as e:
+            if "ontext" in str(e):  # context related
+                self.context = self.context.connect(reset=True)
+                details = client.execute(
+                    cluster_id=self.cluster_id,
+                    context_id=self.context_id,
+                    language=self.language,
+                    command=self.command,
+                ).response
+            else:
+                raise e
+        self.command_id = details.id
+        self._details = None
+        LOGGER.info("Started %s", self)
+        return self
+    @property
+    def workspace(self):
+        return self.context.workspace
+    @property
+    def cluster_id(self):
+        return self.context.cluster.cluster_id
+    @property
+    def context_id(self):
+        if not self.context.context_id:
+            self.context = self.context.connect()
+        return self.context.context_id
+    @property
+    def state(self):
+        return self.details.status
+    @property
+    def running(self):
+        return self.state in PENDING_STATES
+    @property
+    def done(self):
+        return self.state in DONE_STATES
+    @property
+    def details(self) -> CommandStatusResponse:
+        if self._details is None:
+            self._details = self.client().command_status(
+                cluster_id=self.cluster_id,
+                context_id=self.context_id,
+                command_id=self.command_id
+            )
+        elif self._details.status not in DONE_STATES:
+            self._details = self.client().command_status(
+                cluster_id=self.cluster_id,
+                context_id=self.context_id,
+                command_id=self.command_id
+            )
+        return self._details
+    @details.setter
+    def details(self, value: Optional[CommandStatusResponse]):
+        self._details = value
+        if value is not None:
+            assert isinstance(value, CommandStatusResponse), "%s.details must be CommandStatusResponse, got %s" %(
+                self,
+                type(value)
+            )
+            self.command_id = value.id
+    @property
+    def results_metadata(self):
+        return self.details.results
+    def client(self) -> CommandExecutionAPI:
+        return self.context.workspace_client().command_execution
+    def connect(self, reset: bool = False):
+        self.context = self.context.connect(language=self.language)
+        return self
+    def cancel(self, raise_error: bool = False):
+        if self.command_id:
+            try:
+                self.client().cancel_and_wait(
+                    cluster_id=self.cluster_id,
+                    command_id=self.command_id,
+                    context_id=self.context_id
+                )
+            except Exception as e:
+                if raise_error:
+                    raise e
+                LOGGER.exception(e)
+    def raise_for_status(self):
+        if self.state in FAILED_STATES:
+            raise_error_from_response(
+                response=self.details,
+                language=self.language
+            )
+        return self
+    def wait(
+        self,
+        wait: Optional[WaitingConfigArg] = True,
+        raise_error: bool = True
+    ):
+        if not self.command_id:
+            return self.start().wait(
+                wait=wait,
+                raise_error=raise_error
+            )
+        wait = WaitingConfig.check_arg(wait)
+        iteration, start = 0, time.time()
+        if wait.timeout:
+            while self.running:
+                wait.sleep(iteration=iteration, start=start)
+                iteration += 1
+        if raise_error:
+            try:
+                self.raise_for_status()
+            except ModuleNotFoundError as e:
+                module_name = e.name
+                if module_name and not module_name.startswith("ygg"):
+                    self.context.cluster.install_temporary_libraries(
+                        libraries=[module_name]
+                    )
+                    return (
+                        self
+                        .start(reset=True)
+                        .wait(wait=wait, raise_error=raise_error)
+                    )
+                else:
+                    raise e
+            except ClientTerminatedSession as e:
+                LOGGER.error(
+                    "%s aborted: %s",
+                    self,
+                    e
+                )
+                self.context = self.context.connect(reset=True)
+                return (
+                    self
+                    .start(reset=True)
+                    .wait(wait=wait, raise_error=raise_error)
+                )
+        return self
+    def encode_object(
+        self,
+        obj: Any,
+        byte_limit: int = 32 * 1024,
+        byref: Any = None,
+        recurse: Any = None,
+        compression: Optional[str] = None
+    ) -> str:
+        buffer = io.BytesIO()
+        if isinstance(obj, pyarrow.Table):
+            import pyarrow.parquet as pq
+            func = "pyarrow.parquet.read_table"
+            extension = "parquet"
+            pq.write_table(obj, buffer)
+            buffer.seek(0)
+            dbx_path = self.workspace.tmp_path(extension=extension)
+            dbx_path.write_bytes(buffer)
+            return json.dumps({
+                "func": func,
+                "file": dbx_path.full_path()
+            })
+        elif isinstance(obj, PolarsDataFrame):
+            func = "polars.read_parquet"
+            extension = "parquet"
+            obj.write_parquet(buffer)
+            buffer.seek(0)
+            dbx_path = self.workspace.tmp_path(extension=extension)
+            dbx_path.write_bytes(buffer)
+            return json.dumps({
+                "func": func,
+                "file": dbx_path.full_path()
+            })
+        elif isinstance(obj, PandasDataFrame):
+            try:
+                func = "pandas.read_parquet"
+                extension = "parquet"
+                obj.to_parquet(path=buffer)
+            except Exception as e:
+                LOGGER.warning(e)
+                compression = "gzip"
+                extension = "pkl.gz"
+                func = "pandas.read_pickle"
+                obj.to_pickle(path=buffer, compression=compression)
+            buffer.seek(0)
+            dbx_path = self.workspace.tmp_path(extension=extension)
+            dbx_path.write_bytes(buffer)
+            return json.dumps({
+                "func": func,
+                "cpr": compression,
+                "file": dbx_path.full_path()
+            })
+        elif isinstance(obj, (Generator, Iterator)):
+            return json.dumps({
+                "func": "generator",
+                "items": [
+                    self.encode_object(_, byte_limit=byte_limit, byref=byref, recurse=recurse, compression=compression)
+                    for _ in obj
+                ]
+            })
+        dill.dump(obj, buffer, byref=byref, recurse=recurse)
+        raw = buffer.getvalue()
+        if compression or len(raw) > byte_limit:
+            compression = compression or "gzip"
+            raw = gzip.compress(raw)
+        if len(raw) > byte_limit:
+            buffer.seek(0)
+            dbx_path = self.workspace.tmp_path(extension="bin")
+            dbx_path.write_bytes(buffer)
+            return json.dumps({
+                "func": "dill.load",
+                "cpr": compression,
+                "file": dbx_path.full_path()
+            })
+        return json.dumps({
+            "func": "dill.load",
+            "cpr": compression,
+            "b64": base64.b64encode(raw).decode("ascii")
+        })
+    def decode_payload(
+        self,
+        payload: Union[str, bytes, dict, list]
+    ):
+        if isinstance(payload, (str, bytes)):
+            try:
+                payload = json.loads(payload)
+            except JSONDecodeError:
+                return payload
+        if isinstance(payload, dict):
+            func, compression, b64, databricks_path = (
+                payload.get("func"), payload.get("cpr"),
+                payload.get("b64"), payload.get("file")
+            )
+            if isinstance(func, str) and func:
+                if b64:
+                    blob = base64.b64decode(b64.encode("ascii"))
+                elif databricks_path:
+                    blob = self.workspace.dbfs_path(databricks_path, temporary=True).read_bytes()
+                else:
+                    blob = None
+                if func == "dill.load":
+                    if compression == "gzip":
+                        import gzip
+                        blob = gzip.decompress(blob)
+                    return dill.loads(blob)
+                elif func.startswith("pyarrow."):
+                    import pyarrow.parquet as pq
+                    buff = io.BytesIO(blob)
+                    return pq.read_table(buff)
+                elif func.startswith("pandas."):
+                    import pandas
+                    buff = io.BytesIO(blob)
+                    if func == "pandas.read_parquet":
+                        return pandas.read_parquet(buff)
+                    elif func == "pandas.read_pickle":
+                        return pandas.read_pickle(buff, compression=compression)
+                    else:
+                        raise NotImplementedError
+                elif func == "generator":
+                    items = payload.get("items")
+                    def gen(it: Iterator = items):
+                        if it:
+                            for item in it:
+                                yield self.decode_payload(item)
+                    return gen()
+                elif func.startswith("polars."):
+                    import polars
+                    buff = io.BytesIO(blob)
+                    return polars.read_parquet(buff)
+                else:
+                    raise NotImplementedError
+        return payload
+    def make_python_function_command(
+        self,
+        func: Callable,
+        tag: str = "__CALL_RESULT__",
+        byref: Any = None,
+        recurse: Any = None,
+    ):
+        # Serialize the command object (self) as ASCII-safe base64
+        command_bytes = dill.dumps(self)
+        command_b64 = base64.b64encode(command_bytes).decode("ascii")
+        # Func serialized by strict encoder: DILL:<compression>:b64:<...> or DATABRICKS_PATH:<compression>:path:<...>
+        serialized_func = self.encode_object(func, byref=byref, recurse=recurse)
+        cmd = f"""
+import base64, dill, os
+args_b64 = __ARGS__
+kwargs_b64 = __KWARGS__
+environ = __ENVIRON__
+if environ:
+    for k, v in environ.items():
+        if k and v:
+            os.environ[k] = v
+func_payload = {serialized_func!r}
+tag = {tag!r}
+command_b64 = {command_b64!r}
+command = dill.loads(base64.b64decode(command_b64.encode("ascii")))
+args = dill.loads(base64.b64decode(args_b64.encode("ascii")))
+kwargs = dill.loads(base64.b64decode(kwargs_b64.encode("ascii")))
+print(tag + command.encode_object(command.decode_payload(func_payload)(
+    *[command.decode_payload(x) for x in args],
+    **{{k: command.decode_payload(v) for k, v in kwargs.items()}}
+)))"""
+        return cmd
+    def decode_response(
+        self,
+        response: CommandStatusResponse,
+        language: Language,
+        raise_error: bool = True,
+        tag: str = "__CALL_RESULT__",
+        logger: bool = True,
+        unpickle: bool = True
+    ) -> Any:
+        """Mirror the old Cluster.execute_command result handling.
+        Args:
+            response: Raw command execution response.
+            language: Language executed
+            raise_error: Raise error if response is failed
+            tag: Result tag
+            logger: Print logs
+            unpickle: Unpickle
+        Returns:
+            The decoded output string.
+        """
+        raise_error_from_response(
+            response=response,
+            language=language,
+            raise_error=raise_error
+        )
+        results = response.results
+        # normal output
+        if results.result_type == ResultType.TEXT:
+            data = results.data or ""
+        else:
+            raise NotImplementedError(
+                "Cannot decode result form %s" % response
+            )
+        raw_result = data
+        if tag in raw_result:
+            logs_text, raw_result = raw_result.split(tag, 1)
+            try:
+                if logger:
+                    for line in logs_text.splitlines():
+                        stripped_log = line.strip()
+                        if stripped_log:
+                            print(stripped_log)
+            except Exception as e:
+                LOGGER.warning(
+                    "Cannot print logs from %s: %s",
+                    logs_text,
+                    e
+                )
+        if unpickle:
+            return self.decode_payload(payload=raw_result)
+        return raw_result
+    def result(
+        self,
+        raise_error: bool = True,
+        unpickle: bool = True
+    ) -> Any:
+        try:
+            self.wait(raise_error=raise_error)
+            obj = self.decode_response(
+                response=self.details,
+                language=self.language,
+                raise_error=raise_error,
+                unpickle=unpickle
+            )
+        except (InternalError, ClientTerminatedSession):
+            self.context = self.context.connect(reset=True)
+            return (
+                self
+                .start(reset=True)
+                .result(raise_error=raise_error, unpickle=unpickle)
+            )
+        except ModuleNotFoundError as e:
+            module_name = e.name
+            if module_name and not module_name.startswith("ygg"):
+                self.context.cluster.install_temporary_libraries(libraries=[module_name])
+                return (
+                    self
+                    .start(reset=True)
+                    .result(raise_error=raise_error, unpickle=unpickle)
+                )
+            else:
+                raise e
+        return obj
+def raise_error_from_response(
+    response: CommandStatusResponse,
+    language: Language,
+    raise_error: bool = True
+):
+    if raise_error:
+        results = response.results
+        if results.result_type == ResultType.ERROR:
+            message = results.cause or "Command execution failed"
+            if "client terminated the session" in message:
+                raise ClientTerminatedSession(message)
+            if language == Language.PYTHON:
+                raise_parsed_traceback(message)
+            raise RuntimeError(str(response))

yggdrasil/databricks/compute/exceptions.py ADDED Viewed

@@ -0,0 +1,19 @@
+from ...exceptions import YGGException
+__all__ = [
+    "ComputeException",
+    "ClientTerminatedSession",
+    "CommandExecutionException"
+]
+class ComputeException(YGGException):
+    pass
+class CommandExecutionException(ComputeException):
+    pass
+class ClientTerminatedSession(CommandExecutionException):
+    pass

ygg 0.1.57__py3-none-any.whl → 0.1.60__py3-none-any.whl

ygg 0.1.57py3-none-any.whl → 0.1.60py3-none-any.whl