PyPI - flyteplugins-databricks - Versions diffs - 2.0.0__py3-none-any.whl - Mend

flyteplugins-databricks 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

flyteplugins/databricks/__init__.py +4 -0
flyteplugins/databricks/connector.py +151 -0
flyteplugins/databricks/task.py +66 -0
flyteplugins_databricks-2.0.0.dist-info/METADATA +57 -0
flyteplugins_databricks-2.0.0.dist-info/RECORD +8 -0
flyteplugins_databricks-2.0.0.dist-info/WHEEL +5 -0
flyteplugins_databricks-2.0.0.dist-info/entry_points.txt +2 -0
flyteplugins_databricks-2.0.0.dist-info/top_level.txt +1 -0

flyteplugins/databricks/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from flyteplugins.databricks.connector import DatabricksConnector
+from flyteplugins.databricks.task import Databricks
+__all__ = ["Databricks", "DatabricksConnector"]

flyteplugins/databricks/connector.py ADDED Viewed

@@ -0,0 +1,151 @@
+import http
+import json
+import os
+import typing
+from dataclasses import dataclass
+from typing import Optional
+import aiohttp
+from flyte import logger
+from flyte.connectors import AsyncConnector, ConnectorRegistry, Resource, ResourceMeta
+from flyte.connectors.utils import convert_to_flyte_phase
+from flyteidl2.core.execution_pb2 import TaskExecution, TaskLog
+from flyteidl2.core.tasks_pb2 import TaskTemplate
+from google.protobuf.json_format import MessageToDict
+DATABRICKS_API_ENDPOINT = "/api/2.1/jobs"
+DEFAULT_DATABRICKS_INSTANCE_ENV_KEY = "FLYTE_DATABRICKS_INSTANCE"
+@dataclass
+class DatabricksJobMetadata(ResourceMeta):
+    databricks_instance: str
+    run_id: str
+def _get_databricks_job_spec(task_template: TaskTemplate) -> dict:
+    custom = MessageToDict(task_template.custom)
+    container = task_template.container
+    envs = task_template.container.env
+    databricks_job = custom.get("databricksConf")
+    if databricks_job is None:
+        raise ValueError("Missing Databricks job configuration in task template.")
+    if databricks_job.get("existing_cluster_id") is None:
+        new_cluster = databricks_job.get("new_cluster")
+        if new_cluster is None:
+            raise ValueError("Either existing_cluster_id or new_cluster must be specified")
+        if not new_cluster.get("docker_image"):
+            new_cluster["docker_image"] = {"url": container.image}
+        if not new_cluster.get("spark_conf"):
+            new_cluster["spark_conf"] = custom.get("sparkConf", {})
+        if not new_cluster.get("spark_env_vars"):
+            new_cluster["spark_env_vars"] = {env.key: env.value for env in envs}
+        else:
+            new_cluster["spark_env_vars"].update({env.key: env.value for env in envs})
+    # https://docs.databricks.com/api/workspace/jobs/submit
+    databricks_job["spark_python_task"] = {
+        "python_file": "flyteplugins/databricks/entrypoint.py",
+        "parameters": list(container.args),
+        "source": "GIT",
+    }
+    # https://github.com/flyteorg/flytetools/blob/master/flyteplugins/databricks/entrypoint.py
+    databricks_job["git_source"] = {
+        "git_url": "https://github.com/flyteorg/flytetools",
+        "git_provider": "gitHub",
+        "git_commit": "194364210c47c49ce66c419e8fb68d6f9c06fd7e",
+    }
+    logger.debug("databricks_job spec:", databricks_job)
+    return databricks_job
+class DatabricksConnector(AsyncConnector):
+    name: str = "Databricks Connector"
+    task_type_name: str = "databricks"
+    metadata_type: type = DatabricksJobMetadata
+    async def create(
+        self,
+        task_template: TaskTemplate,
+        inputs: Optional[typing.Dict[str, typing.Any]] = None,
+        databricks_token: Optional[str] = None,
+        **kwargs,
+    ) -> DatabricksJobMetadata:
+        data = json.dumps(_get_databricks_job_spec(task_template))
+        custom = MessageToDict(task_template.custom)
+        databricks_instance = custom.get("databricksInstance", os.getenv(DEFAULT_DATABRICKS_INSTANCE_ENV_KEY))
+        if not databricks_instance:
+            raise ValueError(
+                f"Missing databricks instance. Please set the value through the task config or"
+                f" set the {DEFAULT_DATABRICKS_INSTANCE_ENV_KEY} environment variable in the connector."
+            )
+        databricks_url = f"https://{databricks_instance}{DATABRICKS_API_ENDPOINT}/runs/submit"
+        async with aiohttp.ClientSession() as session:
+            async with session.post(databricks_url, headers=get_header(databricks_token), data=data) as resp:
+                response = await resp.json()
+                if resp.status != http.HTTPStatus.OK:
+                    raise RuntimeError(f"Failed to create databricks job with error: {response}")
+        return DatabricksJobMetadata(databricks_instance=databricks_instance, run_id=str(response["run_id"]))
+    async def get(
+        self, resource_meta: DatabricksJobMetadata, databricks_token: Optional[str] = None, **kwargs
+    ) -> Resource:
+        databricks_instance = resource_meta.databricks_instance
+        databricks_url = (
+            f"https://{databricks_instance}{DATABRICKS_API_ENDPOINT}/runs/get?run_id={resource_meta.run_id}"
+        )
+        async with aiohttp.ClientSession() as session:
+            async with session.get(databricks_url, headers=get_header(databricks_token)) as resp:
+                if resp.status != http.HTTPStatus.OK:
+                    raise RuntimeError(f"Failed to get databricks job {resource_meta.run_id} with error: {resp.reason}")
+                response = await resp.json()
+        cur_phase = TaskExecution.UNDEFINED
+        message = ""
+        state = response.get("state")
+        # The databricks job's state is determined by life_cycle_state and result_state.
+        # https://docs.databricks.com/en/workflows/jobs/jobs-2.0-api.html#runresultstate
+        if state:
+            life_cycle_state = state.get("life_cycle_state")
+            if result_state_is_available(life_cycle_state):
+                result_state = state.get("result_state")
+                cur_phase = convert_to_flyte_phase(result_state)
+            else:
+                cur_phase = convert_to_flyte_phase(life_cycle_state)
+            message = state.get("state_message")
+        job_id = response.get("job_id")
+        databricks_console_url = f"https://{databricks_instance}/#job/{job_id}/run/{resource_meta.run_id}"
+        log_links = [TaskLog(uri=databricks_console_url, name="Databricks Console")]
+        return Resource(phase=cur_phase, message=message, log_links=log_links)
+    async def delete(self, resource_meta: DatabricksJobMetadata, databricks_token: Optional[str] = None, **kwargs):
+        databricks_url = f"https://{resource_meta.databricks_instance}{DATABRICKS_API_ENDPOINT}/runs/cancel"
+        data = json.dumps({"run_id": resource_meta.run_id})
+        async with aiohttp.ClientSession() as session:
+            async with session.post(databricks_url, headers=get_header(databricks_token), data=data) as resp:
+                if resp.status != http.HTTPStatus.OK:
+                    raise RuntimeError(
+                        f"Failed to cancel databricks job {resource_meta.run_id} with error: {resp.reason}"
+                    )
+                await resp.json()
+def get_header(token: str) -> typing.Dict[str, str]:
+    return {"Authorization": f"Bearer {token}", "content-type": "application/json"}
+def result_state_is_available(life_cycle_state: str) -> bool:
+    return life_cycle_state == "TERMINATED"
+ConnectorRegistry.register(DatabricksConnector())

flyteplugins/databricks/task.py ADDED Viewed

@@ -0,0 +1,66 @@
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Union
+from flyte._task_plugins import TaskPluginRegistry
+from flyte.connectors import AsyncConnectorExecutorMixin
+from flyte.models import SerializationContext
+from flyteidl2.plugins.spark_pb2 import SparkApplication, SparkJob
+from flyteplugins.spark import Spark
+from flyteplugins.spark.task import PysparkFunctionTask
+from google.protobuf.json_format import MessageToDict
+@dataclass
+class Databricks(Spark):
+    """
+    Use this to configure a Databricks task. Task's marked with this will automatically execute
+    natively onto databricks platform as a distributed execution of spark
+    Args:
+        databricks_conf: Databricks job configuration compliant with API version 2.1, supporting 2.0 use cases.
+        For the configuration structure, visit here.https://docs.databricks.com/dev-tools/api/2.0/jobs.html#request-structure
+        For updates in API 2.1, refer to: https://docs.databricks.com/en/workflows/jobs/jobs-api-updates.html
+        databricks_instance: Domain name of your deployment. Use the form <account>.cloud.databricks.com.
+        databricks_token: the name of the secret containing the Databricks token for authentication.
+    """
+    databricks_conf: Optional[Dict[str, Union[str, dict]]] = None
+    databricks_instance: Optional[str] = None
+    databricks_token: Optional[str] = None
+class DatabricksFunctionTask(AsyncConnectorExecutorMixin, PysparkFunctionTask):
+    """
+    Actual Plugin that transforms the local python code for execution within a spark context
+    """
+    plugin_config: Databricks
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.task_type = "databricks"
+    def custom_config(self, sctx: SerializationContext) -> Dict[str, Any]:
+        driver_pod = self.plugin_config.driver_pod.to_k8s_pod() if self.plugin_config.driver_pod else None
+        executor_pod = self.plugin_config.executor_pod.to_k8s_pod() if self.plugin_config.executor_pod else None
+        job = SparkJob(
+            sparkConf=self.plugin_config.spark_conf,
+            hadoopConf=self.plugin_config.hadoop_conf,
+            mainApplicationFile=self.plugin_config.applications_path or "local://" + sctx.get_entrypoint_path(),
+            executorPath=self.plugin_config.executor_path or sctx.interpreter_path,
+            mainClass="",
+            applicationType=SparkApplication.PYTHON,
+            driverPod=driver_pod,
+            executorPod=executor_pod,
+            databricksConf=self.plugin_config.databricks_conf,
+            databricksInstance=self.plugin_config.databricks_instance,
+        )
+        cfg = MessageToDict(job)
+        cfg["secrets"] = {"databricks_token": self.plugin_config.databricks_token}
+        return cfg
+TaskPluginRegistry.register(Databricks, DatabricksFunctionTask)

flyteplugins_databricks-2.0.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,57 @@
+Metadata-Version: 2.4
+Name: flyteplugins-databricks
+Version: 2.0.0
+Summary: Databricks plugin for flyte
+Author-email: Kevin Su <pingsutw@users.noreply.github.com>
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: flyte[connector]
+Requires-Dist: aiohttp
+Requires-Dist: nest-asyncio
+Requires-Dist: flyteplugins-spark
+# Databricks Plugin for Flyte
+This plugin provides Databricks integration for Flyte, enabling you to run Spark jobs on Databricks as Flyte tasks.
+## Installation
+```bash
+pip install flyteplugins-databricks
+```
+## Usage
+```python
+from flyteplugins.databricks import Databricks, DatabricksConnector
+@task(task_config=Databricks(
+    databricks_conf={
+        "run_name": "flyte databricks plugin",
+        "new_cluster": {
+            "spark_version": "13.3.x-scala2.12",
+            "autoscale": {
+                "min_workers": 1,
+                "max_workers": 1,
+            },
+            "node_type_id": "m6i.large",
+            "num_workers": 1,
+            "aws_attributes": {
+                "availability": "SPOT_WITH_FALLBACK",
+                "instance_profile_arn": "arn:aws:iam::339713193121:instance-profile/databricks-demo",
+                "ebs_volume_type": "GENERAL_PURPOSE_SSD",
+                "ebs_volume_count": 1,
+                "ebs_volume_size": 100,
+                "first_on_demand": 1,
+            },
+        },
+        # "existing_cluster_id": "1113-204018-tb9vr2fm", # use existing cluster id if you want
+        "timeout_seconds": 3600,
+        "max_retries": 1,
+    },
+    databricks_instance="mycompany.cloud.databricks.com",
+))
+def my_spark_task() -> int:
+    # Your Spark code here
+    return 42
+```

flyteplugins_databricks-2.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+flyteplugins/databricks/__init__.py,sha256=FfSNiu0JuKflhA0_i0VrNuwE_SX08lY4CBH1nY8AyD0,167
+flyteplugins/databricks/connector.py,sha256=ysg8ALL0Bmnmi32BszOTlpS0-kQWz843sX8qIwqXiaA,6576
+flyteplugins/databricks/task.py,sha256=ZscXO3PoocpCiBa9PM3Joe0_KytV7mw2GvCl7B69rVg,2863
+flyteplugins_databricks-2.0.0.dist-info/METADATA,sha256=WOsjPV9HE0Q_XbzkvOsXvDfnx8frVAF1cxLBiUpVB7E,1686
+flyteplugins_databricks-2.0.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
+flyteplugins_databricks-2.0.0.dist-info/entry_points.txt,sha256=TrAGjurydxqKpeLlNdhOv0Jc8WrmaQ6XIANTnWF9LCE,86
+flyteplugins_databricks-2.0.0.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
+flyteplugins_databricks-2.0.0.dist-info/RECORD,,

flyteplugins_databricks-2.0.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

flyteplugins_databricks-2.0.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [flyte.connectors]
2	+ databricks = flyteplugins.databricks.connector:DatabricksConnector

flyteplugins_databricks-2.0.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ flyteplugins