PyPI - apache-airflow-providers-databricks - Versions diffs - 6.5.0__tar.gz → 6.6.0__tar.gz - Mend

apache-airflow-providers-databricks 6.5.0tar.gz → 6.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of apache-airflow-providers-databricks might be problematic. Click here for more details.

Files changed (22) hide show

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: apache-airflow-providers-databricks
-Version: 6.5.0
+Version: 6.6.0
 Summary: Provider package apache-airflow-providers-databricks for Apache Airflow
 Keywords: airflow-provider,databricks,airflow,integration
 Author-email: Apache Software Foundation <dev@airflow.apache.org>
@@ -25,12 +25,16 @@ Requires-Dist: aiohttp>=3.9.2, <4
 Requires-Dist: apache-airflow-providers-common-sql>=1.10.0
 Requires-Dist: apache-airflow>=2.7.0
 Requires-Dist: databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0
+Requires-Dist: mergedeep>=1.3.4
+Requires-Dist: pandas>=1.5.3,<2.2;python_version<"3.9"
+Requires-Dist: pandas>=2.1.2,<2.2;python_version>="3.9"
+Requires-Dist: pyarrow>=14.0.1
 Requires-Dist: requests>=2.27.0,<3
 Requires-Dist: apache-airflow-providers-common-sql ; extra == "common.sql"
 Requires-Dist: databricks-sdk==0.10.0 ; extra == "sdk"
 Project-URL: Bug Tracker, https://github.com/apache/airflow/issues
-Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0/changelog.html
-Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0
+Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0/changelog.html
+Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0
 Project-URL: Slack Chat, https://s.apache.org/airflow-slack
 Project-URL: Source Code, https://github.com/apache/airflow
 Project-URL: Twitter, https://twitter.com/ApacheAirflow
@@ -82,7 +86,7 @@ Provides-Extra: sdk
 Package ``apache-airflow-providers-databricks``
-Release: ``6.5.0``
+Release: ``6.6.0``
 `Databricks <https://databricks.com/>`__
@@ -95,7 +99,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
 are in ``airflow.providers.databricks`` python package.
 You can find package information and changelog for the provider
-in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0/>`_.
+in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0/>`_.
 Installation
 ------------
@@ -109,15 +113,19 @@ The package supports the following python versions: 3.8,3.9,3.10,3.11,3.12
 Requirements
 ------------
-=======================================  ==========================
+=======================================  =========================================
 PIP package                              Version required
-=======================================  ==========================
+=======================================  =========================================
 ``apache-airflow``                       ``>=2.7.0``
 ``apache-airflow-providers-common-sql``  ``>=1.10.0``
 ``requests``                             ``>=2.27.0,<3``
 ``databricks-sql-connector``             ``>=2.0.0,!=2.9.0,<3.0.0``
 ``aiohttp``                              ``>=3.9.2,<4``
-=======================================  ==========================
+``mergedeep``                            ``>=1.3.4``
+``pandas``                               ``>=2.1.2,<2.2; python_version >= "3.9"``
+``pandas``                               ``>=1.5.3,<2.2; python_version < "3.9"``
+``pyarrow``                              ``>=14.0.1``
+=======================================  =========================================
 Cross provider package dependencies
 -----------------------------------
@@ -139,4 +147,4 @@ Dependent package
 ============================================================================================================  ==============
 The changelog for the provider package can be found in the
-`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0/changelog.html>`_.
+`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0/changelog.html>`_.

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/README.rst RENAMED Viewed

@@ -42,7 +42,7 @@
 Package ``apache-airflow-providers-databricks``
-Release: ``6.5.0``
+Release: ``6.6.0``
 `Databricks <https://databricks.com/>`__
@@ -55,7 +55,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
 are in ``airflow.providers.databricks`` python package.
 You can find package information and changelog for the provider
-in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0/>`_.
+in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0/>`_.
 Installation
 ------------
@@ -69,15 +69,19 @@ The package supports the following python versions: 3.8,3.9,3.10,3.11,3.12
 Requirements
 ------------
-=======================================  ==========================
+=======================================  =========================================
 PIP package                              Version required
-=======================================  ==========================
+=======================================  =========================================
 ``apache-airflow``                       ``>=2.7.0``
 ``apache-airflow-providers-common-sql``  ``>=1.10.0``
 ``requests``                             ``>=2.27.0,<3``
 ``databricks-sql-connector``             ``>=2.0.0,!=2.9.0,<3.0.0``
 ``aiohttp``                              ``>=3.9.2,<4``
-=======================================  ==========================
+``mergedeep``                            ``>=1.3.4``
+``pandas``                               ``>=2.1.2,<2.2; python_version >= "3.9"``
+``pandas``                               ``>=1.5.3,<2.2; python_version < "3.9"``
+``pyarrow``                              ``>=14.0.1``
+=======================================  =========================================
 Cross provider package dependencies
 -----------------------------------
@@ -99,4 +103,4 @@ Dependent package
 ============================================================================================================  ==============
 The changelog for the provider package can be found in the
-`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0/changelog.html>`_.
+`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0/changelog.html>`_.

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/airflow/providers/databricks/LICENSE RENAMED Viewed

@@ -215,7 +215,7 @@ Third party Apache 2.0 licenses
 The following components are provided under the Apache 2.0 License.
 See project link for details. The text of each license is also included
-at licenses/LICENSE-[project].txt.
+at 3rd-party-licenses/LICENSE-[project].txt.
     (ALv2 License) hue v4.3.0 (https://github.com/cloudera/hue/)
     (ALv2 License) jqclock v2.3.0 (https://github.com/JohnRDOrazio/jQuery-Clock-Plugin)
@@ -227,7 +227,7 @@ MIT licenses
 ========================================================================
 The following components are provided under the MIT License. See project link for details.
-The text of each license is also included at licenses/LICENSE-[project].txt.
+The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
     (MIT License) jquery v3.5.1 (https://jquery.org/license/)
     (MIT License) dagre-d3 v0.6.4 (https://github.com/cpettitt/dagre-d3)
@@ -243,11 +243,11 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
 BSD 3-Clause licenses
 ========================================================================
 The following components are provided under the BSD 3-Clause license. See project links for details.
-The text of each license is also included at licenses/LICENSE-[project].txt.
+The text of each license is also included at 3rd-party-licenses/LICENSE-[project].txt.
     (BSD 3 License) d3 v5.16.0 (https://d3js.org)
     (BSD 3 License) d3-shape v2.1.0 (https://github.com/d3/d3-shape)
     (BSD 3 License) cgroupspy 0.2.1 (https://github.com/cloudsigma/cgroupspy)
 ========================================================================
-See licenses/LICENSES-ui.txt for packages used in `/airflow/www`
+See 3rd-party-licenses/LICENSES-ui.txt for packages used in `/airflow/www`

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/airflow/providers/databricks/__init__.py RENAMED Viewed

@@ -29,7 +29,7 @@ from airflow import __version__ as airflow_version
 __all__ = ["__version__"]
-__version__ = "6.5.0"
+__version__ = "6.6.0"
 if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
     "2.7.0"

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/airflow/providers/databricks/get_provider_info.py RENAMED Viewed

@@ -28,8 +28,9 @@ def get_provider_info():
         "name": "Databricks",
         "description": "`Databricks <https://databricks.com/>`__\n",
         "state": "ready",
-        "source-date-epoch": 1716287262,
+        "source-date-epoch": 1718604145,
         "versions": [
+            "6.6.0",
             "6.5.0",
             "6.4.0",
             "6.3.0",
@@ -74,6 +75,10 @@ def get_provider_info():
             "requests>=2.27.0,<3",
             "databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0",
             "aiohttp>=3.9.2, <4",
+            "mergedeep>=1.3.4",
+            'pandas>=2.1.2,<2.2;python_version>="3.9"',
+            'pandas>=1.5.3,<2.2;python_version<"3.9"',
+            "pyarrow>=14.0.1",
         ],
         "additional-extras": [
             {
@@ -92,6 +97,7 @@ def get_provider_info():
                     "/docs/apache-airflow-providers-databricks/operators/notebook.rst",
                     "/docs/apache-airflow-providers-databricks/operators/submit_run.rst",
                     "/docs/apache-airflow-providers-databricks/operators/run_now.rst",
+                    "/docs/apache-airflow-providers-databricks/operators/task.rst",
                 ],
                 "logo": "/integration-logos/databricks/Databricks.png",
                 "tags": ["service"],
@@ -117,6 +123,13 @@ def get_provider_info():
                 "logo": "/integration-logos/databricks/Databricks.png",
                 "tags": ["service"],
             },
+            {
+                "integration-name": "Databricks Workflow",
+                "external-doc-url": "https://docs.databricks.com/en/workflows/index.html",
+                "how-to-guide": ["/docs/apache-airflow-providers-databricks/operators/workflow.rst"],
+                "logo": "/integration-logos/databricks/Databricks.png",
+                "tags": ["service"],
+            },
         ],
         "operators": [
             {
@@ -131,6 +144,10 @@ def get_provider_info():
                 "integration-name": "Databricks Repos",
                 "python-modules": ["airflow.providers.databricks.operators.databricks_repos"],
             },
+            {
+                "integration-name": "Databricks Workflow",
+                "python-modules": ["airflow.providers.databricks.operators.databricks_workflow"],
+            },
         ],
         "hooks": [
             {

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/airflow/providers/databricks/hooks/databricks.py RENAMED Viewed

@@ -29,6 +29,7 @@ or the ``api/2.1/jobs/runs/submit``
 from __future__ import annotations
 import json
+from enum import Enum
 from typing import Any
 from requests import exceptions as requests_exceptions
@@ -63,6 +64,23 @@ WORKSPACE_GET_STATUS_ENDPOINT = ("GET", "api/2.0/workspace/get-status")
 SPARK_VERSIONS_ENDPOINT = ("GET", "api/2.0/clusters/spark-versions")
+class RunLifeCycleState(Enum):
+    """Enum for the run life cycle state concept of Databricks runs.
+    See more information at: https://docs.databricks.com/api/azure/workspace/jobs/listruns#runs-state-life_cycle_state
+    """
+    BLOCKED = "BLOCKED"
+    INTERNAL_ERROR = "INTERNAL_ERROR"
+    PENDING = "PENDING"
+    QUEUED = "QUEUED"
+    RUNNING = "RUNNING"
+    SKIPPED = "SKIPPED"
+    TERMINATED = "TERMINATED"
+    TERMINATING = "TERMINATING"
+    WAITING_FOR_RETRY = "WAITING_FOR_RETRY"
 class RunState:
     """Utility class for the run state concept of Databricks runs."""
@@ -238,6 +256,7 @@ class DatabricksHook(BaseDatabricksHook):
         expand_tasks: bool = False,
         job_name: str | None = None,
         page_token: str | None = None,
+        include_user_names: bool = False,
     ) -> list[dict[str, Any]]:
         """
         List the jobs in the Databricks Job Service.
@@ -257,6 +276,7 @@ class DatabricksHook(BaseDatabricksHook):
             payload: dict[str, Any] = {
                 "limit": limit,
                 "expand_tasks": expand_tasks,
+                "include_user_names": include_user_names,
             }
             payload["page_token"] = page_token
             if job_name:

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/airflow/providers/databricks/hooks/databricks_base.py RENAMED Viewed

@@ -499,21 +499,21 @@ class BaseDatabricksHook(BaseHook):
             )
             return self.databricks_conn.extra_dejson["token"]
         elif not self.databricks_conn.login and self.databricks_conn.password:
-            self.log.info("Using token auth.")
+            self.log.debug("Using token auth.")
             return self.databricks_conn.password
         elif "azure_tenant_id" in self.databricks_conn.extra_dejson:
             if self.databricks_conn.login == "" or self.databricks_conn.password == "":
                 raise AirflowException("Azure SPN credentials aren't provided")
-            self.log.info("Using AAD Token for SPN.")
+            self.log.debug("Using AAD Token for SPN.")
             return self._get_aad_token(DEFAULT_DATABRICKS_SCOPE)
         elif self.databricks_conn.extra_dejson.get("use_azure_managed_identity", False):
-            self.log.info("Using AAD Token for managed identity.")
+            self.log.debug("Using AAD Token for managed identity.")
             self._check_azure_metadata_service()
             return self._get_aad_token(DEFAULT_DATABRICKS_SCOPE)
         elif self.databricks_conn.extra_dejson.get("service_principal_oauth", False):
             if self.databricks_conn.login == "" or self.databricks_conn.password == "":
                 raise AirflowException("Service Principal credentials aren't provided")
-            self.log.info("Using Service Principal Token.")
+            self.log.debug("Using Service Principal Token.")
             return self._get_sp_token(OIDC_TOKEN_SERVICE_URL.format(self.databricks_conn.host))
         elif raise_error:
             raise AirflowException("Token authentication isn't configured")
@@ -527,21 +527,21 @@ class BaseDatabricksHook(BaseHook):
             )
             return self.databricks_conn.extra_dejson["token"]
         elif not self.databricks_conn.login and self.databricks_conn.password:
-            self.log.info("Using token auth.")
+            self.log.debug("Using token auth.")
             return self.databricks_conn.password
         elif "azure_tenant_id" in self.databricks_conn.extra_dejson:
             if self.databricks_conn.login == "" or self.databricks_conn.password == "":
                 raise AirflowException("Azure SPN credentials aren't provided")
-            self.log.info("Using AAD Token for SPN.")
+            self.log.debug("Using AAD Token for SPN.")
             return await self._a_get_aad_token(DEFAULT_DATABRICKS_SCOPE)
         elif self.databricks_conn.extra_dejson.get("use_azure_managed_identity", False):
-            self.log.info("Using AAD Token for managed identity.")
+            self.log.debug("Using AAD Token for managed identity.")
             await self._a_check_azure_metadata_service()
             return await self._a_get_aad_token(DEFAULT_DATABRICKS_SCOPE)
         elif self.databricks_conn.extra_dejson.get("service_principal_oauth", False):
             if self.databricks_conn.login == "" or self.databricks_conn.password == "":
                 raise AirflowException("Service Principal credentials aren't provided")
-            self.log.info("Using Service Principal Token.")
+            self.log.debug("Using Service Principal Token.")
             return await self._a_get_sp_token(OIDC_TOKEN_SERVICE_URL.format(self.databricks_conn.host))
         elif raise_error:
             raise AirflowException("Token authentication isn't configured")

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/airflow/providers/databricks/operators/databricks.py RENAMED Viewed

@@ -20,6 +20,7 @@
 from __future__ import annotations
 import time
+from abc import ABC, abstractmethod
 from functools import cached_property
 from logging import Logger
 from typing import TYPE_CHECKING, Any, Sequence
@@ -29,13 +30,18 @@ from deprecated import deprecated
 from airflow.configuration import conf
 from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
 from airflow.models import BaseOperator, BaseOperatorLink, XCom
-from airflow.providers.databricks.hooks.databricks import DatabricksHook, RunState
+from airflow.providers.databricks.hooks.databricks import DatabricksHook, RunLifeCycleState, RunState
+from airflow.providers.databricks.operators.databricks_workflow import (
+    DatabricksWorkflowTaskGroup,
+    WorkflowRunMetadata,
+)
 from airflow.providers.databricks.triggers.databricks import DatabricksExecutionTrigger
 from airflow.providers.databricks.utils.databricks import normalise_json_content, validate_trigger_event
 if TYPE_CHECKING:
     from airflow.models.taskinstancekey import TaskInstanceKey
     from airflow.utils.context import Context
+    from airflow.utils.task_group import TaskGroup
 DEFER_METHOD_NAME = "execute_complete"
 XCOM_RUN_ID_KEY = "run_id"
@@ -894,79 +900,64 @@ class DatabricksRunNowDeferrableOperator(DatabricksRunNowOperator):
         super().__init__(deferrable=True, *args, **kwargs)
-class DatabricksNotebookOperator(BaseOperator):
+class DatabricksTaskBaseOperator(BaseOperator, ABC):
     """
-    Runs a notebook on Databricks using an Airflow operator.
-    The DatabricksNotebookOperator allows users to launch and monitor notebook
-    job runs on Databricks as Airflow tasks.
+    Base class for operators that are run as Databricks job tasks or tasks within a Databricks workflow.
-    .. seealso::
-        For more information on how to use this operator, take a look at the guide:
-        :ref:`howto/operator:DatabricksNotebookOperator`
-    :param notebook_path: The path to the notebook in Databricks.
-    :param source: Optional location type of the notebook. When set to WORKSPACE, the notebook will be retrieved
-            from the local Databricks workspace. When set to GIT, the notebook will be retrieved from a Git repository
-            defined in git_source. If the value is empty, the task will use GIT if git_source is defined
-            and WORKSPACE otherwise. For more information please visit
-            https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsCreate
-    :param notebook_params: A dict of key-value pairs to be passed as optional params to the notebook task.
-    :param notebook_packages: A list of the Python libraries to be installed on the cluster running the
-        notebook.
-    :param new_cluster: Specs for a new cluster on which this task will be run.
+    :param caller: The name of the caller operator to be used in the logs.
+    :param databricks_conn_id: The name of the Airflow connection to use.
+    :param databricks_retry_args: An optional dictionary with arguments passed to ``tenacity.Retrying`` class.
+    :param databricks_retry_delay: Number of seconds to wait between retries.
+    :param databricks_retry_limit: Amount of times to retry if the Databricks backend is unreachable.
+    :param deferrable: Whether to run the operator in the deferrable mode.
     :param existing_cluster_id: ID for existing cluster on which to run this task.
     :param job_cluster_key: The key for the job cluster.
+    :param new_cluster: Specs for a new cluster on which this task will be run.
+    :param notebook_packages: A list of the Python libraries to be installed on the cluster running the
+        notebook.
+    :param notebook_params: A dict of key-value pairs to be passed as optional params to the notebook task.
     :param polling_period_seconds: Controls the rate which we poll for the result of this notebook job run.
-    :param databricks_retry_limit: Amount of times to retry if the Databricks backend is unreachable.
-    :param databricks_retry_delay: Number of seconds to wait between retries.
-    :param databricks_retry_args: An optional dictionary with arguments passed to ``tenacity.Retrying`` class.
     :param wait_for_termination: if we should wait for termination of the job run. ``True`` by default.
-    :param databricks_conn_id: The name of the Airflow connection to use.
-    :param deferrable: Run operator in the deferrable mode.
+    :param workflow_run_metadata: Metadata for the workflow run. This is used when the operator is used within
+        a workflow. It is expected to be a dictionary containing the run_id and conn_id for the workflow.
     """
-    template_fields = ("notebook_params",)
-    CALLER = "DatabricksNotebookOperator"
     def __init__(
         self,
-        notebook_path: str,
-        source: str,
-        notebook_params: dict | None = None,
-        notebook_packages: list[dict[str, Any]] | None = None,
-        new_cluster: dict[str, Any] | None = None,
+        caller: str = "DatabricksTaskBaseOperator",
+        databricks_conn_id: str = "databricks_default",
+        databricks_retry_args: dict[Any, Any] | None = None,
+        databricks_retry_delay: int = 1,
+        databricks_retry_limit: int = 3,
+        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
         existing_cluster_id: str = "",
         job_cluster_key: str = "",
+        new_cluster: dict[str, Any] | None = None,
         polling_period_seconds: int = 5,
-        databricks_retry_limit: int = 3,
-        databricks_retry_delay: int = 1,
-        databricks_retry_args: dict[Any, Any] | None = None,
         wait_for_termination: bool = True,
-        databricks_conn_id: str = "databricks_default",
-        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
+        workflow_run_metadata: dict[str, Any] | None = None,
         **kwargs: Any,
     ):
-        self.notebook_path = notebook_path
-        self.source = source
-        self.notebook_params = notebook_params or {}
-        self.notebook_packages = notebook_packages or []
-        self.new_cluster = new_cluster or {}
+        self.caller = caller
+        self.databricks_conn_id = databricks_conn_id
+        self.databricks_retry_args = databricks_retry_args
+        self.databricks_retry_delay = databricks_retry_delay
+        self.databricks_retry_limit = databricks_retry_limit
+        self.deferrable = deferrable
         self.existing_cluster_id = existing_cluster_id
         self.job_cluster_key = job_cluster_key
+        self.new_cluster = new_cluster or {}
         self.polling_period_seconds = polling_period_seconds
-        self.databricks_retry_limit = databricks_retry_limit
-        self.databricks_retry_delay = databricks_retry_delay
-        self.databricks_retry_args = databricks_retry_args
         self.wait_for_termination = wait_for_termination
-        self.databricks_conn_id = databricks_conn_id
+        self.workflow_run_metadata = workflow_run_metadata
         self.databricks_run_id: int | None = None
-        self.deferrable = deferrable
         super().__init__(**kwargs)
     @cached_property
     def _hook(self) -> DatabricksHook:
-        return self._get_hook(caller=self.CALLER)
+        return self._get_hook(caller=self.caller)
     def _get_hook(self, caller: str) -> DatabricksHook:
         return DatabricksHook(
@@ -974,47 +965,37 @@ class DatabricksNotebookOperator(BaseOperator):
             retry_limit=self.databricks_retry_limit,
             retry_delay=self.databricks_retry_delay,
             retry_args=self.databricks_retry_args,
-            caller=self.CALLER,
+            caller=caller,
         )
-    def _get_task_timeout_seconds(self) -> int:
+    def _get_databricks_task_id(self, task_id: str) -> str:
+        """Get the databricks task ID using dag_id and task_id. Removes illegal characters."""
+        return f"{self.dag_id}__{task_id.replace('.', '__')}"
+    @property
+    def _databricks_workflow_task_group(self) -> DatabricksWorkflowTaskGroup | None:
         """
-        Get the timeout seconds value for the Databricks job based on the execution timeout value provided for the Airflow task.
+        Traverse up parent TaskGroups until the `is_databricks` flag associated with the root DatabricksWorkflowTaskGroup is found.
-        By default, tasks in Airflow have an execution_timeout set to None. In Airflow, when
-        execution_timeout is not defined, the task continues to run indefinitely. Therefore,
-        to mirror this behavior in the Databricks Jobs API, we set the timeout to 0, indicating
-        that the job should run indefinitely. This aligns with the default behavior of Databricks jobs,
-        where a timeout seconds value of 0 signifies an indefinite run duration.
-        More details can be found in the Databricks documentation:
-        See https://docs.databricks.com/api/workspace/jobs/submit#timeout_seconds
+        If found, returns the task group. Otherwise, return None.
         """
-        if self.execution_timeout is None:
-            return 0
-        execution_timeout_seconds = int(self.execution_timeout.total_seconds())
-        if execution_timeout_seconds == 0:
-            raise ValueError(
-                "If you've set an `execution_timeout` for the task, ensure it's not `0`. Set it instead to "
-                "`None` if you desire the task to run indefinitely."
-            )
-        return execution_timeout_seconds
+        parent_tg: TaskGroup | DatabricksWorkflowTaskGroup | None = self.task_group
-    def _get_task_base_json(self) -> dict[str, Any]:
-        """Get task base json to be used for task submissions."""
-        return {
-            "timeout_seconds": self._get_task_timeout_seconds(),
-            "email_notifications": {},
-            "notebook_task": {
-                "notebook_path": self.notebook_path,
-                "source": self.source,
-                "base_parameters": self.notebook_params,
-            },
-            "libraries": self.notebook_packages,
-        }
+        while parent_tg:
+            if getattr(parent_tg, "is_databricks", False):
+                return parent_tg  # type: ignore[return-value]
-    def _get_databricks_task_id(self, task_id: str) -> str:
-        """Get the databricks task ID using dag_id and task_id. Removes illegal characters."""
-        return f"{self.dag_id}__{task_id.replace('.', '__')}"
+            if getattr(parent_tg, "task_group", None):
+                parent_tg = parent_tg.task_group
+            else:
+                return None
+        return None
+    @abstractmethod
+    def _get_task_base_json(self) -> dict[str, Any]:
+        """Get the base json for the task."""
+        raise NotImplementedError()
     def _get_run_json(self) -> dict[str, Any]:
         """Get run json to be used for task submissions."""
@@ -1032,65 +1013,335 @@ class DatabricksNotebookOperator(BaseOperator):
             raise ValueError("Must specify either existing_cluster_id or new_cluster.")
         return run_json
-    def launch_notebook_job(self) -> int:
+    def _launch_job(self) -> int:
+        """Launch the job on Databricks."""
         run_json = self._get_run_json()
         self.databricks_run_id = self._hook.submit_run(run_json)
         url = self._hook.get_run_page_url(self.databricks_run_id)
         self.log.info("Check the job run in Databricks: %s", url)
         return self.databricks_run_id
+    def _handle_terminal_run_state(self, run_state: RunState) -> None:
+        """Handle the terminal state of the run."""
+        if run_state.life_cycle_state != RunLifeCycleState.TERMINATED.value:
+            raise AirflowException(
+                f"Databricks job failed with state {run_state.life_cycle_state}. Message: {run_state.state_message}"
+            )
+        if not run_state.is_successful:
+            raise AirflowException(
+                f"Task failed. Final state {run_state.result_state}. Reason: {run_state.state_message}"
+            )
+        self.log.info("Task succeeded. Final state %s.", run_state.result_state)
+    def _get_current_databricks_task(self) -> dict[str, Any]:
+        """Retrieve the Databricks task corresponding to the current Airflow task."""
+        if self.databricks_run_id is None:
+            raise ValueError("Databricks job not yet launched. Please run launch_notebook_job first.")
+        return {task["task_key"]: task for task in self._hook.get_run(self.databricks_run_id)["tasks"]}[
+            self._get_databricks_task_id(self.task_id)
+        ]
+    def _convert_to_databricks_workflow_task(
+        self, relevant_upstreams: list[BaseOperator], context: Context | None = None
+    ) -> dict[str, object]:
+        """Convert the operator to a Databricks workflow task that can be a task in a workflow."""
+        base_task_json = self._get_task_base_json()
+        result = {
+            "task_key": self._get_databricks_task_id(self.task_id),
+            "depends_on": [
+                {"task_key": self._get_databricks_task_id(task_id)}
+                for task_id in self.upstream_task_ids
+                if task_id in relevant_upstreams
+            ],
+            **base_task_json,
+        }
+        if self.existing_cluster_id and self.job_cluster_key:
+            raise ValueError(
+                "Both existing_cluster_id and job_cluster_key are set. Only one can be set per task."
+            )
+        if self.existing_cluster_id:
+            result["existing_cluster_id"] = self.existing_cluster_id
+        elif self.job_cluster_key:
+            result["job_cluster_key"] = self.job_cluster_key
+        return result
     def monitor_databricks_job(self) -> None:
+        """
+        Monitor the Databricks job.
+        Wait for the job to terminate. If deferrable, defer the task.
+        """
         if self.databricks_run_id is None:
             raise ValueError("Databricks job not yet launched. Please run launch_notebook_job first.")
-        run = self._hook.get_run(self.databricks_run_id)
+        current_task_run_id = self._get_current_databricks_task()["run_id"]
+        run = self._hook.get_run(current_task_run_id)
+        run_page_url = run["run_page_url"]
+        self.log.info("Check the task run in Databricks: %s", run_page_url)
         run_state = RunState(**run["state"])
-        self.log.info("Current state of the job: %s", run_state.life_cycle_state)
+        self.log.info(
+            "Current state of the the databricks task %s is %s",
+            self._get_databricks_task_id(self.task_id),
+            run_state.life_cycle_state,
+        )
         if self.deferrable and not run_state.is_terminal:
             self.defer(
                 trigger=DatabricksExecutionTrigger(
-                    run_id=self.databricks_run_id,
+                    run_id=current_task_run_id,
                     databricks_conn_id=self.databricks_conn_id,
                     polling_period_seconds=self.polling_period_seconds,
                     retry_limit=self.databricks_retry_limit,
                     retry_delay=self.databricks_retry_delay,
                     retry_args=self.databricks_retry_args,
-                    caller=self.CALLER,
+                    caller=self.caller,
                 ),
                 method_name=DEFER_METHOD_NAME,
             )
         while not run_state.is_terminal:
             time.sleep(self.polling_period_seconds)
-            run = self._hook.get_run(self.databricks_run_id)
+            run = self._hook.get_run(current_task_run_id)
             run_state = RunState(**run["state"])
             self.log.info(
-                "task %s %s", self._get_databricks_task_id(self.task_id), run_state.life_cycle_state
-            )
-            self.log.info("Current state of the job: %s", run_state.life_cycle_state)
-        if run_state.life_cycle_state != "TERMINATED":
-            raise AirflowException(
-                f"Databricks job failed with state {run_state.life_cycle_state}. "
-                f"Message: {run_state.state_message}"
+                "Current state of the databricks task %s is %s",
+                self._get_databricks_task_id(self.task_id),
+                run_state.life_cycle_state,
             )
-        if not run_state.is_successful:
-            raise AirflowException(
-                f"Task failed. Final state {run_state.result_state}. Reason: {run_state.state_message}"
-            )
-        self.log.info("Task succeeded. Final state %s.", run_state.result_state)
+        self._handle_terminal_run_state(run_state)
     def execute(self, context: Context) -> None:
-        self.launch_notebook_job()
+        """Execute the operator. Launch the job and monitor it if wait_for_termination is set to True."""
+        if self._databricks_workflow_task_group:
+            # If we are in a DatabricksWorkflowTaskGroup, we should have an upstream task launched.
+            if not self.workflow_run_metadata:
+                launch_task_id = next(task for task in self.upstream_task_ids if task.endswith(".launch"))
+                self.workflow_run_metadata = context["ti"].xcom_pull(task_ids=launch_task_id)
+            workflow_run_metadata = WorkflowRunMetadata(  # type: ignore[arg-type]
+                **self.workflow_run_metadata
+            )
+            self.databricks_run_id = workflow_run_metadata.run_id
+            self.databricks_conn_id = workflow_run_metadata.conn_id
+        else:
+            self._launch_job()
         if self.wait_for_termination:
             self.monitor_databricks_job()
     def execute_complete(self, context: dict | None, event: dict) -> None:
         run_state = RunState.from_json(event["run_state"])
-        if run_state.life_cycle_state != "TERMINATED":
-            raise AirflowException(
-                f"Databricks job failed with state {run_state.life_cycle_state}. "
-                f"Message: {run_state.state_message}"
+        self._handle_terminal_run_state(run_state)
+class DatabricksNotebookOperator(DatabricksTaskBaseOperator):
+    """
+    Runs a notebook on Databricks using an Airflow operator.
+    The DatabricksNotebookOperator allows users to launch and monitor notebook job runs on Databricks as
+    Airflow tasks. It can be used as a part of a DatabricksWorkflowTaskGroup to take advantage of job
+    clusters, which allows users to run their tasks on cheaper clusters that can be shared between tasks.
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:DatabricksNotebookOperator`
+    :param notebook_path: The path to the notebook in Databricks.
+    :param source: Optional location type of the notebook. When set to WORKSPACE, the notebook will be retrieved
+            from the local Databricks workspace. When set to GIT, the notebook will be retrieved from a Git repository
+            defined in git_source. If the value is empty, the task will use GIT if git_source is defined
+            and WORKSPACE otherwise. For more information please visit
+            https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsCreate
+    :param databricks_conn_id: The name of the Airflow connection to use.
+    :param databricks_retry_args: An optional dictionary with arguments passed to ``tenacity.Retrying`` class.
+    :param databricks_retry_delay: Number of seconds to wait between retries.
+    :param databricks_retry_limit: Amount of times to retry if the Databricks backend is unreachable.
+    :param deferrable: Whether to run the operator in the deferrable mode.
+    :param existing_cluster_id: ID for existing cluster on which to run this task.
+    :param job_cluster_key: The key for the job cluster.
+    :param new_cluster: Specs for a new cluster on which this task will be run.
+    :param notebook_packages: A list of the Python libraries to be installed on the cluster running the
+        notebook.
+    :param notebook_params: A dict of key-value pairs to be passed as optional params to the notebook task.
+    :param polling_period_seconds: Controls the rate which we poll for the result of this notebook job run.
+    :param wait_for_termination: if we should wait for termination of the job run. ``True`` by default.
+    :param workflow_run_metadata: Metadata for the workflow run. This is used when the operator is used within
+        a workflow. It is expected to be a dictionary containing the run_id and conn_id for the workflow.
+    """
+    template_fields = (
+        "notebook_params",
+        "workflow_run_metadata",
+    )
+    CALLER = "DatabricksNotebookOperator"
+    def __init__(
+        self,
+        notebook_path: str,
+        source: str,
+        databricks_conn_id: str = "databricks_default",
+        databricks_retry_args: dict[Any, Any] | None = None,
+        databricks_retry_delay: int = 1,
+        databricks_retry_limit: int = 3,
+        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
+        existing_cluster_id: str = "",
+        job_cluster_key: str = "",
+        new_cluster: dict[str, Any] | None = None,
+        notebook_packages: list[dict[str, Any]] | None = None,
+        notebook_params: dict | None = None,
+        polling_period_seconds: int = 5,
+        wait_for_termination: bool = True,
+        workflow_run_metadata: dict | None = None,
+        **kwargs: Any,
+    ):
+        self.notebook_path = notebook_path
+        self.source = source
+        self.notebook_packages = notebook_packages or []
+        self.notebook_params = notebook_params or {}
+        super().__init__(
+            caller=self.CALLER,
+            databricks_conn_id=databricks_conn_id,
+            databricks_retry_args=databricks_retry_args,
+            databricks_retry_delay=databricks_retry_delay,
+            databricks_retry_limit=databricks_retry_limit,
+            deferrable=deferrable,
+            existing_cluster_id=existing_cluster_id,
+            job_cluster_key=job_cluster_key,
+            new_cluster=new_cluster,
+            polling_period_seconds=polling_period_seconds,
+            wait_for_termination=wait_for_termination,
+            workflow_run_metadata=workflow_run_metadata,
+            **kwargs,
+        )
+    def _get_task_timeout_seconds(self) -> int:
+        """
+        Get the timeout seconds value for the Databricks job based on the execution timeout value provided for the Airflow task.
+        By default, tasks in Airflow have an execution_timeout set to None. In Airflow, when
+        execution_timeout is not defined, the task continues to run indefinitely. Therefore,
+        to mirror this behavior in the Databricks Jobs API, we set the timeout to 0, indicating
+        that the job should run indefinitely. This aligns with the default behavior of Databricks jobs,
+        where a timeout seconds value of 0 signifies an indefinite run duration.
+        More details can be found in the Databricks documentation:
+        See https://docs.databricks.com/api/workspace/jobs/submit#timeout_seconds
+        """
+        if self.execution_timeout is None:
+            return 0
+        execution_timeout_seconds = int(self.execution_timeout.total_seconds())
+        if execution_timeout_seconds == 0:
+            raise ValueError(
+                "If you've set an `execution_timeout` for the task, ensure it's not `0`. Set it instead to "
+                "`None` if you desire the task to run indefinitely."
             )
-        if not run_state.is_successful:
+        return execution_timeout_seconds
+    def _get_task_base_json(self) -> dict[str, Any]:
+        """Get task base json to be used for task submissions."""
+        return {
+            "timeout_seconds": self._get_task_timeout_seconds(),
+            "email_notifications": {},
+            "notebook_task": {
+                "notebook_path": self.notebook_path,
+                "source": self.source,
+                "base_parameters": self.notebook_params,
+            },
+            "libraries": self.notebook_packages,
+        }
+    def _extend_workflow_notebook_packages(
+        self, databricks_workflow_task_group: DatabricksWorkflowTaskGroup
+    ) -> None:
+        """Extend the task group packages into the notebook's packages, without adding any duplicates."""
+        for task_group_package in databricks_workflow_task_group.notebook_packages:
+            exists = any(
+                task_group_package == existing_package for existing_package in self.notebook_packages
+            )
+            if not exists:
+                self.notebook_packages.append(task_group_package)
+    def _convert_to_databricks_workflow_task(
+        self, relevant_upstreams: list[BaseOperator], context: Context | None = None
+    ) -> dict[str, object]:
+        """Convert the operator to a Databricks workflow task that can be a task in a workflow."""
+        databricks_workflow_task_group = self._databricks_workflow_task_group
+        if not databricks_workflow_task_group:
             raise AirflowException(
-                f"Task failed. Final state {run_state.result_state}. Reason: {run_state.state_message}"
+                "Calling `_convert_to_databricks_workflow_task` without a parent TaskGroup."
             )
-        self.log.info("Task succeeded. Final state %s.", run_state.result_state)
+        if hasattr(databricks_workflow_task_group, "notebook_packages"):
+            self._extend_workflow_notebook_packages(databricks_workflow_task_group)
+        if hasattr(databricks_workflow_task_group, "notebook_params"):
+            self.notebook_params = {
+                **self.notebook_params,
+                **databricks_workflow_task_group.notebook_params,
+            }
+        return super()._convert_to_databricks_workflow_task(relevant_upstreams, context=context)
+class DatabricksTaskOperator(DatabricksTaskBaseOperator):
+    """
+    Runs a task on Databricks using an Airflow operator.
+    The DatabricksTaskOperator allows users to launch and monitor task job runs on Databricks as Airflow
+    tasks. It can be used as a part of a DatabricksWorkflowTaskGroup to take advantage of job clusters, which
+    allows users to run their tasks on cheaper clusters that can be shared between tasks.
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:DatabricksTaskOperator`
+    :param task_config: The configuration of the task to be run on Databricks.
+    :param databricks_conn_id: The name of the Airflow connection to use.
+    :param databricks_retry_args: An optional dictionary with arguments passed to ``tenacity.Retrying`` class.
+    :param databricks_retry_delay: Number of seconds to wait between retries.
+    :param databricks_retry_limit: Amount of times to retry if the Databricks backend is unreachable.
+    :param deferrable: Whether to run the operator in the deferrable mode.
+    :param existing_cluster_id: ID for existing cluster on which to run this task.
+    :param job_cluster_key: The key for the job cluster.
+    :param new_cluster: Specs for a new cluster on which this task will be run.
+    :param polling_period_seconds: Controls the rate which we poll for the result of this notebook job run.
+    :param wait_for_termination: if we should wait for termination of the job run. ``True`` by default.
+    """
+    CALLER = "DatabricksTaskOperator"
+    template_fields = ("workflow_run_metadata",)
+    def __init__(
+        self,
+        task_config: dict,
+        databricks_conn_id: str = "databricks_default",
+        databricks_retry_args: dict[Any, Any] | None = None,
+        databricks_retry_delay: int = 1,
+        databricks_retry_limit: int = 3,
+        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
+        existing_cluster_id: str = "",
+        job_cluster_key: str = "",
+        new_cluster: dict[str, Any] | None = None,
+        polling_period_seconds: int = 5,
+        wait_for_termination: bool = True,
+        workflow_run_metadata: dict | None = None,
+        **kwargs,
+    ):
+        self.task_config = task_config
+        super().__init__(
+            caller=self.CALLER,
+            databricks_conn_id=databricks_conn_id,
+            databricks_retry_args=databricks_retry_args,
+            databricks_retry_delay=databricks_retry_delay,
+            databricks_retry_limit=databricks_retry_limit,
+            deferrable=deferrable,
+            existing_cluster_id=existing_cluster_id,
+            job_cluster_key=job_cluster_key,
+            new_cluster=new_cluster,
+            polling_period_seconds=polling_period_seconds,
+            wait_for_termination=wait_for_termination,
+            workflow_run_metadata=workflow_run_metadata,
+            **kwargs,
+        )
+    def _get_task_base_json(self) -> dict[str, Any]:
+        """Get task base json to be used for task submissions."""
+        return self.task_config

apache_airflow_providers_databricks-6.6.0/airflow/providers/databricks/operators/databricks_workflow.py ADDED Viewed

@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import annotations
+import json
+import time
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING, Any
+from mergedeep import merge
+from airflow.exceptions import AirflowException
+from airflow.models import BaseOperator
+from airflow.providers.databricks.hooks.databricks import DatabricksHook, RunLifeCycleState
+from airflow.utils.task_group import TaskGroup
+if TYPE_CHECKING:
+    from types import TracebackType
+    from airflow.models.taskmixin import DAGNode
+    from airflow.utils.context import Context
+@dataclass
+class WorkflowRunMetadata:
+    """
+    Metadata for a Databricks workflow run.
+    :param run_id: The ID of the Databricks workflow run.
+    :param job_id: The ID of the Databricks workflow job.
+    :param conn_id: The connection ID used to connect to Databricks.
+    """
+    conn_id: str
+    job_id: str
+    run_id: int
+def _flatten_node(
+    node: TaskGroup | BaseOperator | DAGNode, tasks: list[BaseOperator] | None = None
+) -> list[BaseOperator]:
+    """Flatten a node (either a TaskGroup or Operator) to a list of nodes."""
+    if tasks is None:
+        tasks = []
+    if isinstance(node, BaseOperator):
+        return [node]
+    if isinstance(node, TaskGroup):
+        new_tasks = []
+        for _, child in node.children.items():
+            new_tasks += _flatten_node(child, tasks)
+        return tasks + new_tasks
+    return tasks
+class _CreateDatabricksWorkflowOperator(BaseOperator):
+    """
+    Creates a Databricks workflow from a DatabricksWorkflowTaskGroup specified in a DAG.
+    :param task_id: The task_id of the operator
+    :param databricks_conn_id: The connection ID to use when connecting to Databricks.
+    :param existing_clusters: A list of existing clusters to use for the workflow.
+    :param extra_job_params: A dictionary of extra properties which will override the default Databricks
+        Workflow Job definitions.
+    :param job_clusters: A list of job clusters to use for the workflow.
+    :param max_concurrent_runs: The maximum number of concurrent runs for the workflow.
+    :param notebook_params: A dictionary of notebook parameters to pass to the workflow. These parameters
+        will be passed to all notebooks in the workflow.
+    :param tasks_to_convert: A list of tasks to convert to a Databricks workflow. This list can also be
+        populated after instantiation using the `add_task` method.
+    """
+    template_fields = ("notebook_params",)
+    caller = "_CreateDatabricksWorkflowOperator"
+    def __init__(
+        self,
+        task_id: str,
+        databricks_conn_id: str,
+        existing_clusters: list[str] | None = None,
+        extra_job_params: dict[str, Any] | None = None,
+        job_clusters: list[dict[str, object]] | None = None,
+        max_concurrent_runs: int = 1,
+        notebook_params: dict | None = None,
+        tasks_to_convert: list[BaseOperator] | None = None,
+        **kwargs,
+    ):
+        self.databricks_conn_id = databricks_conn_id
+        self.existing_clusters = existing_clusters or []
+        self.extra_job_params = extra_job_params or {}
+        self.job_clusters = job_clusters or []
+        self.max_concurrent_runs = max_concurrent_runs
+        self.notebook_params = notebook_params or {}
+        self.tasks_to_convert = tasks_to_convert or []
+        self.relevant_upstreams = [task_id]
+        super().__init__(task_id=task_id, **kwargs)
+    def _get_hook(self, caller: str) -> DatabricksHook:
+        return DatabricksHook(
+            self.databricks_conn_id,
+            caller=caller,
+        )
+    @cached_property
+    def _hook(self) -> DatabricksHook:
+        return self._get_hook(caller=self.caller)
+    def add_task(self, task: BaseOperator) -> None:
+        """Add a task to the list of tasks to convert to a Databricks workflow."""
+        self.tasks_to_convert.append(task)
+    @property
+    def job_name(self) -> str:
+        if not self.task_group:
+            raise AirflowException("Task group must be set before accessing job_name")
+        return f"{self.dag_id}.{self.task_group.group_id}"
+    def create_workflow_json(self, context: Context | None = None) -> dict[str, object]:
+        """Create a workflow json to be used in the Databricks API."""
+        task_json = [
+            task._convert_to_databricks_workflow_task(  # type: ignore[attr-defined]
+                relevant_upstreams=self.relevant_upstreams, context=context
+            )
+            for task in self.tasks_to_convert
+        ]
+        default_json = {
+            "name": self.job_name,
+            "email_notifications": {"no_alert_for_skipped_runs": False},
+            "timeout_seconds": 0,
+            "tasks": task_json,
+            "format": "MULTI_TASK",
+            "job_clusters": self.job_clusters,
+            "max_concurrent_runs": self.max_concurrent_runs,
+        }
+        return merge(default_json, self.extra_job_params)
+    def _create_or_reset_job(self, context: Context) -> int:
+        job_spec = self.create_workflow_json(context=context)
+        existing_jobs = self._hook.list_jobs(job_name=self.job_name)
+        job_id = existing_jobs[0]["job_id"] if existing_jobs else None
+        if job_id:
+            self.log.info(
+                "Updating existing Databricks workflow job %s with spec %s",
+                self.job_name,
+                json.dumps(job_spec, indent=2),
+            )
+            self._hook.reset_job(job_id, job_spec)
+        else:
+            self.log.info(
+                "Creating new Databricks workflow job %s with spec %s",
+                self.job_name,
+                json.dumps(job_spec, indent=2),
+            )
+            job_id = self._hook.create_job(job_spec)
+        return job_id
+    def _wait_for_job_to_start(self, run_id: int) -> None:
+        run_url = self._hook.get_run_page_url(run_id)
+        self.log.info("Check the progress of the Databricks job at %s", run_url)
+        life_cycle_state = self._hook.get_run_state(run_id).life_cycle_state
+        if life_cycle_state not in (
+            RunLifeCycleState.PENDING.value,
+            RunLifeCycleState.RUNNING.value,
+            RunLifeCycleState.BLOCKED.value,
+        ):
+            raise AirflowException(f"Could not start the workflow job. State: {life_cycle_state}")
+        while life_cycle_state in (RunLifeCycleState.PENDING.value, RunLifeCycleState.BLOCKED.value):
+            self.log.info("Waiting for the Databricks job to start running")
+            time.sleep(5)
+            life_cycle_state = self._hook.get_run_state(run_id).life_cycle_state
+        self.log.info("Databricks job started. State: %s", life_cycle_state)
+    def execute(self, context: Context) -> Any:
+        if not isinstance(self.task_group, DatabricksWorkflowTaskGroup):
+            raise AirflowException("Task group must be a DatabricksWorkflowTaskGroup")
+        job_id = self._create_or_reset_job(context)
+        run_id = self._hook.run_now(
+            {
+                "job_id": job_id,
+                "jar_params": self.task_group.jar_params,
+                "notebook_params": self.notebook_params,
+                "python_params": self.task_group.python_params,
+                "spark_submit_params": self.task_group.spark_submit_params,
+            }
+        )
+        self._wait_for_job_to_start(run_id)
+        return {
+            "conn_id": self.databricks_conn_id,
+            "job_id": job_id,
+            "run_id": run_id,
+        }
+class DatabricksWorkflowTaskGroup(TaskGroup):
+    """
+    A task group that takes a list of tasks and creates a databricks workflow.
+    The DatabricksWorkflowTaskGroup takes a list of tasks and creates a databricks workflow
+    based on the metadata produced by those tasks. For a task to be eligible for this
+    TaskGroup, it must contain the ``_convert_to_databricks_workflow_task`` method. If any tasks
+    do not contain this method then the Taskgroup will raise an error at parse time.
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:DatabricksWorkflowTaskGroup`
+    :param databricks_conn_id: The name of the databricks connection to use.
+    :param existing_clusters: A list of existing clusters to use for this workflow.
+    :param extra_job_params: A dictionary containing properties which will override the default
+        Databricks Workflow Job definitions.
+    :param jar_params: A list of jar parameters to pass to the workflow. These parameters will be passed to all jar
+        tasks in the workflow.
+    :param job_clusters: A list of job clusters to use for this workflow.
+    :param max_concurrent_runs: The maximum number of concurrent runs for this workflow.
+    :param notebook_packages: A list of dictionary of Python packages to be installed. Packages defined
+        at the workflow task group level are installed for each of the notebook tasks under it. And
+        packages defined at the notebook task level are installed specific for the notebook task.
+    :param notebook_params: A dictionary of notebook parameters to pass to the workflow. These parameters
+        will be passed to all notebook tasks in the workflow.
+    :param python_params: A list of python parameters to pass to the workflow. These parameters will be passed to
+        all python tasks in the workflow.
+    :param spark_submit_params: A list of spark submit parameters to pass to the workflow. These parameters
+        will be passed to all spark submit tasks.
+    """
+    is_databricks = True
+    def __init__(
+        self,
+        databricks_conn_id: str,
+        existing_clusters: list[str] | None = None,
+        extra_job_params: dict[str, Any] | None = None,
+        jar_params: list[str] | None = None,
+        job_clusters: list[dict] | None = None,
+        max_concurrent_runs: int = 1,
+        notebook_packages: list[dict[str, Any]] | None = None,
+        notebook_params: dict | None = None,
+        python_params: list | None = None,
+        spark_submit_params: list | None = None,
+        **kwargs,
+    ):
+        self.databricks_conn_id = databricks_conn_id
+        self.existing_clusters = existing_clusters or []
+        self.extra_job_params = extra_job_params or {}
+        self.jar_params = jar_params or []
+        self.job_clusters = job_clusters or []
+        self.max_concurrent_runs = max_concurrent_runs
+        self.notebook_packages = notebook_packages or []
+        self.notebook_params = notebook_params or {}
+        self.python_params = python_params or []
+        self.spark_submit_params = spark_submit_params or []
+        super().__init__(**kwargs)
+    def __exit__(
+        self, _type: type[BaseException] | None, _value: BaseException | None, _tb: TracebackType | None
+    ) -> None:
+        """Exit the context manager and add tasks to a single ``_CreateDatabricksWorkflowOperator``."""
+        roots = list(self.get_roots())
+        tasks = _flatten_node(self)
+        create_databricks_workflow_task = _CreateDatabricksWorkflowOperator(
+            dag=self.dag,
+            task_group=self,
+            task_id="launch",
+            databricks_conn_id=self.databricks_conn_id,
+            existing_clusters=self.existing_clusters,
+            extra_job_params=self.extra_job_params,
+            job_clusters=self.job_clusters,
+            max_concurrent_runs=self.max_concurrent_runs,
+            notebook_params=self.notebook_params,
+        )
+        for task in tasks:
+            if not (
+                hasattr(task, "_convert_to_databricks_workflow_task")
+                and callable(task._convert_to_databricks_workflow_task)
+            ):
+                raise AirflowException(
+                    f"Task {task.task_id} does not support conversion to databricks workflow task."
+                )
+            task.workflow_run_metadata = create_databricks_workflow_task.output
+            create_databricks_workflow_task.relevant_upstreams.append(task.task_id)
+            create_databricks_workflow_task.add_task(task)
+        for root_task in roots:
+            root_task.set_upstream(create_databricks_workflow_task)
+        super().__exit__(_type, _value, _tb)

{apache_airflow_providers_databricks-6.5.0 → apache_airflow_providers_databricks-6.6.0}/pyproject.toml RENAMED Viewed

@@ -28,7 +28,7 @@ build-backend = "flit_core.buildapi"
 [project]
 name = "apache-airflow-providers-databricks"
-version = "6.5.0"
+version = "6.6.0"
 description = "Provider package apache-airflow-providers-databricks for Apache Airflow"
 readme = "README.rst"
 authors = [
@@ -60,12 +60,16 @@ dependencies = [
     "apache-airflow-providers-common-sql>=1.10.0",
     "apache-airflow>=2.7.0",
     "databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0",
+    "mergedeep>=1.3.4",
+    "pandas>=1.5.3,<2.2;python_version<\"3.9\"",
+    "pandas>=2.1.2,<2.2;python_version>=\"3.9\"",
+    "pyarrow>=14.0.1",
     "requests>=2.27.0,<3",
 ]
 [project.urls]
-"Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0"
-"Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.5.0/changelog.html"
+"Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0"
+"Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.6.0/changelog.html"
 "Bug Tracker" = "https://github.com/apache/airflow/issues"
 "Source Code" = "https://github.com/apache/airflow"
 "Slack Chat" = "https://s.apache.org/airflow-slack"