apache-airflow-providers-databricks 7.5.0__py3-none-any.whl → 7.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apache-airflow-providers-databricks might be problematic. Click here for more details.
- airflow/providers/databricks/__init__.py +1 -1
- airflow/providers/databricks/hooks/databricks.py +35 -31
- airflow/providers/databricks/hooks/databricks_base.py +16 -9
- airflow/providers/databricks/hooks/databricks_sql.py +13 -14
- airflow/providers/databricks/operators/databricks.py +21 -6
- airflow/providers/databricks/operators/databricks_repos.py +1 -1
- airflow/providers/databricks/operators/databricks_sql.py +1 -1
- airflow/providers/databricks/operators/databricks_workflow.py +21 -2
- airflow/providers/databricks/plugins/databricks_workflow.py +118 -57
- airflow/providers/databricks/sensors/databricks.py +2 -2
- airflow/providers/databricks/sensors/databricks_partition.py +8 -3
- airflow/providers/databricks/sensors/databricks_sql.py +8 -3
- airflow/providers/databricks/utils/openlineage.py +85 -58
- airflow/providers/databricks/version_compat.py +10 -0
- {apache_airflow_providers_databricks-7.5.0.dist-info → apache_airflow_providers_databricks-7.6.0.dist-info}/METADATA +17 -16
- {apache_airflow_providers_databricks-7.5.0.dist-info → apache_airflow_providers_databricks-7.6.0.dist-info}/RECORD +18 -18
- {apache_airflow_providers_databricks-7.5.0.dist-info → apache_airflow_providers_databricks-7.6.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_databricks-7.5.0.dist-info → apache_airflow_providers_databricks-7.6.0.dist-info}/entry_points.txt +0 -0
|
@@ -29,7 +29,7 @@ from airflow import __version__ as airflow_version
|
|
|
29
29
|
|
|
30
30
|
__all__ = ["__version__"]
|
|
31
31
|
|
|
32
|
-
__version__ = "7.
|
|
32
|
+
__version__ = "7.6.0"
|
|
33
33
|
|
|
34
34
|
if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
|
|
35
35
|
"2.10.0"
|
|
@@ -37,33 +37,36 @@ from requests import exceptions as requests_exceptions
|
|
|
37
37
|
from airflow.exceptions import AirflowException
|
|
38
38
|
from airflow.providers.databricks.hooks.databricks_base import BaseDatabricksHook
|
|
39
39
|
|
|
40
|
-
GET_CLUSTER_ENDPOINT = ("GET", "
|
|
41
|
-
RESTART_CLUSTER_ENDPOINT = ("POST", "
|
|
42
|
-
START_CLUSTER_ENDPOINT = ("POST", "
|
|
43
|
-
TERMINATE_CLUSTER_ENDPOINT = ("POST", "
|
|
44
|
-
|
|
45
|
-
CREATE_ENDPOINT = ("POST", "
|
|
46
|
-
RESET_ENDPOINT = ("POST", "
|
|
47
|
-
UPDATE_ENDPOINT = ("POST", "
|
|
48
|
-
RUN_NOW_ENDPOINT = ("POST", "
|
|
49
|
-
SUBMIT_RUN_ENDPOINT = ("POST", "
|
|
50
|
-
GET_RUN_ENDPOINT = ("GET", "
|
|
51
|
-
CANCEL_RUN_ENDPOINT = ("POST", "
|
|
52
|
-
DELETE_RUN_ENDPOINT = ("POST", "
|
|
53
|
-
REPAIR_RUN_ENDPOINT = ("POST", "
|
|
54
|
-
OUTPUT_RUNS_JOB_ENDPOINT = ("GET", "
|
|
55
|
-
CANCEL_ALL_RUNS_ENDPOINT = ("POST", "
|
|
56
|
-
|
|
57
|
-
INSTALL_LIBS_ENDPOINT = ("POST", "
|
|
58
|
-
UNINSTALL_LIBS_ENDPOINT = ("POST", "
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
40
|
+
GET_CLUSTER_ENDPOINT = ("GET", "2.0/clusters/get")
|
|
41
|
+
RESTART_CLUSTER_ENDPOINT = ("POST", "2.0/clusters/restart")
|
|
42
|
+
START_CLUSTER_ENDPOINT = ("POST", "2.0/clusters/start")
|
|
43
|
+
TERMINATE_CLUSTER_ENDPOINT = ("POST", "2.0/clusters/delete")
|
|
44
|
+
|
|
45
|
+
CREATE_ENDPOINT = ("POST", "2.1/jobs/create")
|
|
46
|
+
RESET_ENDPOINT = ("POST", "2.1/jobs/reset")
|
|
47
|
+
UPDATE_ENDPOINT = ("POST", "2.1/jobs/update")
|
|
48
|
+
RUN_NOW_ENDPOINT = ("POST", "2.1/jobs/run-now")
|
|
49
|
+
SUBMIT_RUN_ENDPOINT = ("POST", "2.1/jobs/runs/submit")
|
|
50
|
+
GET_RUN_ENDPOINT = ("GET", "2.1/jobs/runs/get")
|
|
51
|
+
CANCEL_RUN_ENDPOINT = ("POST", "2.1/jobs/runs/cancel")
|
|
52
|
+
DELETE_RUN_ENDPOINT = ("POST", "2.1/jobs/runs/delete")
|
|
53
|
+
REPAIR_RUN_ENDPOINT = ("POST", "2.1/jobs/runs/repair")
|
|
54
|
+
OUTPUT_RUNS_JOB_ENDPOINT = ("GET", "2.1/jobs/runs/get-output")
|
|
55
|
+
CANCEL_ALL_RUNS_ENDPOINT = ("POST", "2.1/jobs/runs/cancel-all")
|
|
56
|
+
|
|
57
|
+
INSTALL_LIBS_ENDPOINT = ("POST", "2.0/libraries/install")
|
|
58
|
+
UNINSTALL_LIBS_ENDPOINT = ("POST", "2.0/libraries/uninstall")
|
|
59
|
+
UPDATE_REPO_ENDPOINT = ("PATCH", "2.0/repos/")
|
|
60
|
+
DELETE_REPO_ENDPOINT = ("DELETE", "2.0/repos/")
|
|
61
|
+
CREATE_REPO_ENDPOINT = ("POST", "2.0/repos")
|
|
62
|
+
|
|
63
|
+
LIST_JOBS_ENDPOINT = ("GET", "2.1/jobs/list")
|
|
64
|
+
LIST_PIPELINES_ENDPOINT = ("GET", "2.0/pipelines")
|
|
65
|
+
|
|
66
|
+
WORKSPACE_GET_STATUS_ENDPOINT = ("GET", "2.0/workspace/get-status")
|
|
67
|
+
|
|
68
|
+
SPARK_VERSIONS_ENDPOINT = ("GET", "2.0/clusters/spark-versions")
|
|
69
|
+
SQL_STATEMENTS_ENDPOINT = "2.0/sql/statements"
|
|
67
70
|
|
|
68
71
|
|
|
69
72
|
class RunLifeCycleState(Enum):
|
|
@@ -718,7 +721,8 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
718
721
|
:param json: payload
|
|
719
722
|
:return: metadata from update
|
|
720
723
|
"""
|
|
721
|
-
|
|
724
|
+
method, base_path = UPDATE_REPO_ENDPOINT
|
|
725
|
+
repos_endpoint = (method, f"{base_path}/{repo_id}")
|
|
722
726
|
return self._do_api_call(repos_endpoint, json)
|
|
723
727
|
|
|
724
728
|
def delete_repo(self, repo_id: str):
|
|
@@ -728,7 +732,8 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
728
732
|
:param repo_id: ID of Databricks Repos
|
|
729
733
|
:return:
|
|
730
734
|
"""
|
|
731
|
-
|
|
735
|
+
method, base_path = DELETE_REPO_ENDPOINT
|
|
736
|
+
repos_endpoint = (method, f"{base_path}/{repo_id}")
|
|
732
737
|
self._do_api_call(repos_endpoint)
|
|
733
738
|
|
|
734
739
|
def create_repo(self, json: dict[str, Any]) -> dict:
|
|
@@ -738,8 +743,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
738
743
|
:param json: payload
|
|
739
744
|
:return:
|
|
740
745
|
"""
|
|
741
|
-
|
|
742
|
-
return self._do_api_call(repos_endpoint, json)
|
|
746
|
+
return self._do_api_call(CREATE_REPO_ENDPOINT, json)
|
|
743
747
|
|
|
744
748
|
def get_repo_by_path(self, path: str) -> str | None:
|
|
745
749
|
"""
|
|
@@ -50,9 +50,13 @@ from tenacity import (
|
|
|
50
50
|
|
|
51
51
|
from airflow import __version__
|
|
52
52
|
from airflow.exceptions import AirflowException, AirflowOptionalProviderFeatureException
|
|
53
|
-
from airflow.hooks.base import BaseHook
|
|
54
53
|
from airflow.providers_manager import ProvidersManager
|
|
55
54
|
|
|
55
|
+
try:
|
|
56
|
+
from airflow.sdk import BaseHook
|
|
57
|
+
except ImportError:
|
|
58
|
+
from airflow.hooks.base import BaseHook as BaseHook # type: ignore
|
|
59
|
+
|
|
56
60
|
if TYPE_CHECKING:
|
|
57
61
|
from airflow.models import Connection
|
|
58
62
|
|
|
@@ -135,7 +139,7 @@ class BaseDatabricksHook(BaseHook):
|
|
|
135
139
|
|
|
136
140
|
@cached_property
|
|
137
141
|
def databricks_conn(self) -> Connection:
|
|
138
|
-
return self.get_connection(self.databricks_conn_id)
|
|
142
|
+
return self.get_connection(self.databricks_conn_id) # type: ignore[return-value]
|
|
139
143
|
|
|
140
144
|
def get_conn(self) -> Connection:
|
|
141
145
|
return self.databricks_conn
|
|
@@ -353,14 +357,15 @@ class BaseDatabricksHook(BaseHook):
|
|
|
353
357
|
async for attempt in self._a_get_retry_object():
|
|
354
358
|
with attempt:
|
|
355
359
|
if self.databricks_conn.extra_dejson.get("use_azure_managed_identity", False):
|
|
356
|
-
|
|
360
|
+
async with AsyncManagedIdentityCredential() as credential:
|
|
361
|
+
token = await credential.get_token(f"{resource}/.default")
|
|
357
362
|
else:
|
|
358
|
-
|
|
363
|
+
async with AsyncClientSecretCredential(
|
|
359
364
|
client_id=self.databricks_conn.login,
|
|
360
365
|
client_secret=self.databricks_conn.password,
|
|
361
366
|
tenant_id=self.databricks_conn.extra_dejson["azure_tenant_id"],
|
|
362
|
-
)
|
|
363
|
-
|
|
367
|
+
) as credential:
|
|
368
|
+
token = await credential.get_token(f"{resource}/.default")
|
|
364
369
|
jsn = {
|
|
365
370
|
"access_token": token.token,
|
|
366
371
|
"token_type": "Bearer",
|
|
@@ -636,8 +641,9 @@ class BaseDatabricksHook(BaseHook):
|
|
|
636
641
|
"""
|
|
637
642
|
method, endpoint = endpoint_info
|
|
638
643
|
|
|
639
|
-
#
|
|
640
|
-
|
|
644
|
+
# Automatically prepend 'api/' prefix to all endpoint paths
|
|
645
|
+
full_endpoint = f"api/{endpoint}"
|
|
646
|
+
url = self._endpoint_url(full_endpoint)
|
|
641
647
|
|
|
642
648
|
aad_headers = self._get_aad_headers()
|
|
643
649
|
headers = {**self.user_agent_header, **aad_headers}
|
|
@@ -703,7 +709,8 @@ class BaseDatabricksHook(BaseHook):
|
|
|
703
709
|
"""
|
|
704
710
|
method, endpoint = endpoint_info
|
|
705
711
|
|
|
706
|
-
|
|
712
|
+
full_endpoint = f"api/{endpoint}"
|
|
713
|
+
url = self._endpoint_url(full_endpoint)
|
|
707
714
|
|
|
708
715
|
aad_headers = await self._a_get_aad_headers()
|
|
709
716
|
headers = {**self.user_agent_header, **aad_headers}
|
|
@@ -18,14 +18,13 @@ from __future__ import annotations
|
|
|
18
18
|
|
|
19
19
|
import threading
|
|
20
20
|
from collections import namedtuple
|
|
21
|
-
from collections.abc import Iterable, Mapping, Sequence
|
|
21
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
22
22
|
from contextlib import closing
|
|
23
23
|
from copy import copy
|
|
24
24
|
from datetime import timedelta
|
|
25
25
|
from typing import (
|
|
26
26
|
TYPE_CHECKING,
|
|
27
27
|
Any,
|
|
28
|
-
Callable,
|
|
29
28
|
TypeVar,
|
|
30
29
|
cast,
|
|
31
30
|
overload,
|
|
@@ -345,10 +344,9 @@ class DatabricksSqlHook(BaseDatabricksHook, DbApiHook):
|
|
|
345
344
|
|
|
346
345
|
def get_openlineage_database_specific_lineage(self, task_instance) -> OperatorLineage | None:
|
|
347
346
|
"""
|
|
348
|
-
|
|
347
|
+
Emit separate OpenLineage events for each Databricks query, based on executed query IDs.
|
|
349
348
|
|
|
350
|
-
If a single query ID is present,
|
|
351
|
-
If multiple query IDs are present, emits separate OpenLineage events for each query instead.
|
|
349
|
+
If a single query ID is present, also add an `ExternalQueryRunFacet` to the returned lineage metadata.
|
|
352
350
|
|
|
353
351
|
Note that `get_openlineage_database_specific_lineage` is usually called after task's execution,
|
|
354
352
|
so if multiple query IDs are present, both START and COMPLETE event for each query will be emitted
|
|
@@ -369,13 +367,22 @@ class DatabricksSqlHook(BaseDatabricksHook, DbApiHook):
|
|
|
369
367
|
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
370
368
|
|
|
371
369
|
if not self.query_ids:
|
|
372
|
-
self.log.
|
|
370
|
+
self.log.info("OpenLineage could not find databricks query ids.")
|
|
373
371
|
return None
|
|
374
372
|
|
|
375
373
|
self.log.debug("openlineage: getting connection to get database info")
|
|
376
374
|
connection = self.get_connection(self.get_conn_id())
|
|
377
375
|
namespace = SQLParser.create_namespace(self.get_openlineage_database_info(connection))
|
|
378
376
|
|
|
377
|
+
self.log.info("Separate OpenLineage events will be emitted for each Databricks query_id.")
|
|
378
|
+
emit_openlineage_events_for_databricks_queries(
|
|
379
|
+
task_instance=task_instance,
|
|
380
|
+
hook=self,
|
|
381
|
+
query_ids=self.query_ids,
|
|
382
|
+
query_for_extra_metadata=True,
|
|
383
|
+
query_source_namespace=namespace,
|
|
384
|
+
)
|
|
385
|
+
|
|
379
386
|
if len(self.query_ids) == 1:
|
|
380
387
|
self.log.debug("Attaching ExternalQueryRunFacet with single query_id to OpenLineage event.")
|
|
381
388
|
return OperatorLineage(
|
|
@@ -386,12 +393,4 @@ class DatabricksSqlHook(BaseDatabricksHook, DbApiHook):
|
|
|
386
393
|
}
|
|
387
394
|
)
|
|
388
395
|
|
|
389
|
-
self.log.info("Multiple query_ids found. Separate OpenLineage event will be emitted for each query.")
|
|
390
|
-
emit_openlineage_events_for_databricks_queries(
|
|
391
|
-
query_ids=self.query_ids,
|
|
392
|
-
query_source_namespace=namespace,
|
|
393
|
-
task_instance=task_instance,
|
|
394
|
-
hook=self,
|
|
395
|
-
)
|
|
396
|
-
|
|
397
396
|
return None
|
|
@@ -29,7 +29,6 @@ from typing import TYPE_CHECKING, Any
|
|
|
29
29
|
|
|
30
30
|
from airflow.configuration import conf
|
|
31
31
|
from airflow.exceptions import AirflowException
|
|
32
|
-
from airflow.models import BaseOperator
|
|
33
32
|
from airflow.providers.databricks.hooks.databricks import (
|
|
34
33
|
DatabricksHook,
|
|
35
34
|
RunLifeCycleState,
|
|
@@ -42,13 +41,14 @@ from airflow.providers.databricks.operators.databricks_workflow import (
|
|
|
42
41
|
from airflow.providers.databricks.plugins.databricks_workflow import (
|
|
43
42
|
WorkflowJobRepairSingleTaskLink,
|
|
44
43
|
WorkflowJobRunLink,
|
|
44
|
+
store_databricks_job_run_link,
|
|
45
45
|
)
|
|
46
46
|
from airflow.providers.databricks.triggers.databricks import (
|
|
47
47
|
DatabricksExecutionTrigger,
|
|
48
48
|
)
|
|
49
49
|
from airflow.providers.databricks.utils.databricks import normalise_json_content, validate_trigger_event
|
|
50
50
|
from airflow.providers.databricks.utils.mixins import DatabricksSQLStatementsMixin
|
|
51
|
-
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
|
|
51
|
+
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS, BaseOperator
|
|
52
52
|
|
|
53
53
|
if TYPE_CHECKING:
|
|
54
54
|
from airflow.models.taskinstancekey import TaskInstanceKey
|
|
@@ -1214,10 +1214,16 @@ class DatabricksTaskBaseOperator(BaseOperator, ABC):
|
|
|
1214
1214
|
super().__init__(**kwargs)
|
|
1215
1215
|
|
|
1216
1216
|
if self._databricks_workflow_task_group is not None:
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1217
|
+
# Conditionally set operator_extra_links based on Airflow version. In Airflow 3, only show the job run link.
|
|
1218
|
+
# In Airflow 2, show the job run link and the repair link.
|
|
1219
|
+
# TODO: Once we expand the plugin functionality in Airflow 3.1, this can be re-evaluated on how to handle the repair link.
|
|
1220
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
1221
|
+
self.operator_extra_links = (WorkflowJobRunLink(),)
|
|
1222
|
+
else:
|
|
1223
|
+
self.operator_extra_links = (
|
|
1224
|
+
WorkflowJobRunLink(),
|
|
1225
|
+
WorkflowJobRepairSingleTaskLink(),
|
|
1226
|
+
)
|
|
1221
1227
|
else:
|
|
1222
1228
|
# Databricks does not support repair for non-workflow tasks, hence do not show the repair link.
|
|
1223
1229
|
self.operator_extra_links = (DatabricksJobRunLink(),)
|
|
@@ -1427,6 +1433,15 @@ class DatabricksTaskBaseOperator(BaseOperator, ABC):
|
|
|
1427
1433
|
)
|
|
1428
1434
|
self.databricks_run_id = workflow_run_metadata.run_id
|
|
1429
1435
|
self.databricks_conn_id = workflow_run_metadata.conn_id
|
|
1436
|
+
|
|
1437
|
+
# Store operator links in XCom for Airflow 3 compatibility
|
|
1438
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
1439
|
+
# Store the job run link
|
|
1440
|
+
store_databricks_job_run_link(
|
|
1441
|
+
context=context,
|
|
1442
|
+
metadata=workflow_run_metadata,
|
|
1443
|
+
logger=self.log,
|
|
1444
|
+
)
|
|
1430
1445
|
else:
|
|
1431
1446
|
self._launch_job(context=context)
|
|
1432
1447
|
if self.wait_for_termination:
|
|
@@ -26,8 +26,8 @@ from typing import TYPE_CHECKING
|
|
|
26
26
|
from urllib.parse import urlsplit
|
|
27
27
|
|
|
28
28
|
from airflow.exceptions import AirflowException
|
|
29
|
-
from airflow.models import BaseOperator
|
|
30
29
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook
|
|
30
|
+
from airflow.providers.databricks.version_compat import BaseOperator
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
33
|
try:
|
|
@@ -28,9 +28,9 @@ from typing import TYPE_CHECKING, Any, ClassVar
|
|
|
28
28
|
from databricks.sql.utils import ParamEscaper
|
|
29
29
|
|
|
30
30
|
from airflow.exceptions import AirflowException
|
|
31
|
-
from airflow.models import BaseOperator
|
|
32
31
|
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
|
|
33
32
|
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook
|
|
33
|
+
from airflow.providers.databricks.version_compat import BaseOperator
|
|
34
34
|
|
|
35
35
|
if TYPE_CHECKING:
|
|
36
36
|
from airflow.utils.context import Context
|
|
@@ -26,12 +26,13 @@ from typing import TYPE_CHECKING, Any
|
|
|
26
26
|
from mergedeep import merge
|
|
27
27
|
|
|
28
28
|
from airflow.exceptions import AirflowException
|
|
29
|
-
from airflow.models import BaseOperator
|
|
30
29
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook, RunLifeCycleState
|
|
31
30
|
from airflow.providers.databricks.plugins.databricks_workflow import (
|
|
32
31
|
WorkflowJobRepairAllFailedLink,
|
|
33
32
|
WorkflowJobRunLink,
|
|
33
|
+
store_databricks_job_run_link,
|
|
34
34
|
)
|
|
35
|
+
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS, BaseOperator
|
|
35
36
|
from airflow.utils.task_group import TaskGroup
|
|
36
37
|
|
|
37
38
|
if TYPE_CHECKING:
|
|
@@ -92,9 +93,18 @@ class _CreateDatabricksWorkflowOperator(BaseOperator):
|
|
|
92
93
|
populated after instantiation using the `add_task` method.
|
|
93
94
|
"""
|
|
94
95
|
|
|
95
|
-
operator_extra_links = (WorkflowJobRunLink(), WorkflowJobRepairAllFailedLink())
|
|
96
96
|
template_fields = ("notebook_params", "job_clusters")
|
|
97
97
|
caller = "_CreateDatabricksWorkflowOperator"
|
|
98
|
+
# Conditionally set operator_extra_links based on Airflow version
|
|
99
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
100
|
+
# In Airflow 3, disable "Repair All Failed Tasks" since we can't pre-determine failed tasks
|
|
101
|
+
operator_extra_links = (WorkflowJobRunLink(),)
|
|
102
|
+
else:
|
|
103
|
+
# In Airflow 2.x, keep both links
|
|
104
|
+
operator_extra_links = ( # type: ignore[assignment]
|
|
105
|
+
WorkflowJobRunLink(),
|
|
106
|
+
WorkflowJobRepairAllFailedLink(),
|
|
107
|
+
)
|
|
98
108
|
|
|
99
109
|
def __init__(
|
|
100
110
|
self,
|
|
@@ -219,6 +229,15 @@ class _CreateDatabricksWorkflowOperator(BaseOperator):
|
|
|
219
229
|
run_id,
|
|
220
230
|
)
|
|
221
231
|
|
|
232
|
+
# Store operator links in XCom for Airflow 3 compatibility
|
|
233
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
234
|
+
# Store the job run link
|
|
235
|
+
store_databricks_job_run_link(
|
|
236
|
+
context=context,
|
|
237
|
+
metadata=self.workflow_run_metadata,
|
|
238
|
+
logger=self.log,
|
|
239
|
+
)
|
|
240
|
+
|
|
222
241
|
return {
|
|
223
242
|
"conn_id": self.databricks_conn_id,
|
|
224
243
|
"job_id": job_id,
|
|
@@ -40,7 +40,6 @@ if AIRFLOW_V_3_0_PLUS:
|
|
|
40
40
|
else:
|
|
41
41
|
from airflow.www import auth # type: ignore
|
|
42
42
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
43
|
-
from airflow.utils.session import NEW_SESSION, provide_session
|
|
44
43
|
from airflow.utils.state import TaskInstanceState
|
|
45
44
|
from airflow.utils.task_group import TaskGroup
|
|
46
45
|
|
|
@@ -49,6 +48,7 @@ if TYPE_CHECKING:
|
|
|
49
48
|
|
|
50
49
|
from airflow.models import BaseOperator
|
|
51
50
|
from airflow.providers.databricks.operators.databricks import DatabricksTaskBaseOperator
|
|
51
|
+
from airflow.utils.context import Context
|
|
52
52
|
|
|
53
53
|
if AIRFLOW_V_3_0_PLUS:
|
|
54
54
|
from airflow.sdk import BaseOperatorLink
|
|
@@ -93,32 +93,56 @@ def get_databricks_task_ids(
|
|
|
93
93
|
return task_ids
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
96
|
+
# TODO: Need to re-think on how to support the currently unavailable repair functionality in Airflow 3. Probably a
|
|
97
|
+
# good time to re-evaluate this would be once the plugin functionality is expanded in Airflow 3.1.
|
|
98
|
+
if not AIRFLOW_V_3_0_PLUS:
|
|
99
|
+
from airflow.utils.session import NEW_SESSION, provide_session
|
|
100
100
|
|
|
101
|
-
|
|
102
|
-
:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
"""
|
|
106
|
-
if not session:
|
|
107
|
-
raise AirflowException("Session not provided.")
|
|
101
|
+
@provide_session
|
|
102
|
+
def _get_dagrun(dag: DAG, run_id: str, session: Session | None = None) -> DagRun:
|
|
103
|
+
"""
|
|
104
|
+
Retrieve the DagRun object associated with the specified DAG and run_id.
|
|
108
105
|
|
|
109
|
-
|
|
106
|
+
:param dag: The DAG object associated with the DagRun to retrieve.
|
|
107
|
+
:param run_id: The run_id associated with the DagRun to retrieve.
|
|
108
|
+
:param session: The SQLAlchemy session to use for the query. If None, uses the default session.
|
|
109
|
+
:return: The DagRun object associated with the specified DAG and run_id.
|
|
110
|
+
"""
|
|
111
|
+
if not session:
|
|
112
|
+
raise AirflowException("Session not provided.")
|
|
110
113
|
|
|
114
|
+
return session.query(DagRun).filter(DagRun.dag_id == dag.dag_id, DagRun.run_id == run_id).first()
|
|
111
115
|
|
|
112
|
-
@provide_session
|
|
113
|
-
def _clear_task_instances(
|
|
114
|
-
|
|
115
|
-
) -> None:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
116
|
+
@provide_session
|
|
117
|
+
def _clear_task_instances(
|
|
118
|
+
dag_id: str, run_id: str, task_ids: list[str], log: logging.Logger, session: Session | None = None
|
|
119
|
+
) -> None:
|
|
120
|
+
dag_bag = DagBag(read_dags_from_db=True)
|
|
121
|
+
dag = dag_bag.get_dag(dag_id)
|
|
122
|
+
log.debug("task_ids %s to clear", str(task_ids))
|
|
123
|
+
dr: DagRun = _get_dagrun(dag, run_id, session=session)
|
|
124
|
+
tis_to_clear = [ti for ti in dr.get_task_instances() if ti.databricks_task_key in task_ids]
|
|
125
|
+
clear_task_instances(tis_to_clear, session)
|
|
126
|
+
|
|
127
|
+
@provide_session
|
|
128
|
+
def get_task_instance(operator: BaseOperator, dttm, session: Session = NEW_SESSION) -> TaskInstance:
|
|
129
|
+
dag_id = operator.dag.dag_id
|
|
130
|
+
if hasattr(DagRun, "execution_date"): # Airflow 2.x.
|
|
131
|
+
dag_run = DagRun.find(dag_id, execution_date=dttm)[0] # type: ignore[call-arg]
|
|
132
|
+
else:
|
|
133
|
+
dag_run = DagRun.find(dag_id, logical_date=dttm)[0]
|
|
134
|
+
ti = (
|
|
135
|
+
session.query(TaskInstance)
|
|
136
|
+
.filter(
|
|
137
|
+
TaskInstance.dag_id == dag_id,
|
|
138
|
+
TaskInstance.run_id == dag_run.run_id,
|
|
139
|
+
TaskInstance.task_id == operator.task_id,
|
|
140
|
+
)
|
|
141
|
+
.one_or_none()
|
|
142
|
+
)
|
|
143
|
+
if not ti:
|
|
144
|
+
raise TaskInstanceNotFound("Task instance not found")
|
|
145
|
+
return ti
|
|
122
146
|
|
|
123
147
|
|
|
124
148
|
def _repair_task(
|
|
@@ -201,27 +225,6 @@ def _get_launch_task_key(current_task_key: TaskInstanceKey, task_id: str) -> Tas
|
|
|
201
225
|
return current_task_key
|
|
202
226
|
|
|
203
227
|
|
|
204
|
-
@provide_session
|
|
205
|
-
def get_task_instance(operator: BaseOperator, dttm, session: Session = NEW_SESSION) -> TaskInstance:
|
|
206
|
-
dag_id = operator.dag.dag_id
|
|
207
|
-
if hasattr(DagRun, "execution_date"): # Airflow 2.x.
|
|
208
|
-
dag_run = DagRun.find(dag_id, execution_date=dttm)[0] # type: ignore[call-arg]
|
|
209
|
-
else:
|
|
210
|
-
dag_run = DagRun.find(dag_id, logical_date=dttm)[0]
|
|
211
|
-
ti = (
|
|
212
|
-
session.query(TaskInstance)
|
|
213
|
-
.filter(
|
|
214
|
-
TaskInstance.dag_id == dag_id,
|
|
215
|
-
TaskInstance.run_id == dag_run.run_id,
|
|
216
|
-
TaskInstance.task_id == operator.task_id,
|
|
217
|
-
)
|
|
218
|
-
.one_or_none()
|
|
219
|
-
)
|
|
220
|
-
if not ti:
|
|
221
|
-
raise TaskInstanceNotFound("Task instance not found")
|
|
222
|
-
return ti
|
|
223
|
-
|
|
224
|
-
|
|
225
228
|
def get_xcom_result(
|
|
226
229
|
ti_key: TaskInstanceKey,
|
|
227
230
|
key: str,
|
|
@@ -240,6 +243,11 @@ class WorkflowJobRunLink(BaseOperatorLink, LoggingMixin):
|
|
|
240
243
|
|
|
241
244
|
name = "See Databricks Job Run"
|
|
242
245
|
|
|
246
|
+
@property
|
|
247
|
+
def xcom_key(self) -> str:
|
|
248
|
+
"""XCom key where the link is stored during task execution."""
|
|
249
|
+
return "databricks_job_run_link"
|
|
250
|
+
|
|
243
251
|
def get_link(
|
|
244
252
|
self,
|
|
245
253
|
operator: BaseOperator,
|
|
@@ -247,6 +255,29 @@ class WorkflowJobRunLink(BaseOperatorLink, LoggingMixin):
|
|
|
247
255
|
*,
|
|
248
256
|
ti_key: TaskInstanceKey | None = None,
|
|
249
257
|
) -> str:
|
|
258
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
259
|
+
# Use public XCom API to get the pre-computed link
|
|
260
|
+
try:
|
|
261
|
+
link = XCom.get_value(
|
|
262
|
+
ti_key=ti_key,
|
|
263
|
+
key=self.xcom_key,
|
|
264
|
+
)
|
|
265
|
+
return link if link else ""
|
|
266
|
+
except Exception as e:
|
|
267
|
+
self.log.warning("Failed to retrieve Databricks job run link from XCom: %s", e)
|
|
268
|
+
return ""
|
|
269
|
+
else:
|
|
270
|
+
# Airflow 2.x - keep original implementation
|
|
271
|
+
return self._get_link_legacy(operator, dttm, ti_key=ti_key)
|
|
272
|
+
|
|
273
|
+
def _get_link_legacy(
|
|
274
|
+
self,
|
|
275
|
+
operator: BaseOperator,
|
|
276
|
+
dttm=None,
|
|
277
|
+
*,
|
|
278
|
+
ti_key: TaskInstanceKey | None = None,
|
|
279
|
+
) -> str:
|
|
280
|
+
"""Legacy implementation for Airflow 2.x."""
|
|
250
281
|
if not ti_key:
|
|
251
282
|
ti = get_task_instance(operator, dttm)
|
|
252
283
|
ti_key = ti.key
|
|
@@ -269,6 +300,30 @@ class WorkflowJobRunLink(BaseOperatorLink, LoggingMixin):
|
|
|
269
300
|
return f"https://{hook.host}/#job/{metadata.job_id}/run/{metadata.run_id}"
|
|
270
301
|
|
|
271
302
|
|
|
303
|
+
def store_databricks_job_run_link(
|
|
304
|
+
context: Context,
|
|
305
|
+
metadata: Any,
|
|
306
|
+
logger: logging.Logger,
|
|
307
|
+
) -> None:
|
|
308
|
+
"""
|
|
309
|
+
Store the Databricks job run link in XCom during task execution.
|
|
310
|
+
|
|
311
|
+
This should be called by Databricks operators during their execution.
|
|
312
|
+
"""
|
|
313
|
+
if not AIRFLOW_V_3_0_PLUS:
|
|
314
|
+
return # Only needed for Airflow 3
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
hook = DatabricksHook(metadata.conn_id)
|
|
318
|
+
link = f"https://{hook.host}/#job/{metadata.job_id}/run/{metadata.run_id}"
|
|
319
|
+
|
|
320
|
+
# Store the link in XCom for the UI to retrieve as extra link
|
|
321
|
+
context["ti"].xcom_push(key="databricks_job_run_link", value=link)
|
|
322
|
+
logger.info("Stored Databricks job run link in XCom: %s", link)
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.warning("Failed to store Databricks job run link: %s", e)
|
|
325
|
+
|
|
326
|
+
|
|
272
327
|
class WorkflowJobRepairAllFailedLink(BaseOperatorLink, LoggingMixin):
|
|
273
328
|
"""Constructs a link to send a request to repair all failed tasks in the Databricks workflow."""
|
|
274
329
|
|
|
@@ -455,13 +510,6 @@ class RepairDatabricksTasks(BaseView, LoggingMixin):
|
|
|
455
510
|
return url_for("Airflow.grid", dag_id=dag_id, dag_run_id=run_id)
|
|
456
511
|
|
|
457
512
|
|
|
458
|
-
repair_databricks_view = RepairDatabricksTasks()
|
|
459
|
-
|
|
460
|
-
repair_databricks_package = {
|
|
461
|
-
"view": repair_databricks_view,
|
|
462
|
-
}
|
|
463
|
-
|
|
464
|
-
|
|
465
513
|
class DatabricksWorkflowPlugin(AirflowPlugin):
|
|
466
514
|
"""
|
|
467
515
|
Databricks Workflows plugin for Airflow.
|
|
@@ -472,9 +520,22 @@ class DatabricksWorkflowPlugin(AirflowPlugin):
|
|
|
472
520
|
"""
|
|
473
521
|
|
|
474
522
|
name = "databricks_workflow"
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
523
|
+
|
|
524
|
+
# Conditionally set operator_extra_links based on Airflow version
|
|
525
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
526
|
+
# In Airflow 3, disable the links for repair functionality until it is figured out it can be supported
|
|
527
|
+
operator_extra_links = [
|
|
528
|
+
WorkflowJobRunLink(),
|
|
529
|
+
]
|
|
530
|
+
else:
|
|
531
|
+
# In Airflow 2.x, keep all links including repair all failed tasks
|
|
532
|
+
operator_extra_links = [
|
|
533
|
+
WorkflowJobRepairAllFailedLink(),
|
|
534
|
+
WorkflowJobRepairSingleTaskLink(),
|
|
535
|
+
WorkflowJobRunLink(),
|
|
536
|
+
]
|
|
537
|
+
repair_databricks_view = RepairDatabricksTasks()
|
|
538
|
+
repair_databricks_package = {
|
|
539
|
+
"view": repair_databricks_view,
|
|
540
|
+
}
|
|
541
|
+
appbuilder_views = [repair_databricks_package]
|
|
@@ -30,9 +30,9 @@ from airflow.providers.databricks.utils.mixins import DatabricksSQLStatementsMix
|
|
|
30
30
|
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
|
|
31
31
|
|
|
32
32
|
if AIRFLOW_V_3_0_PLUS:
|
|
33
|
-
from airflow.sdk import BaseSensorOperator
|
|
33
|
+
from airflow.sdk.bases.sensor import BaseSensorOperator
|
|
34
34
|
else:
|
|
35
|
-
from airflow.sensors.base import BaseSensorOperator
|
|
35
|
+
from airflow.sensors.base import BaseSensorOperator # type: ignore[no-redef]
|
|
36
36
|
|
|
37
37
|
if TYPE_CHECKING:
|
|
38
38
|
from airflow.utils.context import Context
|
|
@@ -20,17 +20,22 @@
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
-
from collections.abc import Sequence
|
|
23
|
+
from collections.abc import Callable, Sequence
|
|
24
24
|
from datetime import datetime
|
|
25
25
|
from functools import cached_property
|
|
26
|
-
from typing import TYPE_CHECKING, Any
|
|
26
|
+
from typing import TYPE_CHECKING, Any
|
|
27
27
|
|
|
28
28
|
from databricks.sql.utils import ParamEscaper
|
|
29
29
|
|
|
30
30
|
from airflow.exceptions import AirflowException
|
|
31
31
|
from airflow.providers.common.sql.hooks.handlers import fetch_all_handler
|
|
32
32
|
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook
|
|
33
|
-
from airflow.
|
|
33
|
+
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
|
|
34
|
+
|
|
35
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
36
|
+
from airflow.sdk import BaseSensorOperator
|
|
37
|
+
else:
|
|
38
|
+
from airflow.sensors.base import BaseSensorOperator # type: ignore[no-redef]
|
|
34
39
|
|
|
35
40
|
if TYPE_CHECKING:
|
|
36
41
|
try:
|
|
@@ -20,14 +20,19 @@
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
-
from collections.abc import Iterable, Sequence
|
|
23
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
24
24
|
from functools import cached_property
|
|
25
|
-
from typing import TYPE_CHECKING, Any
|
|
25
|
+
from typing import TYPE_CHECKING, Any
|
|
26
26
|
|
|
27
27
|
from airflow.exceptions import AirflowException
|
|
28
28
|
from airflow.providers.common.sql.hooks.handlers import fetch_all_handler
|
|
29
29
|
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook
|
|
30
|
-
from airflow.
|
|
30
|
+
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
|
|
31
|
+
|
|
32
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
33
|
+
from airflow.sdk import BaseSensorOperator
|
|
34
|
+
else:
|
|
35
|
+
from airflow.sensors.base import BaseSensorOperator # type: ignore[no-redef]
|
|
31
36
|
|
|
32
37
|
if TYPE_CHECKING:
|
|
33
38
|
try:
|
|
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
31
31
|
from openlineage.client.event_v2 import RunEvent
|
|
32
32
|
from openlineage.client.facet_v2 import JobFacet
|
|
33
33
|
|
|
34
|
+
from airflow.providers.databricks.hooks.databricks import DatabricksHook
|
|
34
35
|
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook
|
|
35
36
|
|
|
36
37
|
log = logging.getLogger(__name__)
|
|
@@ -121,20 +122,18 @@ def _get_parent_run_facet(task_instance):
|
|
|
121
122
|
)
|
|
122
123
|
|
|
123
124
|
|
|
124
|
-
def _run_api_call(hook: DatabricksSqlHook, query_ids: list[str]) -> list[dict]:
|
|
125
|
+
def _run_api_call(hook: DatabricksSqlHook | DatabricksHook, query_ids: list[str]) -> list[dict]:
|
|
125
126
|
"""Retrieve execution details for specific queries from Databricks's query history API."""
|
|
126
|
-
if not hook._token:
|
|
127
|
-
# This has logic for token initialization
|
|
128
|
-
hook.get_conn()
|
|
129
|
-
|
|
130
|
-
# https://docs.databricks.com/api/azure/workspace/queryhistory/list
|
|
131
127
|
try:
|
|
128
|
+
token = hook._get_token(raise_error=True)
|
|
129
|
+
# https://docs.databricks.com/api/azure/workspace/queryhistory/list
|
|
132
130
|
response = requests.get(
|
|
133
131
|
url=f"https://{hook.host}/api/2.0/sql/history/queries",
|
|
134
|
-
headers={"Authorization": f"Bearer {
|
|
132
|
+
headers={"Authorization": f"Bearer {token}"},
|
|
135
133
|
data=json.dumps({"filter_by": {"statement_ids": query_ids}}),
|
|
136
134
|
timeout=2,
|
|
137
135
|
)
|
|
136
|
+
response.raise_for_status()
|
|
138
137
|
except Exception as e:
|
|
139
138
|
log.warning(
|
|
140
139
|
"OpenLineage could not retrieve Databricks queries details. Error received: `%s`.",
|
|
@@ -142,48 +141,42 @@ def _run_api_call(hook: DatabricksSqlHook, query_ids: list[str]) -> list[dict]:
|
|
|
142
141
|
)
|
|
143
142
|
return []
|
|
144
143
|
|
|
145
|
-
if response.status_code != 200:
|
|
146
|
-
log.warning(
|
|
147
|
-
"OpenLineage could not retrieve Databricks queries details. API error received: `%s`: `%s`",
|
|
148
|
-
response.status_code,
|
|
149
|
-
response.text,
|
|
150
|
-
)
|
|
151
|
-
return []
|
|
152
|
-
|
|
153
144
|
return response.json()["res"]
|
|
154
145
|
|
|
155
146
|
|
|
147
|
+
def _process_data_from_api(data: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
148
|
+
"""Convert timestamp fields to UTC datetime objects."""
|
|
149
|
+
for row in data:
|
|
150
|
+
for key in ("query_start_time_ms", "query_end_time_ms"):
|
|
151
|
+
row[key] = datetime.datetime.fromtimestamp(row[key] / 1000, tz=datetime.timezone.utc)
|
|
152
|
+
|
|
153
|
+
return data
|
|
154
|
+
|
|
155
|
+
|
|
156
156
|
def _get_queries_details_from_databricks(
|
|
157
|
-
hook: DatabricksSqlHook, query_ids: list[str]
|
|
157
|
+
hook: DatabricksSqlHook | DatabricksHook, query_ids: list[str]
|
|
158
158
|
) -> dict[str, dict[str, Any]]:
|
|
159
159
|
if not query_ids:
|
|
160
160
|
return {}
|
|
161
161
|
|
|
162
|
-
queries_info_from_api = _run_api_call(hook=hook, query_ids=query_ids)
|
|
163
|
-
|
|
164
162
|
query_details = {}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
query_info
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
query_details[query_info["query_id"]] = {
|
|
181
|
-
"status": query_info.get("status"),
|
|
182
|
-
"start_time": q_start_time,
|
|
183
|
-
"end_time": q_end_time,
|
|
184
|
-
"query_text": query_info.get("query_text"),
|
|
185
|
-
"error_message": query_info.get("error_message"),
|
|
163
|
+
try:
|
|
164
|
+
queries_info_from_api = _run_api_call(hook=hook, query_ids=query_ids)
|
|
165
|
+
queries_info_from_api = _process_data_from_api(queries_info_from_api)
|
|
166
|
+
|
|
167
|
+
query_details = {
|
|
168
|
+
query_info["query_id"]: {
|
|
169
|
+
"status": query_info.get("status"),
|
|
170
|
+
"start_time": query_info.get("query_start_time_ms"),
|
|
171
|
+
"end_time": query_info.get("query_end_time_ms"),
|
|
172
|
+
"query_text": query_info.get("query_text"),
|
|
173
|
+
"error_message": query_info.get("error_message"),
|
|
174
|
+
}
|
|
175
|
+
for query_info in queries_info_from_api
|
|
176
|
+
if query_info["query_id"]
|
|
186
177
|
}
|
|
178
|
+
except Exception as e:
|
|
179
|
+
log.warning("OpenLineage could not retrieve extra metadata from Databricks. Error encountered: %s", e)
|
|
187
180
|
|
|
188
181
|
return query_details
|
|
189
182
|
|
|
@@ -221,17 +214,18 @@ def _create_ol_event_pair(
|
|
|
221
214
|
|
|
222
215
|
@require_openlineage_version(provider_min_version="2.3.0")
|
|
223
216
|
def emit_openlineage_events_for_databricks_queries(
|
|
224
|
-
query_ids: list[str],
|
|
225
|
-
query_source_namespace: str,
|
|
226
217
|
task_instance,
|
|
227
|
-
hook: DatabricksSqlHook | None = None,
|
|
218
|
+
hook: DatabricksSqlHook | DatabricksHook | None = None,
|
|
219
|
+
query_ids: list[str] | None = None,
|
|
220
|
+
query_source_namespace: str | None = None,
|
|
221
|
+
query_for_extra_metadata: bool = False,
|
|
228
222
|
additional_run_facets: dict | None = None,
|
|
229
223
|
additional_job_facets: dict | None = None,
|
|
230
224
|
) -> None:
|
|
231
225
|
"""
|
|
232
226
|
Emit OpenLineage events for executed Databricks queries.
|
|
233
227
|
|
|
234
|
-
Metadata retrieval from Databricks is attempted only if
|
|
228
|
+
Metadata retrieval from Databricks is attempted only if `get_extra_metadata` is True and hook is provided.
|
|
235
229
|
If metadata is available, execution details such as start time, end time, execution status,
|
|
236
230
|
error messages, and SQL text are included in the events. If no metadata is found, the function
|
|
237
231
|
defaults to using the Airflow task instance's state and the current timestamp.
|
|
@@ -241,10 +235,16 @@ def emit_openlineage_events_for_databricks_queries(
|
|
|
241
235
|
will correspond to actual query execution times.
|
|
242
236
|
|
|
243
237
|
Args:
|
|
244
|
-
query_ids: A list of Databricks query IDs to emit events for.
|
|
245
|
-
query_source_namespace: The namespace to be included in ExternalQueryRunFacet.
|
|
246
238
|
task_instance: The Airflow task instance that run these queries.
|
|
247
|
-
hook: A hook instance used to retrieve query metadata if available.
|
|
239
|
+
hook: A supported Databricks hook instance used to retrieve query metadata if available.
|
|
240
|
+
If omitted, `query_ids` and `query_source_namespace` must be provided explicitly and
|
|
241
|
+
`query_for_extra_metadata` must be `False`.
|
|
242
|
+
query_ids: A list of Databricks query IDs to emit events for, can only be None if `hook` is provided
|
|
243
|
+
and `hook.query_ids` are present (DatabricksHook does not store query_ids).
|
|
244
|
+
query_source_namespace: The namespace to be included in ExternalQueryRunFacet,
|
|
245
|
+
can be `None` only if hook is provided.
|
|
246
|
+
query_for_extra_metadata: Whether to query Databricks for additional metadata about queries.
|
|
247
|
+
Must be `False` if `hook` is not provided.
|
|
248
248
|
additional_run_facets: Additional run facets to include in OpenLineage events.
|
|
249
249
|
additional_job_facets: Additional job facets to include in OpenLineage events.
|
|
250
250
|
"""
|
|
@@ -259,25 +259,52 @@ def emit_openlineage_events_for_databricks_queries(
|
|
|
259
259
|
from airflow.providers.openlineage.conf import namespace
|
|
260
260
|
from airflow.providers.openlineage.plugins.listener import get_openlineage_listener
|
|
261
261
|
|
|
262
|
-
|
|
263
|
-
log.debug("No Databricks query IDs provided; skipping OpenLineage event emission.")
|
|
264
|
-
return
|
|
265
|
-
|
|
266
|
-
query_ids = [q for q in query_ids] # Make a copy to make sure it does not change
|
|
262
|
+
log.info("OpenLineage will emit events for Databricks queries.")
|
|
267
263
|
|
|
268
264
|
if hook:
|
|
265
|
+
if not query_ids:
|
|
266
|
+
log.debug("No Databricks query IDs provided; Checking `hook.query_ids` property.")
|
|
267
|
+
query_ids = getattr(hook, "query_ids", [])
|
|
268
|
+
if not query_ids:
|
|
269
|
+
raise ValueError("No Databricks query IDs provided and `hook.query_ids` are not present.")
|
|
270
|
+
|
|
271
|
+
if not query_source_namespace:
|
|
272
|
+
log.debug("No Databricks query namespace provided; Creating one from scratch.")
|
|
273
|
+
|
|
274
|
+
if hasattr(hook, "get_openlineage_database_info") and hasattr(hook, "get_conn_id"):
|
|
275
|
+
from airflow.providers.openlineage.sqlparser import SQLParser
|
|
276
|
+
|
|
277
|
+
query_source_namespace = SQLParser.create_namespace(
|
|
278
|
+
hook.get_openlineage_database_info(hook.get_connection(hook.get_conn_id()))
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
query_source_namespace = f"databricks://{hook.host}" if hook.host else "databricks"
|
|
282
|
+
else:
|
|
283
|
+
if not query_ids:
|
|
284
|
+
raise ValueError("If 'hook' is not provided, 'query_ids' must be set.")
|
|
285
|
+
if not query_source_namespace:
|
|
286
|
+
raise ValueError("If 'hook' is not provided, 'query_source_namespace' must be set.")
|
|
287
|
+
if query_for_extra_metadata:
|
|
288
|
+
raise ValueError("If 'hook' is not provided, 'query_for_extra_metadata' must be False.")
|
|
289
|
+
|
|
290
|
+
query_ids = [q for q in query_ids] # Make a copy to make sure we do not change hook's attribute
|
|
291
|
+
|
|
292
|
+
if query_for_extra_metadata and hook:
|
|
269
293
|
log.debug("Retrieving metadata for %s queries from Databricks.", len(query_ids))
|
|
270
294
|
databricks_metadata = _get_queries_details_from_databricks(hook, query_ids)
|
|
271
295
|
else:
|
|
272
|
-
log.debug("
|
|
296
|
+
log.debug("`query_for_extra_metadata` is False. No extra metadata fill be fetched from Databricks.")
|
|
273
297
|
databricks_metadata = {}
|
|
274
298
|
|
|
275
299
|
# If real metadata is unavailable, we send events with eventTime=now
|
|
276
300
|
default_event_time = timezone.utcnow()
|
|
277
|
-
#
|
|
301
|
+
# ti.state has no `value` attr (AF2) when task it's still running, in AF3 we get 'running', in that case
|
|
302
|
+
# assuming it's user call and query succeeded, so we replace it with success.
|
|
278
303
|
# Adjust state for DBX logic, where "finished" means "success"
|
|
279
|
-
default_state =
|
|
280
|
-
|
|
304
|
+
default_state = (
|
|
305
|
+
getattr(task_instance.state, "value", "running") if hasattr(task_instance, "state") else ""
|
|
306
|
+
)
|
|
307
|
+
default_state = "finished" if default_state in ("running", "success") else default_state
|
|
281
308
|
|
|
282
309
|
log.debug("Generating OpenLineage facets")
|
|
283
310
|
common_run_facets = {"parent": _get_parent_run_facet(task_instance)}
|
|
@@ -318,10 +345,10 @@ def emit_openlineage_events_for_databricks_queries(
|
|
|
318
345
|
event_batch = _create_ol_event_pair(
|
|
319
346
|
job_namespace=namespace(),
|
|
320
347
|
job_name=f"{task_instance.dag_id}.{task_instance.task_id}.query.{counter}",
|
|
321
|
-
start_time=query_metadata.get("start_time"
|
|
322
|
-
end_time=query_metadata.get("end_time"
|
|
348
|
+
start_time=query_metadata.get("start_time") or default_event_time, # type: ignore[arg-type]
|
|
349
|
+
end_time=query_metadata.get("end_time") or default_event_time, # type: ignore[arg-type]
|
|
323
350
|
# Only finished status means it completed without failures
|
|
324
|
-
is_successful=query_metadata.get("status"
|
|
351
|
+
is_successful=(query_metadata.get("status") or default_state).lower() == "finished",
|
|
325
352
|
run_facets={**query_specific_run_facets, **common_run_facets, **additional_run_facets},
|
|
326
353
|
job_facets={**query_specific_job_facets, **common_job_facets, **additional_job_facets},
|
|
327
354
|
)
|
|
@@ -33,3 +33,13 @@ def get_base_airflow_version_tuple() -> tuple[int, int, int]:
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
AIRFLOW_V_3_0_PLUS = get_base_airflow_version_tuple() >= (3, 0, 0)
|
|
36
|
+
|
|
37
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
38
|
+
from airflow.sdk import BaseOperator
|
|
39
|
+
else:
|
|
40
|
+
from airflow.models import BaseOperator
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
"AIRFLOW_V_3_0_PLUS",
|
|
44
|
+
"BaseOperator",
|
|
45
|
+
]
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: apache-airflow-providers-databricks
|
|
3
|
-
Version: 7.
|
|
3
|
+
Version: 7.6.0
|
|
4
4
|
Summary: Provider package apache-airflow-providers-databricks for Apache Airflow
|
|
5
5
|
Keywords: airflow-provider,databricks,airflow,integration
|
|
6
6
|
Author-email: Apache Software Foundation <dev@airflow.apache.org>
|
|
7
7
|
Maintainer-email: Apache Software Foundation <dev@airflow.apache.org>
|
|
8
|
-
Requires-Python: ~=3.
|
|
8
|
+
Requires-Python: ~=3.10
|
|
9
9
|
Description-Content-Type: text/x-rst
|
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
11
11
|
Classifier: Environment :: Console
|
|
@@ -15,7 +15,6 @@ Classifier: Intended Audience :: System Administrators
|
|
|
15
15
|
Classifier: Framework :: Apache Airflow
|
|
16
16
|
Classifier: Framework :: Apache Airflow :: Provider
|
|
17
17
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.12
|
|
@@ -28,16 +27,17 @@ Requires-Dist: databricks-sql-connector>=3.0.0
|
|
|
28
27
|
Requires-Dist: databricks-sqlalchemy>=1.0.2
|
|
29
28
|
Requires-Dist: aiohttp>=3.9.2, <4
|
|
30
29
|
Requires-Dist: mergedeep>=1.3.4
|
|
31
|
-
Requires-Dist: pandas>=2.1.2
|
|
32
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: pandas>=2.1.2; python_version <"3.13"
|
|
31
|
+
Requires-Dist: pandas>=2.2.3; python_version >="3.13"
|
|
32
|
+
Requires-Dist: pyarrow>=16.1.0
|
|
33
33
|
Requires-Dist: azure-identity>=1.3.1 ; extra == "azure-identity"
|
|
34
34
|
Requires-Dist: apache-airflow-providers-fab ; extra == "fab"
|
|
35
35
|
Requires-Dist: apache-airflow-providers-openlineage>=2.3.0 ; extra == "openlineage"
|
|
36
36
|
Requires-Dist: databricks-sdk==0.10.0 ; extra == "sdk"
|
|
37
37
|
Requires-Dist: apache-airflow-providers-standard ; extra == "standard"
|
|
38
38
|
Project-URL: Bug Tracker, https://github.com/apache/airflow/issues
|
|
39
|
-
Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.
|
|
40
|
-
Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.
|
|
39
|
+
Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.6.0/changelog.html
|
|
40
|
+
Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.6.0
|
|
41
41
|
Project-URL: Mastodon, https://fosstodon.org/@airflow
|
|
42
42
|
Project-URL: Slack Chat, https://s.apache.org/airflow-slack
|
|
43
43
|
Project-URL: Source Code, https://github.com/apache/airflow
|
|
@@ -73,7 +73,7 @@ Provides-Extra: standard
|
|
|
73
73
|
|
|
74
74
|
Package ``apache-airflow-providers-databricks``
|
|
75
75
|
|
|
76
|
-
Release: ``7.
|
|
76
|
+
Release: ``7.6.0``
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
`Databricks <https://databricks.com/>`__
|
|
@@ -86,7 +86,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
|
|
|
86
86
|
are in ``airflow.providers.databricks`` python package.
|
|
87
87
|
|
|
88
88
|
You can find package information and changelog for the provider
|
|
89
|
-
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.
|
|
89
|
+
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.6.0/>`_.
|
|
90
90
|
|
|
91
91
|
Installation
|
|
92
92
|
------------
|
|
@@ -95,14 +95,14 @@ You can install this package on top of an existing Airflow 2 installation (see `
|
|
|
95
95
|
for the minimum Airflow version supported) via
|
|
96
96
|
``pip install apache-airflow-providers-databricks``
|
|
97
97
|
|
|
98
|
-
The package supports the following python versions: 3.
|
|
98
|
+
The package supports the following python versions: 3.10,3.11,3.12
|
|
99
99
|
|
|
100
100
|
Requirements
|
|
101
101
|
------------
|
|
102
102
|
|
|
103
|
-
==========================================
|
|
103
|
+
========================================== =====================================
|
|
104
104
|
PIP package Version required
|
|
105
|
-
==========================================
|
|
105
|
+
========================================== =====================================
|
|
106
106
|
``apache-airflow`` ``>=2.10.0``
|
|
107
107
|
``apache-airflow-providers-common-compat`` ``>=1.6.0``
|
|
108
108
|
``apache-airflow-providers-common-sql`` ``>=1.27.0``
|
|
@@ -111,9 +111,10 @@ PIP package Version required
|
|
|
111
111
|
``databricks-sqlalchemy`` ``>=1.0.2``
|
|
112
112
|
``aiohttp`` ``>=3.9.2,<4``
|
|
113
113
|
``mergedeep`` ``>=1.3.4``
|
|
114
|
-
``pandas`` ``>=2.1.2
|
|
115
|
-
``
|
|
116
|
-
|
|
114
|
+
``pandas`` ``>=2.1.2; python_version < "3.13"``
|
|
115
|
+
``pandas`` ``>=2.2.3; python_version >= "3.13"``
|
|
116
|
+
``pyarrow`` ``>=16.1.0``
|
|
117
|
+
========================================== =====================================
|
|
117
118
|
|
|
118
119
|
Cross provider package dependencies
|
|
119
120
|
-----------------------------------
|
|
@@ -138,5 +139,5 @@ Dependent package
|
|
|
138
139
|
================================================================================================================== =================
|
|
139
140
|
|
|
140
141
|
The changelog for the provider package can be found in the
|
|
141
|
-
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.
|
|
142
|
+
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.6.0/changelog.html>`_.
|
|
142
143
|
|
|
@@ -1,30 +1,30 @@
|
|
|
1
1
|
airflow/providers/databricks/LICENSE,sha256=gXPVwptPlW1TJ4HSuG5OMPg-a3h43OGMkZRR1rpwfJA,10850
|
|
2
|
-
airflow/providers/databricks/__init__.py,sha256=
|
|
2
|
+
airflow/providers/databricks/__init__.py,sha256=VqfLw47pSDD_4wxEIV-X0N2IYBRr2gjQ6HUgapVYbuA,1499
|
|
3
3
|
airflow/providers/databricks/exceptions.py,sha256=85RklmLOI_PnTzfXNIUd5fAu2aMMUhelwumQAX0wANE,1261
|
|
4
4
|
airflow/providers/databricks/get_provider_info.py,sha256=NZ-rY6k6ctDZN7rDngN7mAzq7RMhLag5NwfnuBNcKuw,5644
|
|
5
|
-
airflow/providers/databricks/version_compat.py,sha256=
|
|
5
|
+
airflow/providers/databricks/version_compat.py,sha256=7RHBehpYMeNSBtmJiPUeJHA0c7l-Eqsdy546kW3RFa4,1712
|
|
6
6
|
airflow/providers/databricks/hooks/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
7
|
-
airflow/providers/databricks/hooks/databricks.py,sha256=
|
|
8
|
-
airflow/providers/databricks/hooks/databricks_base.py,sha256=
|
|
9
|
-
airflow/providers/databricks/hooks/databricks_sql.py,sha256=
|
|
7
|
+
airflow/providers/databricks/hooks/databricks.py,sha256=p_oAxWRohaVvhtlmQ3C67JUsi1fRQRW84QDr7uQ7rxk,28965
|
|
8
|
+
airflow/providers/databricks/hooks/databricks_base.py,sha256=gish0H2rHEzPqI5ZpU3BPFCUaycHMEYGYev0ufJMzzI,35167
|
|
9
|
+
airflow/providers/databricks/hooks/databricks_sql.py,sha256=r6LoYSk70DfzJ1kbNaJpM-oTYJm5mCSP600iC8pIY-E,16905
|
|
10
10
|
airflow/providers/databricks/operators/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
11
|
-
airflow/providers/databricks/operators/databricks.py,sha256=
|
|
12
|
-
airflow/providers/databricks/operators/databricks_repos.py,sha256=
|
|
13
|
-
airflow/providers/databricks/operators/databricks_sql.py,sha256=
|
|
14
|
-
airflow/providers/databricks/operators/databricks_workflow.py,sha256=
|
|
11
|
+
airflow/providers/databricks/operators/databricks.py,sha256=6rZiBdnbrs04EK-faP5DxNAHSHL3Is_q23da6N2fY7w,80047
|
|
12
|
+
airflow/providers/databricks/operators/databricks_repos.py,sha256=NLigItgvQOpxhDhttkU2Jhrcu1gODXQME2i5f8w7gYk,13311
|
|
13
|
+
airflow/providers/databricks/operators/databricks_sql.py,sha256=QmFUM83jY0pvnG4K-iM7Kuc4H48ORIx2jgGoOdAtEJw,21836
|
|
14
|
+
airflow/providers/databricks/operators/databricks_workflow.py,sha256=Gwrtf_EYrkYEMFIwzuA2h9IFyQyk_q4mC1cdJrf8U8Q,14994
|
|
15
15
|
airflow/providers/databricks/plugins/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
16
|
-
airflow/providers/databricks/plugins/databricks_workflow.py,sha256=
|
|
16
|
+
airflow/providers/databricks/plugins/databricks_workflow.py,sha256=iJGrG_uhFFhuGXecBFLWxLhm4zdAj-IzPsjA3EL-cpQ,20110
|
|
17
17
|
airflow/providers/databricks/sensors/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
18
|
-
airflow/providers/databricks/sensors/databricks.py,sha256=
|
|
19
|
-
airflow/providers/databricks/sensors/databricks_partition.py,sha256=
|
|
20
|
-
airflow/providers/databricks/sensors/databricks_sql.py,sha256=
|
|
18
|
+
airflow/providers/databricks/sensors/databricks.py,sha256=AVSqvHDr7iDXL1WZ46MTN3KUnVSIOc_g5JEViA1MeVE,6428
|
|
19
|
+
airflow/providers/databricks/sensors/databricks_partition.py,sha256=1PZo-rdRo6E7yBa30ISFjgQ-iaFdqPYm0gnN5tXgxCU,10205
|
|
20
|
+
airflow/providers/databricks/sensors/databricks_sql.py,sha256=cbPKia5eH2no_sl-LltjBA-1qM64lurmB8lT9QR9eGk,5948
|
|
21
21
|
airflow/providers/databricks/triggers/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
22
22
|
airflow/providers/databricks/triggers/databricks.py,sha256=dSogx6GlcJfZ4CFhtlMeWs9sYFEYthP82S_U8-tM2Tk,9240
|
|
23
23
|
airflow/providers/databricks/utils/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
24
24
|
airflow/providers/databricks/utils/databricks.py,sha256=s0qEr_DsFhKW4uUiq2VQbtqcj52isYIplPZsUcxGPrI,2862
|
|
25
25
|
airflow/providers/databricks/utils/mixins.py,sha256=WUmkt3AmXalmV6zOUIJZWbTldxYunAZOstddDhKCC94,7407
|
|
26
|
-
airflow/providers/databricks/utils/openlineage.py,sha256=
|
|
27
|
-
apache_airflow_providers_databricks-7.
|
|
28
|
-
apache_airflow_providers_databricks-7.
|
|
29
|
-
apache_airflow_providers_databricks-7.
|
|
30
|
-
apache_airflow_providers_databricks-7.
|
|
26
|
+
airflow/providers/databricks/utils/openlineage.py,sha256=DVgmT4I5-mhwMwo6j_qEvF4WUP35ZmZFwc1YqL-pMMA,15230
|
|
27
|
+
apache_airflow_providers_databricks-7.6.0.dist-info/entry_points.txt,sha256=hjmZm3ab2cteTR4t9eE28oKixHwNIKtLCThd6sx3XRQ,227
|
|
28
|
+
apache_airflow_providers_databricks-7.6.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
29
|
+
apache_airflow_providers_databricks-7.6.0.dist-info/METADATA,sha256=ThoHqv3qHIIStqI9vGH84F2Er1n3GPTmD_SxMvqi_bM,6939
|
|
30
|
+
apache_airflow_providers_databricks-7.6.0.dist-info/RECORD,,
|
|
File without changes
|