apache-airflow-providers-databricks 7.8.0rc1__py3-none-any.whl → 7.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/databricks/__init__.py +1 -1
- airflow/providers/databricks/exceptions.py +1 -1
- airflow/providers/databricks/hooks/databricks.py +30 -30
- airflow/providers/databricks/hooks/databricks_base.py +1 -1
- airflow/providers/databricks/hooks/databricks_sql.py +10 -2
- airflow/providers/databricks/operators/databricks.py +8 -10
- airflow/providers/databricks/operators/databricks_repos.py +1 -2
- airflow/providers/databricks/operators/databricks_sql.py +164 -30
- airflow/providers/databricks/operators/databricks_workflow.py +1 -2
- airflow/providers/databricks/plugins/databricks_workflow.py +32 -12
- airflow/providers/databricks/sensors/databricks.py +1 -3
- airflow/providers/databricks/sensors/databricks_partition.py +1 -2
- airflow/providers/databricks/sensors/databricks_sql.py +1 -2
- airflow/providers/databricks/utils/databricks.py +1 -1
- airflow/providers/databricks/utils/mixins.py +3 -7
- airflow/providers/databricks/utils/openlineage.py +25 -63
- {apache_airflow_providers_databricks-7.8.0rc1.dist-info → apache_airflow_providers_databricks-7.9.0.dist-info}/METADATA +23 -14
- apache_airflow_providers_databricks-7.9.0.dist-info/RECORD +31 -0
- {apache_airflow_providers_databricks-7.8.0rc1.dist-info → apache_airflow_providers_databricks-7.9.0.dist-info}/licenses/NOTICE +1 -1
- apache_airflow_providers_databricks-7.8.0rc1.dist-info/RECORD +0 -31
- {apache_airflow_providers_databricks-7.8.0rc1.dist-info → apache_airflow_providers_databricks-7.9.0.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_databricks-7.8.0rc1.dist-info → apache_airflow_providers_databricks-7.9.0.dist-info}/entry_points.txt +0 -0
- {apache_airflow_providers_databricks-7.8.0rc1.dist-info → apache_airflow_providers_databricks-7.9.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -29,7 +29,7 @@ from airflow import __version__ as airflow_version
|
|
|
29
29
|
|
|
30
30
|
__all__ = ["__version__"]
|
|
31
31
|
|
|
32
|
-
__version__ = "7.
|
|
32
|
+
__version__ = "7.9.0"
|
|
33
33
|
|
|
34
34
|
if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
|
|
35
35
|
"2.11.0"
|
|
@@ -20,9 +20,9 @@ Databricks hook.
|
|
|
20
20
|
|
|
21
21
|
This hook enable the submitting and running of jobs to the Databricks platform. Internally the
|
|
22
22
|
operators talk to the
|
|
23
|
-
``api/2.
|
|
23
|
+
``api/2.2/jobs/run-now``
|
|
24
24
|
`endpoint <https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunNow>_`
|
|
25
|
-
or the ``api/2.
|
|
25
|
+
or the ``api/2.2/jobs/runs/submit``
|
|
26
26
|
`endpoint <https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit>`_.
|
|
27
27
|
"""
|
|
28
28
|
|
|
@@ -34,25 +34,25 @@ from typing import Any
|
|
|
34
34
|
|
|
35
35
|
from requests import exceptions as requests_exceptions
|
|
36
36
|
|
|
37
|
-
from airflow.
|
|
37
|
+
from airflow.providers.common.compat.sdk import AirflowException
|
|
38
38
|
from airflow.providers.databricks.hooks.databricks_base import BaseDatabricksHook
|
|
39
39
|
|
|
40
|
-
GET_CLUSTER_ENDPOINT = ("GET", "2.
|
|
41
|
-
RESTART_CLUSTER_ENDPOINT = ("POST", "2.
|
|
42
|
-
START_CLUSTER_ENDPOINT = ("POST", "2.
|
|
43
|
-
TERMINATE_CLUSTER_ENDPOINT = ("POST", "2.
|
|
44
|
-
|
|
45
|
-
CREATE_ENDPOINT = ("POST", "2.
|
|
46
|
-
RESET_ENDPOINT = ("POST", "2.
|
|
47
|
-
UPDATE_ENDPOINT = ("POST", "2.
|
|
48
|
-
RUN_NOW_ENDPOINT = ("POST", "2.
|
|
49
|
-
SUBMIT_RUN_ENDPOINT = ("POST", "2.
|
|
50
|
-
GET_RUN_ENDPOINT = ("GET", "2.
|
|
51
|
-
CANCEL_RUN_ENDPOINT = ("POST", "2.
|
|
52
|
-
DELETE_RUN_ENDPOINT = ("POST", "2.
|
|
53
|
-
REPAIR_RUN_ENDPOINT = ("POST", "2.
|
|
54
|
-
OUTPUT_RUNS_JOB_ENDPOINT = ("GET", "2.
|
|
55
|
-
CANCEL_ALL_RUNS_ENDPOINT = ("POST", "2.
|
|
40
|
+
GET_CLUSTER_ENDPOINT = ("GET", "2.1/clusters/get")
|
|
41
|
+
RESTART_CLUSTER_ENDPOINT = ("POST", "2.1/clusters/restart")
|
|
42
|
+
START_CLUSTER_ENDPOINT = ("POST", "2.1/clusters/start")
|
|
43
|
+
TERMINATE_CLUSTER_ENDPOINT = ("POST", "2.1/clusters/delete")
|
|
44
|
+
|
|
45
|
+
CREATE_ENDPOINT = ("POST", "2.2/jobs/create")
|
|
46
|
+
RESET_ENDPOINT = ("POST", "2.2/jobs/reset")
|
|
47
|
+
UPDATE_ENDPOINT = ("POST", "2.2/jobs/update")
|
|
48
|
+
RUN_NOW_ENDPOINT = ("POST", "2.2/jobs/run-now")
|
|
49
|
+
SUBMIT_RUN_ENDPOINT = ("POST", "2.2/jobs/runs/submit")
|
|
50
|
+
GET_RUN_ENDPOINT = ("GET", "2.2/jobs/runs/get")
|
|
51
|
+
CANCEL_RUN_ENDPOINT = ("POST", "2.2/jobs/runs/cancel")
|
|
52
|
+
DELETE_RUN_ENDPOINT = ("POST", "2.2/jobs/runs/delete")
|
|
53
|
+
REPAIR_RUN_ENDPOINT = ("POST", "2.2/jobs/runs/repair")
|
|
54
|
+
OUTPUT_RUNS_JOB_ENDPOINT = ("GET", "2.2/jobs/runs/get-output")
|
|
55
|
+
CANCEL_ALL_RUNS_ENDPOINT = ("POST", "2.2/jobs/runs/cancel-all")
|
|
56
56
|
|
|
57
57
|
INSTALL_LIBS_ENDPOINT = ("POST", "2.0/libraries/install")
|
|
58
58
|
UNINSTALL_LIBS_ENDPOINT = ("POST", "2.0/libraries/uninstall")
|
|
@@ -60,13 +60,13 @@ UPDATE_REPO_ENDPOINT = ("PATCH", "2.0/repos/")
|
|
|
60
60
|
DELETE_REPO_ENDPOINT = ("DELETE", "2.0/repos/")
|
|
61
61
|
CREATE_REPO_ENDPOINT = ("POST", "2.0/repos")
|
|
62
62
|
|
|
63
|
-
LIST_JOBS_ENDPOINT = ("GET", "2.
|
|
63
|
+
LIST_JOBS_ENDPOINT = ("GET", "2.2/jobs/list")
|
|
64
64
|
LIST_PIPELINES_ENDPOINT = ("GET", "2.0/pipelines")
|
|
65
|
-
LIST_SQL_ENDPOINTS_ENDPOINT = ("GET", "2.0/sql/
|
|
65
|
+
LIST_SQL_ENDPOINTS_ENDPOINT = ("GET", "2.0/sql/warehouses")
|
|
66
66
|
|
|
67
67
|
WORKSPACE_GET_STATUS_ENDPOINT = ("GET", "2.0/workspace/get-status")
|
|
68
68
|
|
|
69
|
-
SPARK_VERSIONS_ENDPOINT = ("GET", "2.
|
|
69
|
+
SPARK_VERSIONS_ENDPOINT = ("GET", "2.1/clusters/spark-versions")
|
|
70
70
|
SQL_STATEMENTS_ENDPOINT = "2.0/sql/statements"
|
|
71
71
|
|
|
72
72
|
|
|
@@ -293,7 +293,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
293
293
|
|
|
294
294
|
def create_job(self, json: dict) -> int:
|
|
295
295
|
"""
|
|
296
|
-
Call the ``api/2.
|
|
296
|
+
Call the ``api/2.2/jobs/create`` endpoint.
|
|
297
297
|
|
|
298
298
|
:param json: The data used in the body of the request to the ``create`` endpoint.
|
|
299
299
|
:return: the job_id as an int
|
|
@@ -303,7 +303,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
303
303
|
|
|
304
304
|
def reset_job(self, job_id: str, json: dict) -> None:
|
|
305
305
|
"""
|
|
306
|
-
Call the ``api/2.
|
|
306
|
+
Call the ``api/2.2/jobs/reset`` endpoint.
|
|
307
307
|
|
|
308
308
|
:param json: The data used in the new_settings of the request to the ``reset`` endpoint.
|
|
309
309
|
"""
|
|
@@ -321,7 +321,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
321
321
|
|
|
322
322
|
def update_job(self, job_id: str, json: dict) -> None:
|
|
323
323
|
"""
|
|
324
|
-
Call the ``api/2.
|
|
324
|
+
Call the ``api/2.2/jobs/update`` endpoint.
|
|
325
325
|
|
|
326
326
|
:param job_id: The id of the job to update.
|
|
327
327
|
:param json: The data used in the new_settings of the request to the ``update`` endpoint.
|
|
@@ -330,7 +330,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
330
330
|
|
|
331
331
|
def run_now(self, json: dict) -> int:
|
|
332
332
|
"""
|
|
333
|
-
Call the ``api/2.
|
|
333
|
+
Call the ``api/2.2/jobs/run-now`` endpoint.
|
|
334
334
|
|
|
335
335
|
:param json: The data used in the body of the request to the ``run-now`` endpoint.
|
|
336
336
|
:return: the run_id as an int
|
|
@@ -340,7 +340,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
340
340
|
|
|
341
341
|
def submit_run(self, json: dict) -> int:
|
|
342
342
|
"""
|
|
343
|
-
Call the ``api/2.
|
|
343
|
+
Call the ``api/2.2/jobs/runs/submit`` endpoint.
|
|
344
344
|
|
|
345
345
|
:param json: The data used in the body of the request to the ``submit`` endpoint.
|
|
346
346
|
:return: the run_id as an int
|
|
@@ -385,9 +385,9 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
385
385
|
all_jobs += [j for j in jobs if j["settings"]["name"] == job_name]
|
|
386
386
|
else:
|
|
387
387
|
all_jobs += jobs
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
388
|
+
# issue-59189: API v2.2 removes "has_more" field
|
|
389
|
+
page_token = response.get("next_page_token", "")
|
|
390
|
+
has_more = bool(page_token)
|
|
391
391
|
|
|
392
392
|
return all_jobs
|
|
393
393
|
|
|
@@ -49,7 +49,7 @@ from tenacity import (
|
|
|
49
49
|
)
|
|
50
50
|
|
|
51
51
|
from airflow import __version__
|
|
52
|
-
from airflow.
|
|
52
|
+
from airflow.providers.common.compat.sdk import AirflowException, AirflowOptionalProviderFeatureException
|
|
53
53
|
from airflow.providers_manager import ProvidersManager
|
|
54
54
|
|
|
55
55
|
try:
|
|
@@ -32,9 +32,8 @@ from typing import (
|
|
|
32
32
|
|
|
33
33
|
from databricks import sql
|
|
34
34
|
from databricks.sql.types import Row
|
|
35
|
-
from sqlalchemy.engine import URL
|
|
36
35
|
|
|
37
|
-
from airflow.
|
|
36
|
+
from airflow.providers.common.compat.sdk import AirflowException, AirflowOptionalProviderFeatureException
|
|
38
37
|
from airflow.providers.common.sql.hooks.handlers import return_single_query_results
|
|
39
38
|
from airflow.providers.common.sql.hooks.sql import DbApiHook
|
|
40
39
|
from airflow.providers.databricks.exceptions import DatabricksSqlExecutionError, DatabricksSqlExecutionTimeout
|
|
@@ -43,6 +42,7 @@ from airflow.providers.databricks.hooks.databricks_base import BaseDatabricksHoo
|
|
|
43
42
|
|
|
44
43
|
if TYPE_CHECKING:
|
|
45
44
|
from databricks.sql.client import Connection
|
|
45
|
+
from sqlalchemy.engine import URL
|
|
46
46
|
|
|
47
47
|
from airflow.models.connection import Connection as AirflowConnection
|
|
48
48
|
from airflow.providers.openlineage.extractors import OperatorLineage
|
|
@@ -179,6 +179,14 @@ class DatabricksSqlHook(BaseDatabricksHook, DbApiHook):
|
|
|
179
179
|
|
|
180
180
|
:return: the extracted sqlalchemy.engine.URL object.
|
|
181
181
|
"""
|
|
182
|
+
try:
|
|
183
|
+
from sqlalchemy.engine import URL
|
|
184
|
+
except ImportError:
|
|
185
|
+
raise AirflowOptionalProviderFeatureException(
|
|
186
|
+
"sqlalchemy is required to generate the connection URL. "
|
|
187
|
+
"Install it with: pip install 'apache-airflow-providers-databricks[sqlalchemy]'"
|
|
188
|
+
)
|
|
189
|
+
|
|
182
190
|
url_query = {
|
|
183
191
|
"http_path": self._http_path,
|
|
184
192
|
"catalog": self.catalog,
|
|
@@ -26,9 +26,7 @@ from collections.abc import Sequence
|
|
|
26
26
|
from functools import cached_property
|
|
27
27
|
from typing import TYPE_CHECKING, Any
|
|
28
28
|
|
|
29
|
-
from airflow.
|
|
30
|
-
from airflow.exceptions import AirflowException
|
|
31
|
-
from airflow.providers.common.compat.sdk import BaseOperator, BaseOperatorLink, XCom
|
|
29
|
+
from airflow.providers.common.compat.sdk import AirflowException, BaseOperator, BaseOperatorLink, XCom, conf
|
|
32
30
|
from airflow.providers.databricks.hooks.databricks import (
|
|
33
31
|
DatabricksHook,
|
|
34
32
|
RunLifeCycleState,
|
|
@@ -55,7 +53,7 @@ from airflow.providers.databricks.utils.mixins import DatabricksSQLStatementsMix
|
|
|
55
53
|
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
|
|
56
54
|
|
|
57
55
|
if TYPE_CHECKING:
|
|
58
|
-
from airflow.
|
|
56
|
+
from airflow.providers.common.compat.sdk import TaskInstanceKey
|
|
59
57
|
from airflow.providers.databricks.operators.databricks_workflow import (
|
|
60
58
|
DatabricksWorkflowTaskGroup,
|
|
61
59
|
)
|
|
@@ -261,7 +259,7 @@ class DatabricksCreateJobsOperator(BaseOperator):
|
|
|
261
259
|
https://docs.databricks.com/api/workspace/jobs/reset
|
|
262
260
|
|
|
263
261
|
:param json: A JSON object containing API parameters which will be passed
|
|
264
|
-
directly to the ``api/2.
|
|
262
|
+
directly to the ``api/2.2/jobs/create`` endpoint. The other named parameters
|
|
265
263
|
(i.e. ``name``, ``tags``, ``tasks``, etc.) to this operator will
|
|
266
264
|
be merged with this json dictionary if they are provided.
|
|
267
265
|
If there are conflicts during the merge, the named parameters will
|
|
@@ -392,7 +390,7 @@ class DatabricksCreateJobsOperator(BaseOperator):
|
|
|
392
390
|
|
|
393
391
|
class DatabricksSubmitRunOperator(BaseOperator):
|
|
394
392
|
"""
|
|
395
|
-
Submits a Spark job run to Databricks using the api/2.
|
|
393
|
+
Submits a Spark job run to Databricks using the api/2.2/jobs/runs/submit API endpoint.
|
|
396
394
|
|
|
397
395
|
See: https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit
|
|
398
396
|
|
|
@@ -407,7 +405,7 @@ class DatabricksSubmitRunOperator(BaseOperator):
|
|
|
407
405
|
.. seealso::
|
|
408
406
|
https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit
|
|
409
407
|
:param json: A JSON object containing API parameters which will be passed
|
|
410
|
-
directly to the ``api/2.
|
|
408
|
+
directly to the ``api/2.2/jobs/runs/submit`` endpoint. The other named parameters
|
|
411
409
|
(i.e. ``spark_jar_task``, ``notebook_task``..) to this operator will
|
|
412
410
|
be merged with this json dictionary if they are provided.
|
|
413
411
|
If there are conflicts during the merge, the named parameters will
|
|
@@ -645,14 +643,14 @@ class DatabricksSubmitRunOperator(BaseOperator):
|
|
|
645
643
|
|
|
646
644
|
class DatabricksRunNowOperator(BaseOperator):
|
|
647
645
|
"""
|
|
648
|
-
Runs an existing Spark job run to Databricks using the api/2.
|
|
646
|
+
Runs an existing Spark job run to Databricks using the api/2.2/jobs/run-now API endpoint.
|
|
649
647
|
|
|
650
648
|
See: https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunNow
|
|
651
649
|
|
|
652
650
|
There are two ways to instantiate this operator.
|
|
653
651
|
|
|
654
652
|
In the first way, you can take the JSON payload that you typically use
|
|
655
|
-
to call the ``api/2.
|
|
653
|
+
to call the ``api/2.2/jobs/run-now`` endpoint and pass it directly
|
|
656
654
|
to our ``DatabricksRunNowOperator`` through the ``json`` parameter.
|
|
657
655
|
For example ::
|
|
658
656
|
|
|
@@ -730,7 +728,7 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
730
728
|
https://docs.databricks.com/en/workflows/jobs/settings.html#add-parameters-for-all-job-tasks
|
|
731
729
|
|
|
732
730
|
:param json: A JSON object containing API parameters which will be passed
|
|
733
|
-
directly to the ``api/2.
|
|
731
|
+
directly to the ``api/2.2/jobs/run-now`` endpoint. The other named parameters
|
|
734
732
|
(i.e. ``notebook_params``, ``spark_submit_params``..) to this operator will
|
|
735
733
|
be merged with this json dictionary if they are provided.
|
|
736
734
|
If there are conflicts during the merge, the named parameters will
|
|
@@ -25,8 +25,7 @@ from functools import cached_property
|
|
|
25
25
|
from typing import TYPE_CHECKING
|
|
26
26
|
from urllib.parse import urlsplit
|
|
27
27
|
|
|
28
|
-
from airflow.
|
|
29
|
-
from airflow.providers.common.compat.sdk import BaseOperator
|
|
28
|
+
from airflow.providers.common.compat.sdk import AirflowException, BaseOperator
|
|
30
29
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook
|
|
31
30
|
|
|
32
31
|
if TYPE_CHECKING:
|
|
@@ -21,14 +21,20 @@ from __future__ import annotations
|
|
|
21
21
|
|
|
22
22
|
import csv
|
|
23
23
|
import json
|
|
24
|
+
import os
|
|
24
25
|
from collections.abc import Sequence
|
|
25
26
|
from functools import cached_property
|
|
27
|
+
from tempfile import NamedTemporaryFile
|
|
26
28
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
29
|
+
from urllib.parse import urlparse
|
|
27
30
|
|
|
28
31
|
from databricks.sql.utils import ParamEscaper
|
|
29
32
|
|
|
30
|
-
from airflow.
|
|
31
|
-
|
|
33
|
+
from airflow.providers.common.compat.sdk import (
|
|
34
|
+
AirflowException,
|
|
35
|
+
AirflowOptionalProviderFeatureException,
|
|
36
|
+
BaseOperator,
|
|
37
|
+
)
|
|
32
38
|
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
|
|
33
39
|
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook
|
|
34
40
|
|
|
@@ -63,13 +69,27 @@ class DatabricksSqlOperator(SQLExecuteQueryOperator):
|
|
|
63
69
|
:param catalog: An optional initial catalog to use. Requires DBR version 9.0+ (templated)
|
|
64
70
|
:param schema: An optional initial schema to use. Requires DBR version 9.0+ (templated)
|
|
65
71
|
:param output_path: optional string specifying the file to which write selected data. (templated)
|
|
66
|
-
|
|
67
|
-
|
|
72
|
+
Supports local file paths and GCS URIs (e.g., ``gs://bucket/path/file.parquet``).
|
|
73
|
+
When using GCS URIs, requires the ``apache-airflow-providers-google`` package.
|
|
74
|
+
:param output_format: format of output data if ``output_path`` is specified.
|
|
75
|
+
Possible values are ``csv``, ``json``, ``jsonl``, ``parquet``, ``avro``. Default is ``csv``.
|
|
68
76
|
:param csv_params: parameters that will be passed to the ``csv.DictWriter`` class used to write CSV data.
|
|
77
|
+
:param gcp_conn_id: The connection ID to use for connecting to Google Cloud when using GCS output path.
|
|
78
|
+
Default is ``google_cloud_default``.
|
|
79
|
+
:param gcs_impersonation_chain: Optional service account to impersonate using short-term
|
|
80
|
+
credentials for GCS upload, or chained list of accounts required to get the access_token
|
|
81
|
+
of the last account in the list, which will be impersonated in the request. (templated)
|
|
69
82
|
"""
|
|
70
83
|
|
|
71
84
|
template_fields: Sequence[str] = tuple(
|
|
72
|
-
{
|
|
85
|
+
{
|
|
86
|
+
"_output_path",
|
|
87
|
+
"schema",
|
|
88
|
+
"catalog",
|
|
89
|
+
"http_headers",
|
|
90
|
+
"databricks_conn_id",
|
|
91
|
+
"_gcs_impersonation_chain",
|
|
92
|
+
}
|
|
73
93
|
| set(SQLExecuteQueryOperator.template_fields)
|
|
74
94
|
)
|
|
75
95
|
|
|
@@ -91,6 +111,8 @@ class DatabricksSqlOperator(SQLExecuteQueryOperator):
|
|
|
91
111
|
output_format: str = "csv",
|
|
92
112
|
csv_params: dict[str, Any] | None = None,
|
|
93
113
|
client_parameters: dict[str, Any] | None = None,
|
|
114
|
+
gcp_conn_id: str = "google_cloud_default",
|
|
115
|
+
gcs_impersonation_chain: str | Sequence[str] | None = None,
|
|
94
116
|
**kwargs,
|
|
95
117
|
) -> None:
|
|
96
118
|
super().__init__(conn_id=databricks_conn_id, **kwargs)
|
|
@@ -106,6 +128,8 @@ class DatabricksSqlOperator(SQLExecuteQueryOperator):
|
|
|
106
128
|
self.http_headers = http_headers
|
|
107
129
|
self.catalog = catalog
|
|
108
130
|
self.schema = schema
|
|
131
|
+
self._gcp_conn_id = gcp_conn_id
|
|
132
|
+
self._gcs_impersonation_chain = gcs_impersonation_chain
|
|
109
133
|
|
|
110
134
|
@cached_property
|
|
111
135
|
def _hook(self) -> DatabricksSqlHook:
|
|
@@ -128,41 +152,151 @@ class DatabricksSqlOperator(SQLExecuteQueryOperator):
|
|
|
128
152
|
def _should_run_output_processing(self) -> bool:
|
|
129
153
|
return self.do_xcom_push or bool(self._output_path)
|
|
130
154
|
|
|
155
|
+
@property
|
|
156
|
+
def _is_gcs_output(self) -> bool:
|
|
157
|
+
"""Check if the output path is a GCS URI."""
|
|
158
|
+
return self._output_path.startswith("gs://") if self._output_path else False
|
|
159
|
+
|
|
160
|
+
def _parse_gcs_path(self, path: str) -> tuple[str, str]:
|
|
161
|
+
"""Parse a GCS URI into bucket and object name."""
|
|
162
|
+
parsed = urlparse(path)
|
|
163
|
+
bucket = parsed.netloc
|
|
164
|
+
object_name = parsed.path.lstrip("/")
|
|
165
|
+
return bucket, object_name
|
|
166
|
+
|
|
167
|
+
def _upload_to_gcs(self, local_path: str, gcs_path: str) -> None:
|
|
168
|
+
"""Upload a local file to GCS."""
|
|
169
|
+
try:
|
|
170
|
+
from airflow.providers.google.cloud.hooks.gcs import GCSHook
|
|
171
|
+
except ImportError:
|
|
172
|
+
raise AirflowOptionalProviderFeatureException(
|
|
173
|
+
"The 'apache-airflow-providers-google' package is required for GCS output. "
|
|
174
|
+
"Install it with: pip install apache-airflow-providers-google"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
bucket, object_name = self._parse_gcs_path(gcs_path)
|
|
178
|
+
hook = GCSHook(
|
|
179
|
+
gcp_conn_id=self._gcp_conn_id,
|
|
180
|
+
impersonation_chain=self._gcs_impersonation_chain,
|
|
181
|
+
)
|
|
182
|
+
hook.upload(
|
|
183
|
+
bucket_name=bucket,
|
|
184
|
+
object_name=object_name,
|
|
185
|
+
filename=local_path,
|
|
186
|
+
)
|
|
187
|
+
self.log.info("Uploaded output to %s", gcs_path)
|
|
188
|
+
|
|
189
|
+
def _write_parquet(self, file_path: str, field_names: list[str], rows: list[Any]) -> None:
|
|
190
|
+
"""Write data to a Parquet file."""
|
|
191
|
+
import pyarrow as pa
|
|
192
|
+
import pyarrow.parquet as pq
|
|
193
|
+
|
|
194
|
+
data: dict[str, list] = {name: [] for name in field_names}
|
|
195
|
+
for row in rows:
|
|
196
|
+
row_dict = row._asdict()
|
|
197
|
+
for name in field_names:
|
|
198
|
+
data[name].append(row_dict[name])
|
|
199
|
+
|
|
200
|
+
table = pa.Table.from_pydict(data)
|
|
201
|
+
pq.write_table(table, file_path)
|
|
202
|
+
|
|
203
|
+
def _write_avro(self, file_path: str, field_names: list[str], rows: list[Any]) -> None:
|
|
204
|
+
"""Write data to an Avro file using fastavro."""
|
|
205
|
+
try:
|
|
206
|
+
from fastavro import writer
|
|
207
|
+
except ImportError:
|
|
208
|
+
raise AirflowOptionalProviderFeatureException(
|
|
209
|
+
"The 'fastavro' package is required for Avro output. Install it with: pip install fastavro"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
data: dict[str, list] = {name: [] for name in field_names}
|
|
213
|
+
for row in rows:
|
|
214
|
+
row_dict = row._asdict()
|
|
215
|
+
for name in field_names:
|
|
216
|
+
data[name].append(row_dict[name])
|
|
217
|
+
|
|
218
|
+
schema_fields = []
|
|
219
|
+
for name in field_names:
|
|
220
|
+
sample_val = next(
|
|
221
|
+
(data[name][i] for i in range(len(data[name])) if data[name][i] is not None), None
|
|
222
|
+
)
|
|
223
|
+
if sample_val is None:
|
|
224
|
+
avro_type = ["null", "string"]
|
|
225
|
+
elif isinstance(sample_val, bool):
|
|
226
|
+
avro_type = ["null", "boolean"]
|
|
227
|
+
elif isinstance(sample_val, int):
|
|
228
|
+
avro_type = ["null", "long"]
|
|
229
|
+
elif isinstance(sample_val, float):
|
|
230
|
+
avro_type = ["null", "double"]
|
|
231
|
+
else:
|
|
232
|
+
avro_type = ["null", "string"]
|
|
233
|
+
schema_fields.append({"name": name, "type": avro_type})
|
|
234
|
+
|
|
235
|
+
avro_schema = {
|
|
236
|
+
"type": "record",
|
|
237
|
+
"name": "QueryResult",
|
|
238
|
+
"fields": schema_fields,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
records = [row._asdict() for row in rows]
|
|
242
|
+
with open(file_path, "wb") as f:
|
|
243
|
+
writer(f, avro_schema, records)
|
|
244
|
+
|
|
131
245
|
def _process_output(self, results: list[Any], descriptions: list[Sequence[Sequence] | None]) -> list[Any]:
|
|
132
246
|
if not self._output_path:
|
|
133
247
|
return list(zip(descriptions, results))
|
|
134
248
|
if not self._output_format:
|
|
135
249
|
raise AirflowException("Output format should be specified!")
|
|
136
|
-
|
|
250
|
+
|
|
137
251
|
last_description = descriptions[-1]
|
|
138
252
|
last_results = results[-1]
|
|
139
253
|
if last_description is None:
|
|
140
|
-
raise AirflowException("There is missing description present for the output file.
|
|
254
|
+
raise AirflowException("There is missing description present for the output file.")
|
|
141
255
|
field_names = [field[0] for field in last_description]
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
write_header = csv_params.get("header", True)
|
|
149
|
-
if "header" in csv_params:
|
|
150
|
-
del csv_params["header"]
|
|
151
|
-
writer = csv.DictWriter(file, fieldnames=field_names, **csv_params)
|
|
152
|
-
if write_header:
|
|
153
|
-
writer.writeheader()
|
|
154
|
-
for row in last_results:
|
|
155
|
-
writer.writerow(row._asdict())
|
|
156
|
-
elif self._output_format.lower() == "json":
|
|
157
|
-
with open(self._output_path, "w") as file:
|
|
158
|
-
file.write(json.dumps([row._asdict() for row in last_results]))
|
|
159
|
-
elif self._output_format.lower() == "jsonl":
|
|
160
|
-
with open(self._output_path, "w") as file:
|
|
161
|
-
for row in last_results:
|
|
162
|
-
file.write(json.dumps(row._asdict()))
|
|
163
|
-
file.write("\n")
|
|
256
|
+
|
|
257
|
+
if self._is_gcs_output:
|
|
258
|
+
suffix = f".{self._output_format.lower()}"
|
|
259
|
+
tmp_file = NamedTemporaryFile(mode="w", suffix=suffix, delete=False, newline="")
|
|
260
|
+
local_path = tmp_file.name
|
|
261
|
+
tmp_file.close()
|
|
164
262
|
else:
|
|
165
|
-
|
|
263
|
+
local_path = self._output_path
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
output_format = self._output_format.lower()
|
|
267
|
+
if output_format == "csv":
|
|
268
|
+
with open(local_path, "w", newline="") as file:
|
|
269
|
+
if self._csv_params:
|
|
270
|
+
csv_params = self._csv_params.copy()
|
|
271
|
+
else:
|
|
272
|
+
csv_params = {}
|
|
273
|
+
write_header = csv_params.pop("header", True)
|
|
274
|
+
writer = csv.DictWriter(file, fieldnames=field_names, **csv_params)
|
|
275
|
+
if write_header:
|
|
276
|
+
writer.writeheader()
|
|
277
|
+
for row in last_results:
|
|
278
|
+
writer.writerow(row._asdict())
|
|
279
|
+
elif output_format == "json":
|
|
280
|
+
with open(local_path, "w") as file:
|
|
281
|
+
file.write(json.dumps([row._asdict() for row in last_results]))
|
|
282
|
+
elif output_format == "jsonl":
|
|
283
|
+
with open(local_path, "w") as file:
|
|
284
|
+
for row in last_results:
|
|
285
|
+
file.write(json.dumps(row._asdict()))
|
|
286
|
+
file.write("\n")
|
|
287
|
+
elif output_format == "parquet":
|
|
288
|
+
self._write_parquet(local_path, field_names, last_results)
|
|
289
|
+
elif output_format == "avro":
|
|
290
|
+
self._write_avro(local_path, field_names, last_results)
|
|
291
|
+
else:
|
|
292
|
+
raise ValueError(f"Unsupported output format: '{self._output_format}'")
|
|
293
|
+
|
|
294
|
+
if self._is_gcs_output:
|
|
295
|
+
self._upload_to_gcs(local_path, self._output_path)
|
|
296
|
+
finally:
|
|
297
|
+
if self._is_gcs_output and os.path.exists(local_path):
|
|
298
|
+
os.unlink(local_path)
|
|
299
|
+
|
|
166
300
|
return list(zip(descriptions, results))
|
|
167
301
|
|
|
168
302
|
|
|
@@ -25,8 +25,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
25
25
|
|
|
26
26
|
from mergedeep import merge
|
|
27
27
|
|
|
28
|
-
from airflow.
|
|
29
|
-
from airflow.providers.common.compat.sdk import BaseOperator, TaskGroup
|
|
28
|
+
from airflow.providers.common.compat.sdk import AirflowException, BaseOperator, TaskGroup
|
|
30
29
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook, RunLifeCycleState
|
|
31
30
|
from airflow.providers.databricks.plugins.databricks_workflow import (
|
|
32
31
|
WorkflowJobRepairAllFailedLink,
|
|
@@ -20,11 +20,17 @@ from __future__ import annotations
|
|
|
20
20
|
from typing import TYPE_CHECKING, Any
|
|
21
21
|
from urllib.parse import unquote
|
|
22
22
|
|
|
23
|
-
from airflow.exceptions import
|
|
23
|
+
from airflow.exceptions import TaskInstanceNotFound
|
|
24
24
|
from airflow.models.dagrun import DagRun
|
|
25
25
|
from airflow.models.taskinstance import TaskInstance, TaskInstanceKey, clear_task_instances
|
|
26
|
-
from airflow.
|
|
27
|
-
|
|
26
|
+
from airflow.providers.common.compat.sdk import (
|
|
27
|
+
AirflowException,
|
|
28
|
+
AirflowOptionalProviderFeatureException,
|
|
29
|
+
AirflowPlugin,
|
|
30
|
+
BaseOperatorLink,
|
|
31
|
+
TaskGroup,
|
|
32
|
+
XCom,
|
|
33
|
+
)
|
|
28
34
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook
|
|
29
35
|
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
|
|
30
36
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
@@ -68,6 +74,10 @@ if not AIRFLOW_V_3_0_PLUS:
|
|
|
68
74
|
from flask_appbuilder import BaseView
|
|
69
75
|
from flask_appbuilder.api import expose
|
|
70
76
|
|
|
77
|
+
try:
|
|
78
|
+
from sqlalchemy import select
|
|
79
|
+
except ImportError:
|
|
80
|
+
select = None # type: ignore[assignment,misc]
|
|
71
81
|
from airflow.utils.session import NEW_SESSION, provide_session
|
|
72
82
|
from airflow.www import auth
|
|
73
83
|
|
|
@@ -140,10 +150,17 @@ if not AIRFLOW_V_3_0_PLUS:
|
|
|
140
150
|
:param session: The SQLAlchemy session to use for the query. If None, uses the default session.
|
|
141
151
|
:return: The DagRun object associated with the specified DAG and run_id.
|
|
142
152
|
"""
|
|
153
|
+
if select is None:
|
|
154
|
+
raise AirflowOptionalProviderFeatureException(
|
|
155
|
+
"sqlalchemy is required for workflow repair functionality. "
|
|
156
|
+
"Install it with: pip install 'apache-airflow-providers-databricks[sqlalchemy]'"
|
|
157
|
+
)
|
|
143
158
|
if not session:
|
|
144
159
|
raise AirflowException("Session not provided.")
|
|
145
160
|
|
|
146
|
-
return session.
|
|
161
|
+
return session.scalars(
|
|
162
|
+
select(DagRun).where(DagRun.dag_id == dag.dag_id, DagRun.run_id == run_id)
|
|
163
|
+
).one()
|
|
147
164
|
|
|
148
165
|
@provide_session
|
|
149
166
|
def _clear_task_instances(
|
|
@@ -157,20 +174,23 @@ if not AIRFLOW_V_3_0_PLUS:
|
|
|
157
174
|
|
|
158
175
|
@provide_session
|
|
159
176
|
def get_task_instance(operator: BaseOperator, dttm, session: Session = NEW_SESSION) -> TaskInstance:
|
|
177
|
+
if select is None:
|
|
178
|
+
raise AirflowOptionalProviderFeatureException(
|
|
179
|
+
"sqlalchemy is required to get task instance. "
|
|
180
|
+
"Install it with: pip install 'apache-airflow-providers-databricks[sqlalchemy]'"
|
|
181
|
+
)
|
|
160
182
|
dag_id = operator.dag.dag_id
|
|
161
183
|
if hasattr(DagRun, "execution_date"): # Airflow 2.x.
|
|
162
184
|
dag_run = DagRun.find(dag_id, execution_date=dttm)[0] # type: ignore[call-arg]
|
|
163
185
|
else:
|
|
164
186
|
dag_run = DagRun.find(dag_id, logical_date=dttm)[0]
|
|
165
|
-
ti = (
|
|
166
|
-
|
|
167
|
-
.filter(
|
|
187
|
+
ti = session.scalars(
|
|
188
|
+
select(TaskInstance).where(
|
|
168
189
|
TaskInstance.dag_id == dag_id,
|
|
169
190
|
TaskInstance.run_id == dag_run.run_id,
|
|
170
191
|
TaskInstance.task_id == operator.task_id,
|
|
171
192
|
)
|
|
172
|
-
|
|
173
|
-
)
|
|
193
|
+
).one_or_none()
|
|
174
194
|
if not ti:
|
|
175
195
|
raise TaskInstanceNotFound("Task instance not found")
|
|
176
196
|
return ti
|
|
@@ -278,7 +298,7 @@ class WorkflowJobRunLink(BaseOperatorLink, LoggingMixin):
|
|
|
278
298
|
"""XCom key where the link is stored during task execution."""
|
|
279
299
|
return "databricks_job_run_link"
|
|
280
300
|
|
|
281
|
-
def get_link(
|
|
301
|
+
def get_link( # type: ignore[override] # Signature intentionally kept this way for Airflow 2.x compatibility
|
|
282
302
|
self,
|
|
283
303
|
operator: BaseOperator,
|
|
284
304
|
dttm=None,
|
|
@@ -354,7 +374,7 @@ class WorkflowJobRepairAllFailedLink(BaseOperatorLink, LoggingMixin):
|
|
|
354
374
|
|
|
355
375
|
name = "Repair All Failed Tasks"
|
|
356
376
|
|
|
357
|
-
def get_link(
|
|
377
|
+
def get_link( # type: ignore[override] # Signature intentionally kept this way for Airflow 2.x compatibility
|
|
358
378
|
self,
|
|
359
379
|
operator,
|
|
360
380
|
dttm=None,
|
|
@@ -451,7 +471,7 @@ class WorkflowJobRepairSingleTaskLink(BaseOperatorLink, LoggingMixin):
|
|
|
451
471
|
|
|
452
472
|
name = "Repair a single task"
|
|
453
473
|
|
|
454
|
-
def get_link(
|
|
474
|
+
def get_link( # type: ignore[override] # Signature intentionally kept this way for Airflow 2.x compatibility
|
|
455
475
|
self,
|
|
456
476
|
operator,
|
|
457
477
|
dttm=None,
|
|
@@ -22,9 +22,7 @@ from collections.abc import Sequence
|
|
|
22
22
|
from functools import cached_property
|
|
23
23
|
from typing import TYPE_CHECKING, Any
|
|
24
24
|
|
|
25
|
-
from airflow.
|
|
26
|
-
from airflow.exceptions import AirflowException
|
|
27
|
-
from airflow.providers.common.compat.sdk import BaseSensorOperator
|
|
25
|
+
from airflow.providers.common.compat.sdk import AirflowException, BaseSensorOperator, conf
|
|
28
26
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook, SQLStatementState
|
|
29
27
|
from airflow.providers.databricks.operators.databricks import DEFER_METHOD_NAME
|
|
30
28
|
from airflow.providers.databricks.utils.mixins import DatabricksSQLStatementsMixin
|
|
@@ -27,8 +27,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
27
27
|
|
|
28
28
|
from databricks.sql.utils import ParamEscaper
|
|
29
29
|
|
|
30
|
-
from airflow.
|
|
31
|
-
from airflow.providers.common.compat.sdk import BaseSensorOperator
|
|
30
|
+
from airflow.providers.common.compat.sdk import AirflowException, BaseSensorOperator
|
|
32
31
|
from airflow.providers.common.sql.hooks.handlers import fetch_all_handler
|
|
33
32
|
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook
|
|
34
33
|
|
|
@@ -24,8 +24,7 @@ from collections.abc import Callable, Iterable, Sequence
|
|
|
24
24
|
from functools import cached_property
|
|
25
25
|
from typing import TYPE_CHECKING, Any
|
|
26
26
|
|
|
27
|
-
from airflow.
|
|
28
|
-
from airflow.providers.common.compat.sdk import BaseSensorOperator
|
|
27
|
+
from airflow.providers.common.compat.sdk import AirflowException, BaseSensorOperator
|
|
29
28
|
from airflow.providers.common.sql.hooks.handlers import fetch_all_handler
|
|
30
29
|
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook
|
|
31
30
|
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# under the License.
|
|
18
18
|
from __future__ import annotations
|
|
19
19
|
|
|
20
|
-
from airflow.
|
|
20
|
+
from airflow.providers.common.compat.sdk import AirflowException
|
|
21
21
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook, RunState
|
|
22
22
|
|
|
23
23
|
|
|
@@ -20,18 +20,14 @@ from __future__ import annotations
|
|
|
20
20
|
|
|
21
21
|
import time
|
|
22
22
|
from logging import Logger
|
|
23
|
-
from typing import
|
|
24
|
-
TYPE_CHECKING,
|
|
25
|
-
Any,
|
|
26
|
-
Protocol,
|
|
27
|
-
)
|
|
23
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
28
24
|
|
|
29
|
-
from airflow.
|
|
25
|
+
from airflow.providers.common.compat.sdk import AirflowException
|
|
30
26
|
from airflow.providers.databricks.hooks.databricks import DatabricksHook, SQLStatementState
|
|
31
27
|
from airflow.providers.databricks.triggers.databricks import DatabricksSQLStatementExecutionTrigger
|
|
32
28
|
|
|
33
29
|
if TYPE_CHECKING:
|
|
34
|
-
from airflow.
|
|
30
|
+
from airflow.sdk import Context
|
|
35
31
|
|
|
36
32
|
|
|
37
33
|
class GetHookHasFields(Protocol):
|
|
@@ -24,7 +24,6 @@ from typing import TYPE_CHECKING, Any
|
|
|
24
24
|
import requests
|
|
25
25
|
|
|
26
26
|
from airflow.providers.common.compat.openlineage.check import require_openlineage_version
|
|
27
|
-
from airflow.providers.databricks.version_compat import AIRFLOW_V_3_0_PLUS
|
|
28
27
|
from airflow.utils import timezone
|
|
29
28
|
|
|
30
29
|
if TYPE_CHECKING:
|
|
@@ -37,60 +36,6 @@ if TYPE_CHECKING:
|
|
|
37
36
|
log = logging.getLogger(__name__)
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
def _get_logical_date(task_instance):
|
|
41
|
-
# todo: remove when min airflow version >= 3.0
|
|
42
|
-
if AIRFLOW_V_3_0_PLUS:
|
|
43
|
-
dagrun = task_instance.get_template_context()["dag_run"]
|
|
44
|
-
return dagrun.logical_date or dagrun.run_after
|
|
45
|
-
|
|
46
|
-
if hasattr(task_instance, "logical_date"):
|
|
47
|
-
date = task_instance.logical_date
|
|
48
|
-
else:
|
|
49
|
-
date = task_instance.execution_date
|
|
50
|
-
|
|
51
|
-
return date
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def _get_dag_run_clear_number(task_instance):
|
|
55
|
-
# todo: remove when min airflow version >= 3.0
|
|
56
|
-
if AIRFLOW_V_3_0_PLUS:
|
|
57
|
-
dagrun = task_instance.get_template_context()["dag_run"]
|
|
58
|
-
return dagrun.clear_number
|
|
59
|
-
return task_instance.dag_run.clear_number
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
# todo: move this run_id logic into OpenLineage's listener to avoid differences
|
|
63
|
-
def _get_ol_run_id(task_instance) -> str:
|
|
64
|
-
"""
|
|
65
|
-
Get OpenLineage run_id from TaskInstance.
|
|
66
|
-
|
|
67
|
-
It's crucial that the task_instance's run_id creation logic matches OpenLineage's listener implementation.
|
|
68
|
-
Only then can we ensure that the generated run_id aligns with the Airflow task,
|
|
69
|
-
enabling a proper connection between events.
|
|
70
|
-
"""
|
|
71
|
-
from airflow.providers.openlineage.plugins.adapter import OpenLineageAdapter
|
|
72
|
-
|
|
73
|
-
# Generate same OL run id as is generated for current task instance
|
|
74
|
-
return OpenLineageAdapter.build_task_instance_run_id(
|
|
75
|
-
dag_id=task_instance.dag_id,
|
|
76
|
-
task_id=task_instance.task_id,
|
|
77
|
-
logical_date=_get_logical_date(task_instance),
|
|
78
|
-
try_number=task_instance.try_number,
|
|
79
|
-
map_index=task_instance.map_index,
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
# todo: move this run_id logic into OpenLineage's listener to avoid differences
|
|
84
|
-
def _get_ol_dag_run_id(task_instance) -> str:
|
|
85
|
-
from airflow.providers.openlineage.plugins.adapter import OpenLineageAdapter
|
|
86
|
-
|
|
87
|
-
return OpenLineageAdapter.build_dag_run_id(
|
|
88
|
-
dag_id=task_instance.dag_id,
|
|
89
|
-
logical_date=_get_logical_date(task_instance),
|
|
90
|
-
clear_number=_get_dag_run_clear_number(task_instance),
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
|
|
94
39
|
def _get_parent_run_facet(task_instance):
|
|
95
40
|
"""
|
|
96
41
|
Retrieve the ParentRunFacet associated with a specific Airflow task instance.
|
|
@@ -101,22 +46,39 @@ def _get_parent_run_facet(task_instance):
|
|
|
101
46
|
"""
|
|
102
47
|
from openlineage.client.facet_v2 import parent_run
|
|
103
48
|
|
|
104
|
-
from airflow.providers.openlineage.
|
|
49
|
+
from airflow.providers.openlineage.plugins.macros import (
|
|
50
|
+
lineage_job_name,
|
|
51
|
+
lineage_job_namespace,
|
|
52
|
+
lineage_root_job_name,
|
|
53
|
+
lineage_root_run_id,
|
|
54
|
+
lineage_run_id,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
parent_run_id = lineage_run_id(task_instance)
|
|
58
|
+
parent_job_name = lineage_job_name(task_instance)
|
|
59
|
+
parent_job_namespace = lineage_job_namespace()
|
|
60
|
+
|
|
61
|
+
root_parent_run_id = lineage_root_run_id(task_instance)
|
|
62
|
+
rot_parent_job_name = lineage_root_job_name(task_instance)
|
|
63
|
+
|
|
64
|
+
try: # Added in OL provider 2.9.0, try to use it if possible
|
|
65
|
+
from airflow.providers.openlineage.plugins.macros import lineage_root_job_namespace
|
|
105
66
|
|
|
106
|
-
|
|
107
|
-
|
|
67
|
+
root_parent_job_namespace = lineage_root_job_namespace(task_instance)
|
|
68
|
+
except ImportError:
|
|
69
|
+
root_parent_job_namespace = lineage_job_namespace()
|
|
108
70
|
|
|
109
71
|
return parent_run.ParentRunFacet(
|
|
110
72
|
run=parent_run.Run(runId=parent_run_id),
|
|
111
73
|
job=parent_run.Job(
|
|
112
|
-
namespace=
|
|
113
|
-
name=
|
|
74
|
+
namespace=parent_job_namespace,
|
|
75
|
+
name=parent_job_name,
|
|
114
76
|
),
|
|
115
77
|
root=parent_run.Root(
|
|
116
78
|
run=parent_run.RootRun(runId=root_parent_run_id),
|
|
117
79
|
job=parent_run.RootJob(
|
|
118
|
-
name=
|
|
119
|
-
namespace=
|
|
80
|
+
name=rot_parent_job_name,
|
|
81
|
+
namespace=root_parent_job_namespace,
|
|
120
82
|
),
|
|
121
83
|
),
|
|
122
84
|
)
|
|
@@ -209,7 +171,7 @@ def _create_ol_event_pair(
|
|
|
209
171
|
return start, end
|
|
210
172
|
|
|
211
173
|
|
|
212
|
-
@require_openlineage_version(provider_min_version="2.
|
|
174
|
+
@require_openlineage_version(provider_min_version="2.5.0")
|
|
213
175
|
def emit_openlineage_events_for_databricks_queries(
|
|
214
176
|
task_instance,
|
|
215
177
|
hook: DatabricksSqlHook | DatabricksHook | None = None,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: apache-airflow-providers-databricks
|
|
3
|
-
Version: 7.
|
|
3
|
+
Version: 7.9.0
|
|
4
4
|
Summary: Provider package apache-airflow-providers-databricks for Apache Airflow
|
|
5
5
|
Keywords: airflow-provider,databricks,airflow,integration
|
|
6
6
|
Author-email: Apache Software Foundation <dev@airflow.apache.org>
|
|
@@ -22,34 +22,40 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
22
22
|
Classifier: Topic :: System :: Monitoring
|
|
23
23
|
License-File: LICENSE
|
|
24
24
|
License-File: NOTICE
|
|
25
|
-
Requires-Dist: apache-airflow>=2.11.
|
|
26
|
-
Requires-Dist: apache-airflow-providers-common-compat>=1.
|
|
27
|
-
Requires-Dist: apache-airflow-providers-common-sql>=1.27.
|
|
25
|
+
Requires-Dist: apache-airflow>=2.11.0
|
|
26
|
+
Requires-Dist: apache-airflow-providers-common-compat>=1.13.0
|
|
27
|
+
Requires-Dist: apache-airflow-providers-common-sql>=1.27.0
|
|
28
28
|
Requires-Dist: requests>=2.32.0,<3
|
|
29
29
|
Requires-Dist: databricks-sql-connector>=4.0.0
|
|
30
|
-
Requires-Dist: databricks-sqlalchemy>=1.0.2
|
|
31
30
|
Requires-Dist: aiohttp>=3.9.2, <4
|
|
32
31
|
Requires-Dist: mergedeep>=1.3.4
|
|
33
32
|
Requires-Dist: pandas>=2.1.2; python_version <"3.13"
|
|
34
33
|
Requires-Dist: pandas>=2.2.3; python_version >="3.13"
|
|
35
34
|
Requires-Dist: pyarrow>=16.1.0; python_version < '3.13'
|
|
36
35
|
Requires-Dist: pyarrow>=18.0.0; python_version >= '3.13'
|
|
36
|
+
Requires-Dist: fastavro>=1.9.0 ; extra == "avro"
|
|
37
|
+
Requires-Dist: fastavro>=1.10.0 ; extra == "avro" and (python_version>="3.12")
|
|
37
38
|
Requires-Dist: azure-identity>=1.3.1 ; extra == "azure-identity"
|
|
38
|
-
Requires-Dist: apache-airflow-providers-fab>=2.2.
|
|
39
|
-
Requires-Dist: apache-airflow-providers-
|
|
39
|
+
Requires-Dist: apache-airflow-providers-fab>=2.2.0 ; extra == "fab" and ( python_version < '3.13')
|
|
40
|
+
Requires-Dist: apache-airflow-providers-google>=10.24.0 ; extra == "google"
|
|
41
|
+
Requires-Dist: apache-airflow-providers-openlineage>=2.3.0 ; extra == "openlineage"
|
|
40
42
|
Requires-Dist: databricks-sdk==0.10.0 ; extra == "sdk"
|
|
43
|
+
Requires-Dist: databricks-sqlalchemy>=1.0.2 ; extra == "sqlalchemy"
|
|
41
44
|
Requires-Dist: apache-airflow-providers-standard ; extra == "standard"
|
|
42
45
|
Project-URL: Bug Tracker, https://github.com/apache/airflow/issues
|
|
43
|
-
Project-URL: Changelog, https://airflow.
|
|
44
|
-
Project-URL: Documentation, https://airflow.
|
|
46
|
+
Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.9.0/changelog.html
|
|
47
|
+
Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.9.0
|
|
45
48
|
Project-URL: Mastodon, https://fosstodon.org/@airflow
|
|
46
49
|
Project-URL: Slack Chat, https://s.apache.org/airflow-slack
|
|
47
50
|
Project-URL: Source Code, https://github.com/apache/airflow
|
|
48
51
|
Project-URL: YouTube, https://www.youtube.com/channel/UCSXwxpWZQ7XZ1WL3wqevChA/
|
|
52
|
+
Provides-Extra: avro
|
|
49
53
|
Provides-Extra: azure-identity
|
|
50
54
|
Provides-Extra: fab
|
|
55
|
+
Provides-Extra: google
|
|
51
56
|
Provides-Extra: openlineage
|
|
52
57
|
Provides-Extra: sdk
|
|
58
|
+
Provides-Extra: sqlalchemy
|
|
53
59
|
Provides-Extra: standard
|
|
54
60
|
|
|
55
61
|
|
|
@@ -77,7 +83,7 @@ Provides-Extra: standard
|
|
|
77
83
|
|
|
78
84
|
Package ``apache-airflow-providers-databricks``
|
|
79
85
|
|
|
80
|
-
Release: ``7.
|
|
86
|
+
Release: ``7.9.0``
|
|
81
87
|
|
|
82
88
|
|
|
83
89
|
`Databricks <https://databricks.com/>`__
|
|
@@ -90,7 +96,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
|
|
|
90
96
|
are in ``airflow.providers.databricks`` python package.
|
|
91
97
|
|
|
92
98
|
You can find package information and changelog for the provider
|
|
93
|
-
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.
|
|
99
|
+
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.9.0/>`_.
|
|
94
100
|
|
|
95
101
|
Installation
|
|
96
102
|
------------
|
|
@@ -108,11 +114,10 @@ Requirements
|
|
|
108
114
|
PIP package Version required
|
|
109
115
|
========================================== ======================================
|
|
110
116
|
``apache-airflow`` ``>=2.11.0``
|
|
111
|
-
``apache-airflow-providers-common-compat`` ``>=1.
|
|
117
|
+
``apache-airflow-providers-common-compat`` ``>=1.13.0``
|
|
112
118
|
``apache-airflow-providers-common-sql`` ``>=1.27.0``
|
|
113
119
|
``requests`` ``>=2.32.0,<3``
|
|
114
120
|
``databricks-sql-connector`` ``>=4.0.0``
|
|
115
|
-
``databricks-sqlalchemy`` ``>=1.0.2``
|
|
116
121
|
``aiohttp`` ``>=3.9.2,<4``
|
|
117
122
|
``mergedeep`` ``>=1.3.4``
|
|
118
123
|
``pandas`` ``>=2.1.2; python_version < "3.13"``
|
|
@@ -139,6 +144,7 @@ Dependent package
|
|
|
139
144
|
================================================================================================================== =================
|
|
140
145
|
`apache-airflow-providers-common-compat <https://airflow.apache.org/docs/apache-airflow-providers-common-compat>`_ ``common.compat``
|
|
141
146
|
`apache-airflow-providers-common-sql <https://airflow.apache.org/docs/apache-airflow-providers-common-sql>`_ ``common.sql``
|
|
147
|
+
`apache-airflow-providers-google <https://airflow.apache.org/docs/apache-airflow-providers-google>`_ ``google``
|
|
142
148
|
`apache-airflow-providers-openlineage <https://airflow.apache.org/docs/apache-airflow-providers-openlineage>`_ ``openlineage``
|
|
143
149
|
================================================================================================================== =================
|
|
144
150
|
|
|
@@ -153,8 +159,11 @@ Extra Dependencies
|
|
|
153
159
|
``fab`` ``apache-airflow-providers-fab>=2.2.0; python_version < '3.13'``
|
|
154
160
|
``standard`` ``apache-airflow-providers-standard``
|
|
155
161
|
``openlineage`` ``apache-airflow-providers-openlineage>=2.3.0``
|
|
162
|
+
``sqlalchemy`` ``databricks-sqlalchemy>=1.0.2``
|
|
163
|
+
``google`` ``apache-airflow-providers-google>=10.24.0``
|
|
164
|
+
``avro`` ``fastavro>=1.9.0``, ``fastavro>=1.10.0;python_version>="3.12"``
|
|
156
165
|
================== ================================================================
|
|
157
166
|
|
|
158
167
|
The changelog for the provider package can be found in the
|
|
159
|
-
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.
|
|
168
|
+
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/7.9.0/changelog.html>`_.
|
|
160
169
|
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
airflow/providers/databricks/__init__.py,sha256=nAdKvPEVae_IY8zBScAW_De79Ob4OC-dGDALAno1HA0,1499
|
|
2
|
+
airflow/providers/databricks/exceptions.py,sha256=v7TD8auFp9LmyWqRtnXYG8mOit0WE3OuInUNFoC0zTo,1278
|
|
3
|
+
airflow/providers/databricks/get_provider_info.py,sha256=LfK0AwIARVh4tX5146-J2VRZwfe6GP3xjLyltA7X7iU,5738
|
|
4
|
+
airflow/providers/databricks/version_compat.py,sha256=RQbdCueLOaFZWekpQmF0BoAoJInW8EoyvJ3Ah-HbrPo,1577
|
|
5
|
+
airflow/providers/databricks/hooks/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
6
|
+
airflow/providers/databricks/hooks/databricks.py,sha256=eYvrc9H3-gpGZRXBGms_DyjeFjxg-JB1lYKmdr2bwcE,29789
|
|
7
|
+
airflow/providers/databricks/hooks/databricks_base.py,sha256=ud9Mxzi86tAaGunlx0vypLR6ICapdn2qyFlT3WFjZjQ,36881
|
|
8
|
+
airflow/providers/databricks/hooks/databricks_sql.py,sha256=4LSTSYxHPJolmB91eOP_LuShyAUcjWATx6-ywUx8ASc,18149
|
|
9
|
+
airflow/providers/databricks/operators/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
10
|
+
airflow/providers/databricks/operators/databricks.py,sha256=NqcMOAlC_OvkrBFUaRFQa37P36Shja-plECZzg04Gl8,79258
|
|
11
|
+
airflow/providers/databricks/operators/databricks_repos.py,sha256=jOrYO_tFQJ5JBXeu7Rhrc3pcQJ4qtzSGSjGZ4GffmwU,13125
|
|
12
|
+
airflow/providers/databricks/operators/databricks_sql.py,sha256=9hXLFSUtdVlg45lwBTIZgY33is5-Kkgp00Cz22sI-yg,27076
|
|
13
|
+
airflow/providers/databricks/operators/databricks_workflow.py,sha256=xqk6kbFcqArHo4w9E0sVGbAkX2tuBqWdtvwiFyc9jzo,14989
|
|
14
|
+
airflow/providers/databricks/plugins/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
15
|
+
airflow/providers/databricks/plugins/databricks_workflow.py,sha256=Tg4fgrMQ31NqtcjPK6D61ehSqp-Jtf3_OS4db7BDSCo,21019
|
|
16
|
+
airflow/providers/databricks/sensors/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
17
|
+
airflow/providers/databricks/sensors/databricks.py,sha256=dtVcb-Ka9R3l8y_59hdz65be3JUIVEsAodTsviwx1Mg,6199
|
|
18
|
+
airflow/providers/databricks/sensors/databricks_partition.py,sha256=AV7GoAIRnV7NEtbqUxp9WdSeN-LeIc49I3_NaI1cBiY,9910
|
|
19
|
+
airflow/providers/databricks/sensors/databricks_sql.py,sha256=ON3ulhD0I4ukJhKzDYTqw-8ZkdUuED_8QyDZbzFgHko,5603
|
|
20
|
+
airflow/providers/databricks/triggers/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
21
|
+
airflow/providers/databricks/triggers/databricks.py,sha256=DQbXLw1W_e3Iw-hsDph7vPuHc2caj623V7WmA2_PftM,8672
|
|
22
|
+
airflow/providers/databricks/utils/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
23
|
+
airflow/providers/databricks/utils/databricks.py,sha256=bnZdjQ1etvAcfgdmb8BR4i1M4YjdcDXxxznVtmur1GM,5134
|
|
24
|
+
airflow/providers/databricks/utils/mixins.py,sha256=XDA9v9BeCgMIznYPpa-X7XIqrD1mJbw4eSQUjvTsQXI,7397
|
|
25
|
+
airflow/providers/databricks/utils/openlineage.py,sha256=naqLzbdBebwDUPvDhhIa5Ey_8SgKkYqdwhzJC_51gFU,13674
|
|
26
|
+
apache_airflow_providers_databricks-7.9.0.dist-info/entry_points.txt,sha256=hjmZm3ab2cteTR4t9eE28oKixHwNIKtLCThd6sx3XRQ,227
|
|
27
|
+
apache_airflow_providers_databricks-7.9.0.dist-info/licenses/LICENSE,sha256=gXPVwptPlW1TJ4HSuG5OMPg-a3h43OGMkZRR1rpwfJA,10850
|
|
28
|
+
apache_airflow_providers_databricks-7.9.0.dist-info/licenses/NOTICE,sha256=_cWHznIoUSbLCY_KfmKqetlKlsoH0c2VBjmZjElAzuc,168
|
|
29
|
+
apache_airflow_providers_databricks-7.9.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
30
|
+
apache_airflow_providers_databricks-7.9.0.dist-info/METADATA,sha256=bbY2URbAFkKKJ5xXgaSB6vIhmfS27RefxG4X0DesqOc,8325
|
|
31
|
+
apache_airflow_providers_databricks-7.9.0.dist-info/RECORD,,
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
airflow/providers/databricks/__init__.py,sha256=jK9hWZ8jptf_y_7T6PywlGiLdB4zq1EsGECyxMGFi8A,1499
|
|
2
|
-
airflow/providers/databricks/exceptions.py,sha256=85RklmLOI_PnTzfXNIUd5fAu2aMMUhelwumQAX0wANE,1261
|
|
3
|
-
airflow/providers/databricks/get_provider_info.py,sha256=LfK0AwIARVh4tX5146-J2VRZwfe6GP3xjLyltA7X7iU,5738
|
|
4
|
-
airflow/providers/databricks/version_compat.py,sha256=RQbdCueLOaFZWekpQmF0BoAoJInW8EoyvJ3Ah-HbrPo,1577
|
|
5
|
-
airflow/providers/databricks/hooks/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
6
|
-
airflow/providers/databricks/hooks/databricks.py,sha256=uOBPUUAEc9eHBdvMgNyJzWivIFCt_GQgR4UlRrRxqgM,29754
|
|
7
|
-
airflow/providers/databricks/hooks/databricks_base.py,sha256=m-m2AKqD3-6mEfvuwgo7Era47zGzsjKbpLTRQNjiUS4,36864
|
|
8
|
-
airflow/providers/databricks/hooks/databricks_sql.py,sha256=xougOWuFgQzhBzFcuYkbX-lo0FpKCQztXoBETJEzesg,17755
|
|
9
|
-
airflow/providers/databricks/operators/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
10
|
-
airflow/providers/databricks/operators/databricks.py,sha256=bVhFNTeGboHkmaJkYDYEyF0V1QUOB_RnsvwaCuEtIew,79316
|
|
11
|
-
airflow/providers/databricks/operators/databricks_repos.py,sha256=VRZye45ZMlDxti6ZJjuouox5umiMoeQ-BKugPpE7jnM,13155
|
|
12
|
-
airflow/providers/databricks/operators/databricks_sql.py,sha256=gwpkr660qpk4dUve98RB-hniaMzuXL6znQZZGilJxi0,21842
|
|
13
|
-
airflow/providers/databricks/operators/databricks_workflow.py,sha256=QLsR0pGLWvvQbutsjj4RWwBE-z6tkWiYLHj6waMv8ZE,15019
|
|
14
|
-
airflow/providers/databricks/plugins/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
15
|
-
airflow/providers/databricks/plugins/databricks_workflow.py,sha256=5vyG2WNM25ptSv5IwAndUTqKAOmTneOWy_pAtqBKcgc,20020
|
|
16
|
-
airflow/providers/databricks/sensors/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
17
|
-
airflow/providers/databricks/sensors/databricks.py,sha256=RrjSzncvppdp5U8RYHd975MCIQIb_s1VQoxm9Aqbvac,6262
|
|
18
|
-
airflow/providers/databricks/sensors/databricks_partition.py,sha256=qPDy8oxg-Lo-jnHy1EbxmA5GIjC6t0XnFJ1E3aAmUgg,9940
|
|
19
|
-
airflow/providers/databricks/sensors/databricks_sql.py,sha256=shq7ng4LCiaD4Q7lorm4g1A7aijmq3nVUnCFlYtoI7c,5633
|
|
20
|
-
airflow/providers/databricks/triggers/__init__.py,sha256=mlJxuZLkd5x-iq2SBwD3mvRQpt3YR7wjz_nceyF1IaI,787
|
|
21
|
-
airflow/providers/databricks/triggers/databricks.py,sha256=DQbXLw1W_e3Iw-hsDph7vPuHc2caj623V7WmA2_PftM,8672
|
|
22
|
-
airflow/providers/databricks/utils/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
|
|
23
|
-
airflow/providers/databricks/utils/databricks.py,sha256=ecvzZbC4KdXds47VeSayot9EO-RQnTRJTEwKITH7waQ,5117
|
|
24
|
-
airflow/providers/databricks/utils/mixins.py,sha256=WUmkt3AmXalmV6zOUIJZWbTldxYunAZOstddDhKCC94,7407
|
|
25
|
-
airflow/providers/databricks/utils/openlineage.py,sha256=1jT5Woh9YifawdP-VFWsabfF-ecuCjPlzD5P_W4DAhI,15078
|
|
26
|
-
apache_airflow_providers_databricks-7.8.0rc1.dist-info/entry_points.txt,sha256=hjmZm3ab2cteTR4t9eE28oKixHwNIKtLCThd6sx3XRQ,227
|
|
27
|
-
apache_airflow_providers_databricks-7.8.0rc1.dist-info/licenses/LICENSE,sha256=gXPVwptPlW1TJ4HSuG5OMPg-a3h43OGMkZRR1rpwfJA,10850
|
|
28
|
-
apache_airflow_providers_databricks-7.8.0rc1.dist-info/licenses/NOTICE,sha256=E3-_E02gwwSEFzeeWPKmnIjOoos3hW28CLISV6sYrbQ,168
|
|
29
|
-
apache_airflow_providers_databricks-7.8.0rc1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
30
|
-
apache_airflow_providers_databricks-7.8.0rc1.dist-info/METADATA,sha256=Dts1HmORUW1KUFBXvtAotQuZvHRDbj0iMadzjbWFyNg,7782
|
|
31
|
-
apache_airflow_providers_databricks-7.8.0rc1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|