apache-airflow-providers-databricks 6.2.0__tar.gz → 6.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apache-airflow-providers-databricks might be problematic. Click here for more details.
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/PKG-INFO +8 -7
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/README.rst +4 -4
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/__init__.py +1 -1
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/get_provider_info.py +2 -1
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/hooks/databricks.py +14 -5
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/hooks/databricks_base.py +1 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/hooks/databricks_sql.py +2 -4
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/operators/databricks.py +84 -57
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/operators/databricks_repos.py +1 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/operators/databricks_sql.py +11 -10
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/triggers/databricks.py +12 -8
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/pyproject.toml +4 -3
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/LICENSE +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/hooks/__init__.py +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/operators/__init__.py +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/sensors/__init__.py +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/sensors/databricks_partition.py +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/sensors/databricks_sql.py +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/triggers/__init__.py +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/utils/__init__.py +0 -0
- {apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/airflow/providers/databricks/utils/databricks.py +0 -0
{apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: apache-airflow-providers-databricks
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.3.0
|
|
4
4
|
Summary: Provider package apache-airflow-providers-databricks for Apache Airflow
|
|
5
5
|
Keywords: airflow-provider,databricks,airflow,integration
|
|
6
6
|
Author-email: Apache Software Foundation <dev@airflow.apache.org>
|
|
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.9
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.10
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
23
|
Classifier: Topic :: System :: Monitoring
|
|
23
24
|
Requires-Dist: aiohttp>=3.9.2, <4
|
|
24
25
|
Requires-Dist: apache-airflow-providers-common-sql>=1.10.0
|
|
@@ -28,8 +29,8 @@ Requires-Dist: requests>=2.27.0,<3
|
|
|
28
29
|
Requires-Dist: apache-airflow-providers-common-sql ; extra == "common.sql"
|
|
29
30
|
Requires-Dist: databricks-sdk==0.10.0 ; extra == "sdk"
|
|
30
31
|
Project-URL: Bug Tracker, https://github.com/apache/airflow/issues
|
|
31
|
-
Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
32
|
-
Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
32
|
+
Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html
|
|
33
|
+
Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0
|
|
33
34
|
Project-URL: Slack Chat, https://s.apache.org/airflow-slack
|
|
34
35
|
Project-URL: Source Code, https://github.com/apache/airflow
|
|
35
36
|
Project-URL: Twitter, https://twitter.com/ApacheAirflow
|
|
@@ -81,7 +82,7 @@ Provides-Extra: sdk
|
|
|
81
82
|
|
|
82
83
|
Package ``apache-airflow-providers-databricks``
|
|
83
84
|
|
|
84
|
-
Release: ``6.
|
|
85
|
+
Release: ``6.3.0``
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
`Databricks <https://databricks.com/>`__
|
|
@@ -94,7 +95,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
|
|
|
94
95
|
are in ``airflow.providers.databricks`` python package.
|
|
95
96
|
|
|
96
97
|
You can find package information and changelog for the provider
|
|
97
|
-
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
98
|
+
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/>`_.
|
|
98
99
|
|
|
99
100
|
Installation
|
|
100
101
|
------------
|
|
@@ -103,7 +104,7 @@ You can install this package on top of an existing Airflow 2 installation (see `
|
|
|
103
104
|
for the minimum Airflow version supported) via
|
|
104
105
|
``pip install apache-airflow-providers-databricks``
|
|
105
106
|
|
|
106
|
-
The package supports the following python versions: 3.8,3.9,3.10,3.11
|
|
107
|
+
The package supports the following python versions: 3.8,3.9,3.10,3.11,3.12
|
|
107
108
|
|
|
108
109
|
Requirements
|
|
109
110
|
------------
|
|
@@ -138,4 +139,4 @@ Dependent package
|
|
|
138
139
|
============================================================================================================ ==============
|
|
139
140
|
|
|
140
141
|
The changelog for the provider package can be found in the
|
|
141
|
-
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
142
|
+
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html>`_.
|
{apache_airflow_providers_databricks-6.2.0 → apache_airflow_providers_databricks-6.3.0}/README.rst
RENAMED
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
Package ``apache-airflow-providers-databricks``
|
|
44
44
|
|
|
45
|
-
Release: ``6.
|
|
45
|
+
Release: ``6.3.0``
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
`Databricks <https://databricks.com/>`__
|
|
@@ -55,7 +55,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
|
|
|
55
55
|
are in ``airflow.providers.databricks`` python package.
|
|
56
56
|
|
|
57
57
|
You can find package information and changelog for the provider
|
|
58
|
-
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
58
|
+
in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/>`_.
|
|
59
59
|
|
|
60
60
|
Installation
|
|
61
61
|
------------
|
|
@@ -64,7 +64,7 @@ You can install this package on top of an existing Airflow 2 installation (see `
|
|
|
64
64
|
for the minimum Airflow version supported) via
|
|
65
65
|
``pip install apache-airflow-providers-databricks``
|
|
66
66
|
|
|
67
|
-
The package supports the following python versions: 3.8,3.9,3.10,3.11
|
|
67
|
+
The package supports the following python versions: 3.8,3.9,3.10,3.11,3.12
|
|
68
68
|
|
|
69
69
|
Requirements
|
|
70
70
|
------------
|
|
@@ -99,4 +99,4 @@ Dependent package
|
|
|
99
99
|
============================================================================================================ ==============
|
|
100
100
|
|
|
101
101
|
The changelog for the provider package can be found in the
|
|
102
|
-
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
102
|
+
`changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html>`_.
|
|
@@ -28,8 +28,9 @@ def get_provider_info():
|
|
|
28
28
|
"name": "Databricks",
|
|
29
29
|
"description": "`Databricks <https://databricks.com/>`__\n",
|
|
30
30
|
"state": "ready",
|
|
31
|
-
"source-date-epoch":
|
|
31
|
+
"source-date-epoch": 1712665557,
|
|
32
32
|
"versions": [
|
|
33
|
+
"6.3.0",
|
|
33
34
|
"6.2.0",
|
|
34
35
|
"6.1.0",
|
|
35
36
|
"6.0.0",
|
|
@@ -25,6 +25,7 @@ operators talk to the
|
|
|
25
25
|
or the ``api/2.1/jobs/runs/submit``
|
|
26
26
|
`endpoint <https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit>`_.
|
|
27
27
|
"""
|
|
28
|
+
|
|
28
29
|
from __future__ import annotations
|
|
29
30
|
|
|
30
31
|
import json
|
|
@@ -196,8 +197,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
196
197
|
super().__init__(databricks_conn_id, timeout_seconds, retry_limit, retry_delay, retry_args, caller)
|
|
197
198
|
|
|
198
199
|
def create_job(self, json: dict) -> int:
|
|
199
|
-
"""
|
|
200
|
-
Utility function to call the ``api/2.1/jobs/create`` endpoint.
|
|
200
|
+
"""Call the ``api/2.1/jobs/create`` endpoint.
|
|
201
201
|
|
|
202
202
|
:param json: The data used in the body of the request to the ``create`` endpoint.
|
|
203
203
|
:return: the job_id as an int
|
|
@@ -206,8 +206,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
206
206
|
return response["job_id"]
|
|
207
207
|
|
|
208
208
|
def reset_job(self, job_id: str, json: dict) -> None:
|
|
209
|
-
"""
|
|
210
|
-
Utility function to call the ``api/2.1/jobs/reset`` endpoint.
|
|
209
|
+
"""Call the ``api/2.1/jobs/reset`` endpoint.
|
|
211
210
|
|
|
212
211
|
:param json: The data used in the new_settings of the request to the ``reset`` endpoint.
|
|
213
212
|
"""
|
|
@@ -530,7 +529,7 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
530
529
|
|
|
531
530
|
def get_latest_repair_id(self, run_id: int) -> int | None:
|
|
532
531
|
"""Get latest repair id if any exist for run_id else None."""
|
|
533
|
-
json = {"run_id": run_id, "include_history":
|
|
532
|
+
json = {"run_id": run_id, "include_history": "true"}
|
|
534
533
|
response = self._do_api_call(GET_RUN_ENDPOINT, json)
|
|
535
534
|
repair_history = response["repair_history"]
|
|
536
535
|
if len(repair_history) == 1:
|
|
@@ -656,6 +655,16 @@ class DatabricksHook(BaseDatabricksHook):
|
|
|
656
655
|
|
|
657
656
|
return None
|
|
658
657
|
|
|
658
|
+
def update_job_permission(self, job_id: int, json: dict[str, Any]) -> dict:
|
|
659
|
+
"""
|
|
660
|
+
Update databricks job permission.
|
|
661
|
+
|
|
662
|
+
:param job_id: job id
|
|
663
|
+
:param json: payload
|
|
664
|
+
:return: json containing permission specification
|
|
665
|
+
"""
|
|
666
|
+
return self._do_api_call(("PATCH", f"api/2.0/permissions/jobs/{job_id}"), json)
|
|
667
|
+
|
|
659
668
|
def test_connection(self) -> tuple[bool, str]:
|
|
660
669
|
"""Test the Databricks connectivity from UI."""
|
|
661
670
|
hook = DatabricksHook(databricks_conn_id=self.databricks_conn_id)
|
|
@@ -22,6 +22,7 @@ This hook enable the submitting and running of jobs to the Databricks platform.
|
|
|
22
22
|
operators talk to the ``api/2.0/jobs/runs/submit``
|
|
23
23
|
`endpoint <https://docs.databricks.com/api/latest/jobs.html#runs-submit>`_.
|
|
24
24
|
"""
|
|
25
|
+
|
|
25
26
|
from __future__ import annotations
|
|
26
27
|
|
|
27
28
|
import copy
|
|
@@ -183,8 +183,7 @@ class DatabricksSqlHook(BaseDatabricksHook, DbApiHook):
|
|
|
183
183
|
handler: None = ...,
|
|
184
184
|
split_statements: bool = ...,
|
|
185
185
|
return_last: bool = ...,
|
|
186
|
-
) -> None:
|
|
187
|
-
...
|
|
186
|
+
) -> None: ...
|
|
188
187
|
|
|
189
188
|
@overload
|
|
190
189
|
def run(
|
|
@@ -195,8 +194,7 @@ class DatabricksSqlHook(BaseDatabricksHook, DbApiHook):
|
|
|
195
194
|
handler: Callable[[Any], T] = ...,
|
|
196
195
|
split_statements: bool = ...,
|
|
197
196
|
return_last: bool = ...,
|
|
198
|
-
) -> tuple | list[tuple] | list[list[tuple] | tuple] | None:
|
|
199
|
-
...
|
|
197
|
+
) -> tuple | list[tuple] | list[list[tuple] | tuple] | None: ...
|
|
200
198
|
|
|
201
199
|
def run(
|
|
202
200
|
self,
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
# specific language governing permissions and limitations
|
|
17
17
|
# under the License.
|
|
18
18
|
"""This module contains Databricks operators."""
|
|
19
|
+
|
|
19
20
|
from __future__ import annotations
|
|
20
21
|
|
|
21
22
|
import time
|
|
@@ -51,6 +52,7 @@ def _handle_databricks_operator_execution(operator, hook, log, context) -> None:
|
|
|
51
52
|
"""
|
|
52
53
|
if operator.do_xcom_push and context is not None:
|
|
53
54
|
context["ti"].xcom_push(key=XCOM_RUN_ID_KEY, value=operator.run_id)
|
|
55
|
+
|
|
54
56
|
log.info("Run submitted with run_id: %s", operator.run_id)
|
|
55
57
|
run_page_url = hook.get_run_page_url(operator.run_id)
|
|
56
58
|
if operator.do_xcom_push and context is not None:
|
|
@@ -65,52 +67,52 @@ def _handle_databricks_operator_execution(operator, hook, log, context) -> None:
|
|
|
65
67
|
log.info("%s completed successfully.", operator.task_id)
|
|
66
68
|
log.info("View run status, Spark UI, and logs at %s", run_page_url)
|
|
67
69
|
return
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
else:
|
|
80
|
-
notebook_error = run_state.state_message
|
|
70
|
+
|
|
71
|
+
if run_state.result_state == "FAILED":
|
|
72
|
+
task_run_id = None
|
|
73
|
+
if "tasks" in run_info:
|
|
74
|
+
for task in run_info["tasks"]:
|
|
75
|
+
if task.get("state", {}).get("result_state", "") == "FAILED":
|
|
76
|
+
task_run_id = task["run_id"]
|
|
77
|
+
if task_run_id is not None:
|
|
78
|
+
run_output = hook.get_run_output(task_run_id)
|
|
79
|
+
if "error" in run_output:
|
|
80
|
+
notebook_error = run_output["error"]
|
|
81
81
|
else:
|
|
82
82
|
notebook_error = run_state.state_message
|
|
83
|
-
error_message = (
|
|
84
|
-
f"{operator.task_id} failed with terminal state: {run_state} "
|
|
85
|
-
f"and with the error {notebook_error}"
|
|
86
|
-
)
|
|
87
83
|
else:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
84
|
+
notebook_error = run_state.state_message
|
|
85
|
+
error_message = (
|
|
86
|
+
f"{operator.task_id} failed with terminal state: {run_state} "
|
|
87
|
+
f"and with the error {notebook_error}"
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
error_message = (
|
|
91
|
+
f"{operator.task_id} failed with terminal state: {run_state} "
|
|
92
|
+
f"and with the error {run_state.state_message}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if isinstance(operator, DatabricksRunNowOperator) and operator.repair_run:
|
|
96
|
+
operator.repair_run = False
|
|
97
|
+
log.warning(
|
|
98
|
+
"%s but since repair run is set, repairing the run with all failed tasks",
|
|
99
|
+
error_message,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
latest_repair_id = hook.get_latest_repair_id(operator.run_id)
|
|
103
|
+
repair_json = {"run_id": operator.run_id, "rerun_all_failed_tasks": True}
|
|
104
|
+
if latest_repair_id is not None:
|
|
105
|
+
repair_json["latest_repair_id"] = latest_repair_id
|
|
106
|
+
operator.json["latest_repair_id"] = hook.repair_run(operator, repair_json)
|
|
107
|
+
_handle_databricks_operator_execution(operator, hook, log, context)
|
|
108
|
+
raise AirflowException(error_message)
|
|
109
|
+
|
|
110
|
+
log.info("%s in run state: %s", operator.task_id, run_state)
|
|
111
|
+
log.info("View run status, Spark UI, and logs at %s", run_page_url)
|
|
112
|
+
log.info("Sleeping for %s seconds.", operator.polling_period_seconds)
|
|
113
|
+
time.sleep(operator.polling_period_seconds)
|
|
114
|
+
|
|
115
|
+
log.info("View run status, Spark UI, and logs at %s", run_page_url)
|
|
114
116
|
|
|
115
117
|
|
|
116
118
|
def _handle_deferrable_databricks_operator_execution(operator, hook, log, context) -> None:
|
|
@@ -145,6 +147,7 @@ def _handle_deferrable_databricks_operator_execution(operator, hook, log, contex
|
|
|
145
147
|
retry_delay=operator.databricks_retry_delay,
|
|
146
148
|
retry_args=operator.databricks_retry_args,
|
|
147
149
|
run_page_url=run_page_url,
|
|
150
|
+
repair_run=getattr(operator, "repair_run", False),
|
|
148
151
|
),
|
|
149
152
|
method_name=DEFER_METHOD_NAME,
|
|
150
153
|
)
|
|
@@ -162,9 +165,15 @@ def _handle_deferrable_databricks_operator_completion(event: dict, log: Logger)
|
|
|
162
165
|
if run_state.is_successful:
|
|
163
166
|
log.info("Job run completed successfully.")
|
|
164
167
|
return
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
+
|
|
169
|
+
error_message = f"Job run failed with terminal state: {run_state}"
|
|
170
|
+
if event["repair_run"]:
|
|
171
|
+
log.warning(
|
|
172
|
+
"%s but since repair run is set, repairing the run with all failed tasks",
|
|
173
|
+
error_message,
|
|
174
|
+
)
|
|
175
|
+
return
|
|
176
|
+
raise AirflowException(error_message)
|
|
168
177
|
|
|
169
178
|
|
|
170
179
|
class DatabricksJobRunLink(BaseOperatorLink):
|
|
@@ -257,7 +266,7 @@ class DatabricksCreateJobsOperator(BaseOperator):
|
|
|
257
266
|
databricks_retry_args: dict[Any, Any] | None = None,
|
|
258
267
|
**kwargs,
|
|
259
268
|
) -> None:
|
|
260
|
-
"""
|
|
269
|
+
"""Create a new ``DatabricksCreateJobsOperator``."""
|
|
261
270
|
super().__init__(**kwargs)
|
|
262
271
|
self.json = json or {}
|
|
263
272
|
self.databricks_conn_id = databricks_conn_id
|
|
@@ -287,8 +296,8 @@ class DatabricksCreateJobsOperator(BaseOperator):
|
|
|
287
296
|
self.json["git_source"] = git_source
|
|
288
297
|
if access_control_list is not None:
|
|
289
298
|
self.json["access_control_list"] = access_control_list
|
|
290
|
-
|
|
291
|
-
|
|
299
|
+
if self.json:
|
|
300
|
+
self.json = normalise_json_content(self.json)
|
|
292
301
|
|
|
293
302
|
@cached_property
|
|
294
303
|
def _hook(self):
|
|
@@ -307,6 +316,10 @@ class DatabricksCreateJobsOperator(BaseOperator):
|
|
|
307
316
|
if job_id is None:
|
|
308
317
|
return self._hook.create_job(self.json)
|
|
309
318
|
self._hook.reset_job(str(job_id), self.json)
|
|
319
|
+
if (access_control_list := self.json.get("access_control_list")) is not None:
|
|
320
|
+
acl_json = {"access_control_list": access_control_list}
|
|
321
|
+
self._hook.update_job_permission(job_id, normalise_json_content(acl_json))
|
|
322
|
+
|
|
310
323
|
return job_id
|
|
311
324
|
|
|
312
325
|
|
|
@@ -583,9 +596,6 @@ class DatabricksSubmitRunDeferrableOperator(DatabricksSubmitRunOperator):
|
|
|
583
596
|
self.run_id = hook.submit_run(json_normalised)
|
|
584
597
|
_handle_deferrable_databricks_operator_execution(self, hook, self.log, context)
|
|
585
598
|
|
|
586
|
-
def execute_complete(self, context: dict | None, event: dict):
|
|
587
|
-
_handle_deferrable_databricks_operator_completion(event, self.log)
|
|
588
|
-
|
|
589
599
|
|
|
590
600
|
class DatabricksRunNowOperator(BaseOperator):
|
|
591
601
|
"""
|
|
@@ -645,6 +655,7 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
645
655
|
- ``spark_submit_params``
|
|
646
656
|
- ``idempotency_token``
|
|
647
657
|
- ``repair_run``
|
|
658
|
+
- ``cancel_previous_runs``
|
|
648
659
|
|
|
649
660
|
:param job_id: the job_id of the existing Databricks job.
|
|
650
661
|
This field will be templated.
|
|
@@ -733,7 +744,8 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
733
744
|
:param do_xcom_push: Whether we should push run_id and run_page_url to xcom.
|
|
734
745
|
:param wait_for_termination: if we should wait for termination of the job run. ``True`` by default.
|
|
735
746
|
:param deferrable: Run operator in the deferrable mode.
|
|
736
|
-
:param repair_run: Repair the databricks run in case of failure
|
|
747
|
+
:param repair_run: Repair the databricks run in case of failure.
|
|
748
|
+
:param cancel_previous_runs: Cancel all existing running jobs before submitting new one.
|
|
737
749
|
"""
|
|
738
750
|
|
|
739
751
|
# Used in airflow.models.BaseOperator
|
|
@@ -765,6 +777,7 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
765
777
|
wait_for_termination: bool = True,
|
|
766
778
|
deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
|
|
767
779
|
repair_run: bool = False,
|
|
780
|
+
cancel_previous_runs: bool = False,
|
|
768
781
|
**kwargs,
|
|
769
782
|
) -> None:
|
|
770
783
|
"""Create a new ``DatabricksRunNowOperator``."""
|
|
@@ -778,6 +791,7 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
778
791
|
self.wait_for_termination = wait_for_termination
|
|
779
792
|
self.deferrable = deferrable
|
|
780
793
|
self.repair_run = repair_run
|
|
794
|
+
self.cancel_previous_runs = cancel_previous_runs
|
|
781
795
|
|
|
782
796
|
if job_id is not None:
|
|
783
797
|
self.json["job_id"] = job_id
|
|
@@ -797,8 +811,8 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
797
811
|
self.json["spark_submit_params"] = spark_submit_params
|
|
798
812
|
if idempotency_token is not None:
|
|
799
813
|
self.json["idempotency_token"] = idempotency_token
|
|
800
|
-
|
|
801
|
-
|
|
814
|
+
if self.json:
|
|
815
|
+
self.json = normalise_json_content(self.json)
|
|
802
816
|
# This variable will be used in case our task gets killed.
|
|
803
817
|
self.run_id: int | None = None
|
|
804
818
|
self.do_xcom_push = do_xcom_push
|
|
@@ -824,6 +838,10 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
824
838
|
raise AirflowException(f"Job ID for job name {self.json['job_name']} can not be found")
|
|
825
839
|
self.json["job_id"] = job_id
|
|
826
840
|
del self.json["job_name"]
|
|
841
|
+
|
|
842
|
+
if self.cancel_previous_runs and self.json["job_id"] is not None:
|
|
843
|
+
hook.cancel_all_runs(self.json["job_id"])
|
|
844
|
+
|
|
827
845
|
self.run_id = hook.run_now(self.json)
|
|
828
846
|
if self.deferrable:
|
|
829
847
|
_handle_deferrable_databricks_operator_execution(self, hook, self.log, context)
|
|
@@ -833,8 +851,17 @@ class DatabricksRunNowOperator(BaseOperator):
|
|
|
833
851
|
def execute_complete(self, context: Context, event: dict[str, Any] | None = None) -> None:
|
|
834
852
|
if event:
|
|
835
853
|
_handle_deferrable_databricks_operator_completion(event, self.log)
|
|
836
|
-
|
|
837
|
-
|
|
854
|
+
if event["repair_run"]:
|
|
855
|
+
self.repair_run = False
|
|
856
|
+
self.run_id = event["run_id"]
|
|
857
|
+
latest_repair_id = self._hook.get_latest_repair_id(self.run_id)
|
|
858
|
+
repair_json = {"run_id": self.run_id, "rerun_all_failed_tasks": True}
|
|
859
|
+
if latest_repair_id is not None:
|
|
860
|
+
repair_json["latest_repair_id"] = latest_repair_id
|
|
861
|
+
self.json["latest_repair_id"] = self._hook.repair_run(repair_json)
|
|
862
|
+
_handle_deferrable_databricks_operator_execution(self, self._hook, self.log, context)
|
|
863
|
+
|
|
864
|
+
def on_kill(self) -> None:
|
|
838
865
|
if self.run_id:
|
|
839
866
|
self._hook.cancel_run(self.run_id)
|
|
840
867
|
self.log.info(
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
# specific language governing permissions and limitations
|
|
17
17
|
# under the License.
|
|
18
18
|
"""This module contains Databricks operators."""
|
|
19
|
+
|
|
19
20
|
from __future__ import annotations
|
|
20
21
|
|
|
21
22
|
import csv
|
|
@@ -207,9 +208,9 @@ class DatabricksCopyIntoOperator(BaseOperator):
|
|
|
207
208
|
"""
|
|
208
209
|
|
|
209
210
|
template_fields: Sequence[str] = (
|
|
210
|
-
"
|
|
211
|
-
"
|
|
212
|
-
"
|
|
211
|
+
"file_location",
|
|
212
|
+
"files",
|
|
213
|
+
"table_name",
|
|
213
214
|
"databricks_conn_id",
|
|
214
215
|
)
|
|
215
216
|
|
|
@@ -249,17 +250,17 @@ class DatabricksCopyIntoOperator(BaseOperator):
|
|
|
249
250
|
raise AirflowException("file_location shouldn't be empty")
|
|
250
251
|
if file_format not in COPY_INTO_APPROVED_FORMATS:
|
|
251
252
|
raise AirflowException(f"file_format '{file_format}' isn't supported")
|
|
252
|
-
self.
|
|
253
|
+
self.files = files
|
|
253
254
|
self._pattern = pattern
|
|
254
255
|
self._file_format = file_format
|
|
255
256
|
self.databricks_conn_id = databricks_conn_id
|
|
256
257
|
self._http_path = http_path
|
|
257
258
|
self._sql_endpoint_name = sql_endpoint_name
|
|
258
259
|
self.session_config = session_configuration
|
|
259
|
-
self.
|
|
260
|
+
self.table_name = table_name
|
|
260
261
|
self._catalog = catalog
|
|
261
262
|
self._schema = schema
|
|
262
|
-
self.
|
|
263
|
+
self.file_location = file_location
|
|
263
264
|
self._expression_list = expression_list
|
|
264
265
|
self._credential = credential
|
|
265
266
|
self._storage_credential = storage_credential
|
|
@@ -313,14 +314,14 @@ class DatabricksCopyIntoOperator(BaseOperator):
|
|
|
313
314
|
if self._credential is not None:
|
|
314
315
|
maybe_credential = self._generate_options("CREDENTIAL", escaper, self._credential, False)
|
|
315
316
|
maybe_with = f" WITH ({maybe_credential} {maybe_encryption})"
|
|
316
|
-
location = escaper.escape_item(self.
|
|
317
|
+
location = escaper.escape_item(self.file_location) + maybe_with
|
|
317
318
|
if self._expression_list is not None:
|
|
318
319
|
location = f"(SELECT {self._expression_list} FROM {location})"
|
|
319
320
|
files_or_pattern = ""
|
|
320
321
|
if self._pattern is not None:
|
|
321
322
|
files_or_pattern = f"PATTERN = {escaper.escape_item(self._pattern)}\n"
|
|
322
|
-
elif self.
|
|
323
|
-
files_or_pattern = f"FILES = {escaper.escape_item(self.
|
|
323
|
+
elif self.files is not None:
|
|
324
|
+
files_or_pattern = f"FILES = {escaper.escape_item(self.files)}\n"
|
|
324
325
|
format_options = self._generate_options("FORMAT_OPTIONS", escaper, self._format_options) + "\n"
|
|
325
326
|
copy_options = self._generate_options("COPY_OPTIONS", escaper, self._copy_options) + "\n"
|
|
326
327
|
storage_cred = ""
|
|
@@ -340,7 +341,7 @@ class DatabricksCopyIntoOperator(BaseOperator):
|
|
|
340
341
|
else:
|
|
341
342
|
raise AirflowException(f"Incorrect data type for validate parameter: {type(self._validate)}")
|
|
342
343
|
# TODO: think on how to make sure that table_name and expression_list aren't used for SQL injection
|
|
343
|
-
sql = f"""COPY INTO {self.
|
|
344
|
+
sql = f"""COPY INTO {self.table_name}{storage_cred}
|
|
344
345
|
FROM {location}
|
|
345
346
|
FILEFORMAT = {self._file_format}
|
|
346
347
|
{validation}{files_or_pattern}{format_options}{copy_options}
|
|
@@ -47,6 +47,7 @@ class DatabricksExecutionTrigger(BaseTrigger):
|
|
|
47
47
|
retry_delay: int = 10,
|
|
48
48
|
retry_args: dict[Any, Any] | None = None,
|
|
49
49
|
run_page_url: str | None = None,
|
|
50
|
+
repair_run: bool = False,
|
|
50
51
|
) -> None:
|
|
51
52
|
super().__init__()
|
|
52
53
|
self.run_id = run_id
|
|
@@ -56,6 +57,7 @@ class DatabricksExecutionTrigger(BaseTrigger):
|
|
|
56
57
|
self.retry_delay = retry_delay
|
|
57
58
|
self.retry_args = retry_args
|
|
58
59
|
self.run_page_url = run_page_url
|
|
60
|
+
self.repair_run = repair_run
|
|
59
61
|
self.hook = DatabricksHook(
|
|
60
62
|
databricks_conn_id,
|
|
61
63
|
retry_limit=self.retry_limit,
|
|
@@ -74,6 +76,7 @@ class DatabricksExecutionTrigger(BaseTrigger):
|
|
|
74
76
|
"retry_delay": self.retry_delay,
|
|
75
77
|
"retry_args": self.retry_args,
|
|
76
78
|
"run_page_url": self.run_page_url,
|
|
79
|
+
"repair_run": self.repair_run,
|
|
77
80
|
},
|
|
78
81
|
)
|
|
79
82
|
|
|
@@ -87,14 +90,15 @@ class DatabricksExecutionTrigger(BaseTrigger):
|
|
|
87
90
|
"run_id": self.run_id,
|
|
88
91
|
"run_page_url": self.run_page_url,
|
|
89
92
|
"run_state": run_state.to_json(),
|
|
93
|
+
"repair_run": self.repair_run,
|
|
90
94
|
}
|
|
91
95
|
)
|
|
92
96
|
return
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
97
|
+
|
|
98
|
+
self.log.info(
|
|
99
|
+
"run-id %s in run state %s. sleeping for %s seconds",
|
|
100
|
+
self.run_id,
|
|
101
|
+
run_state,
|
|
102
|
+
self.polling_period_seconds,
|
|
103
|
+
)
|
|
104
|
+
await asyncio.sleep(self.polling_period_seconds)
|
|
@@ -28,7 +28,7 @@ build-backend = "flit_core.buildapi"
|
|
|
28
28
|
|
|
29
29
|
[project]
|
|
30
30
|
name = "apache-airflow-providers-databricks"
|
|
31
|
-
version = "6.
|
|
31
|
+
version = "6.3.0"
|
|
32
32
|
description = "Provider package apache-airflow-providers-databricks for Apache Airflow"
|
|
33
33
|
readme = "README.rst"
|
|
34
34
|
authors = [
|
|
@@ -51,6 +51,7 @@ classifiers = [
|
|
|
51
51
|
"Programming Language :: Python :: 3.9",
|
|
52
52
|
"Programming Language :: Python :: 3.10",
|
|
53
53
|
"Programming Language :: Python :: 3.11",
|
|
54
|
+
"Programming Language :: Python :: 3.12",
|
|
54
55
|
"Topic :: System :: Monitoring",
|
|
55
56
|
]
|
|
56
57
|
requires-python = "~=3.8"
|
|
@@ -63,8 +64,8 @@ dependencies = [
|
|
|
63
64
|
]
|
|
64
65
|
|
|
65
66
|
[project.urls]
|
|
66
|
-
"Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
67
|
-
"Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.
|
|
67
|
+
"Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0"
|
|
68
|
+
"Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html"
|
|
68
69
|
"Bug Tracker" = "https://github.com/apache/airflow/issues"
|
|
69
70
|
"Source Code" = "https://github.com/apache/airflow"
|
|
70
71
|
"Slack Chat" = "https://s.apache.org/airflow-slack"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|