apache-airflow-providers-databricks 6.3.0rc3__tar.gz → 6.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apache-airflow-providers-databricks might be problematic. Click here for more details.

Files changed (21) hide show
  1. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/PKG-INFO +9 -9
  2. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/README.rst +4 -4
  3. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/__init__.py +3 -3
  4. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/get_provider_info.py +4 -2
  5. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/hooks/databricks.py +11 -0
  6. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/operators/databricks.py +188 -5
  7. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/triggers/databricks.py +30 -15
  8. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/utils/databricks.py +1 -1
  9. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/pyproject.toml +5 -5
  10. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/LICENSE +0 -0
  11. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/hooks/__init__.py +0 -0
  12. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/hooks/databricks_base.py +0 -0
  13. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/hooks/databricks_sql.py +0 -0
  14. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/operators/__init__.py +0 -0
  15. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/operators/databricks_repos.py +0 -0
  16. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/operators/databricks_sql.py +0 -0
  17. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/sensors/__init__.py +0 -0
  18. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/sensors/databricks_partition.py +0 -0
  19. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/sensors/databricks_sql.py +0 -0
  20. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/triggers/__init__.py +0 -0
  21. {apache_airflow_providers_databricks-6.3.0rc3 → apache_airflow_providers_databricks-6.4.0}/airflow/providers/databricks/utils/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apache-airflow-providers-databricks
3
- Version: 6.3.0rc3
3
+ Version: 6.4.0
4
4
  Summary: Provider package apache-airflow-providers-databricks for Apache Airflow
5
5
  Keywords: airflow-provider,databricks,airflow,integration
6
6
  Author-email: Apache Software Foundation <dev@airflow.apache.org>
@@ -22,15 +22,15 @@ Classifier: Programming Language :: Python :: 3.11
22
22
  Classifier: Programming Language :: Python :: 3.12
23
23
  Classifier: Topic :: System :: Monitoring
24
24
  Requires-Dist: aiohttp>=3.9.2, <4
25
- Requires-Dist: apache-airflow-providers-common-sql>=1.10.0rc0
26
- Requires-Dist: apache-airflow>=2.6.0rc0
25
+ Requires-Dist: apache-airflow-providers-common-sql>=1.10.0
26
+ Requires-Dist: apache-airflow>=2.7.0
27
27
  Requires-Dist: databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0
28
28
  Requires-Dist: requests>=2.27.0,<3
29
29
  Requires-Dist: apache-airflow-providers-common-sql ; extra == "common.sql"
30
30
  Requires-Dist: databricks-sdk==0.10.0 ; extra == "sdk"
31
31
  Project-URL: Bug Tracker, https://github.com/apache/airflow/issues
32
- Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html
33
- Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0
32
+ Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0/changelog.html
33
+ Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0
34
34
  Project-URL: Slack Chat, https://s.apache.org/airflow-slack
35
35
  Project-URL: Source Code, https://github.com/apache/airflow
36
36
  Project-URL: Twitter, https://twitter.com/ApacheAirflow
@@ -82,7 +82,7 @@ Provides-Extra: sdk
82
82
 
83
83
  Package ``apache-airflow-providers-databricks``
84
84
 
85
- Release: ``6.3.0.rc3``
85
+ Release: ``6.4.0``
86
86
 
87
87
 
88
88
  `Databricks <https://databricks.com/>`__
@@ -95,7 +95,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
95
95
  are in ``airflow.providers.databricks`` python package.
96
96
 
97
97
  You can find package information and changelog for the provider
98
- in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/>`_.
98
+ in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0/>`_.
99
99
 
100
100
  Installation
101
101
  ------------
@@ -112,7 +112,7 @@ Requirements
112
112
  ======================================= ==========================
113
113
  PIP package Version required
114
114
  ======================================= ==========================
115
- ``apache-airflow`` ``>=2.6.0``
115
+ ``apache-airflow`` ``>=2.7.0``
116
116
  ``apache-airflow-providers-common-sql`` ``>=1.10.0``
117
117
  ``requests`` ``>=2.27.0,<3``
118
118
  ``databricks-sql-connector`` ``>=2.0.0,!=2.9.0,<3.0.0``
@@ -139,4 +139,4 @@ Dependent package
139
139
  ============================================================================================================ ==============
140
140
 
141
141
  The changelog for the provider package can be found in the
142
- `changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html>`_.
142
+ `changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0/changelog.html>`_.
@@ -42,7 +42,7 @@
42
42
 
43
43
  Package ``apache-airflow-providers-databricks``
44
44
 
45
- Release: ``6.3.0.rc3``
45
+ Release: ``6.4.0``
46
46
 
47
47
 
48
48
  `Databricks <https://databricks.com/>`__
@@ -55,7 +55,7 @@ This is a provider package for ``databricks`` provider. All classes for this pro
55
55
  are in ``airflow.providers.databricks`` python package.
56
56
 
57
57
  You can find package information and changelog for the provider
58
- in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/>`_.
58
+ in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0/>`_.
59
59
 
60
60
  Installation
61
61
  ------------
@@ -72,7 +72,7 @@ Requirements
72
72
  ======================================= ==========================
73
73
  PIP package Version required
74
74
  ======================================= ==========================
75
- ``apache-airflow`` ``>=2.6.0``
75
+ ``apache-airflow`` ``>=2.7.0``
76
76
  ``apache-airflow-providers-common-sql`` ``>=1.10.0``
77
77
  ``requests`` ``>=2.27.0,<3``
78
78
  ``databricks-sql-connector`` ``>=2.0.0,!=2.9.0,<3.0.0``
@@ -99,4 +99,4 @@ Dependent package
99
99
  ============================================================================================================ ==============
100
100
 
101
101
  The changelog for the provider package can be found in the
102
- `changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html>`_.
102
+ `changelog <https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0/changelog.html>`_.
@@ -27,7 +27,7 @@ import packaging.version
27
27
 
28
28
  __all__ = ["__version__"]
29
29
 
30
- __version__ = "6.3.0"
30
+ __version__ = "6.4.0"
31
31
 
32
32
  try:
33
33
  from airflow import __version__ as airflow_version
@@ -35,8 +35,8 @@ except ImportError:
35
35
  from airflow.version import version as airflow_version
36
36
 
37
37
  if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
38
- "2.6.0"
38
+ "2.7.0"
39
39
  ):
40
40
  raise RuntimeError(
41
- f"The package `apache-airflow-providers-databricks:{__version__}` needs Apache Airflow 2.6.0+"
41
+ f"The package `apache-airflow-providers-databricks:{__version__}` needs Apache Airflow 2.7.0+"
42
42
  )
@@ -28,8 +28,9 @@ def get_provider_info():
28
28
  "name": "Databricks",
29
29
  "description": "`Databricks <https://databricks.com/>`__\n",
30
30
  "state": "ready",
31
- "source-date-epoch": 1712665557,
31
+ "source-date-epoch": 1714476154,
32
32
  "versions": [
33
+ "6.4.0",
33
34
  "6.3.0",
34
35
  "6.2.0",
35
36
  "6.1.0",
@@ -67,7 +68,7 @@ def get_provider_info():
67
68
  "1.0.0",
68
69
  ],
69
70
  "dependencies": [
70
- "apache-airflow>=2.6.0",
71
+ "apache-airflow>=2.7.0",
71
72
  "apache-airflow-providers-common-sql>=1.10.0",
72
73
  "requests>=2.27.0,<3",
73
74
  "databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0",
@@ -87,6 +88,7 @@ def get_provider_info():
87
88
  "external-doc-url": "https://databricks.com/",
88
89
  "how-to-guide": [
89
90
  "/docs/apache-airflow-providers-databricks/operators/jobs_create.rst",
91
+ "/docs/apache-airflow-providers-databricks/operators/notebook.rst",
90
92
  "/docs/apache-airflow-providers-databricks/operators/submit_run.rst",
91
93
  "/docs/apache-airflow-providers-databricks/operators/run_now.rst",
92
94
  ],
@@ -491,6 +491,17 @@ class DatabricksHook(BaseDatabricksHook):
491
491
  run_output = self._do_api_call(OUTPUT_RUNS_JOB_ENDPOINT, json)
492
492
  return run_output
493
493
 
494
+ async def a_get_run_output(self, run_id: int) -> dict:
495
+ """
496
+ Async version of `get_run_output()`.
497
+
498
+ :param run_id: id of the run
499
+ :return: output of the run
500
+ """
501
+ json = {"run_id": run_id}
502
+ run_output = await self._a_do_api_call(OUTPUT_RUNS_JOB_ENDPOINT, json)
503
+ return run_output
504
+
494
505
  def cancel_run(self, run_id: int) -> None:
495
506
  """
496
507
  Cancel the run.
@@ -70,10 +70,9 @@ def _handle_databricks_operator_execution(operator, hook, log, context) -> None:
70
70
 
71
71
  if run_state.result_state == "FAILED":
72
72
  task_run_id = None
73
- if "tasks" in run_info:
74
- for task in run_info["tasks"]:
75
- if task.get("state", {}).get("result_state", "") == "FAILED":
76
- task_run_id = task["run_id"]
73
+ for task in run_info.get("tasks", []):
74
+ if task.get("state", {}).get("result_state", "") == "FAILED":
75
+ task_run_id = task["run_id"]
77
76
  if task_run_id is not None:
78
77
  run_output = hook.get_run_output(task_run_id)
79
78
  if "error" in run_output:
@@ -160,13 +159,15 @@ def _handle_deferrable_databricks_operator_completion(event: dict, log: Logger)
160
159
  validate_trigger_event(event)
161
160
  run_state = RunState.from_json(event["run_state"])
162
161
  run_page_url = event["run_page_url"]
162
+ errors = event["errors"]
163
163
  log.info("View run status, Spark UI, and logs at %s", run_page_url)
164
164
 
165
165
  if run_state.is_successful:
166
166
  log.info("Job run completed successfully.")
167
167
  return
168
168
 
169
- error_message = f"Job run failed with terminal state: {run_state}"
169
+ error_message = f"Job run failed with terminal state: {run_state} and with the errors {errors}"
170
+
170
171
  if event["repair_run"]:
171
172
  log.warning(
172
173
  "%s but since repair run is set, repairing the run with all failed tasks",
@@ -207,6 +208,7 @@ class DatabricksCreateJobsOperator(BaseOperator):
207
208
  .. seealso::
208
209
  For more information about templating see :ref:`concepts:jinja-templating`.
209
210
  :param name: An optional name for the job.
211
+ :param description: An optional description for the job.
210
212
  :param tags: A map of tags associated with the job.
211
213
  :param tasks: A list of task specifications to be executed by this job.
212
214
  Array of objects (JobTaskSettings).
@@ -214,6 +216,7 @@ class DatabricksCreateJobsOperator(BaseOperator):
214
216
  tasks of this job. Array of objects (JobCluster).
215
217
  :param email_notifications: Object (JobEmailNotifications).
216
218
  :param webhook_notifications: Object (WebhookNotifications).
219
+ :param notification_settings: Optional notification settings.
217
220
  :param timeout_seconds: An optional timeout applied to each run of this job.
218
221
  :param schedule: Object (CronSchedule).
219
222
  :param max_concurrent_runs: An optional maximum allowed number of concurrent runs of the job.
@@ -249,11 +252,13 @@ class DatabricksCreateJobsOperator(BaseOperator):
249
252
  *,
250
253
  json: Any | None = None,
251
254
  name: str | None = None,
255
+ description: str | None = None,
252
256
  tags: dict[str, str] | None = None,
253
257
  tasks: list[dict] | None = None,
254
258
  job_clusters: list[dict] | None = None,
255
259
  email_notifications: dict | None = None,
256
260
  webhook_notifications: dict | None = None,
261
+ notification_settings: dict | None = None,
257
262
  timeout_seconds: int | None = None,
258
263
  schedule: dict | None = None,
259
264
  max_concurrent_runs: int | None = None,
@@ -276,6 +281,8 @@ class DatabricksCreateJobsOperator(BaseOperator):
276
281
  self.databricks_retry_args = databricks_retry_args
277
282
  if name is not None:
278
283
  self.json["name"] = name
284
+ if description is not None:
285
+ self.json["description"] = description
279
286
  if tags is not None:
280
287
  self.json["tags"] = tags
281
288
  if tasks is not None:
@@ -286,6 +293,8 @@ class DatabricksCreateJobsOperator(BaseOperator):
286
293
  self.json["email_notifications"] = email_notifications
287
294
  if webhook_notifications is not None:
288
295
  self.json["webhook_notifications"] = webhook_notifications
296
+ if notification_settings is not None:
297
+ self.json["notification_settings"] = notification_settings
289
298
  if timeout_seconds is not None:
290
299
  self.json["timeout_seconds"] = timeout_seconds
291
300
  if schedule is not None:
@@ -884,3 +893,177 @@ class DatabricksRunNowDeferrableOperator(DatabricksRunNowOperator):
884
893
 
885
894
  def __init__(self, *args, **kwargs):
886
895
  super().__init__(deferrable=True, *args, **kwargs)
896
+
897
+
898
+ class DatabricksNotebookOperator(BaseOperator):
899
+ """
900
+ Runs a notebook on Databricks using an Airflow operator.
901
+
902
+ The DatabricksNotebookOperator allows users to launch and monitor notebook
903
+ job runs on Databricks as Airflow tasks.
904
+
905
+ .. seealso::
906
+ For more information on how to use this operator, take a look at the guide:
907
+ :ref:`howto/operator:DatabricksNotebookOperator`
908
+
909
+ :param notebook_path: The path to the notebook in Databricks.
910
+ :param source: Optional location type of the notebook. When set to WORKSPACE, the notebook will be retrieved
911
+ from the local Databricks workspace. When set to GIT, the notebook will be retrieved from a Git repository
912
+ defined in git_source. If the value is empty, the task will use GIT if git_source is defined
913
+ and WORKSPACE otherwise. For more information please visit
914
+ https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsCreate
915
+ :param notebook_params: A dict of key-value pairs to be passed as optional params to the notebook task.
916
+ :param notebook_packages: A list of the Python libraries to be installed on the cluster running the
917
+ notebook.
918
+ :param new_cluster: Specs for a new cluster on which this task will be run.
919
+ :param existing_cluster_id: ID for existing cluster on which to run this task.
920
+ :param job_cluster_key: The key for the job cluster.
921
+ :param polling_period_seconds: Controls the rate which we poll for the result of this notebook job run.
922
+ :param databricks_retry_limit: Amount of times to retry if the Databricks backend is unreachable.
923
+ :param databricks_retry_delay: Number of seconds to wait between retries.
924
+ :param databricks_retry_args: An optional dictionary with arguments passed to ``tenacity.Retrying`` class.
925
+ :param wait_for_termination: if we should wait for termination of the job run. ``True`` by default.
926
+ :param databricks_conn_id: The name of the Airflow connection to use.
927
+ """
928
+
929
+ template_fields = ("notebook_params",)
930
+
931
+ def __init__(
932
+ self,
933
+ notebook_path: str,
934
+ source: str,
935
+ notebook_params: dict | None = None,
936
+ notebook_packages: list[dict[str, Any]] | None = None,
937
+ new_cluster: dict[str, Any] | None = None,
938
+ existing_cluster_id: str = "",
939
+ job_cluster_key: str = "",
940
+ polling_period_seconds: int = 5,
941
+ databricks_retry_limit: int = 3,
942
+ databricks_retry_delay: int = 1,
943
+ databricks_retry_args: dict[Any, Any] | None = None,
944
+ wait_for_termination: bool = True,
945
+ databricks_conn_id: str = "databricks_default",
946
+ **kwargs: Any,
947
+ ):
948
+ self.notebook_path = notebook_path
949
+ self.source = source
950
+ self.notebook_params = notebook_params or {}
951
+ self.notebook_packages = notebook_packages or []
952
+ self.new_cluster = new_cluster or {}
953
+ self.existing_cluster_id = existing_cluster_id
954
+ self.job_cluster_key = job_cluster_key
955
+ self.polling_period_seconds = polling_period_seconds
956
+ self.databricks_retry_limit = databricks_retry_limit
957
+ self.databricks_retry_delay = databricks_retry_delay
958
+ self.databricks_retry_args = databricks_retry_args
959
+ self.wait_for_termination = wait_for_termination
960
+ self.databricks_conn_id = databricks_conn_id
961
+ self.databricks_run_id: int | None = None
962
+ super().__init__(**kwargs)
963
+
964
+ @cached_property
965
+ def _hook(self) -> DatabricksHook:
966
+ return self._get_hook(caller="DatabricksNotebookOperator")
967
+
968
+ def _get_hook(self, caller: str) -> DatabricksHook:
969
+ return DatabricksHook(
970
+ self.databricks_conn_id,
971
+ retry_limit=self.databricks_retry_limit,
972
+ retry_delay=self.databricks_retry_delay,
973
+ retry_args=self.databricks_retry_args,
974
+ caller=caller,
975
+ )
976
+
977
+ def _get_task_timeout_seconds(self) -> int:
978
+ """
979
+ Get the timeout seconds value for the Databricks job based on the execution timeout value provided for the Airflow task.
980
+
981
+ By default, tasks in Airflow have an execution_timeout set to None. In Airflow, when
982
+ execution_timeout is not defined, the task continues to run indefinitely. Therefore,
983
+ to mirror this behavior in the Databricks Jobs API, we set the timeout to 0, indicating
984
+ that the job should run indefinitely. This aligns with the default behavior of Databricks jobs,
985
+ where a timeout seconds value of 0 signifies an indefinite run duration.
986
+ More details can be found in the Databricks documentation:
987
+ See https://docs.databricks.com/api/workspace/jobs/submit#timeout_seconds
988
+ """
989
+ if self.execution_timeout is None:
990
+ return 0
991
+ execution_timeout_seconds = int(self.execution_timeout.total_seconds())
992
+ if execution_timeout_seconds == 0:
993
+ raise ValueError(
994
+ "If you've set an `execution_timeout` for the task, ensure it's not `0`. Set it instead to "
995
+ "`None` if you desire the task to run indefinitely."
996
+ )
997
+ return execution_timeout_seconds
998
+
999
+ def _get_task_base_json(self) -> dict[str, Any]:
1000
+ """Get task base json to be used for task submissions."""
1001
+ return {
1002
+ "timeout_seconds": self._get_task_timeout_seconds(),
1003
+ "email_notifications": {},
1004
+ "notebook_task": {
1005
+ "notebook_path": self.notebook_path,
1006
+ "source": self.source,
1007
+ "base_parameters": self.notebook_params,
1008
+ },
1009
+ "libraries": self.notebook_packages,
1010
+ }
1011
+
1012
+ def _get_databricks_task_id(self, task_id: str) -> str:
1013
+ """Get the databricks task ID using dag_id and task_id. Removes illegal characters."""
1014
+ return f"{self.dag_id}__{task_id.replace('.', '__')}"
1015
+
1016
+ def _get_run_json(self) -> dict[str, Any]:
1017
+ """Get run json to be used for task submissions."""
1018
+ run_json = {
1019
+ "run_name": self._get_databricks_task_id(self.task_id),
1020
+ **self._get_task_base_json(),
1021
+ }
1022
+ if self.new_cluster and self.existing_cluster_id:
1023
+ raise ValueError("Both new_cluster and existing_cluster_id are set. Only one should be set.")
1024
+ if self.new_cluster:
1025
+ run_json["new_cluster"] = self.new_cluster
1026
+ elif self.existing_cluster_id:
1027
+ run_json["existing_cluster_id"] = self.existing_cluster_id
1028
+ else:
1029
+ raise ValueError("Must specify either existing_cluster_id or new_cluster.")
1030
+ return run_json
1031
+
1032
+ def launch_notebook_job(self) -> int:
1033
+ run_json = self._get_run_json()
1034
+ self.databricks_run_id = self._hook.submit_run(run_json)
1035
+ url = self._hook.get_run_page_url(self.databricks_run_id)
1036
+ self.log.info("Check the job run in Databricks: %s", url)
1037
+ return self.databricks_run_id
1038
+
1039
+ def monitor_databricks_job(self) -> None:
1040
+ if self.databricks_run_id is None:
1041
+ raise ValueError("Databricks job not yet launched. Please run launch_notebook_job first.")
1042
+ run = self._hook.get_run(self.databricks_run_id)
1043
+ run_state = RunState(**run["state"])
1044
+ self.log.info("Current state of the job: %s", run_state.life_cycle_state)
1045
+ while not run_state.is_terminal:
1046
+ time.sleep(self.polling_period_seconds)
1047
+ run = self._hook.get_run(self.databricks_run_id)
1048
+ run_state = RunState(**run["state"])
1049
+ self.log.info(
1050
+ "task %s %s", self._get_databricks_task_id(self.task_id), run_state.life_cycle_state
1051
+ )
1052
+ self.log.info("Current state of the job: %s", run_state.life_cycle_state)
1053
+ if run_state.life_cycle_state != "TERMINATED":
1054
+ raise AirflowException(
1055
+ f"Databricks job failed with state {run_state.life_cycle_state}. "
1056
+ f"Message: {run_state.state_message}"
1057
+ )
1058
+ if not run_state.is_successful:
1059
+ raise AirflowException(
1060
+ "Task failed. Final state %s. Reason: %s",
1061
+ run_state.result_state,
1062
+ run_state.state_message,
1063
+ )
1064
+ self.log.info("Task succeeded. Final state %s.", run_state.result_state)
1065
+
1066
+ def execute(self, context: Context) -> None:
1067
+ self.launch_notebook_job()
1068
+ if self.wait_for_termination:
1069
+ self.monitor_databricks_job()
@@ -84,21 +84,36 @@ class DatabricksExecutionTrigger(BaseTrigger):
84
84
  async with self.hook:
85
85
  while True:
86
86
  run_state = await self.hook.a_get_run_state(self.run_id)
87
- if run_state.is_terminal:
88
- yield TriggerEvent(
89
- {
90
- "run_id": self.run_id,
91
- "run_page_url": self.run_page_url,
92
- "run_state": run_state.to_json(),
93
- "repair_run": self.repair_run,
94
- }
87
+ if not run_state.is_terminal:
88
+ self.log.info(
89
+ "run-id %s in run state %s. sleeping for %s seconds",
90
+ self.run_id,
91
+ run_state,
92
+ self.polling_period_seconds,
95
93
  )
96
- return
94
+ await asyncio.sleep(self.polling_period_seconds)
95
+ continue
97
96
 
98
- self.log.info(
99
- "run-id %s in run state %s. sleeping for %s seconds",
100
- self.run_id,
101
- run_state,
102
- self.polling_period_seconds,
97
+ failed_tasks = []
98
+ if run_state.result_state == "FAILED":
99
+ run_info = await self.hook.a_get_run(self.run_id)
100
+ for task in run_info.get("tasks", []):
101
+ if task.get("state", {}).get("result_state", "") == "FAILED":
102
+ task_run_id = task["run_id"]
103
+ task_key = task["task_key"]
104
+ run_output = await self.hook.a_get_run_output(task_run_id)
105
+ if "error" in run_output:
106
+ error = run_output["error"]
107
+ else:
108
+ error = run_state.state_message
109
+ failed_tasks.append({"task_key": task_key, "run_id": task_run_id, "error": error})
110
+ yield TriggerEvent(
111
+ {
112
+ "run_id": self.run_id,
113
+ "run_page_url": self.run_page_url,
114
+ "run_state": run_state.to_json(),
115
+ "repair_run": self.repair_run,
116
+ "errors": failed_tasks,
117
+ }
103
118
  )
104
- await asyncio.sleep(self.polling_period_seconds)
119
+ return
@@ -55,7 +55,7 @@ def validate_trigger_event(event: dict):
55
55
 
56
56
  See: :class:`~airflow.providers.databricks.triggers.databricks.DatabricksExecutionTrigger`.
57
57
  """
58
- keys_to_check = ["run_id", "run_page_url", "run_state"]
58
+ keys_to_check = ["run_id", "run_page_url", "run_state", "errors"]
59
59
  for key in keys_to_check:
60
60
  if key not in event:
61
61
  raise AirflowException(f"Could not find `{key}` in the event: {event}")
@@ -28,7 +28,7 @@ build-backend = "flit_core.buildapi"
28
28
 
29
29
  [project]
30
30
  name = "apache-airflow-providers-databricks"
31
- version = "6.3.0.rc3"
31
+ version = "6.4.0"
32
32
  description = "Provider package apache-airflow-providers-databricks for Apache Airflow"
33
33
  readme = "README.rst"
34
34
  authors = [
@@ -57,15 +57,15 @@ classifiers = [
57
57
  requires-python = "~=3.8"
58
58
  dependencies = [
59
59
  "aiohttp>=3.9.2, <4",
60
- "apache-airflow-providers-common-sql>=1.10.0rc0",
61
- "apache-airflow>=2.6.0rc0",
60
+ "apache-airflow-providers-common-sql>=1.10.0",
61
+ "apache-airflow>=2.7.0",
62
62
  "databricks-sql-connector>=2.0.0, <3.0.0, !=2.9.0",
63
63
  "requests>=2.27.0,<3",
64
64
  ]
65
65
 
66
66
  [project.urls]
67
- "Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0"
68
- "Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.3.0/changelog.html"
67
+ "Documentation" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0"
68
+ "Changelog" = "https://airflow.apache.org/docs/apache-airflow-providers-databricks/6.4.0/changelog.html"
69
69
  "Bug Tracker" = "https://github.com/apache/airflow/issues"
70
70
  "Source Code" = "https://github.com/apache/airflow"
71
71
  "Slack Chat" = "https://s.apache.org/airflow-slack"