apache-airflow-providers-cncf-kubernetes 3.1.0__py3-none-any.whl → 10.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. airflow/providers/cncf/kubernetes/__init__.py +18 -23
  2. airflow/providers/cncf/kubernetes/backcompat/__init__.py +17 -0
  3. airflow/providers/cncf/kubernetes/backcompat/backwards_compat_converters.py +31 -49
  4. airflow/providers/cncf/kubernetes/callbacks.py +200 -0
  5. airflow/providers/cncf/kubernetes/cli/__init__.py +16 -0
  6. airflow/providers/cncf/kubernetes/cli/kubernetes_command.py +195 -0
  7. airflow/providers/cncf/kubernetes/decorators/kubernetes.py +163 -0
  8. airflow/providers/cncf/kubernetes/decorators/kubernetes_cmd.py +118 -0
  9. airflow/providers/cncf/kubernetes/exceptions.py +37 -0
  10. airflow/providers/cncf/kubernetes/executors/__init__.py +17 -0
  11. airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +831 -0
  12. airflow/providers/cncf/kubernetes/executors/kubernetes_executor_types.py +91 -0
  13. airflow/providers/cncf/kubernetes/executors/kubernetes_executor_utils.py +736 -0
  14. airflow/providers/cncf/kubernetes/executors/local_kubernetes_executor.py +306 -0
  15. airflow/providers/cncf/kubernetes/get_provider_info.py +249 -50
  16. airflow/providers/cncf/kubernetes/hooks/kubernetes.py +846 -112
  17. airflow/providers/cncf/kubernetes/k8s_model.py +62 -0
  18. airflow/providers/cncf/kubernetes/kube_client.py +156 -0
  19. airflow/providers/cncf/kubernetes/kube_config.py +125 -0
  20. airflow/providers/cncf/kubernetes/kubernetes_executor_templates/__init__.py +16 -0
  21. airflow/providers/cncf/kubernetes/kubernetes_executor_templates/basic_template.yaml +79 -0
  22. airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +165 -0
  23. airflow/providers/cncf/kubernetes/operators/custom_object_launcher.py +368 -0
  24. airflow/providers/cncf/kubernetes/operators/job.py +646 -0
  25. airflow/providers/cncf/kubernetes/operators/kueue.py +132 -0
  26. airflow/providers/cncf/kubernetes/operators/pod.py +1417 -0
  27. airflow/providers/cncf/kubernetes/operators/resource.py +191 -0
  28. airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +336 -35
  29. airflow/providers/cncf/kubernetes/pod_generator.py +592 -0
  30. airflow/providers/cncf/kubernetes/pod_template_file_examples/__init__.py +16 -0
  31. airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_image_template.yaml +68 -0
  32. airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml +74 -0
  33. airflow/providers/cncf/kubernetes/pod_template_file_examples/git_sync_template.yaml +95 -0
  34. airflow/providers/cncf/kubernetes/python_kubernetes_script.jinja2 +51 -0
  35. airflow/providers/cncf/kubernetes/python_kubernetes_script.py +92 -0
  36. airflow/providers/cncf/kubernetes/resource_convert/__init__.py +16 -0
  37. airflow/providers/cncf/kubernetes/resource_convert/configmap.py +52 -0
  38. airflow/providers/cncf/kubernetes/resource_convert/env_variable.py +39 -0
  39. airflow/providers/cncf/kubernetes/resource_convert/secret.py +40 -0
  40. airflow/providers/cncf/kubernetes/secret.py +128 -0
  41. airflow/providers/cncf/kubernetes/sensors/spark_kubernetes.py +30 -14
  42. airflow/providers/cncf/kubernetes/template_rendering.py +81 -0
  43. airflow/providers/cncf/kubernetes/triggers/__init__.py +16 -0
  44. airflow/providers/cncf/kubernetes/triggers/job.py +176 -0
  45. airflow/providers/cncf/kubernetes/triggers/pod.py +344 -0
  46. airflow/providers/cncf/kubernetes/utils/__init__.py +3 -0
  47. airflow/providers/cncf/kubernetes/utils/container.py +118 -0
  48. airflow/providers/cncf/kubernetes/utils/delete_from.py +154 -0
  49. airflow/providers/cncf/kubernetes/utils/k8s_resource_iterator.py +46 -0
  50. airflow/providers/cncf/kubernetes/utils/pod_manager.py +887 -152
  51. airflow/providers/cncf/kubernetes/utils/xcom_sidecar.py +25 -16
  52. airflow/providers/cncf/kubernetes/version_compat.py +38 -0
  53. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/METADATA +125 -0
  54. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/RECORD +62 -0
  55. {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info}/WHEEL +1 -2
  56. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/entry_points.txt +3 -0
  57. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses/NOTICE +5 -0
  58. airflow/providers/cncf/kubernetes/backcompat/pod.py +0 -119
  59. airflow/providers/cncf/kubernetes/backcompat/pod_runtime_info_env.py +0 -56
  60. airflow/providers/cncf/kubernetes/backcompat/volume.py +0 -62
  61. airflow/providers/cncf/kubernetes/backcompat/volume_mount.py +0 -58
  62. airflow/providers/cncf/kubernetes/example_dags/example_kubernetes.py +0 -163
  63. airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes.py +0 -66
  64. airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes_spark_pi.yaml +0 -57
  65. airflow/providers/cncf/kubernetes/operators/kubernetes_pod.py +0 -622
  66. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/METADATA +0 -452
  67. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/NOTICE +0 -6
  68. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/RECORD +0 -29
  69. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/entry_points.txt +0 -3
  70. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/top_level.txt +0 -1
  71. /airflow/providers/cncf/kubernetes/{example_dags → decorators}/__init__.py +0 -0
  72. {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,831 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+ """
18
+ KubernetesExecutor.
19
+
20
+ .. seealso::
21
+ For more information on how the KubernetesExecutor works, take a look at the guide:
22
+ :doc:`/kubernetes_executor`
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import contextlib
28
+ import json
29
+ import logging
30
+ import multiprocessing
31
+ import time
32
+ from collections import Counter, defaultdict
33
+ from collections.abc import Sequence
34
+ from contextlib import suppress
35
+ from datetime import datetime
36
+ from queue import Empty, Queue
37
+ from typing import TYPE_CHECKING, Any
38
+
39
+ from deprecated import deprecated
40
+ from kubernetes.dynamic import DynamicClient
41
+ from sqlalchemy import select
42
+
43
+ from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator
44
+ from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_0_PLUS
45
+
46
+ try:
47
+ from airflow.cli.cli_config import ARG_LOGICAL_DATE
48
+ except ImportError: # 2.x compatibility.
49
+ from airflow.cli.cli_config import ( # type: ignore[attr-defined, no-redef]
50
+ ARG_EXECUTION_DATE as ARG_LOGICAL_DATE,
51
+ )
52
+ from airflow.cli.cli_config import (
53
+ ARG_DAG_ID,
54
+ ARG_OUTPUT_PATH,
55
+ ARG_VERBOSE,
56
+ ActionCommand,
57
+ Arg,
58
+ GroupCommand,
59
+ lazy_load_command,
60
+ positive_int,
61
+ )
62
+ from airflow.configuration import conf
63
+ from airflow.exceptions import AirflowProviderDeprecationWarning
64
+ from airflow.executors.base_executor import BaseExecutor
65
+ from airflow.providers.cncf.kubernetes.exceptions import PodMutationHookException, PodReconciliationError
66
+ from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_types import (
67
+ ADOPTED,
68
+ POD_EXECUTOR_DONE_KEY,
69
+ KubernetesJob,
70
+ KubernetesResults,
71
+ )
72
+ from airflow.providers.cncf.kubernetes.kube_config import KubeConfig
73
+ from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import annotations_to_key
74
+ from airflow.stats import Stats
75
+ from airflow.utils.log.logging_mixin import remove_escape_codes
76
+ from airflow.utils.session import NEW_SESSION, provide_session
77
+ from airflow.utils.state import TaskInstanceState
78
+
79
+ if TYPE_CHECKING:
80
+ import argparse
81
+ from collections.abc import Sequence
82
+
83
+ from kubernetes import client
84
+ from kubernetes.client import models as k8s
85
+ from sqlalchemy.orm import Session
86
+
87
+ from airflow.executors import workloads
88
+ from airflow.models.taskinstance import TaskInstance
89
+ from airflow.models.taskinstancekey import TaskInstanceKey
90
+ from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import (
91
+ AirflowKubernetesScheduler,
92
+ )
93
+
94
+
95
+ if AIRFLOW_V_3_0_PLUS:
96
+ from airflow.cli.cli_config import ARG_BUNDLE_NAME
97
+
98
+ ARG_COMPAT = ARG_BUNDLE_NAME
99
+ else:
100
+ from airflow.cli.cli_config import ARG_SUBDIR # type: ignore[attr-defined]
101
+
102
+ ARG_COMPAT = ARG_SUBDIR
103
+
104
+ # CLI Args
105
+ ARG_NAMESPACE = Arg(
106
+ ("--namespace",),
107
+ default=conf.get("kubernetes_executor", "namespace"),
108
+ help="Kubernetes Namespace. Default value is `[kubernetes] namespace` in configuration.",
109
+ )
110
+
111
+ ARG_MIN_PENDING_MINUTES = Arg(
112
+ ("--min-pending-minutes",),
113
+ default=30,
114
+ type=positive_int(allow_zero=False),
115
+ help=(
116
+ "Pending pods created before the time interval are to be cleaned up, "
117
+ "measured in minutes. Default value is 30(m). The minimum value is 5(m)."
118
+ ),
119
+ )
120
+
121
+ # CLI Commands
122
+ KUBERNETES_COMMANDS = (
123
+ ActionCommand(
124
+ name="cleanup-pods",
125
+ help=(
126
+ "Clean up Kubernetes pods "
127
+ "(created by KubernetesExecutor/KubernetesPodOperator) "
128
+ "in evicted/failed/succeeded/pending states"
129
+ ),
130
+ func=lazy_load_command("airflow.providers.cncf.kubernetes.cli.kubernetes_command.cleanup_pods"),
131
+ args=(ARG_NAMESPACE, ARG_MIN_PENDING_MINUTES, ARG_VERBOSE),
132
+ ),
133
+ ActionCommand(
134
+ name="generate-dag-yaml",
135
+ help="Generate YAML files for all tasks in DAG. Useful for debugging tasks without "
136
+ "launching into a cluster",
137
+ func=lazy_load_command("airflow.providers.cncf.kubernetes.cli.kubernetes_command.generate_pod_yaml"),
138
+ args=(ARG_DAG_ID, ARG_LOGICAL_DATE, ARG_COMPAT, ARG_OUTPUT_PATH, ARG_VERBOSE),
139
+ ),
140
+ )
141
+
142
+
143
+ class KubernetesExecutor(BaseExecutor):
144
+ """Executor for Kubernetes."""
145
+
146
+ RUNNING_POD_LOG_LINES = 100
147
+ supports_ad_hoc_ti_run: bool = True
148
+
149
+ if TYPE_CHECKING and AIRFLOW_V_3_0_PLUS:
150
+ # In the v3 path, we store workloads, not commands as strings.
151
+ # TODO: TaskSDK: move this type change into BaseExecutor
152
+ queued_tasks: dict[TaskInstanceKey, workloads.All] # type: ignore[assignment]
153
+
154
+ def __init__(self):
155
+ self.kube_config = KubeConfig()
156
+ self._manager = multiprocessing.Manager()
157
+ self.task_queue: Queue[KubernetesJob] = self._manager.Queue()
158
+ self.result_queue: Queue[KubernetesResults] = self._manager.Queue()
159
+ self.kube_scheduler: AirflowKubernetesScheduler | None = None
160
+ self.kube_client: client.CoreV1Api | None = None
161
+ self.scheduler_job_id: str | None = None
162
+ self.last_handled: dict[TaskInstanceKey, float] = {}
163
+ self.kubernetes_queue: str | None = None
164
+ self.task_publish_retries: Counter[TaskInstanceKey] = Counter()
165
+ self.task_publish_max_retries = conf.getint(
166
+ "kubernetes_executor", "task_publish_max_retries", fallback=0
167
+ )
168
+ self.completed: set[KubernetesResults] = set()
169
+ super().__init__(parallelism=self.kube_config.parallelism)
170
+
171
+ def _list_pods(self, query_kwargs):
172
+ query_kwargs["header_params"] = {
173
+ "Accept": "application/json;as=PartialObjectMetadataList;v=v1;g=meta.k8s.io"
174
+ }
175
+ dynamic_client = DynamicClient(self.kube_client.api_client)
176
+ pod_resource = dynamic_client.resources.get(api_version="v1", kind="Pod")
177
+ if self.kube_config.multi_namespace_mode:
178
+ if self.kube_config.multi_namespace_mode_namespace_list:
179
+ namespaces = self.kube_config.multi_namespace_mode_namespace_list
180
+ else:
181
+ namespaces = [None]
182
+ else:
183
+ namespaces = [self.kube_config.kube_namespace]
184
+
185
+ pods = []
186
+ for namespace in namespaces:
187
+ pods.extend(dynamic_client.get(resource=pod_resource, namespace=namespace, **query_kwargs).items)
188
+
189
+ return pods
190
+
191
+ def _make_safe_label_value(self, input_value: str | datetime) -> str:
192
+ """
193
+ Normalize a provided label to be of valid length and characters.
194
+
195
+ See airflow.providers.cncf.kubernetes.pod_generator.make_safe_label_value for more details.
196
+ """
197
+ # airflow.providers.cncf.kubernetes is an expensive import, locally import it here to
198
+ # speed up load times of the kubernetes_executor module.
199
+ from airflow.providers.cncf.kubernetes import pod_generator
200
+
201
+ if isinstance(input_value, datetime):
202
+ return pod_generator.datetime_to_label_safe_datestring(input_value)
203
+ return pod_generator.make_safe_label_value(input_value)
204
+
205
+ def get_pod_combined_search_str_to_pod_map(self) -> dict[str, k8s.V1Pod]:
206
+ """
207
+ List the worker pods owned by this scheduler and create a map containing pod combined search str -> pod.
208
+
209
+ For every pod, it creates two below entries in the map
210
+ dag_id={dag_id},task_id={task_id},airflow-worker={airflow_worker},<map_index={map_index}>,run_id={run_id}
211
+ """
212
+ # airflow worker label selector batch call
213
+ kwargs = {"label_selector": f"airflow-worker={self._make_safe_label_value(str(self.job_id))}"}
214
+ if self.kube_config.kube_client_request_args:
215
+ kwargs.update(self.kube_config.kube_client_request_args)
216
+ pod_list = self._list_pods(kwargs)
217
+
218
+ # create a set against pod query label fields
219
+ pod_combined_search_str_to_pod_map = {}
220
+ for pod in pod_list:
221
+ dag_id = pod.metadata.annotations.get("dag_id", None)
222
+ task_id = pod.metadata.annotations.get("task_id", None)
223
+ map_index = pod.metadata.annotations.get("map_index", None)
224
+ run_id = pod.metadata.annotations.get("run_id", None)
225
+ if dag_id is None or task_id is None:
226
+ continue
227
+ search_base_str = f"dag_id={dag_id},task_id={task_id}"
228
+ if map_index is not None:
229
+ search_base_str += f",map_index={map_index}"
230
+ if run_id is not None:
231
+ search_str = f"{search_base_str},run_id={run_id}"
232
+ pod_combined_search_str_to_pod_map[search_str] = pod
233
+ return pod_combined_search_str_to_pod_map
234
+
235
+ def start(self) -> None:
236
+ """Start the executor."""
237
+ self.log.info("Start Kubernetes executor")
238
+ self.scheduler_job_id = str(self.job_id)
239
+ self.log.debug("Start with scheduler_job_id: %s", self.scheduler_job_id)
240
+ from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import (
241
+ AirflowKubernetesScheduler,
242
+ )
243
+ from airflow.providers.cncf.kubernetes.kube_client import get_kube_client
244
+
245
+ self.kube_client = get_kube_client()
246
+ self.kube_scheduler = AirflowKubernetesScheduler(
247
+ kube_config=self.kube_config,
248
+ result_queue=self.result_queue,
249
+ kube_client=self.kube_client,
250
+ scheduler_job_id=self.scheduler_job_id,
251
+ )
252
+
253
+ def execute_async(
254
+ self,
255
+ key: TaskInstanceKey,
256
+ command: Any,
257
+ queue: str | None = None,
258
+ executor_config: Any | None = None,
259
+ ) -> None:
260
+ """Execute task asynchronously."""
261
+ if TYPE_CHECKING:
262
+ assert self.task_queue
263
+
264
+ if self.log.isEnabledFor(logging.DEBUG):
265
+ self.log.debug("Add task %s with command %s, executor_config %s", key, command, executor_config)
266
+ else:
267
+ self.log.info("Add task %s with command %s", key, command)
268
+
269
+ try:
270
+ kube_executor_config = PodGenerator.from_obj(executor_config)
271
+ except Exception:
272
+ self.log.error("Invalid executor_config for %s. Executor_config: %s", key, executor_config)
273
+ self.fail(key=key, info="Invalid executor_config passed")
274
+ return
275
+
276
+ if executor_config:
277
+ pod_template_file = executor_config.get("pod_template_file", None)
278
+ else:
279
+ pod_template_file = None
280
+ self.event_buffer[key] = (TaskInstanceState.QUEUED, self.scheduler_job_id)
281
+ self.task_queue.put(KubernetesJob(key, command, kube_executor_config, pod_template_file))
282
+ # We keep a temporary local record that we've handled this so we don't
283
+ # try and remove it from the QUEUED state while we process it
284
+ self.last_handled[key] = time.time()
285
+
286
+ def queue_workload(self, workload: workloads.All, session: Session | None) -> None:
287
+ from airflow.executors import workloads
288
+
289
+ if not isinstance(workload, workloads.ExecuteTask):
290
+ raise RuntimeError(f"{type(self)} cannot handle workloads of type {type(workload)}")
291
+ ti = workload.ti
292
+ self.queued_tasks[ti.key] = workload
293
+
294
+ def _process_workloads(self, workloads: Sequence[workloads.All]) -> None:
295
+ from airflow.executors.workloads import ExecuteTask
296
+
297
+ # Airflow V3 version
298
+ for w in workloads:
299
+ if not isinstance(w, ExecuteTask):
300
+ raise RuntimeError(f"{type(self)} cannot handle workloads of type {type(w)}")
301
+
302
+ # TODO: AIP-72 handle populating tokens once https://github.com/apache/airflow/issues/45107 is handled.
303
+ command = [w]
304
+ key = w.ti.key
305
+ queue = w.ti.queue
306
+ executor_config = w.ti.executor_config or {}
307
+
308
+ del self.queued_tasks[key]
309
+ self.execute_async(key=key, command=command, queue=queue, executor_config=executor_config)
310
+ self.running.add(key)
311
+
312
+ def sync(self) -> None:
313
+ """Synchronize task state."""
314
+ if TYPE_CHECKING:
315
+ assert self.scheduler_job_id
316
+ assert self.kube_scheduler
317
+ assert self.kube_config
318
+ assert self.result_queue
319
+ assert self.task_queue
320
+
321
+ if self.running:
322
+ self.log.debug("self.running: %s", self.running)
323
+ if self.queued_tasks:
324
+ self.log.debug("self.queued: %s", self.queued_tasks)
325
+ self.kube_scheduler.sync()
326
+
327
+ last_resource_version: dict[str, str] = defaultdict(lambda: "0")
328
+ with contextlib.suppress(Empty):
329
+ while True:
330
+ results = self.result_queue.get_nowait()
331
+ try:
332
+ last_resource_version[results.namespace] = results.resource_version
333
+ self.log.info("Changing state of %s to %s", results, results.state)
334
+ try:
335
+ self._change_state(results)
336
+ except Exception as e:
337
+ self.log.exception(
338
+ "Exception: %s when attempting to change state of %s to %s, re-queueing.",
339
+ e,
340
+ results,
341
+ results.state,
342
+ )
343
+ self.result_queue.put(results)
344
+ finally:
345
+ self.result_queue.task_done()
346
+
347
+ for result in self.completed:
348
+ self._change_state(result)
349
+
350
+ from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import ResourceVersion
351
+
352
+ resource_instance = ResourceVersion()
353
+ for ns in resource_instance.resource_version:
354
+ resource_instance.resource_version[ns] = (
355
+ last_resource_version[ns] or resource_instance.resource_version[ns]
356
+ )
357
+
358
+ from kubernetes.client.rest import ApiException
359
+
360
+ with contextlib.suppress(Empty):
361
+ for _ in range(self.kube_config.worker_pods_creation_batch_size):
362
+ task = self.task_queue.get_nowait()
363
+
364
+ try:
365
+ key = task.key
366
+ self.kube_scheduler.run_next(task)
367
+ self.task_publish_retries.pop(key, None)
368
+ except PodReconciliationError as e:
369
+ self.log.exception(
370
+ "Pod reconciliation failed, likely due to kubernetes library upgrade. "
371
+ "Try clearing the task to re-run.",
372
+ )
373
+ self.fail(task[0], e)
374
+ except ApiException as e:
375
+ try:
376
+ if e.body:
377
+ body = json.loads(e.body)
378
+ else:
379
+ # If no body content, use reason as the message
380
+ body = {"message": e.reason}
381
+ except (json.JSONDecodeError, ValueError, TypeError):
382
+ # If the body is a string (e.g., in a 429 error), it can't be parsed as JSON.
383
+ # Use the body directly as the message instead.
384
+ body = {"message": e.body}
385
+
386
+ retries = self.task_publish_retries[key]
387
+ # In case of exceeded quota or conflict errors, requeue the task as per the task_publish_max_retries
388
+ message = body.get("message", "")
389
+ if (
390
+ (str(e.status) == "403" and "exceeded quota" in message)
391
+ or (str(e.status) == "409" and "object has been modified" in message)
392
+ or str(e.status) == "500"
393
+ ) and (self.task_publish_max_retries == -1 or retries < self.task_publish_max_retries):
394
+ self.log.warning(
395
+ "[Try %s of %s] Kube ApiException for Task: (%s). Reason: %r. Message: %s",
396
+ self.task_publish_retries[key] + 1,
397
+ self.task_publish_max_retries,
398
+ key,
399
+ e.reason,
400
+ message,
401
+ )
402
+ self.task_queue.put(task)
403
+ self.task_publish_retries[key] = retries + 1
404
+ else:
405
+ self.log.error("Pod creation failed with reason %r. Failing task", e.reason)
406
+ key = task.key
407
+ self.fail(key, e)
408
+ self.task_publish_retries.pop(key, None)
409
+ except PodMutationHookException as e:
410
+ key = task.key
411
+ self.log.error(
412
+ "Pod Mutation Hook failed for the task %s. Failing task. Details: %s",
413
+ key,
414
+ e.__cause__,
415
+ )
416
+ self.fail(key, e)
417
+ finally:
418
+ self.task_queue.task_done()
419
+
420
+ @provide_session
421
+ def _change_state(
422
+ self,
423
+ results: KubernetesResults,
424
+ session: Session = NEW_SESSION,
425
+ ) -> None:
426
+ """Change state of the task based on KubernetesResults."""
427
+ if TYPE_CHECKING:
428
+ assert self.kube_scheduler
429
+
430
+ key = results.key
431
+ state = results.state
432
+ pod_name = results.pod_name
433
+ namespace = results.namespace
434
+ failure_details = results.failure_details
435
+
436
+ if state == TaskInstanceState.FAILED:
437
+ # Use pre-collected failure details from the watcher to avoid additional API calls
438
+ if failure_details:
439
+ pod_status = failure_details.get("pod_status")
440
+ pod_reason = failure_details.get("pod_reason")
441
+ pod_message = failure_details.get("pod_message")
442
+ container_state = failure_details.get("container_state")
443
+ container_reason = failure_details.get("container_reason")
444
+ container_message = failure_details.get("container_message")
445
+ exit_code = failure_details.get("exit_code")
446
+ container_type = failure_details.get("container_type")
447
+ container_name = failure_details.get("container_name")
448
+
449
+ task_key_str = f"{key.dag_id}.{key.task_id}.{key.try_number}"
450
+ self.log.warning(
451
+ "Task %s failed in pod %s/%s. Pod phase: %s, reason: %s, message: %s, "
452
+ "container_type: %s, container_name: %s, container_state: %s, container_reason: %s, "
453
+ "container_message: %s, exit_code: %s",
454
+ task_key_str,
455
+ namespace,
456
+ pod_name,
457
+ pod_status,
458
+ pod_reason,
459
+ pod_message,
460
+ container_type,
461
+ container_name,
462
+ container_state,
463
+ container_reason,
464
+ container_message,
465
+ exit_code,
466
+ )
467
+ else:
468
+ task_key_str = f"{key.dag_id}.{key.task_id}.{key.try_number}"
469
+ self.log.warning(
470
+ "Task %s failed in pod %s/%s (no details available)", task_key_str, namespace, pod_name
471
+ )
472
+
473
+ if state == ADOPTED:
474
+ # When the task pod is adopted by another executor,
475
+ # then remove the task from the current executor running queue.
476
+ try:
477
+ self.running.remove(key)
478
+ except KeyError:
479
+ self.log.debug("TI key not in running: %s", key)
480
+ return
481
+
482
+ if state == TaskInstanceState.RUNNING:
483
+ self.event_buffer[key] = state, None
484
+ return
485
+
486
+ if self.kube_config.delete_worker_pods:
487
+ if state != TaskInstanceState.FAILED or self.kube_config.delete_worker_pods_on_failure:
488
+ self.kube_scheduler.delete_pod(pod_name=pod_name, namespace=namespace)
489
+ self.log.info(
490
+ "Deleted pod associated with the TI %s. Pod name: %s. Namespace: %s",
491
+ key,
492
+ pod_name,
493
+ namespace,
494
+ )
495
+ else:
496
+ self.kube_scheduler.patch_pod_executor_done(pod_name=pod_name, namespace=namespace)
497
+ self.log.info("Patched pod %s in namespace %s to mark it as done", key, namespace)
498
+
499
+ try:
500
+ self.running.remove(key)
501
+ except KeyError:
502
+ self.log.debug("TI key not in running, not adding to event_buffer: %s", key)
503
+ return
504
+
505
+ # If we don't have a TI state, look it up from the db. event_buffer expects the TI state
506
+ if state is None:
507
+ from airflow.models.taskinstance import TaskInstance
508
+
509
+ filter_for_tis = TaskInstance.filter_for_tis([key])
510
+ if filter_for_tis is not None:
511
+ state = session.scalar(select(TaskInstance.state).where(filter_for_tis))
512
+ else:
513
+ state = None
514
+ state = TaskInstanceState(state) if state else None
515
+
516
+ self.event_buffer[key] = state, None
517
+
518
+ @staticmethod
519
+ def _get_pod_namespace(ti: TaskInstance):
520
+ pod_override = ti.executor_config.get("pod_override")
521
+ namespace = None
522
+ with suppress(Exception):
523
+ if pod_override is not None:
524
+ namespace = pod_override.metadata.namespace
525
+ return namespace or conf.get("kubernetes_executor", "namespace")
526
+
527
+ def get_task_log(self, ti: TaskInstance, try_number: int) -> tuple[list[str], list[str]]:
528
+ messages = []
529
+ log = []
530
+ try:
531
+ from airflow.providers.cncf.kubernetes.kube_client import get_kube_client
532
+ from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator
533
+
534
+ client = get_kube_client()
535
+
536
+ messages.append(f"Attempting to fetch logs from pod {ti.hostname} through kube API")
537
+ selector = PodGenerator.build_selector_for_k8s_executor_pod(
538
+ dag_id=ti.dag_id,
539
+ task_id=ti.task_id,
540
+ try_number=try_number,
541
+ map_index=ti.map_index,
542
+ run_id=ti.run_id,
543
+ airflow_worker=ti.queued_by_job_id,
544
+ )
545
+ namespace = self._get_pod_namespace(ti)
546
+ pod_list = client.list_namespaced_pod(
547
+ namespace=namespace,
548
+ label_selector=selector,
549
+ ).items
550
+ if not pod_list:
551
+ raise RuntimeError("Cannot find pod for ti %s", ti)
552
+ if len(pod_list) > 1:
553
+ raise RuntimeError("Found multiple pods for ti %s: %s", ti, pod_list)
554
+ res = client.read_namespaced_pod_log(
555
+ name=pod_list[0].metadata.name,
556
+ namespace=namespace,
557
+ container="base",
558
+ follow=False,
559
+ tail_lines=self.RUNNING_POD_LOG_LINES,
560
+ _preload_content=False,
561
+ )
562
+ for line in res:
563
+ log.append(remove_escape_codes(line.decode()))
564
+ if log:
565
+ messages.append("Found logs through kube API")
566
+ except Exception as e:
567
+ messages.append(f"Reading from k8s pod logs failed: {e}")
568
+ return messages, ["\n".join(log)]
569
+
570
+ def try_adopt_task_instances(self, tis: Sequence[TaskInstance]) -> Sequence[TaskInstance]:
571
+ with Stats.timer("kubernetes_executor.adopt_task_instances.duration"):
572
+ # Always flush TIs without queued_by_job_id
573
+ tis_to_flush = [ti for ti in tis if not ti.queued_by_job_id]
574
+ scheduler_job_ids = {ti.queued_by_job_id for ti in tis}
575
+ tis_to_flush_by_key = {ti.key: ti for ti in tis if ti.queued_by_job_id}
576
+ kube_client: client.CoreV1Api = self.kube_client
577
+ for scheduler_job_id in scheduler_job_ids:
578
+ scheduler_job_id_safe_label = self._make_safe_label_value(str(scheduler_job_id))
579
+ # We will look for any pods owned by the no-longer-running scheduler,
580
+ # but will exclude only successful pods, as those TIs will have a terminal state
581
+ # and not be up for adoption!
582
+ # Those workers that failed, however, are okay to adopt here as their TI will
583
+ # still be in queued.
584
+ query_kwargs = {
585
+ "field_selector": "status.phase!=Succeeded",
586
+ "label_selector": (
587
+ "kubernetes_executor=True,"
588
+ f"airflow-worker={scheduler_job_id_safe_label},{POD_EXECUTOR_DONE_KEY}!=True"
589
+ ),
590
+ }
591
+ pod_list = self._list_pods(query_kwargs)
592
+ for pod in pod_list:
593
+ self.adopt_launched_task(kube_client, pod, tis_to_flush_by_key)
594
+ self._adopt_completed_pods(kube_client)
595
+
596
+ # as this method can be retried within a short time frame
597
+ # (wrapped in a run_with_db_retries of scheduler_job_runner,
598
+ # and get retried due to an OperationalError, for example),
599
+ # there is a chance that in second attempt, adopt_launched_task will not be called even once
600
+ # as all pods are already adopted in the first attempt.
601
+ # and tis_to_flush_by_key will contain TIs that are already adopted.
602
+ # therefore, we need to check if the TIs are already adopted by the first attempt and remove them.
603
+ def _iter_tis_to_flush():
604
+ for key, ti in tis_to_flush_by_key.items():
605
+ if key in self.running:
606
+ self.log.info("%s is already adopted, no need to flush.", ti)
607
+ else:
608
+ yield ti
609
+
610
+ tis_to_flush.extend(_iter_tis_to_flush())
611
+ return tis_to_flush
612
+
613
+ @deprecated(
614
+ reason="Replaced by function `revoke_task`. Upgrade airflow core to make this go away.",
615
+ category=AirflowProviderDeprecationWarning,
616
+ )
617
+ def cleanup_stuck_queued_tasks(self, tis: list[TaskInstance]) -> list[str]:
618
+ """
619
+ Handle remnants of tasks that were failed because they were stuck in queued.
620
+
621
+ Tasks can get stuck in queued. If such a task is detected, it will be marked
622
+ as `UP_FOR_RETRY` if the task instance has remaining retries or marked as `FAILED`
623
+ if it doesn't.
624
+
625
+ :param tis: List of Task Instances to clean up
626
+ :return: List of readable task instances for a warning message
627
+ """
628
+ reprs = []
629
+ for ti in tis:
630
+ reprs.append(repr(ti))
631
+ self.revoke_task(ti=ti)
632
+ self.fail(ti.key)
633
+ return reprs
634
+
635
+ def revoke_task(self, *, ti: TaskInstance):
636
+ """
637
+ Revoke task that may be running.
638
+
639
+ :param ti: task instance to revoke
640
+ """
641
+ if TYPE_CHECKING:
642
+ assert self.kube_client
643
+ assert self.kube_scheduler
644
+ self.running.discard(ti.key)
645
+ self.queued_tasks.pop(ti.key, None)
646
+ pod_combined_search_str_to_pod_map = self.get_pod_combined_search_str_to_pod_map()
647
+ # Build the pod selector
648
+ base_label_selector = f"dag_id={ti.dag_id},task_id={ti.task_id}"
649
+ if ti.map_index >= 0:
650
+ # Old tasks _couldn't_ be mapped, so we don't have to worry about compat
651
+ base_label_selector += f",map_index={ti.map_index}"
652
+
653
+ search_str = f"{base_label_selector},run_id={ti.run_id}"
654
+ pod = pod_combined_search_str_to_pod_map.get(search_str, None)
655
+ if not pod:
656
+ self.log.warning("Cannot find pod for ti %s", ti)
657
+ return
658
+
659
+ self.kube_scheduler.patch_pod_revoked(pod_name=pod.metadata.name, namespace=pod.metadata.namespace)
660
+ self.kube_scheduler.delete_pod(pod_name=pod.metadata.name, namespace=pod.metadata.namespace)
661
+
662
+ def adopt_launched_task(
663
+ self,
664
+ kube_client: client.CoreV1Api,
665
+ pod: k8s.V1Pod,
666
+ tis_to_flush_by_key: dict[TaskInstanceKey, k8s.V1Pod],
667
+ ) -> None:
668
+ """
669
+ Patch existing pod so that the current KubernetesJobWatcher can monitor it via label selectors.
670
+
671
+ :param kube_client: kubernetes client for speaking to kube API
672
+ :param pod: V1Pod spec that we will patch with new label
673
+ :param tis_to_flush_by_key: TIs that will be flushed if they aren't adopted
674
+ """
675
+ if TYPE_CHECKING:
676
+ assert self.scheduler_job_id
677
+
678
+ self.log.info("attempting to adopt pod %s", pod.metadata.name)
679
+ ti_key = annotations_to_key(pod.metadata.annotations)
680
+ if ti_key not in tis_to_flush_by_key:
681
+ self.log.error("attempting to adopt taskinstance which was not specified by database: %s", ti_key)
682
+ return
683
+
684
+ new_worker_id_label = self._make_safe_label_value(self.scheduler_job_id)
685
+ from kubernetes.client.rest import ApiException
686
+
687
+ try:
688
+ kube_client.patch_namespaced_pod(
689
+ name=pod.metadata.name,
690
+ namespace=pod.metadata.namespace,
691
+ body={"metadata": {"labels": {"airflow-worker": new_worker_id_label}}},
692
+ )
693
+ except ApiException as e:
694
+ self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
695
+ return
696
+
697
+ del tis_to_flush_by_key[ti_key]
698
+ self.running.add(ti_key)
699
+
700
+ def _adopt_completed_pods(self, kube_client: client.CoreV1Api) -> None:
701
+ """
702
+ Patch completed pods so that the KubernetesJobWatcher can delete them.
703
+
704
+ :param kube_client: kubernetes client for speaking to kube API
705
+ """
706
+ if TYPE_CHECKING:
707
+ assert self.scheduler_job_id
708
+
709
+ new_worker_id_label = self._make_safe_label_value(self.scheduler_job_id)
710
+ query_kwargs = {
711
+ "field_selector": "status.phase=Succeeded",
712
+ "label_selector": (
713
+ "kubernetes_executor=True,"
714
+ f"airflow-worker!={new_worker_id_label},{POD_EXECUTOR_DONE_KEY}!=True"
715
+ ),
716
+ }
717
+ pod_list = self._list_pods(query_kwargs)
718
+ for pod in pod_list:
719
+ self.log.info("Attempting to adopt pod %s", pod.metadata.name)
720
+ from kubernetes.client.rest import ApiException
721
+
722
+ try:
723
+ kube_client.patch_namespaced_pod(
724
+ name=pod.metadata.name,
725
+ namespace=pod.metadata.namespace,
726
+ body={"metadata": {"labels": {"airflow-worker": new_worker_id_label}}},
727
+ )
728
+ except ApiException as e:
729
+ self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
730
+ continue
731
+
732
+ ti_id = annotations_to_key(pod.metadata.annotations)
733
+ self.completed.add(
734
+ KubernetesResults(
735
+ key=ti_id,
736
+ state="completed",
737
+ pod_name=pod.metadata.name,
738
+ namespace=pod.metadata.namespace,
739
+ resource_version=pod.metadata.resource_version,
740
+ failure_details=None,
741
+ )
742
+ )
743
+
744
+ def _flush_task_queue(self) -> None:
745
+ if TYPE_CHECKING:
746
+ assert self.task_queue
747
+
748
+ self.log.debug("Executor shutting down, task_queue approximate size=%d", self.task_queue.qsize())
749
+ with contextlib.suppress(Empty):
750
+ while True:
751
+ task = self.task_queue.get_nowait()
752
+ # This is a new task to run thus ok to ignore.
753
+ self.log.warning("Executor shutting down, will NOT run task=%s", task)
754
+ self.task_queue.task_done()
755
+
756
+ def _flush_result_queue(self) -> None:
757
+ if TYPE_CHECKING:
758
+ assert self.result_queue
759
+
760
+ self.log.debug("Executor shutting down, result_queue approximate size=%d", self.result_queue.qsize())
761
+ with contextlib.suppress(Empty):
762
+ while True:
763
+ results = self.result_queue.get_nowait()
764
+ self.log.warning("Executor shutting down, flushing results=%s", results)
765
+ try:
766
+ self.log.info(
767
+ "Changing state of %s to %s : resource_version=%s",
768
+ results,
769
+ results.state,
770
+ results.resource_version,
771
+ )
772
+ try:
773
+ self._change_state(results)
774
+ except Exception as e:
775
+ self.log.exception(
776
+ "Ignoring exception: %s when attempting to change state of %s to %s.",
777
+ e,
778
+ results,
779
+ results.state,
780
+ )
781
+ finally:
782
+ self.result_queue.task_done()
783
+
784
+ def end(self) -> None:
785
+ """Shut down the executor."""
786
+ if TYPE_CHECKING:
787
+ assert self.task_queue
788
+ assert self.result_queue
789
+ assert self.kube_scheduler
790
+
791
+ self.log.info("Shutting down Kubernetes executor")
792
+ try:
793
+ self.log.debug("Flushing task_queue...")
794
+ self._flush_task_queue()
795
+ self.log.debug("Flushing result_queue...")
796
+ self._flush_result_queue()
797
+ # Both queues should be empty...
798
+ self.task_queue.join()
799
+ self.result_queue.join()
800
+ except ConnectionResetError:
801
+ self.log.exception("Connection Reset error while flushing task_queue and result_queue.")
802
+ except Exception:
803
+ self.log.exception("Unknown error while flushing task queue and result queue.")
804
+ if self.kube_scheduler:
805
+ try:
806
+ self.kube_scheduler.terminate()
807
+ except Exception:
808
+ self.log.exception("Unknown error while flushing task queue and result queue.")
809
+ self._manager.shutdown()
810
+
811
+ def terminate(self):
812
+ """Terminate the executor is not doing anything."""
813
+
814
+ @staticmethod
815
+ def get_cli_commands() -> list[GroupCommand]:
816
+ return [
817
+ GroupCommand(
818
+ name="kubernetes",
819
+ help="Tools to help run the KubernetesExecutor",
820
+ subcommands=KUBERNETES_COMMANDS,
821
+ )
822
+ ]
823
+
824
+
825
+ def _get_parser() -> argparse.ArgumentParser:
826
+ """
827
+ Generate documentation; used by Sphinx.
828
+
829
+ :meta private:
830
+ """
831
+ return KubernetesExecutor._get_parser()