apache-airflow-providers-cncf-kubernetes 3.1.0__py3-none-any.whl → 10.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/cncf/kubernetes/__init__.py +18 -23
- airflow/providers/cncf/kubernetes/backcompat/__init__.py +17 -0
- airflow/providers/cncf/kubernetes/backcompat/backwards_compat_converters.py +31 -49
- airflow/providers/cncf/kubernetes/callbacks.py +200 -0
- airflow/providers/cncf/kubernetes/cli/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/cli/kubernetes_command.py +195 -0
- airflow/providers/cncf/kubernetes/decorators/kubernetes.py +163 -0
- airflow/providers/cncf/kubernetes/decorators/kubernetes_cmd.py +118 -0
- airflow/providers/cncf/kubernetes/exceptions.py +37 -0
- airflow/providers/cncf/kubernetes/executors/__init__.py +17 -0
- airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +831 -0
- airflow/providers/cncf/kubernetes/executors/kubernetes_executor_types.py +91 -0
- airflow/providers/cncf/kubernetes/executors/kubernetes_executor_utils.py +736 -0
- airflow/providers/cncf/kubernetes/executors/local_kubernetes_executor.py +306 -0
- airflow/providers/cncf/kubernetes/get_provider_info.py +249 -50
- airflow/providers/cncf/kubernetes/hooks/kubernetes.py +846 -112
- airflow/providers/cncf/kubernetes/k8s_model.py +62 -0
- airflow/providers/cncf/kubernetes/kube_client.py +156 -0
- airflow/providers/cncf/kubernetes/kube_config.py +125 -0
- airflow/providers/cncf/kubernetes/kubernetes_executor_templates/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/kubernetes_executor_templates/basic_template.yaml +79 -0
- airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +165 -0
- airflow/providers/cncf/kubernetes/operators/custom_object_launcher.py +368 -0
- airflow/providers/cncf/kubernetes/operators/job.py +646 -0
- airflow/providers/cncf/kubernetes/operators/kueue.py +132 -0
- airflow/providers/cncf/kubernetes/operators/pod.py +1417 -0
- airflow/providers/cncf/kubernetes/operators/resource.py +191 -0
- airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +336 -35
- airflow/providers/cncf/kubernetes/pod_generator.py +592 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_image_template.yaml +68 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml +74 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/git_sync_template.yaml +95 -0
- airflow/providers/cncf/kubernetes/python_kubernetes_script.jinja2 +51 -0
- airflow/providers/cncf/kubernetes/python_kubernetes_script.py +92 -0
- airflow/providers/cncf/kubernetes/resource_convert/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/resource_convert/configmap.py +52 -0
- airflow/providers/cncf/kubernetes/resource_convert/env_variable.py +39 -0
- airflow/providers/cncf/kubernetes/resource_convert/secret.py +40 -0
- airflow/providers/cncf/kubernetes/secret.py +128 -0
- airflow/providers/cncf/kubernetes/sensors/spark_kubernetes.py +30 -14
- airflow/providers/cncf/kubernetes/template_rendering.py +81 -0
- airflow/providers/cncf/kubernetes/triggers/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/triggers/job.py +176 -0
- airflow/providers/cncf/kubernetes/triggers/pod.py +344 -0
- airflow/providers/cncf/kubernetes/utils/__init__.py +3 -0
- airflow/providers/cncf/kubernetes/utils/container.py +118 -0
- airflow/providers/cncf/kubernetes/utils/delete_from.py +154 -0
- airflow/providers/cncf/kubernetes/utils/k8s_resource_iterator.py +46 -0
- airflow/providers/cncf/kubernetes/utils/pod_manager.py +887 -152
- airflow/providers/cncf/kubernetes/utils/xcom_sidecar.py +25 -16
- airflow/providers/cncf/kubernetes/version_compat.py +38 -0
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/METADATA +125 -0
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/RECORD +62 -0
- {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info}/WHEEL +1 -2
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/entry_points.txt +3 -0
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses/NOTICE +5 -0
- airflow/providers/cncf/kubernetes/backcompat/pod.py +0 -119
- airflow/providers/cncf/kubernetes/backcompat/pod_runtime_info_env.py +0 -56
- airflow/providers/cncf/kubernetes/backcompat/volume.py +0 -62
- airflow/providers/cncf/kubernetes/backcompat/volume_mount.py +0 -58
- airflow/providers/cncf/kubernetes/example_dags/example_kubernetes.py +0 -163
- airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes.py +0 -66
- airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes_spark_pi.yaml +0 -57
- airflow/providers/cncf/kubernetes/operators/kubernetes_pod.py +0 -622
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/METADATA +0 -452
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/NOTICE +0 -6
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/RECORD +0 -29
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/entry_points.txt +0 -3
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/top_level.txt +0 -1
- /airflow/providers/cncf/kubernetes/{example_dags → decorators}/__init__.py +0 -0
- {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,831 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
"""
|
|
18
|
+
KubernetesExecutor.
|
|
19
|
+
|
|
20
|
+
.. seealso::
|
|
21
|
+
For more information on how the KubernetesExecutor works, take a look at the guide:
|
|
22
|
+
:doc:`/kubernetes_executor`
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import contextlib
|
|
28
|
+
import json
|
|
29
|
+
import logging
|
|
30
|
+
import multiprocessing
|
|
31
|
+
import time
|
|
32
|
+
from collections import Counter, defaultdict
|
|
33
|
+
from collections.abc import Sequence
|
|
34
|
+
from contextlib import suppress
|
|
35
|
+
from datetime import datetime
|
|
36
|
+
from queue import Empty, Queue
|
|
37
|
+
from typing import TYPE_CHECKING, Any
|
|
38
|
+
|
|
39
|
+
from deprecated import deprecated
|
|
40
|
+
from kubernetes.dynamic import DynamicClient
|
|
41
|
+
from sqlalchemy import select
|
|
42
|
+
|
|
43
|
+
from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator
|
|
44
|
+
from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_0_PLUS
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
from airflow.cli.cli_config import ARG_LOGICAL_DATE
|
|
48
|
+
except ImportError: # 2.x compatibility.
|
|
49
|
+
from airflow.cli.cli_config import ( # type: ignore[attr-defined, no-redef]
|
|
50
|
+
ARG_EXECUTION_DATE as ARG_LOGICAL_DATE,
|
|
51
|
+
)
|
|
52
|
+
from airflow.cli.cli_config import (
|
|
53
|
+
ARG_DAG_ID,
|
|
54
|
+
ARG_OUTPUT_PATH,
|
|
55
|
+
ARG_VERBOSE,
|
|
56
|
+
ActionCommand,
|
|
57
|
+
Arg,
|
|
58
|
+
GroupCommand,
|
|
59
|
+
lazy_load_command,
|
|
60
|
+
positive_int,
|
|
61
|
+
)
|
|
62
|
+
from airflow.configuration import conf
|
|
63
|
+
from airflow.exceptions import AirflowProviderDeprecationWarning
|
|
64
|
+
from airflow.executors.base_executor import BaseExecutor
|
|
65
|
+
from airflow.providers.cncf.kubernetes.exceptions import PodMutationHookException, PodReconciliationError
|
|
66
|
+
from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_types import (
|
|
67
|
+
ADOPTED,
|
|
68
|
+
POD_EXECUTOR_DONE_KEY,
|
|
69
|
+
KubernetesJob,
|
|
70
|
+
KubernetesResults,
|
|
71
|
+
)
|
|
72
|
+
from airflow.providers.cncf.kubernetes.kube_config import KubeConfig
|
|
73
|
+
from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import annotations_to_key
|
|
74
|
+
from airflow.stats import Stats
|
|
75
|
+
from airflow.utils.log.logging_mixin import remove_escape_codes
|
|
76
|
+
from airflow.utils.session import NEW_SESSION, provide_session
|
|
77
|
+
from airflow.utils.state import TaskInstanceState
|
|
78
|
+
|
|
79
|
+
if TYPE_CHECKING:
|
|
80
|
+
import argparse
|
|
81
|
+
from collections.abc import Sequence
|
|
82
|
+
|
|
83
|
+
from kubernetes import client
|
|
84
|
+
from kubernetes.client import models as k8s
|
|
85
|
+
from sqlalchemy.orm import Session
|
|
86
|
+
|
|
87
|
+
from airflow.executors import workloads
|
|
88
|
+
from airflow.models.taskinstance import TaskInstance
|
|
89
|
+
from airflow.models.taskinstancekey import TaskInstanceKey
|
|
90
|
+
from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import (
|
|
91
|
+
AirflowKubernetesScheduler,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if AIRFLOW_V_3_0_PLUS:
|
|
96
|
+
from airflow.cli.cli_config import ARG_BUNDLE_NAME
|
|
97
|
+
|
|
98
|
+
ARG_COMPAT = ARG_BUNDLE_NAME
|
|
99
|
+
else:
|
|
100
|
+
from airflow.cli.cli_config import ARG_SUBDIR # type: ignore[attr-defined]
|
|
101
|
+
|
|
102
|
+
ARG_COMPAT = ARG_SUBDIR
|
|
103
|
+
|
|
104
|
+
# CLI Args
|
|
105
|
+
ARG_NAMESPACE = Arg(
|
|
106
|
+
("--namespace",),
|
|
107
|
+
default=conf.get("kubernetes_executor", "namespace"),
|
|
108
|
+
help="Kubernetes Namespace. Default value is `[kubernetes] namespace` in configuration.",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
ARG_MIN_PENDING_MINUTES = Arg(
|
|
112
|
+
("--min-pending-minutes",),
|
|
113
|
+
default=30,
|
|
114
|
+
type=positive_int(allow_zero=False),
|
|
115
|
+
help=(
|
|
116
|
+
"Pending pods created before the time interval are to be cleaned up, "
|
|
117
|
+
"measured in minutes. Default value is 30(m). The minimum value is 5(m)."
|
|
118
|
+
),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# CLI Commands
|
|
122
|
+
KUBERNETES_COMMANDS = (
|
|
123
|
+
ActionCommand(
|
|
124
|
+
name="cleanup-pods",
|
|
125
|
+
help=(
|
|
126
|
+
"Clean up Kubernetes pods "
|
|
127
|
+
"(created by KubernetesExecutor/KubernetesPodOperator) "
|
|
128
|
+
"in evicted/failed/succeeded/pending states"
|
|
129
|
+
),
|
|
130
|
+
func=lazy_load_command("airflow.providers.cncf.kubernetes.cli.kubernetes_command.cleanup_pods"),
|
|
131
|
+
args=(ARG_NAMESPACE, ARG_MIN_PENDING_MINUTES, ARG_VERBOSE),
|
|
132
|
+
),
|
|
133
|
+
ActionCommand(
|
|
134
|
+
name="generate-dag-yaml",
|
|
135
|
+
help="Generate YAML files for all tasks in DAG. Useful for debugging tasks without "
|
|
136
|
+
"launching into a cluster",
|
|
137
|
+
func=lazy_load_command("airflow.providers.cncf.kubernetes.cli.kubernetes_command.generate_pod_yaml"),
|
|
138
|
+
args=(ARG_DAG_ID, ARG_LOGICAL_DATE, ARG_COMPAT, ARG_OUTPUT_PATH, ARG_VERBOSE),
|
|
139
|
+
),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class KubernetesExecutor(BaseExecutor):
|
|
144
|
+
"""Executor for Kubernetes."""
|
|
145
|
+
|
|
146
|
+
RUNNING_POD_LOG_LINES = 100
|
|
147
|
+
supports_ad_hoc_ti_run: bool = True
|
|
148
|
+
|
|
149
|
+
if TYPE_CHECKING and AIRFLOW_V_3_0_PLUS:
|
|
150
|
+
# In the v3 path, we store workloads, not commands as strings.
|
|
151
|
+
# TODO: TaskSDK: move this type change into BaseExecutor
|
|
152
|
+
queued_tasks: dict[TaskInstanceKey, workloads.All] # type: ignore[assignment]
|
|
153
|
+
|
|
154
|
+
def __init__(self):
|
|
155
|
+
self.kube_config = KubeConfig()
|
|
156
|
+
self._manager = multiprocessing.Manager()
|
|
157
|
+
self.task_queue: Queue[KubernetesJob] = self._manager.Queue()
|
|
158
|
+
self.result_queue: Queue[KubernetesResults] = self._manager.Queue()
|
|
159
|
+
self.kube_scheduler: AirflowKubernetesScheduler | None = None
|
|
160
|
+
self.kube_client: client.CoreV1Api | None = None
|
|
161
|
+
self.scheduler_job_id: str | None = None
|
|
162
|
+
self.last_handled: dict[TaskInstanceKey, float] = {}
|
|
163
|
+
self.kubernetes_queue: str | None = None
|
|
164
|
+
self.task_publish_retries: Counter[TaskInstanceKey] = Counter()
|
|
165
|
+
self.task_publish_max_retries = conf.getint(
|
|
166
|
+
"kubernetes_executor", "task_publish_max_retries", fallback=0
|
|
167
|
+
)
|
|
168
|
+
self.completed: set[KubernetesResults] = set()
|
|
169
|
+
super().__init__(parallelism=self.kube_config.parallelism)
|
|
170
|
+
|
|
171
|
+
def _list_pods(self, query_kwargs):
|
|
172
|
+
query_kwargs["header_params"] = {
|
|
173
|
+
"Accept": "application/json;as=PartialObjectMetadataList;v=v1;g=meta.k8s.io"
|
|
174
|
+
}
|
|
175
|
+
dynamic_client = DynamicClient(self.kube_client.api_client)
|
|
176
|
+
pod_resource = dynamic_client.resources.get(api_version="v1", kind="Pod")
|
|
177
|
+
if self.kube_config.multi_namespace_mode:
|
|
178
|
+
if self.kube_config.multi_namespace_mode_namespace_list:
|
|
179
|
+
namespaces = self.kube_config.multi_namespace_mode_namespace_list
|
|
180
|
+
else:
|
|
181
|
+
namespaces = [None]
|
|
182
|
+
else:
|
|
183
|
+
namespaces = [self.kube_config.kube_namespace]
|
|
184
|
+
|
|
185
|
+
pods = []
|
|
186
|
+
for namespace in namespaces:
|
|
187
|
+
pods.extend(dynamic_client.get(resource=pod_resource, namespace=namespace, **query_kwargs).items)
|
|
188
|
+
|
|
189
|
+
return pods
|
|
190
|
+
|
|
191
|
+
def _make_safe_label_value(self, input_value: str | datetime) -> str:
|
|
192
|
+
"""
|
|
193
|
+
Normalize a provided label to be of valid length and characters.
|
|
194
|
+
|
|
195
|
+
See airflow.providers.cncf.kubernetes.pod_generator.make_safe_label_value for more details.
|
|
196
|
+
"""
|
|
197
|
+
# airflow.providers.cncf.kubernetes is an expensive import, locally import it here to
|
|
198
|
+
# speed up load times of the kubernetes_executor module.
|
|
199
|
+
from airflow.providers.cncf.kubernetes import pod_generator
|
|
200
|
+
|
|
201
|
+
if isinstance(input_value, datetime):
|
|
202
|
+
return pod_generator.datetime_to_label_safe_datestring(input_value)
|
|
203
|
+
return pod_generator.make_safe_label_value(input_value)
|
|
204
|
+
|
|
205
|
+
def get_pod_combined_search_str_to_pod_map(self) -> dict[str, k8s.V1Pod]:
|
|
206
|
+
"""
|
|
207
|
+
List the worker pods owned by this scheduler and create a map containing pod combined search str -> pod.
|
|
208
|
+
|
|
209
|
+
For every pod, it creates two below entries in the map
|
|
210
|
+
dag_id={dag_id},task_id={task_id},airflow-worker={airflow_worker},<map_index={map_index}>,run_id={run_id}
|
|
211
|
+
"""
|
|
212
|
+
# airflow worker label selector batch call
|
|
213
|
+
kwargs = {"label_selector": f"airflow-worker={self._make_safe_label_value(str(self.job_id))}"}
|
|
214
|
+
if self.kube_config.kube_client_request_args:
|
|
215
|
+
kwargs.update(self.kube_config.kube_client_request_args)
|
|
216
|
+
pod_list = self._list_pods(kwargs)
|
|
217
|
+
|
|
218
|
+
# create a set against pod query label fields
|
|
219
|
+
pod_combined_search_str_to_pod_map = {}
|
|
220
|
+
for pod in pod_list:
|
|
221
|
+
dag_id = pod.metadata.annotations.get("dag_id", None)
|
|
222
|
+
task_id = pod.metadata.annotations.get("task_id", None)
|
|
223
|
+
map_index = pod.metadata.annotations.get("map_index", None)
|
|
224
|
+
run_id = pod.metadata.annotations.get("run_id", None)
|
|
225
|
+
if dag_id is None or task_id is None:
|
|
226
|
+
continue
|
|
227
|
+
search_base_str = f"dag_id={dag_id},task_id={task_id}"
|
|
228
|
+
if map_index is not None:
|
|
229
|
+
search_base_str += f",map_index={map_index}"
|
|
230
|
+
if run_id is not None:
|
|
231
|
+
search_str = f"{search_base_str},run_id={run_id}"
|
|
232
|
+
pod_combined_search_str_to_pod_map[search_str] = pod
|
|
233
|
+
return pod_combined_search_str_to_pod_map
|
|
234
|
+
|
|
235
|
+
def start(self) -> None:
|
|
236
|
+
"""Start the executor."""
|
|
237
|
+
self.log.info("Start Kubernetes executor")
|
|
238
|
+
self.scheduler_job_id = str(self.job_id)
|
|
239
|
+
self.log.debug("Start with scheduler_job_id: %s", self.scheduler_job_id)
|
|
240
|
+
from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import (
|
|
241
|
+
AirflowKubernetesScheduler,
|
|
242
|
+
)
|
|
243
|
+
from airflow.providers.cncf.kubernetes.kube_client import get_kube_client
|
|
244
|
+
|
|
245
|
+
self.kube_client = get_kube_client()
|
|
246
|
+
self.kube_scheduler = AirflowKubernetesScheduler(
|
|
247
|
+
kube_config=self.kube_config,
|
|
248
|
+
result_queue=self.result_queue,
|
|
249
|
+
kube_client=self.kube_client,
|
|
250
|
+
scheduler_job_id=self.scheduler_job_id,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def execute_async(
|
|
254
|
+
self,
|
|
255
|
+
key: TaskInstanceKey,
|
|
256
|
+
command: Any,
|
|
257
|
+
queue: str | None = None,
|
|
258
|
+
executor_config: Any | None = None,
|
|
259
|
+
) -> None:
|
|
260
|
+
"""Execute task asynchronously."""
|
|
261
|
+
if TYPE_CHECKING:
|
|
262
|
+
assert self.task_queue
|
|
263
|
+
|
|
264
|
+
if self.log.isEnabledFor(logging.DEBUG):
|
|
265
|
+
self.log.debug("Add task %s with command %s, executor_config %s", key, command, executor_config)
|
|
266
|
+
else:
|
|
267
|
+
self.log.info("Add task %s with command %s", key, command)
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
kube_executor_config = PodGenerator.from_obj(executor_config)
|
|
271
|
+
except Exception:
|
|
272
|
+
self.log.error("Invalid executor_config for %s. Executor_config: %s", key, executor_config)
|
|
273
|
+
self.fail(key=key, info="Invalid executor_config passed")
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
if executor_config:
|
|
277
|
+
pod_template_file = executor_config.get("pod_template_file", None)
|
|
278
|
+
else:
|
|
279
|
+
pod_template_file = None
|
|
280
|
+
self.event_buffer[key] = (TaskInstanceState.QUEUED, self.scheduler_job_id)
|
|
281
|
+
self.task_queue.put(KubernetesJob(key, command, kube_executor_config, pod_template_file))
|
|
282
|
+
# We keep a temporary local record that we've handled this so we don't
|
|
283
|
+
# try and remove it from the QUEUED state while we process it
|
|
284
|
+
self.last_handled[key] = time.time()
|
|
285
|
+
|
|
286
|
+
def queue_workload(self, workload: workloads.All, session: Session | None) -> None:
|
|
287
|
+
from airflow.executors import workloads
|
|
288
|
+
|
|
289
|
+
if not isinstance(workload, workloads.ExecuteTask):
|
|
290
|
+
raise RuntimeError(f"{type(self)} cannot handle workloads of type {type(workload)}")
|
|
291
|
+
ti = workload.ti
|
|
292
|
+
self.queued_tasks[ti.key] = workload
|
|
293
|
+
|
|
294
|
+
def _process_workloads(self, workloads: Sequence[workloads.All]) -> None:
|
|
295
|
+
from airflow.executors.workloads import ExecuteTask
|
|
296
|
+
|
|
297
|
+
# Airflow V3 version
|
|
298
|
+
for w in workloads:
|
|
299
|
+
if not isinstance(w, ExecuteTask):
|
|
300
|
+
raise RuntimeError(f"{type(self)} cannot handle workloads of type {type(w)}")
|
|
301
|
+
|
|
302
|
+
# TODO: AIP-72 handle populating tokens once https://github.com/apache/airflow/issues/45107 is handled.
|
|
303
|
+
command = [w]
|
|
304
|
+
key = w.ti.key
|
|
305
|
+
queue = w.ti.queue
|
|
306
|
+
executor_config = w.ti.executor_config or {}
|
|
307
|
+
|
|
308
|
+
del self.queued_tasks[key]
|
|
309
|
+
self.execute_async(key=key, command=command, queue=queue, executor_config=executor_config)
|
|
310
|
+
self.running.add(key)
|
|
311
|
+
|
|
312
|
+
def sync(self) -> None:
|
|
313
|
+
"""Synchronize task state."""
|
|
314
|
+
if TYPE_CHECKING:
|
|
315
|
+
assert self.scheduler_job_id
|
|
316
|
+
assert self.kube_scheduler
|
|
317
|
+
assert self.kube_config
|
|
318
|
+
assert self.result_queue
|
|
319
|
+
assert self.task_queue
|
|
320
|
+
|
|
321
|
+
if self.running:
|
|
322
|
+
self.log.debug("self.running: %s", self.running)
|
|
323
|
+
if self.queued_tasks:
|
|
324
|
+
self.log.debug("self.queued: %s", self.queued_tasks)
|
|
325
|
+
self.kube_scheduler.sync()
|
|
326
|
+
|
|
327
|
+
last_resource_version: dict[str, str] = defaultdict(lambda: "0")
|
|
328
|
+
with contextlib.suppress(Empty):
|
|
329
|
+
while True:
|
|
330
|
+
results = self.result_queue.get_nowait()
|
|
331
|
+
try:
|
|
332
|
+
last_resource_version[results.namespace] = results.resource_version
|
|
333
|
+
self.log.info("Changing state of %s to %s", results, results.state)
|
|
334
|
+
try:
|
|
335
|
+
self._change_state(results)
|
|
336
|
+
except Exception as e:
|
|
337
|
+
self.log.exception(
|
|
338
|
+
"Exception: %s when attempting to change state of %s to %s, re-queueing.",
|
|
339
|
+
e,
|
|
340
|
+
results,
|
|
341
|
+
results.state,
|
|
342
|
+
)
|
|
343
|
+
self.result_queue.put(results)
|
|
344
|
+
finally:
|
|
345
|
+
self.result_queue.task_done()
|
|
346
|
+
|
|
347
|
+
for result in self.completed:
|
|
348
|
+
self._change_state(result)
|
|
349
|
+
|
|
350
|
+
from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import ResourceVersion
|
|
351
|
+
|
|
352
|
+
resource_instance = ResourceVersion()
|
|
353
|
+
for ns in resource_instance.resource_version:
|
|
354
|
+
resource_instance.resource_version[ns] = (
|
|
355
|
+
last_resource_version[ns] or resource_instance.resource_version[ns]
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
from kubernetes.client.rest import ApiException
|
|
359
|
+
|
|
360
|
+
with contextlib.suppress(Empty):
|
|
361
|
+
for _ in range(self.kube_config.worker_pods_creation_batch_size):
|
|
362
|
+
task = self.task_queue.get_nowait()
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
key = task.key
|
|
366
|
+
self.kube_scheduler.run_next(task)
|
|
367
|
+
self.task_publish_retries.pop(key, None)
|
|
368
|
+
except PodReconciliationError as e:
|
|
369
|
+
self.log.exception(
|
|
370
|
+
"Pod reconciliation failed, likely due to kubernetes library upgrade. "
|
|
371
|
+
"Try clearing the task to re-run.",
|
|
372
|
+
)
|
|
373
|
+
self.fail(task[0], e)
|
|
374
|
+
except ApiException as e:
|
|
375
|
+
try:
|
|
376
|
+
if e.body:
|
|
377
|
+
body = json.loads(e.body)
|
|
378
|
+
else:
|
|
379
|
+
# If no body content, use reason as the message
|
|
380
|
+
body = {"message": e.reason}
|
|
381
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
382
|
+
# If the body is a string (e.g., in a 429 error), it can't be parsed as JSON.
|
|
383
|
+
# Use the body directly as the message instead.
|
|
384
|
+
body = {"message": e.body}
|
|
385
|
+
|
|
386
|
+
retries = self.task_publish_retries[key]
|
|
387
|
+
# In case of exceeded quota or conflict errors, requeue the task as per the task_publish_max_retries
|
|
388
|
+
message = body.get("message", "")
|
|
389
|
+
if (
|
|
390
|
+
(str(e.status) == "403" and "exceeded quota" in message)
|
|
391
|
+
or (str(e.status) == "409" and "object has been modified" in message)
|
|
392
|
+
or str(e.status) == "500"
|
|
393
|
+
) and (self.task_publish_max_retries == -1 or retries < self.task_publish_max_retries):
|
|
394
|
+
self.log.warning(
|
|
395
|
+
"[Try %s of %s] Kube ApiException for Task: (%s). Reason: %r. Message: %s",
|
|
396
|
+
self.task_publish_retries[key] + 1,
|
|
397
|
+
self.task_publish_max_retries,
|
|
398
|
+
key,
|
|
399
|
+
e.reason,
|
|
400
|
+
message,
|
|
401
|
+
)
|
|
402
|
+
self.task_queue.put(task)
|
|
403
|
+
self.task_publish_retries[key] = retries + 1
|
|
404
|
+
else:
|
|
405
|
+
self.log.error("Pod creation failed with reason %r. Failing task", e.reason)
|
|
406
|
+
key = task.key
|
|
407
|
+
self.fail(key, e)
|
|
408
|
+
self.task_publish_retries.pop(key, None)
|
|
409
|
+
except PodMutationHookException as e:
|
|
410
|
+
key = task.key
|
|
411
|
+
self.log.error(
|
|
412
|
+
"Pod Mutation Hook failed for the task %s. Failing task. Details: %s",
|
|
413
|
+
key,
|
|
414
|
+
e.__cause__,
|
|
415
|
+
)
|
|
416
|
+
self.fail(key, e)
|
|
417
|
+
finally:
|
|
418
|
+
self.task_queue.task_done()
|
|
419
|
+
|
|
420
|
+
@provide_session
|
|
421
|
+
def _change_state(
|
|
422
|
+
self,
|
|
423
|
+
results: KubernetesResults,
|
|
424
|
+
session: Session = NEW_SESSION,
|
|
425
|
+
) -> None:
|
|
426
|
+
"""Change state of the task based on KubernetesResults."""
|
|
427
|
+
if TYPE_CHECKING:
|
|
428
|
+
assert self.kube_scheduler
|
|
429
|
+
|
|
430
|
+
key = results.key
|
|
431
|
+
state = results.state
|
|
432
|
+
pod_name = results.pod_name
|
|
433
|
+
namespace = results.namespace
|
|
434
|
+
failure_details = results.failure_details
|
|
435
|
+
|
|
436
|
+
if state == TaskInstanceState.FAILED:
|
|
437
|
+
# Use pre-collected failure details from the watcher to avoid additional API calls
|
|
438
|
+
if failure_details:
|
|
439
|
+
pod_status = failure_details.get("pod_status")
|
|
440
|
+
pod_reason = failure_details.get("pod_reason")
|
|
441
|
+
pod_message = failure_details.get("pod_message")
|
|
442
|
+
container_state = failure_details.get("container_state")
|
|
443
|
+
container_reason = failure_details.get("container_reason")
|
|
444
|
+
container_message = failure_details.get("container_message")
|
|
445
|
+
exit_code = failure_details.get("exit_code")
|
|
446
|
+
container_type = failure_details.get("container_type")
|
|
447
|
+
container_name = failure_details.get("container_name")
|
|
448
|
+
|
|
449
|
+
task_key_str = f"{key.dag_id}.{key.task_id}.{key.try_number}"
|
|
450
|
+
self.log.warning(
|
|
451
|
+
"Task %s failed in pod %s/%s. Pod phase: %s, reason: %s, message: %s, "
|
|
452
|
+
"container_type: %s, container_name: %s, container_state: %s, container_reason: %s, "
|
|
453
|
+
"container_message: %s, exit_code: %s",
|
|
454
|
+
task_key_str,
|
|
455
|
+
namespace,
|
|
456
|
+
pod_name,
|
|
457
|
+
pod_status,
|
|
458
|
+
pod_reason,
|
|
459
|
+
pod_message,
|
|
460
|
+
container_type,
|
|
461
|
+
container_name,
|
|
462
|
+
container_state,
|
|
463
|
+
container_reason,
|
|
464
|
+
container_message,
|
|
465
|
+
exit_code,
|
|
466
|
+
)
|
|
467
|
+
else:
|
|
468
|
+
task_key_str = f"{key.dag_id}.{key.task_id}.{key.try_number}"
|
|
469
|
+
self.log.warning(
|
|
470
|
+
"Task %s failed in pod %s/%s (no details available)", task_key_str, namespace, pod_name
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
if state == ADOPTED:
|
|
474
|
+
# When the task pod is adopted by another executor,
|
|
475
|
+
# then remove the task from the current executor running queue.
|
|
476
|
+
try:
|
|
477
|
+
self.running.remove(key)
|
|
478
|
+
except KeyError:
|
|
479
|
+
self.log.debug("TI key not in running: %s", key)
|
|
480
|
+
return
|
|
481
|
+
|
|
482
|
+
if state == TaskInstanceState.RUNNING:
|
|
483
|
+
self.event_buffer[key] = state, None
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
if self.kube_config.delete_worker_pods:
|
|
487
|
+
if state != TaskInstanceState.FAILED or self.kube_config.delete_worker_pods_on_failure:
|
|
488
|
+
self.kube_scheduler.delete_pod(pod_name=pod_name, namespace=namespace)
|
|
489
|
+
self.log.info(
|
|
490
|
+
"Deleted pod associated with the TI %s. Pod name: %s. Namespace: %s",
|
|
491
|
+
key,
|
|
492
|
+
pod_name,
|
|
493
|
+
namespace,
|
|
494
|
+
)
|
|
495
|
+
else:
|
|
496
|
+
self.kube_scheduler.patch_pod_executor_done(pod_name=pod_name, namespace=namespace)
|
|
497
|
+
self.log.info("Patched pod %s in namespace %s to mark it as done", key, namespace)
|
|
498
|
+
|
|
499
|
+
try:
|
|
500
|
+
self.running.remove(key)
|
|
501
|
+
except KeyError:
|
|
502
|
+
self.log.debug("TI key not in running, not adding to event_buffer: %s", key)
|
|
503
|
+
return
|
|
504
|
+
|
|
505
|
+
# If we don't have a TI state, look it up from the db. event_buffer expects the TI state
|
|
506
|
+
if state is None:
|
|
507
|
+
from airflow.models.taskinstance import TaskInstance
|
|
508
|
+
|
|
509
|
+
filter_for_tis = TaskInstance.filter_for_tis([key])
|
|
510
|
+
if filter_for_tis is not None:
|
|
511
|
+
state = session.scalar(select(TaskInstance.state).where(filter_for_tis))
|
|
512
|
+
else:
|
|
513
|
+
state = None
|
|
514
|
+
state = TaskInstanceState(state) if state else None
|
|
515
|
+
|
|
516
|
+
self.event_buffer[key] = state, None
|
|
517
|
+
|
|
518
|
+
@staticmethod
|
|
519
|
+
def _get_pod_namespace(ti: TaskInstance):
|
|
520
|
+
pod_override = ti.executor_config.get("pod_override")
|
|
521
|
+
namespace = None
|
|
522
|
+
with suppress(Exception):
|
|
523
|
+
if pod_override is not None:
|
|
524
|
+
namespace = pod_override.metadata.namespace
|
|
525
|
+
return namespace or conf.get("kubernetes_executor", "namespace")
|
|
526
|
+
|
|
527
|
+
def get_task_log(self, ti: TaskInstance, try_number: int) -> tuple[list[str], list[str]]:
|
|
528
|
+
messages = []
|
|
529
|
+
log = []
|
|
530
|
+
try:
|
|
531
|
+
from airflow.providers.cncf.kubernetes.kube_client import get_kube_client
|
|
532
|
+
from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator
|
|
533
|
+
|
|
534
|
+
client = get_kube_client()
|
|
535
|
+
|
|
536
|
+
messages.append(f"Attempting to fetch logs from pod {ti.hostname} through kube API")
|
|
537
|
+
selector = PodGenerator.build_selector_for_k8s_executor_pod(
|
|
538
|
+
dag_id=ti.dag_id,
|
|
539
|
+
task_id=ti.task_id,
|
|
540
|
+
try_number=try_number,
|
|
541
|
+
map_index=ti.map_index,
|
|
542
|
+
run_id=ti.run_id,
|
|
543
|
+
airflow_worker=ti.queued_by_job_id,
|
|
544
|
+
)
|
|
545
|
+
namespace = self._get_pod_namespace(ti)
|
|
546
|
+
pod_list = client.list_namespaced_pod(
|
|
547
|
+
namespace=namespace,
|
|
548
|
+
label_selector=selector,
|
|
549
|
+
).items
|
|
550
|
+
if not pod_list:
|
|
551
|
+
raise RuntimeError("Cannot find pod for ti %s", ti)
|
|
552
|
+
if len(pod_list) > 1:
|
|
553
|
+
raise RuntimeError("Found multiple pods for ti %s: %s", ti, pod_list)
|
|
554
|
+
res = client.read_namespaced_pod_log(
|
|
555
|
+
name=pod_list[0].metadata.name,
|
|
556
|
+
namespace=namespace,
|
|
557
|
+
container="base",
|
|
558
|
+
follow=False,
|
|
559
|
+
tail_lines=self.RUNNING_POD_LOG_LINES,
|
|
560
|
+
_preload_content=False,
|
|
561
|
+
)
|
|
562
|
+
for line in res:
|
|
563
|
+
log.append(remove_escape_codes(line.decode()))
|
|
564
|
+
if log:
|
|
565
|
+
messages.append("Found logs through kube API")
|
|
566
|
+
except Exception as e:
|
|
567
|
+
messages.append(f"Reading from k8s pod logs failed: {e}")
|
|
568
|
+
return messages, ["\n".join(log)]
|
|
569
|
+
|
|
570
|
+
def try_adopt_task_instances(self, tis: Sequence[TaskInstance]) -> Sequence[TaskInstance]:
|
|
571
|
+
with Stats.timer("kubernetes_executor.adopt_task_instances.duration"):
|
|
572
|
+
# Always flush TIs without queued_by_job_id
|
|
573
|
+
tis_to_flush = [ti for ti in tis if not ti.queued_by_job_id]
|
|
574
|
+
scheduler_job_ids = {ti.queued_by_job_id for ti in tis}
|
|
575
|
+
tis_to_flush_by_key = {ti.key: ti for ti in tis if ti.queued_by_job_id}
|
|
576
|
+
kube_client: client.CoreV1Api = self.kube_client
|
|
577
|
+
for scheduler_job_id in scheduler_job_ids:
|
|
578
|
+
scheduler_job_id_safe_label = self._make_safe_label_value(str(scheduler_job_id))
|
|
579
|
+
# We will look for any pods owned by the no-longer-running scheduler,
|
|
580
|
+
# but will exclude only successful pods, as those TIs will have a terminal state
|
|
581
|
+
# and not be up for adoption!
|
|
582
|
+
# Those workers that failed, however, are okay to adopt here as their TI will
|
|
583
|
+
# still be in queued.
|
|
584
|
+
query_kwargs = {
|
|
585
|
+
"field_selector": "status.phase!=Succeeded",
|
|
586
|
+
"label_selector": (
|
|
587
|
+
"kubernetes_executor=True,"
|
|
588
|
+
f"airflow-worker={scheduler_job_id_safe_label},{POD_EXECUTOR_DONE_KEY}!=True"
|
|
589
|
+
),
|
|
590
|
+
}
|
|
591
|
+
pod_list = self._list_pods(query_kwargs)
|
|
592
|
+
for pod in pod_list:
|
|
593
|
+
self.adopt_launched_task(kube_client, pod, tis_to_flush_by_key)
|
|
594
|
+
self._adopt_completed_pods(kube_client)
|
|
595
|
+
|
|
596
|
+
# as this method can be retried within a short time frame
|
|
597
|
+
# (wrapped in a run_with_db_retries of scheduler_job_runner,
|
|
598
|
+
# and get retried due to an OperationalError, for example),
|
|
599
|
+
# there is a chance that in second attempt, adopt_launched_task will not be called even once
|
|
600
|
+
# as all pods are already adopted in the first attempt.
|
|
601
|
+
# and tis_to_flush_by_key will contain TIs that are already adopted.
|
|
602
|
+
# therefore, we need to check if the TIs are already adopted by the first attempt and remove them.
|
|
603
|
+
def _iter_tis_to_flush():
|
|
604
|
+
for key, ti in tis_to_flush_by_key.items():
|
|
605
|
+
if key in self.running:
|
|
606
|
+
self.log.info("%s is already adopted, no need to flush.", ti)
|
|
607
|
+
else:
|
|
608
|
+
yield ti
|
|
609
|
+
|
|
610
|
+
tis_to_flush.extend(_iter_tis_to_flush())
|
|
611
|
+
return tis_to_flush
|
|
612
|
+
|
|
613
|
+
@deprecated(
|
|
614
|
+
reason="Replaced by function `revoke_task`. Upgrade airflow core to make this go away.",
|
|
615
|
+
category=AirflowProviderDeprecationWarning,
|
|
616
|
+
)
|
|
617
|
+
def cleanup_stuck_queued_tasks(self, tis: list[TaskInstance]) -> list[str]:
|
|
618
|
+
"""
|
|
619
|
+
Handle remnants of tasks that were failed because they were stuck in queued.
|
|
620
|
+
|
|
621
|
+
Tasks can get stuck in queued. If such a task is detected, it will be marked
|
|
622
|
+
as `UP_FOR_RETRY` if the task instance has remaining retries or marked as `FAILED`
|
|
623
|
+
if it doesn't.
|
|
624
|
+
|
|
625
|
+
:param tis: List of Task Instances to clean up
|
|
626
|
+
:return: List of readable task instances for a warning message
|
|
627
|
+
"""
|
|
628
|
+
reprs = []
|
|
629
|
+
for ti in tis:
|
|
630
|
+
reprs.append(repr(ti))
|
|
631
|
+
self.revoke_task(ti=ti)
|
|
632
|
+
self.fail(ti.key)
|
|
633
|
+
return reprs
|
|
634
|
+
|
|
635
|
+
def revoke_task(self, *, ti: TaskInstance):
|
|
636
|
+
"""
|
|
637
|
+
Revoke task that may be running.
|
|
638
|
+
|
|
639
|
+
:param ti: task instance to revoke
|
|
640
|
+
"""
|
|
641
|
+
if TYPE_CHECKING:
|
|
642
|
+
assert self.kube_client
|
|
643
|
+
assert self.kube_scheduler
|
|
644
|
+
self.running.discard(ti.key)
|
|
645
|
+
self.queued_tasks.pop(ti.key, None)
|
|
646
|
+
pod_combined_search_str_to_pod_map = self.get_pod_combined_search_str_to_pod_map()
|
|
647
|
+
# Build the pod selector
|
|
648
|
+
base_label_selector = f"dag_id={ti.dag_id},task_id={ti.task_id}"
|
|
649
|
+
if ti.map_index >= 0:
|
|
650
|
+
# Old tasks _couldn't_ be mapped, so we don't have to worry about compat
|
|
651
|
+
base_label_selector += f",map_index={ti.map_index}"
|
|
652
|
+
|
|
653
|
+
search_str = f"{base_label_selector},run_id={ti.run_id}"
|
|
654
|
+
pod = pod_combined_search_str_to_pod_map.get(search_str, None)
|
|
655
|
+
if not pod:
|
|
656
|
+
self.log.warning("Cannot find pod for ti %s", ti)
|
|
657
|
+
return
|
|
658
|
+
|
|
659
|
+
self.kube_scheduler.patch_pod_revoked(pod_name=pod.metadata.name, namespace=pod.metadata.namespace)
|
|
660
|
+
self.kube_scheduler.delete_pod(pod_name=pod.metadata.name, namespace=pod.metadata.namespace)
|
|
661
|
+
|
|
662
|
+
def adopt_launched_task(
|
|
663
|
+
self,
|
|
664
|
+
kube_client: client.CoreV1Api,
|
|
665
|
+
pod: k8s.V1Pod,
|
|
666
|
+
tis_to_flush_by_key: dict[TaskInstanceKey, k8s.V1Pod],
|
|
667
|
+
) -> None:
|
|
668
|
+
"""
|
|
669
|
+
Patch existing pod so that the current KubernetesJobWatcher can monitor it via label selectors.
|
|
670
|
+
|
|
671
|
+
:param kube_client: kubernetes client for speaking to kube API
|
|
672
|
+
:param pod: V1Pod spec that we will patch with new label
|
|
673
|
+
:param tis_to_flush_by_key: TIs that will be flushed if they aren't adopted
|
|
674
|
+
"""
|
|
675
|
+
if TYPE_CHECKING:
|
|
676
|
+
assert self.scheduler_job_id
|
|
677
|
+
|
|
678
|
+
self.log.info("attempting to adopt pod %s", pod.metadata.name)
|
|
679
|
+
ti_key = annotations_to_key(pod.metadata.annotations)
|
|
680
|
+
if ti_key not in tis_to_flush_by_key:
|
|
681
|
+
self.log.error("attempting to adopt taskinstance which was not specified by database: %s", ti_key)
|
|
682
|
+
return
|
|
683
|
+
|
|
684
|
+
new_worker_id_label = self._make_safe_label_value(self.scheduler_job_id)
|
|
685
|
+
from kubernetes.client.rest import ApiException
|
|
686
|
+
|
|
687
|
+
try:
|
|
688
|
+
kube_client.patch_namespaced_pod(
|
|
689
|
+
name=pod.metadata.name,
|
|
690
|
+
namespace=pod.metadata.namespace,
|
|
691
|
+
body={"metadata": {"labels": {"airflow-worker": new_worker_id_label}}},
|
|
692
|
+
)
|
|
693
|
+
except ApiException as e:
|
|
694
|
+
self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
|
|
695
|
+
return
|
|
696
|
+
|
|
697
|
+
del tis_to_flush_by_key[ti_key]
|
|
698
|
+
self.running.add(ti_key)
|
|
699
|
+
|
|
700
|
+
def _adopt_completed_pods(self, kube_client: client.CoreV1Api) -> None:
|
|
701
|
+
"""
|
|
702
|
+
Patch completed pods so that the KubernetesJobWatcher can delete them.
|
|
703
|
+
|
|
704
|
+
:param kube_client: kubernetes client for speaking to kube API
|
|
705
|
+
"""
|
|
706
|
+
if TYPE_CHECKING:
|
|
707
|
+
assert self.scheduler_job_id
|
|
708
|
+
|
|
709
|
+
new_worker_id_label = self._make_safe_label_value(self.scheduler_job_id)
|
|
710
|
+
query_kwargs = {
|
|
711
|
+
"field_selector": "status.phase=Succeeded",
|
|
712
|
+
"label_selector": (
|
|
713
|
+
"kubernetes_executor=True,"
|
|
714
|
+
f"airflow-worker!={new_worker_id_label},{POD_EXECUTOR_DONE_KEY}!=True"
|
|
715
|
+
),
|
|
716
|
+
}
|
|
717
|
+
pod_list = self._list_pods(query_kwargs)
|
|
718
|
+
for pod in pod_list:
|
|
719
|
+
self.log.info("Attempting to adopt pod %s", pod.metadata.name)
|
|
720
|
+
from kubernetes.client.rest import ApiException
|
|
721
|
+
|
|
722
|
+
try:
|
|
723
|
+
kube_client.patch_namespaced_pod(
|
|
724
|
+
name=pod.metadata.name,
|
|
725
|
+
namespace=pod.metadata.namespace,
|
|
726
|
+
body={"metadata": {"labels": {"airflow-worker": new_worker_id_label}}},
|
|
727
|
+
)
|
|
728
|
+
except ApiException as e:
|
|
729
|
+
self.log.info("Failed to adopt pod %s. Reason: %s", pod.metadata.name, e)
|
|
730
|
+
continue
|
|
731
|
+
|
|
732
|
+
ti_id = annotations_to_key(pod.metadata.annotations)
|
|
733
|
+
self.completed.add(
|
|
734
|
+
KubernetesResults(
|
|
735
|
+
key=ti_id,
|
|
736
|
+
state="completed",
|
|
737
|
+
pod_name=pod.metadata.name,
|
|
738
|
+
namespace=pod.metadata.namespace,
|
|
739
|
+
resource_version=pod.metadata.resource_version,
|
|
740
|
+
failure_details=None,
|
|
741
|
+
)
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
def _flush_task_queue(self) -> None:
|
|
745
|
+
if TYPE_CHECKING:
|
|
746
|
+
assert self.task_queue
|
|
747
|
+
|
|
748
|
+
self.log.debug("Executor shutting down, task_queue approximate size=%d", self.task_queue.qsize())
|
|
749
|
+
with contextlib.suppress(Empty):
|
|
750
|
+
while True:
|
|
751
|
+
task = self.task_queue.get_nowait()
|
|
752
|
+
# This is a new task to run thus ok to ignore.
|
|
753
|
+
self.log.warning("Executor shutting down, will NOT run task=%s", task)
|
|
754
|
+
self.task_queue.task_done()
|
|
755
|
+
|
|
756
|
+
def _flush_result_queue(self) -> None:
|
|
757
|
+
if TYPE_CHECKING:
|
|
758
|
+
assert self.result_queue
|
|
759
|
+
|
|
760
|
+
self.log.debug("Executor shutting down, result_queue approximate size=%d", self.result_queue.qsize())
|
|
761
|
+
with contextlib.suppress(Empty):
|
|
762
|
+
while True:
|
|
763
|
+
results = self.result_queue.get_nowait()
|
|
764
|
+
self.log.warning("Executor shutting down, flushing results=%s", results)
|
|
765
|
+
try:
|
|
766
|
+
self.log.info(
|
|
767
|
+
"Changing state of %s to %s : resource_version=%s",
|
|
768
|
+
results,
|
|
769
|
+
results.state,
|
|
770
|
+
results.resource_version,
|
|
771
|
+
)
|
|
772
|
+
try:
|
|
773
|
+
self._change_state(results)
|
|
774
|
+
except Exception as e:
|
|
775
|
+
self.log.exception(
|
|
776
|
+
"Ignoring exception: %s when attempting to change state of %s to %s.",
|
|
777
|
+
e,
|
|
778
|
+
results,
|
|
779
|
+
results.state,
|
|
780
|
+
)
|
|
781
|
+
finally:
|
|
782
|
+
self.result_queue.task_done()
|
|
783
|
+
|
|
784
|
+
def end(self) -> None:
|
|
785
|
+
"""Shut down the executor."""
|
|
786
|
+
if TYPE_CHECKING:
|
|
787
|
+
assert self.task_queue
|
|
788
|
+
assert self.result_queue
|
|
789
|
+
assert self.kube_scheduler
|
|
790
|
+
|
|
791
|
+
self.log.info("Shutting down Kubernetes executor")
|
|
792
|
+
try:
|
|
793
|
+
self.log.debug("Flushing task_queue...")
|
|
794
|
+
self._flush_task_queue()
|
|
795
|
+
self.log.debug("Flushing result_queue...")
|
|
796
|
+
self._flush_result_queue()
|
|
797
|
+
# Both queues should be empty...
|
|
798
|
+
self.task_queue.join()
|
|
799
|
+
self.result_queue.join()
|
|
800
|
+
except ConnectionResetError:
|
|
801
|
+
self.log.exception("Connection Reset error while flushing task_queue and result_queue.")
|
|
802
|
+
except Exception:
|
|
803
|
+
self.log.exception("Unknown error while flushing task queue and result queue.")
|
|
804
|
+
if self.kube_scheduler:
|
|
805
|
+
try:
|
|
806
|
+
self.kube_scheduler.terminate()
|
|
807
|
+
except Exception:
|
|
808
|
+
self.log.exception("Unknown error while flushing task queue and result queue.")
|
|
809
|
+
self._manager.shutdown()
|
|
810
|
+
|
|
811
|
+
def terminate(self):
|
|
812
|
+
"""Terminate the executor is not doing anything."""
|
|
813
|
+
|
|
814
|
+
@staticmethod
|
|
815
|
+
def get_cli_commands() -> list[GroupCommand]:
|
|
816
|
+
return [
|
|
817
|
+
GroupCommand(
|
|
818
|
+
name="kubernetes",
|
|
819
|
+
help="Tools to help run the KubernetesExecutor",
|
|
820
|
+
subcommands=KUBERNETES_COMMANDS,
|
|
821
|
+
)
|
|
822
|
+
]
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
def _get_parser() -> argparse.ArgumentParser:
|
|
826
|
+
"""
|
|
827
|
+
Generate documentation; used by Sphinx.
|
|
828
|
+
|
|
829
|
+
:meta private:
|
|
830
|
+
"""
|
|
831
|
+
return KubernetesExecutor._get_parser()
|