apache-airflow-providers-cncf-kubernetes 3.1.0__py3-none-any.whl → 10.10.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/cncf/kubernetes/__init__.py +18 -23
- airflow/providers/cncf/kubernetes/backcompat/__init__.py +17 -0
- airflow/providers/cncf/kubernetes/backcompat/backwards_compat_converters.py +31 -49
- airflow/providers/cncf/kubernetes/callbacks.py +200 -0
- airflow/providers/cncf/kubernetes/cli/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/cli/kubernetes_command.py +195 -0
- airflow/providers/cncf/kubernetes/decorators/kubernetes.py +163 -0
- airflow/providers/cncf/kubernetes/decorators/kubernetes_cmd.py +118 -0
- airflow/providers/cncf/kubernetes/exceptions.py +37 -0
- airflow/providers/cncf/kubernetes/executors/__init__.py +17 -0
- airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +831 -0
- airflow/providers/cncf/kubernetes/executors/kubernetes_executor_types.py +91 -0
- airflow/providers/cncf/kubernetes/executors/kubernetes_executor_utils.py +736 -0
- airflow/providers/cncf/kubernetes/executors/local_kubernetes_executor.py +306 -0
- airflow/providers/cncf/kubernetes/get_provider_info.py +249 -50
- airflow/providers/cncf/kubernetes/hooks/kubernetes.py +846 -112
- airflow/providers/cncf/kubernetes/k8s_model.py +62 -0
- airflow/providers/cncf/kubernetes/kube_client.py +156 -0
- airflow/providers/cncf/kubernetes/kube_config.py +125 -0
- airflow/providers/cncf/kubernetes/kubernetes_executor_templates/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/kubernetes_executor_templates/basic_template.yaml +79 -0
- airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +165 -0
- airflow/providers/cncf/kubernetes/operators/custom_object_launcher.py +368 -0
- airflow/providers/cncf/kubernetes/operators/job.py +646 -0
- airflow/providers/cncf/kubernetes/operators/kueue.py +132 -0
- airflow/providers/cncf/kubernetes/operators/pod.py +1417 -0
- airflow/providers/cncf/kubernetes/operators/resource.py +191 -0
- airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +336 -35
- airflow/providers/cncf/kubernetes/pod_generator.py +592 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_image_template.yaml +68 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml +74 -0
- airflow/providers/cncf/kubernetes/pod_template_file_examples/git_sync_template.yaml +95 -0
- airflow/providers/cncf/kubernetes/python_kubernetes_script.jinja2 +51 -0
- airflow/providers/cncf/kubernetes/python_kubernetes_script.py +92 -0
- airflow/providers/cncf/kubernetes/resource_convert/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/resource_convert/configmap.py +52 -0
- airflow/providers/cncf/kubernetes/resource_convert/env_variable.py +39 -0
- airflow/providers/cncf/kubernetes/resource_convert/secret.py +40 -0
- airflow/providers/cncf/kubernetes/secret.py +128 -0
- airflow/providers/cncf/kubernetes/sensors/spark_kubernetes.py +30 -14
- airflow/providers/cncf/kubernetes/template_rendering.py +81 -0
- airflow/providers/cncf/kubernetes/triggers/__init__.py +16 -0
- airflow/providers/cncf/kubernetes/triggers/job.py +176 -0
- airflow/providers/cncf/kubernetes/triggers/pod.py +344 -0
- airflow/providers/cncf/kubernetes/utils/__init__.py +3 -0
- airflow/providers/cncf/kubernetes/utils/container.py +118 -0
- airflow/providers/cncf/kubernetes/utils/delete_from.py +154 -0
- airflow/providers/cncf/kubernetes/utils/k8s_resource_iterator.py +46 -0
- airflow/providers/cncf/kubernetes/utils/pod_manager.py +887 -152
- airflow/providers/cncf/kubernetes/utils/xcom_sidecar.py +25 -16
- airflow/providers/cncf/kubernetes/version_compat.py +38 -0
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/METADATA +125 -0
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/RECORD +62 -0
- {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info}/WHEEL +1 -2
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/entry_points.txt +3 -0
- apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses/NOTICE +5 -0
- airflow/providers/cncf/kubernetes/backcompat/pod.py +0 -119
- airflow/providers/cncf/kubernetes/backcompat/pod_runtime_info_env.py +0 -56
- airflow/providers/cncf/kubernetes/backcompat/volume.py +0 -62
- airflow/providers/cncf/kubernetes/backcompat/volume_mount.py +0 -58
- airflow/providers/cncf/kubernetes/example_dags/example_kubernetes.py +0 -163
- airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes.py +0 -66
- airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes_spark_pi.yaml +0 -57
- airflow/providers/cncf/kubernetes/operators/kubernetes_pod.py +0 -622
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/METADATA +0 -452
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/NOTICE +0 -6
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/RECORD +0 -29
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/entry_points.txt +0 -3
- apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/top_level.txt +0 -1
- /airflow/providers/cncf/kubernetes/{example_dags → decorators}/__init__.py +0 -0
- {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses}/LICENSE +0 -0
|
@@ -14,35 +14,61 @@
|
|
|
14
14
|
# KIND, either express or implied. See the License for the
|
|
15
15
|
# specific language governing permissions and limitations
|
|
16
16
|
# under the License.
|
|
17
|
-
"""Launches PODs"""
|
|
17
|
+
"""Launches PODs."""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import asyncio
|
|
22
|
+
import enum
|
|
18
23
|
import json
|
|
19
24
|
import math
|
|
20
25
|
import time
|
|
26
|
+
from collections.abc import Callable, Generator, Iterable
|
|
21
27
|
from contextlib import closing
|
|
22
|
-
from
|
|
23
|
-
from
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from datetime import timedelta
|
|
30
|
+
from typing import TYPE_CHECKING, Literal, cast
|
|
24
31
|
|
|
25
32
|
import pendulum
|
|
26
33
|
import tenacity
|
|
27
34
|
from kubernetes import client, watch
|
|
28
|
-
from kubernetes.client.models.v1_pod import V1Pod
|
|
29
35
|
from kubernetes.client.rest import ApiException
|
|
30
36
|
from kubernetes.stream import stream as kubernetes_stream
|
|
31
37
|
from pendulum import DateTime
|
|
32
38
|
from pendulum.parsing.exceptions import ParserError
|
|
33
|
-
from urllib3.exceptions import HTTPError
|
|
39
|
+
from urllib3.exceptions import HTTPError, TimeoutError
|
|
34
40
|
|
|
35
41
|
from airflow.exceptions import AirflowException
|
|
36
|
-
from airflow.kubernetes.
|
|
37
|
-
from airflow.kubernetes.
|
|
42
|
+
from airflow.providers.cncf.kubernetes.callbacks import ExecutionMode, KubernetesPodOperatorCallback
|
|
43
|
+
from airflow.providers.cncf.kubernetes.utils.container import (
|
|
44
|
+
container_is_completed,
|
|
45
|
+
container_is_running,
|
|
46
|
+
container_is_terminated,
|
|
47
|
+
container_is_wait,
|
|
48
|
+
get_container_status,
|
|
49
|
+
)
|
|
50
|
+
from airflow.providers.cncf.kubernetes.utils.xcom_sidecar import PodDefaults
|
|
38
51
|
from airflow.utils.log.logging_mixin import LoggingMixin
|
|
52
|
+
from airflow.utils.timezone import utcnow
|
|
39
53
|
|
|
40
54
|
if TYPE_CHECKING:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
55
|
+
from kubernetes.client.models.core_v1_event_list import CoreV1EventList
|
|
56
|
+
from kubernetes.client.models.v1_container_state import V1ContainerState
|
|
57
|
+
from kubernetes.client.models.v1_container_state_waiting import V1ContainerStateWaiting
|
|
58
|
+
from kubernetes.client.models.v1_object_reference import V1ObjectReference
|
|
59
|
+
from kubernetes.client.models.v1_pod import V1Pod
|
|
60
|
+
from kubernetes.client.models.v1_pod_condition import V1PodCondition
|
|
61
|
+
from urllib3.response import HTTPResponse
|
|
62
|
+
|
|
63
|
+
from airflow.providers.cncf.kubernetes.hooks.kubernetes import AsyncKubernetesHook
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
EMPTY_XCOM_RESULT = "__airflow_xcom_result_empty__"
|
|
67
|
+
"""
|
|
68
|
+
Sentinel for no xcom result.
|
|
69
|
+
|
|
70
|
+
:meta private:
|
|
71
|
+
"""
|
|
46
72
|
|
|
47
73
|
|
|
48
74
|
class PodLaunchFailedException(AirflowException):
|
|
@@ -50,90 +76,283 @@ class PodLaunchFailedException(AirflowException):
|
|
|
50
76
|
|
|
51
77
|
|
|
52
78
|
def should_retry_start_pod(exception: BaseException) -> bool:
|
|
53
|
-
"""Check if an Exception indicates a transient error and warrants retrying"""
|
|
79
|
+
"""Check if an Exception indicates a transient error and warrants retrying."""
|
|
54
80
|
if isinstance(exception, ApiException):
|
|
55
|
-
return exception.status == 409
|
|
81
|
+
return str(exception.status) == "409"
|
|
56
82
|
return False
|
|
57
83
|
|
|
58
84
|
|
|
59
85
|
class PodPhase:
|
|
60
86
|
"""
|
|
61
|
-
Possible pod phases
|
|
87
|
+
Possible pod phases.
|
|
88
|
+
|
|
62
89
|
See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase.
|
|
63
90
|
"""
|
|
64
91
|
|
|
65
|
-
PENDING =
|
|
66
|
-
RUNNING =
|
|
67
|
-
FAILED =
|
|
68
|
-
SUCCEEDED =
|
|
92
|
+
PENDING = "Pending"
|
|
93
|
+
RUNNING = "Running"
|
|
94
|
+
FAILED = "Failed"
|
|
95
|
+
SUCCEEDED = "Succeeded"
|
|
69
96
|
|
|
70
97
|
terminal_states = {FAILED, SUCCEEDED}
|
|
71
98
|
|
|
72
99
|
|
|
73
|
-
def
|
|
100
|
+
def check_exception_is_kubernetes_api_unauthorized(exc: BaseException):
|
|
101
|
+
return isinstance(exc, ApiException) and exc.status and str(exc.status) == "401"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
async def watch_pod_events(
|
|
105
|
+
pod_manager: PodManager | AsyncPodManager,
|
|
106
|
+
pod: V1Pod,
|
|
107
|
+
check_interval: float = 1,
|
|
108
|
+
) -> None:
|
|
74
109
|
"""
|
|
75
|
-
|
|
76
|
-
|
|
110
|
+
Read pod events and write them to the log.
|
|
111
|
+
|
|
112
|
+
This function supports both asynchronous and synchronous pod managers.
|
|
113
|
+
|
|
114
|
+
:param pod_manager: The pod manager instance (PodManager or AsyncPodManager).
|
|
115
|
+
:param pod: The pod object to monitor.
|
|
116
|
+
:param check_interval: Interval (in seconds) between checks.
|
|
77
117
|
"""
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
118
|
+
num_events = 0
|
|
119
|
+
is_async = isinstance(pod_manager, AsyncPodManager)
|
|
120
|
+
while not pod_manager.stop_watching_events:
|
|
121
|
+
if is_async:
|
|
122
|
+
events = await pod_manager.read_pod_events(pod)
|
|
123
|
+
else:
|
|
124
|
+
events = pod_manager.read_pod_events(pod)
|
|
125
|
+
for new_event in events.items[num_events:]:
|
|
126
|
+
involved_object: V1ObjectReference = new_event.involved_object
|
|
127
|
+
pod_manager.log.info(
|
|
128
|
+
"The Pod has an Event: %s from %s", new_event.message, involved_object.field_path
|
|
129
|
+
)
|
|
130
|
+
num_events = len(events.items)
|
|
131
|
+
await asyncio.sleep(check_interval)
|
|
85
132
|
|
|
86
133
|
|
|
87
|
-
|
|
134
|
+
async def await_pod_start(
|
|
135
|
+
pod_manager: PodManager | AsyncPodManager,
|
|
136
|
+
pod: V1Pod,
|
|
137
|
+
schedule_timeout: int = 120,
|
|
138
|
+
startup_timeout: int = 120,
|
|
139
|
+
check_interval: float = 1,
|
|
140
|
+
):
|
|
88
141
|
"""
|
|
89
|
-
|
|
90
|
-
|
|
142
|
+
Monitor the startup phase of a Kubernetes pod, waiting for it to leave the ``Pending`` state.
|
|
143
|
+
|
|
144
|
+
This function is shared by both PodManager and AsyncPodManager to provide consistent pod startup tracking.
|
|
145
|
+
|
|
146
|
+
:param pod_manager: The pod manager instance (PodManager or AsyncPodManager).
|
|
147
|
+
:param pod: The pod object to monitor.
|
|
148
|
+
:param schedule_timeout: Maximum time (in seconds) to wait for the pod to be scheduled.
|
|
149
|
+
:param startup_timeout: Maximum time (in seconds) to wait for the pod to start running after being scheduled.
|
|
150
|
+
:param check_interval: Interval (in seconds) between status checks.
|
|
151
|
+
:param is_async: Set to True if called in an async context; otherwise, False.
|
|
152
|
+
"""
|
|
153
|
+
pod_manager.log.info("::group::Waiting until %ss to get the POD scheduled...", schedule_timeout)
|
|
154
|
+
pod_was_scheduled = False
|
|
155
|
+
start_check_time = time.time()
|
|
156
|
+
is_async = isinstance(pod_manager, AsyncPodManager)
|
|
157
|
+
while True:
|
|
158
|
+
if is_async:
|
|
159
|
+
remote_pod = await pod_manager.read_pod(pod)
|
|
160
|
+
else:
|
|
161
|
+
remote_pod = pod_manager.read_pod(pod)
|
|
162
|
+
pod_status = remote_pod.status
|
|
163
|
+
if pod_status.phase != PodPhase.PENDING:
|
|
164
|
+
pod_manager.stop_watching_events = True
|
|
165
|
+
pod_manager.log.info("::endgroup::")
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
# Check for timeout
|
|
169
|
+
pod_conditions: list[V1PodCondition] = pod_status.conditions
|
|
170
|
+
if pod_conditions and any(
|
|
171
|
+
(condition.type == "PodScheduled" and condition.status == "True") for condition in pod_conditions
|
|
172
|
+
):
|
|
173
|
+
if not pod_was_scheduled:
|
|
174
|
+
# POD was initially scheduled update timeout for getting POD launched
|
|
175
|
+
pod_was_scheduled = True
|
|
176
|
+
start_check_time = time.time()
|
|
177
|
+
pod_manager.log.info("Waiting %ss to get the POD running...", startup_timeout)
|
|
178
|
+
|
|
179
|
+
if time.time() - start_check_time >= startup_timeout:
|
|
180
|
+
pod_manager.log.info("::endgroup::")
|
|
181
|
+
raise PodLaunchTimeoutException(
|
|
182
|
+
f"Pod took too long to start. More than {startup_timeout}s. Check the pod events in kubernetes."
|
|
183
|
+
)
|
|
184
|
+
else:
|
|
185
|
+
if time.time() - start_check_time >= schedule_timeout:
|
|
186
|
+
pod_manager.log.info("::endgroup::")
|
|
187
|
+
raise PodLaunchTimeoutException(
|
|
188
|
+
f"Pod took too long to be scheduled on the cluster, giving up. More than {schedule_timeout}s. Check the pod events in kubernetes."
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Check for general problems to terminate early - ErrImagePull
|
|
192
|
+
if pod_status.container_statuses:
|
|
193
|
+
for container_status in pod_status.container_statuses:
|
|
194
|
+
container_state: V1ContainerState = container_status.state
|
|
195
|
+
container_waiting: V1ContainerStateWaiting | None = container_state.waiting
|
|
196
|
+
if container_waiting:
|
|
197
|
+
if container_waiting.reason in ["ErrImagePull", "InvalidImageName"]:
|
|
198
|
+
pod_manager.log.info("::endgroup::")
|
|
199
|
+
raise PodLaunchFailedException(
|
|
200
|
+
f"Pod docker image cannot be pulled, unable to start: {container_waiting.reason}"
|
|
201
|
+
f"\n{container_waiting.message}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
await asyncio.sleep(check_interval)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class PodLaunchTimeoutException(AirflowException):
|
|
208
|
+
"""When pod does not leave the ``Pending`` phase within specified timeout."""
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class PodNotFoundException(AirflowException):
|
|
212
|
+
"""Expected pod does not exist in kube-api."""
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class PodLogsConsumer:
|
|
216
|
+
"""
|
|
217
|
+
Responsible for pulling pod logs from a stream with checking a container status before reading data.
|
|
218
|
+
|
|
219
|
+
This class is a workaround for the issue https://github.com/apache/airflow/issues/23497.
|
|
220
|
+
|
|
221
|
+
:param response: HTTP response with logs
|
|
222
|
+
:param pod: Pod instance from Kubernetes client
|
|
223
|
+
:param pod_manager: Pod manager instance
|
|
224
|
+
:param container_name: Name of the container that we're reading logs from
|
|
225
|
+
:param post_termination_timeout: (Optional) The period of time in seconds representing for how long time
|
|
226
|
+
logs are available after the container termination.
|
|
227
|
+
:param read_pod_cache_timeout: (Optional) The container's status cache lifetime.
|
|
228
|
+
The container status is cached to reduce API calls.
|
|
229
|
+
|
|
230
|
+
:meta private:
|
|
91
231
|
"""
|
|
92
232
|
|
|
93
233
|
def __init__(
|
|
94
234
|
self,
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
235
|
+
response: HTTPResponse,
|
|
236
|
+
pod: V1Pod,
|
|
237
|
+
pod_manager: PodManager,
|
|
238
|
+
container_name: str,
|
|
239
|
+
post_termination_timeout: int = 120,
|
|
240
|
+
read_pod_cache_timeout: int = 120,
|
|
241
|
+
):
|
|
242
|
+
self.response = response
|
|
243
|
+
self.pod = pod
|
|
244
|
+
self.pod_manager = pod_manager
|
|
245
|
+
self.container_name = container_name
|
|
246
|
+
self.post_termination_timeout = post_termination_timeout
|
|
247
|
+
self.last_read_pod_at = None
|
|
248
|
+
self.read_pod_cache = None
|
|
249
|
+
self.read_pod_cache_timeout = read_pod_cache_timeout
|
|
250
|
+
|
|
251
|
+
def __iter__(self) -> Generator[bytes, None, None]:
|
|
252
|
+
r"""Yield log items divided by the '\n' symbol."""
|
|
253
|
+
incomplete_log_item: list[bytes] = []
|
|
254
|
+
if self.logs_available():
|
|
255
|
+
for data_chunk in self.response.stream(amt=None, decode_content=True):
|
|
256
|
+
if b"\n" in data_chunk:
|
|
257
|
+
log_items = data_chunk.split(b"\n")
|
|
258
|
+
yield from self._extract_log_items(incomplete_log_item, log_items)
|
|
259
|
+
incomplete_log_item = self._save_incomplete_log_item(log_items[-1])
|
|
260
|
+
else:
|
|
261
|
+
incomplete_log_item.append(data_chunk)
|
|
262
|
+
if not self.logs_available():
|
|
263
|
+
break
|
|
264
|
+
if incomplete_log_item:
|
|
265
|
+
yield b"".join(incomplete_log_item)
|
|
266
|
+
|
|
267
|
+
@staticmethod
|
|
268
|
+
def _extract_log_items(incomplete_log_item: list[bytes], log_items: list[bytes]):
|
|
269
|
+
yield b"".join(incomplete_log_item) + log_items[0] + b"\n"
|
|
270
|
+
for x in log_items[1:-1]:
|
|
271
|
+
yield x + b"\n"
|
|
272
|
+
|
|
273
|
+
@staticmethod
|
|
274
|
+
def _save_incomplete_log_item(sub_chunk: bytes):
|
|
275
|
+
return [sub_chunk] if [sub_chunk] else []
|
|
276
|
+
|
|
277
|
+
def logs_available(self):
|
|
278
|
+
remote_pod = self.read_pod()
|
|
279
|
+
if container_is_running(pod=remote_pod, container_name=self.container_name):
|
|
280
|
+
return True
|
|
281
|
+
container_status = get_container_status(pod=remote_pod, container_name=self.container_name)
|
|
282
|
+
state = container_status.state if container_status else None
|
|
283
|
+
terminated = state.terminated if state else None
|
|
284
|
+
if terminated:
|
|
285
|
+
termination_time = terminated.finished_at
|
|
286
|
+
if termination_time:
|
|
287
|
+
return termination_time + timedelta(seconds=self.post_termination_timeout) > utcnow()
|
|
288
|
+
return False
|
|
289
|
+
|
|
290
|
+
def read_pod(self):
|
|
291
|
+
_now = utcnow()
|
|
292
|
+
if (
|
|
293
|
+
self.read_pod_cache is None
|
|
294
|
+
or self.last_read_pod_at + timedelta(seconds=self.read_pod_cache_timeout) < _now
|
|
295
|
+
):
|
|
296
|
+
self.read_pod_cache = self.pod_manager.read_pod(self.pod)
|
|
297
|
+
self.last_read_pod_at = _now
|
|
298
|
+
return self.read_pod_cache
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@dataclass
|
|
302
|
+
class PodLoggingStatus:
|
|
303
|
+
"""Return the status of the pod and last log time when exiting from `fetch_container_logs`."""
|
|
304
|
+
|
|
305
|
+
running: bool
|
|
306
|
+
last_log_time: DateTime | None
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class PodManager(LoggingMixin):
|
|
310
|
+
"""Create, monitor, and otherwise interact with Kubernetes pods for use with the KubernetesPodOperator."""
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
kube_client: client.CoreV1Api,
|
|
315
|
+
callbacks: list[type[KubernetesPodOperatorCallback]] | None = None,
|
|
98
316
|
):
|
|
99
317
|
"""
|
|
100
|
-
|
|
318
|
+
Create the launcher.
|
|
101
319
|
|
|
102
320
|
:param kube_client: kubernetes client
|
|
103
|
-
:param
|
|
104
|
-
:param cluster_context: context of the cluster
|
|
321
|
+
:param callbacks:
|
|
105
322
|
"""
|
|
106
323
|
super().__init__()
|
|
107
|
-
self._client = kube_client
|
|
324
|
+
self._client = kube_client
|
|
108
325
|
self._watch = watch.Watch()
|
|
326
|
+
self._callbacks = callbacks or []
|
|
327
|
+
self.stop_watching_events = False
|
|
109
328
|
|
|
110
329
|
def run_pod_async(self, pod: V1Pod, **kwargs) -> V1Pod:
|
|
111
|
-
"""
|
|
330
|
+
"""Run POD asynchronously."""
|
|
112
331
|
sanitized_pod = self._client.api_client.sanitize_for_serialization(pod)
|
|
113
332
|
json_pod = json.dumps(sanitized_pod, indent=2)
|
|
114
333
|
|
|
115
|
-
self.log.debug(
|
|
334
|
+
self.log.debug("Pod Creation Request: \n%s", json_pod)
|
|
116
335
|
try:
|
|
117
336
|
resp = self._client.create_namespaced_pod(
|
|
118
337
|
body=sanitized_pod, namespace=pod.metadata.namespace, **kwargs
|
|
119
338
|
)
|
|
120
|
-
self.log.debug(
|
|
339
|
+
self.log.debug("Pod Creation Response: %s", resp)
|
|
121
340
|
except Exception as e:
|
|
122
341
|
self.log.exception(
|
|
123
|
-
|
|
342
|
+
"Exception when attempting to create Namespaced Pod: %s", str(json_pod).replace("\n", " ")
|
|
124
343
|
)
|
|
125
344
|
raise e
|
|
126
345
|
return resp
|
|
127
346
|
|
|
128
347
|
def delete_pod(self, pod: V1Pod) -> None:
|
|
129
|
-
"""
|
|
348
|
+
"""Delete POD."""
|
|
130
349
|
try:
|
|
131
350
|
self._client.delete_namespaced_pod(
|
|
132
351
|
pod.metadata.name, pod.metadata.namespace, body=client.V1DeleteOptions()
|
|
133
352
|
)
|
|
134
353
|
except ApiException as e:
|
|
135
354
|
# If the pod is already deleted
|
|
136
|
-
if e.status != 404:
|
|
355
|
+
if str(e.status) != "404":
|
|
137
356
|
raise
|
|
138
357
|
|
|
139
358
|
@tenacity.retry(
|
|
@@ -143,209 +362,725 @@ class PodManager(LoggingMixin):
|
|
|
143
362
|
retry=tenacity.retry_if_exception(should_retry_start_pod),
|
|
144
363
|
)
|
|
145
364
|
def create_pod(self, pod: V1Pod) -> V1Pod:
|
|
146
|
-
"""
|
|
365
|
+
"""Launch the pod asynchronously."""
|
|
147
366
|
return self.run_pod_async(pod)
|
|
148
367
|
|
|
149
|
-
def
|
|
368
|
+
async def watch_pod_events(self, pod: V1Pod, check_interval: int = 1) -> None:
|
|
369
|
+
"""Read pod events and writes into log."""
|
|
370
|
+
await watch_pod_events(pod_manager=self, pod=pod, check_interval=check_interval)
|
|
371
|
+
|
|
372
|
+
async def await_pod_start(
|
|
373
|
+
self, pod: V1Pod, schedule_timeout: int = 120, startup_timeout: int = 120, check_interval: int = 1
|
|
374
|
+
) -> None:
|
|
150
375
|
"""
|
|
151
|
-
|
|
376
|
+
Wait for the pod to reach phase other than ``Pending``.
|
|
152
377
|
|
|
153
378
|
:param pod:
|
|
379
|
+
:param schedule_timeout: Timeout (in seconds) for pod stay in schedule state
|
|
380
|
+
(if pod is taking to long in schedule state, fails task)
|
|
154
381
|
:param startup_timeout: Timeout (in seconds) for startup of the pod
|
|
155
|
-
(if pod is pending for too long, fails task)
|
|
382
|
+
(if pod is pending for too long after being scheduled, fails task)
|
|
383
|
+
:param check_interval: Interval (in seconds) between checks
|
|
156
384
|
:return:
|
|
157
385
|
"""
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
386
|
+
await await_pod_start(
|
|
387
|
+
pod_manager=self,
|
|
388
|
+
pod=pod,
|
|
389
|
+
schedule_timeout=schedule_timeout,
|
|
390
|
+
startup_timeout=startup_timeout,
|
|
391
|
+
check_interval=check_interval,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def _log_message(
|
|
395
|
+
self,
|
|
396
|
+
message: str,
|
|
397
|
+
container_name: str,
|
|
398
|
+
container_name_log_prefix_enabled: bool,
|
|
399
|
+
log_formatter: Callable[[str, str], str] | None,
|
|
400
|
+
) -> None:
|
|
401
|
+
"""Log a message with appropriate formatting."""
|
|
402
|
+
if is_log_group_marker(message):
|
|
403
|
+
print(message)
|
|
404
|
+
else:
|
|
405
|
+
if log_formatter:
|
|
406
|
+
formatted_message = log_formatter(container_name, message)
|
|
407
|
+
self.log.info("%s", formatted_message)
|
|
408
|
+
else:
|
|
409
|
+
log_message = (
|
|
410
|
+
f"[{container_name}] {message}" if container_name_log_prefix_enabled else message
|
|
169
411
|
)
|
|
170
|
-
|
|
171
|
-
time.sleep(1)
|
|
412
|
+
self.log.info("%s", log_message)
|
|
172
413
|
|
|
173
|
-
def
|
|
414
|
+
def fetch_container_logs(
|
|
415
|
+
self,
|
|
416
|
+
pod: V1Pod,
|
|
417
|
+
container_name: str,
|
|
418
|
+
*,
|
|
419
|
+
follow=False,
|
|
420
|
+
since_time: DateTime | None = None,
|
|
421
|
+
post_termination_timeout: int = 120,
|
|
422
|
+
container_name_log_prefix_enabled: bool = True,
|
|
423
|
+
log_formatter: Callable[[str, str], str] | None = None,
|
|
424
|
+
) -> PodLoggingStatus:
|
|
174
425
|
"""
|
|
175
|
-
|
|
426
|
+
Follow the logs of container and stream to airflow logging.
|
|
427
|
+
|
|
176
428
|
Returns when container exits.
|
|
177
429
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
430
|
+
Between when the pod starts and logs being available, there might be a delay due to CSR not approved
|
|
431
|
+
and signed yet. In such situation, ApiException is thrown. This is why we are retrying on this
|
|
432
|
+
specific exception.
|
|
433
|
+
|
|
434
|
+
:meta private:
|
|
181
435
|
"""
|
|
182
436
|
|
|
183
|
-
def
|
|
437
|
+
def consume_logs(*, since_time: DateTime | None = None) -> tuple[DateTime | None, Exception | None]:
|
|
184
438
|
"""
|
|
185
|
-
|
|
439
|
+
Try to follow container logs until container completes.
|
|
440
|
+
|
|
186
441
|
For a long-running container, sometimes the log read may be interrupted
|
|
187
442
|
Such errors of this kind are suppressed.
|
|
188
443
|
|
|
189
444
|
Returns the last timestamp observed in logs.
|
|
190
445
|
"""
|
|
191
|
-
|
|
446
|
+
exception = None
|
|
447
|
+
last_captured_timestamp = None
|
|
448
|
+
# We timeout connections after 30 minutes because otherwise they can get
|
|
449
|
+
# stuck forever. The 30 is somewhat arbitrary.
|
|
450
|
+
# As a consequence, a TimeoutError will be raised no more than 30 minutes
|
|
451
|
+
# after starting read.
|
|
452
|
+
connection_timeout = 60 * 30
|
|
453
|
+
# We set a shorter read timeout because that helps reduce *connection* timeouts
|
|
454
|
+
# (since the connection will be restarted periodically). And with read timeout,
|
|
455
|
+
# we don't need to worry about either duplicate messages or losing messages; we
|
|
456
|
+
# can safely resume from a few seconds later
|
|
457
|
+
read_timeout = 60 * 5
|
|
192
458
|
try:
|
|
459
|
+
since_seconds = None
|
|
460
|
+
if since_time:
|
|
461
|
+
try:
|
|
462
|
+
since_seconds = math.ceil((pendulum.now() - since_time).total_seconds())
|
|
463
|
+
except TypeError:
|
|
464
|
+
self.log.warning(
|
|
465
|
+
"Error calculating since_seconds with since_time %s. Using None instead.",
|
|
466
|
+
since_time,
|
|
467
|
+
)
|
|
193
468
|
logs = self.read_pod_logs(
|
|
194
469
|
pod=pod,
|
|
195
470
|
container_name=container_name,
|
|
196
471
|
timestamps=True,
|
|
197
|
-
since_seconds=
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
for line in logs:
|
|
202
|
-
timestamp, message = self.parse_log_line(line.decode('utf-8'))
|
|
203
|
-
self.log.info(message)
|
|
204
|
-
except BaseHTTPError: # Catches errors like ProtocolError(TimeoutError).
|
|
205
|
-
self.log.warning(
|
|
206
|
-
'Failed to read logs for pod %s',
|
|
207
|
-
pod.metadata.name,
|
|
208
|
-
exc_info=True,
|
|
472
|
+
since_seconds=since_seconds,
|
|
473
|
+
follow=follow,
|
|
474
|
+
post_termination_timeout=post_termination_timeout,
|
|
475
|
+
_request_timeout=(connection_timeout, read_timeout),
|
|
209
476
|
)
|
|
210
|
-
|
|
477
|
+
message_to_log = None
|
|
478
|
+
message_timestamp = None
|
|
479
|
+
progress_callback_lines = []
|
|
480
|
+
try:
|
|
481
|
+
for raw_line in logs:
|
|
482
|
+
line = raw_line.decode("utf-8", errors="backslashreplace")
|
|
483
|
+
line_timestamp, message = parse_log_line(line)
|
|
484
|
+
if line_timestamp: # detect new log line
|
|
485
|
+
if message_to_log is None: # first line in the log
|
|
486
|
+
message_to_log = message
|
|
487
|
+
message_timestamp = line_timestamp
|
|
488
|
+
progress_callback_lines.append(line)
|
|
489
|
+
else: # previous log line is complete
|
|
490
|
+
for line in progress_callback_lines:
|
|
491
|
+
for callback in self._callbacks:
|
|
492
|
+
callback.progress_callback(
|
|
493
|
+
line=line, client=self._client, mode=ExecutionMode.SYNC
|
|
494
|
+
)
|
|
495
|
+
if message_to_log is not None:
|
|
496
|
+
self._log_message(
|
|
497
|
+
message_to_log,
|
|
498
|
+
container_name,
|
|
499
|
+
container_name_log_prefix_enabled,
|
|
500
|
+
log_formatter,
|
|
501
|
+
)
|
|
502
|
+
last_captured_timestamp = message_timestamp
|
|
503
|
+
message_to_log = message
|
|
504
|
+
message_timestamp = line_timestamp
|
|
505
|
+
progress_callback_lines = [line]
|
|
506
|
+
else: # continuation of the previous log line
|
|
507
|
+
message_to_log = f"{message_to_log}\n{message}"
|
|
508
|
+
progress_callback_lines.append(line)
|
|
509
|
+
finally:
|
|
510
|
+
# log the last line and update the last_captured_timestamp
|
|
511
|
+
for line in progress_callback_lines:
|
|
512
|
+
for callback in self._callbacks:
|
|
513
|
+
callback.progress_callback(
|
|
514
|
+
line=line, client=self._client, mode=ExecutionMode.SYNC
|
|
515
|
+
)
|
|
516
|
+
if message_to_log is not None:
|
|
517
|
+
self._log_message(
|
|
518
|
+
message_to_log, container_name, container_name_log_prefix_enabled, log_formatter
|
|
519
|
+
)
|
|
520
|
+
last_captured_timestamp = message_timestamp
|
|
521
|
+
except TimeoutError as e:
|
|
522
|
+
# in case of timeout, increment return time by 2 seconds to avoid
|
|
523
|
+
# duplicate log entries
|
|
524
|
+
if val := (last_captured_timestamp or since_time):
|
|
525
|
+
return val.add(seconds=2), e
|
|
526
|
+
except HTTPError as e:
|
|
527
|
+
exception = e
|
|
528
|
+
self._http_error_timestamps = getattr(self, "_http_error_timestamps", [])
|
|
529
|
+
self._http_error_timestamps = [
|
|
530
|
+
t for t in self._http_error_timestamps if t > utcnow() - timedelta(seconds=60)
|
|
531
|
+
]
|
|
532
|
+
self._http_error_timestamps.append(utcnow())
|
|
533
|
+
# Log only if more than 2 errors occurred in the last 60 seconds
|
|
534
|
+
if len(self._http_error_timestamps) > 2:
|
|
535
|
+
self.log.exception(
|
|
536
|
+
"Reading of logs interrupted for container %r; will retry.",
|
|
537
|
+
container_name,
|
|
538
|
+
)
|
|
539
|
+
return last_captured_timestamp or since_time, exception
|
|
211
540
|
|
|
212
|
-
|
|
541
|
+
# note: `read_pod_logs` follows the logs, so we shouldn't necessarily *need* to
|
|
542
|
+
# loop as we do here. But in a long-running process we might temporarily lose connectivity.
|
|
543
|
+
# So the looping logic is there to let us resume following the logs.
|
|
544
|
+
last_log_time = since_time
|
|
213
545
|
while True:
|
|
214
|
-
last_log_time =
|
|
546
|
+
last_log_time, exc = consume_logs(since_time=last_log_time)
|
|
215
547
|
if not self.container_is_running(pod, container_name=container_name):
|
|
216
|
-
return
|
|
217
|
-
|
|
548
|
+
return PodLoggingStatus(running=False, last_log_time=last_log_time)
|
|
549
|
+
if not follow:
|
|
550
|
+
return PodLoggingStatus(running=True, last_log_time=last_log_time)
|
|
551
|
+
# a timeout is a normal thing and we ignore it and resume following logs
|
|
552
|
+
if not isinstance(exc, TimeoutError):
|
|
218
553
|
self.log.warning(
|
|
219
|
-
|
|
554
|
+
"Pod %s log read interrupted but container %s still running. Logs generated in the last one second might get duplicated.",
|
|
220
555
|
pod.metadata.name,
|
|
221
556
|
container_name,
|
|
222
557
|
)
|
|
223
|
-
time.sleep(1)
|
|
224
|
-
|
|
225
|
-
def await_container_completion(self, pod: V1Pod, container_name: str) -> None:
|
|
226
|
-
while not self.container_is_running(pod=pod, container_name=container_name):
|
|
227
558
|
time.sleep(1)
|
|
228
559
|
|
|
229
|
-
def
|
|
560
|
+
def _reconcile_requested_log_containers(
|
|
561
|
+
self, requested: Iterable[str] | str | bool | None, actual: list[str], pod_name
|
|
562
|
+
) -> list[str]:
|
|
563
|
+
"""Return actual containers based on requested."""
|
|
564
|
+
containers_to_log = []
|
|
565
|
+
if actual:
|
|
566
|
+
if isinstance(requested, str):
|
|
567
|
+
# fetch logs only for requested container if only one container is provided
|
|
568
|
+
if requested in actual:
|
|
569
|
+
containers_to_log.append(requested)
|
|
570
|
+
else:
|
|
571
|
+
self.log.error(
|
|
572
|
+
"container %s whose logs were requested not found in the pod %s",
|
|
573
|
+
requested,
|
|
574
|
+
pod_name,
|
|
575
|
+
)
|
|
576
|
+
elif isinstance(requested, bool):
|
|
577
|
+
# if True is provided, get logs for all the containers
|
|
578
|
+
if requested is True:
|
|
579
|
+
containers_to_log.extend(actual)
|
|
580
|
+
else:
|
|
581
|
+
self.log.error(
|
|
582
|
+
"False is not a valid value for container_logs",
|
|
583
|
+
)
|
|
584
|
+
else:
|
|
585
|
+
# if a sequence of containers are provided, iterate for every container in the pod
|
|
586
|
+
if isinstance(requested, Iterable):
|
|
587
|
+
for container in requested:
|
|
588
|
+
if container in actual:
|
|
589
|
+
containers_to_log.append(container)
|
|
590
|
+
else:
|
|
591
|
+
self.log.error(
|
|
592
|
+
"Container %s whose logs were requests not found in the pod %s",
|
|
593
|
+
container,
|
|
594
|
+
pod_name,
|
|
595
|
+
)
|
|
596
|
+
else:
|
|
597
|
+
self.log.error(
|
|
598
|
+
"Invalid type %s specified for container names input parameter", type(requested)
|
|
599
|
+
)
|
|
600
|
+
else:
|
|
601
|
+
self.log.error("Could not retrieve containers for the pod: %s", pod_name)
|
|
602
|
+
return containers_to_log
|
|
603
|
+
|
|
604
|
+
def fetch_requested_init_container_logs(
|
|
605
|
+
self,
|
|
606
|
+
pod: V1Pod,
|
|
607
|
+
init_containers: Iterable[str] | str | Literal[True] | None,
|
|
608
|
+
follow_logs=False,
|
|
609
|
+
container_name_log_prefix_enabled: bool = True,
|
|
610
|
+
log_formatter: Callable[[str, str], str] | None = None,
|
|
611
|
+
) -> list[PodLoggingStatus]:
|
|
230
612
|
"""
|
|
231
|
-
|
|
613
|
+
Follow the logs of containers in the specified pod and publish it to airflow logging.
|
|
614
|
+
|
|
615
|
+
Returns when all the containers exit.
|
|
616
|
+
|
|
617
|
+
:meta private:
|
|
618
|
+
"""
|
|
619
|
+
pod_logging_statuses = []
|
|
620
|
+
all_containers = self.get_init_container_names(pod)
|
|
621
|
+
containers_to_log = self._reconcile_requested_log_containers(
|
|
622
|
+
requested=init_containers,
|
|
623
|
+
actual=all_containers,
|
|
624
|
+
pod_name=pod.metadata.name,
|
|
625
|
+
)
|
|
626
|
+
# sort by spec.initContainers because containers runs sequentially
|
|
627
|
+
containers_to_log = sorted(containers_to_log, key=lambda cn: all_containers.index(cn))
|
|
628
|
+
for c in containers_to_log:
|
|
629
|
+
self._await_init_container_start(pod=pod, container_name=c)
|
|
630
|
+
status = self.fetch_container_logs(
|
|
631
|
+
pod=pod,
|
|
632
|
+
container_name=c,
|
|
633
|
+
follow=follow_logs,
|
|
634
|
+
container_name_log_prefix_enabled=container_name_log_prefix_enabled,
|
|
635
|
+
log_formatter=log_formatter,
|
|
636
|
+
)
|
|
637
|
+
pod_logging_statuses.append(status)
|
|
638
|
+
return pod_logging_statuses
|
|
639
|
+
|
|
640
|
+
def fetch_requested_container_logs(
|
|
641
|
+
self,
|
|
642
|
+
pod: V1Pod,
|
|
643
|
+
containers: Iterable[str] | str | Literal[True],
|
|
644
|
+
follow_logs=False,
|
|
645
|
+
container_name_log_prefix_enabled: bool = True,
|
|
646
|
+
log_formatter: Callable[[str, str], str] | None = None,
|
|
647
|
+
) -> list[PodLoggingStatus]:
|
|
648
|
+
"""
|
|
649
|
+
Follow the logs of containers in the specified pod and publish it to airflow logging.
|
|
650
|
+
|
|
651
|
+
Returns when all the containers exit.
|
|
652
|
+
|
|
653
|
+
:meta private:
|
|
654
|
+
"""
|
|
655
|
+
pod_logging_statuses = []
|
|
656
|
+
all_containers = self.get_container_names(pod)
|
|
657
|
+
containers_to_log = self._reconcile_requested_log_containers(
|
|
658
|
+
requested=containers,
|
|
659
|
+
actual=all_containers,
|
|
660
|
+
pod_name=pod.metadata.name,
|
|
661
|
+
)
|
|
662
|
+
for c in containers_to_log:
|
|
663
|
+
status = self.fetch_container_logs(
|
|
664
|
+
pod=pod,
|
|
665
|
+
container_name=c,
|
|
666
|
+
follow=follow_logs,
|
|
667
|
+
container_name_log_prefix_enabled=container_name_log_prefix_enabled,
|
|
668
|
+
log_formatter=log_formatter,
|
|
669
|
+
)
|
|
670
|
+
pod_logging_statuses.append(status)
|
|
671
|
+
return pod_logging_statuses
|
|
672
|
+
|
|
673
|
+
def await_container_completion(self, pod: V1Pod, container_name: str, polling_time: float = 1) -> None:
|
|
674
|
+
"""
|
|
675
|
+
Wait for the given container in the given pod to be completed.
|
|
232
676
|
|
|
233
677
|
:param pod: pod spec that will be monitored
|
|
234
|
-
:
|
|
678
|
+
:param container_name: name of the container within the pod to monitor
|
|
679
|
+
:param polling_time: polling time between two container status checks.
|
|
680
|
+
Defaults to 1s.
|
|
235
681
|
"""
|
|
236
682
|
while True:
|
|
237
683
|
remote_pod = self.read_pod(pod)
|
|
238
|
-
|
|
684
|
+
terminated = container_is_completed(remote_pod, container_name)
|
|
685
|
+
if terminated:
|
|
239
686
|
break
|
|
240
|
-
self.log.info(
|
|
241
|
-
time.sleep(
|
|
242
|
-
return remote_pod
|
|
687
|
+
self.log.info("Waiting for container '%s' state to be completed", container_name)
|
|
688
|
+
time.sleep(polling_time)
|
|
243
689
|
|
|
244
|
-
def
|
|
690
|
+
def await_pod_completion(
|
|
691
|
+
self, pod: V1Pod, istio_enabled: bool = False, container_name: str = "base"
|
|
692
|
+
) -> V1Pod:
|
|
245
693
|
"""
|
|
246
|
-
|
|
694
|
+
Monitor a pod and return the final state.
|
|
247
695
|
|
|
248
|
-
:param
|
|
249
|
-
:
|
|
250
|
-
:
|
|
696
|
+
:param istio_enabled: whether istio is enabled in the namespace
|
|
697
|
+
:param pod: pod spec that will be monitored
|
|
698
|
+
:param container_name: name of the container within the pod
|
|
699
|
+
:return: tuple[State, str | None]
|
|
251
700
|
"""
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
return None, line
|
|
262
|
-
return last_log_time, message
|
|
701
|
+
while True:
|
|
702
|
+
remote_pod = self.read_pod(pod)
|
|
703
|
+
if remote_pod.status.phase in PodPhase.terminal_states:
|
|
704
|
+
break
|
|
705
|
+
if istio_enabled and container_is_completed(remote_pod, container_name):
|
|
706
|
+
break
|
|
707
|
+
self.log.info("Pod %s has phase %s", pod.metadata.name, remote_pod.status.phase)
|
|
708
|
+
time.sleep(2)
|
|
709
|
+
return remote_pod
|
|
263
710
|
|
|
264
711
|
def container_is_running(self, pod: V1Pod, container_name: str) -> bool:
|
|
265
|
-
"""
|
|
712
|
+
"""Read pod and checks if container is running."""
|
|
266
713
|
remote_pod = self.read_pod(pod)
|
|
267
714
|
return container_is_running(pod=remote_pod, container_name=container_name)
|
|
268
715
|
|
|
269
|
-
|
|
716
|
+
def container_is_terminated(self, pod: V1Pod, container_name: str) -> bool:
|
|
717
|
+
"""Read pod and checks if container is terminated."""
|
|
718
|
+
remote_pod = self.read_pod(pod)
|
|
719
|
+
return container_is_terminated(pod=remote_pod, container_name=container_name)
|
|
720
|
+
|
|
721
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(6), wait=tenacity.wait_exponential(max=15), reraise=True)
|
|
270
722
|
def read_pod_logs(
|
|
271
723
|
self,
|
|
272
724
|
pod: V1Pod,
|
|
273
725
|
container_name: str,
|
|
274
|
-
tail_lines:
|
|
726
|
+
tail_lines: int | None = None,
|
|
275
727
|
timestamps: bool = False,
|
|
276
|
-
since_seconds:
|
|
277
|
-
|
|
278
|
-
|
|
728
|
+
since_seconds: int | None = None,
|
|
729
|
+
follow=True,
|
|
730
|
+
post_termination_timeout: int = 120,
|
|
731
|
+
**kwargs,
|
|
732
|
+
) -> PodLogsConsumer:
|
|
733
|
+
"""Read log from the POD."""
|
|
279
734
|
additional_kwargs = {}
|
|
280
735
|
if since_seconds:
|
|
281
|
-
additional_kwargs[
|
|
736
|
+
additional_kwargs["since_seconds"] = since_seconds
|
|
282
737
|
|
|
283
738
|
if tail_lines:
|
|
284
|
-
additional_kwargs[
|
|
739
|
+
additional_kwargs["tail_lines"] = tail_lines
|
|
740
|
+
additional_kwargs.update(**kwargs)
|
|
285
741
|
|
|
286
742
|
try:
|
|
287
|
-
|
|
743
|
+
logs = self._client.read_namespaced_pod_log(
|
|
288
744
|
name=pod.metadata.name,
|
|
289
745
|
namespace=pod.metadata.namespace,
|
|
290
746
|
container=container_name,
|
|
291
|
-
follow=
|
|
747
|
+
follow=follow,
|
|
292
748
|
timestamps=timestamps,
|
|
293
749
|
_preload_content=False,
|
|
294
750
|
**additional_kwargs,
|
|
295
751
|
)
|
|
296
|
-
except
|
|
297
|
-
self.log.exception(
|
|
752
|
+
except HTTPError:
|
|
753
|
+
self.log.exception("There was an error reading the kubernetes API.")
|
|
298
754
|
raise
|
|
299
755
|
|
|
756
|
+
return PodLogsConsumer(
|
|
757
|
+
response=logs,
|
|
758
|
+
pod=pod,
|
|
759
|
+
pod_manager=self,
|
|
760
|
+
container_name=container_name,
|
|
761
|
+
post_termination_timeout=post_termination_timeout,
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
|
|
765
|
+
def get_init_container_names(self, pod: V1Pod) -> list[str]:
|
|
766
|
+
"""
|
|
767
|
+
Return container names from the POD except for the airflow-xcom-sidecar container.
|
|
768
|
+
|
|
769
|
+
:meta private:
|
|
770
|
+
"""
|
|
771
|
+
return [container_spec.name for container_spec in pod.spec.init_containers]
|
|
772
|
+
|
|
300
773
|
@tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
|
|
301
|
-
def
|
|
302
|
-
"""
|
|
774
|
+
def get_container_names(self, pod: V1Pod) -> list[str]:
|
|
775
|
+
"""
|
|
776
|
+
Return container names from the POD except for the airflow-xcom-sidecar container.
|
|
777
|
+
|
|
778
|
+
:meta private:
|
|
779
|
+
"""
|
|
780
|
+
pod_info = self.read_pod(pod)
|
|
781
|
+
return [
|
|
782
|
+
container_spec.name
|
|
783
|
+
for container_spec in pod_info.spec.containers
|
|
784
|
+
if container_spec.name != PodDefaults.SIDECAR_CONTAINER_NAME
|
|
785
|
+
]
|
|
786
|
+
|
|
787
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
|
|
788
|
+
def read_pod_events(self, pod: V1Pod) -> CoreV1EventList:
|
|
789
|
+
"""Read events from the POD."""
|
|
303
790
|
try:
|
|
304
791
|
return self._client.list_namespaced_event(
|
|
305
792
|
namespace=pod.metadata.namespace, field_selector=f"involvedObject.name={pod.metadata.name}"
|
|
306
793
|
)
|
|
307
|
-
except
|
|
308
|
-
raise AirflowException(f
|
|
794
|
+
except HTTPError as e:
|
|
795
|
+
raise AirflowException(f"There was an error reading the kubernetes API: {e}")
|
|
309
796
|
|
|
310
797
|
@tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
|
|
311
798
|
def read_pod(self, pod: V1Pod) -> V1Pod:
|
|
312
|
-
"""Read POD information"""
|
|
799
|
+
"""Read POD information."""
|
|
313
800
|
try:
|
|
314
801
|
return self._client.read_namespaced_pod(pod.metadata.name, pod.metadata.namespace)
|
|
315
|
-
except
|
|
316
|
-
raise AirflowException(f
|
|
802
|
+
except HTTPError as e:
|
|
803
|
+
raise AirflowException(f"There was an error reading the kubernetes API: {e}")
|
|
804
|
+
|
|
805
|
+
def await_xcom_sidecar_container_start(
|
|
806
|
+
self, pod: V1Pod, timeout: int = 900, log_interval: int = 30
|
|
807
|
+
) -> None:
|
|
808
|
+
"""Check if the sidecar container has reached the 'Running' state before performing do_xcom_push."""
|
|
809
|
+
self.log.info("Checking if xcom sidecar container is started.")
|
|
810
|
+
start_time = time.time()
|
|
811
|
+
last_log_time = start_time
|
|
812
|
+
|
|
813
|
+
while True:
|
|
814
|
+
elapsed_time = time.time() - start_time
|
|
815
|
+
if self.container_is_running(pod, PodDefaults.SIDECAR_CONTAINER_NAME):
|
|
816
|
+
self.log.info("The xcom sidecar container has started.")
|
|
817
|
+
break
|
|
818
|
+
if self.container_is_terminated(pod, PodDefaults.SIDECAR_CONTAINER_NAME):
|
|
819
|
+
raise AirflowException(
|
|
820
|
+
"Xcom sidecar container is already terminated! Not possible to read xcom output of task."
|
|
821
|
+
)
|
|
822
|
+
if (time.time() - last_log_time) >= log_interval:
|
|
823
|
+
self.log.warning(
|
|
824
|
+
"Still waiting for the xcom sidecar container to start. Elapsed time: %d seconds.",
|
|
825
|
+
int(elapsed_time),
|
|
826
|
+
)
|
|
827
|
+
last_log_time = time.time()
|
|
828
|
+
if elapsed_time > timeout:
|
|
829
|
+
raise AirflowException(
|
|
830
|
+
f"Xcom sidecar container did not start within {timeout // 60} minutes."
|
|
831
|
+
)
|
|
832
|
+
time.sleep(1)
|
|
317
833
|
|
|
318
834
|
def extract_xcom(self, pod: V1Pod) -> str:
|
|
319
|
-
"""
|
|
835
|
+
"""Retrieve XCom value and kill xcom sidecar container."""
|
|
836
|
+
try:
|
|
837
|
+
result = self.extract_xcom_json(pod)
|
|
838
|
+
return result
|
|
839
|
+
finally:
|
|
840
|
+
self.extract_xcom_kill(pod)
|
|
841
|
+
|
|
842
|
+
@tenacity.retry(
|
|
843
|
+
stop=tenacity.stop_after_attempt(5),
|
|
844
|
+
wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
|
845
|
+
reraise=True,
|
|
846
|
+
)
|
|
847
|
+
def extract_xcom_json(self, pod: V1Pod) -> str:
|
|
848
|
+
"""Retrieve XCom value and also check if xcom json is valid."""
|
|
849
|
+
command = (
|
|
850
|
+
f"if [ -s {PodDefaults.XCOM_MOUNT_PATH}/return.json ]; "
|
|
851
|
+
f"then cat {PodDefaults.XCOM_MOUNT_PATH}/return.json; "
|
|
852
|
+
f"else echo {EMPTY_XCOM_RESULT}; fi"
|
|
853
|
+
)
|
|
320
854
|
with closing(
|
|
321
855
|
kubernetes_stream(
|
|
322
856
|
self._client.connect_get_namespaced_pod_exec,
|
|
323
857
|
pod.metadata.name,
|
|
324
858
|
pod.metadata.namespace,
|
|
325
859
|
container=PodDefaults.SIDECAR_CONTAINER_NAME,
|
|
326
|
-
command=[
|
|
327
|
-
|
|
860
|
+
command=[
|
|
861
|
+
"/bin/sh",
|
|
862
|
+
"-c",
|
|
863
|
+
command,
|
|
864
|
+
],
|
|
865
|
+
stdin=False,
|
|
328
866
|
stdout=True,
|
|
329
867
|
stderr=True,
|
|
330
868
|
tty=False,
|
|
331
869
|
_preload_content=False,
|
|
332
870
|
)
|
|
333
|
-
) as
|
|
334
|
-
|
|
335
|
-
|
|
871
|
+
) as client:
|
|
872
|
+
self.log.info("Running command... %s", command)
|
|
873
|
+
client.run_forever()
|
|
874
|
+
if client.peek_stderr():
|
|
875
|
+
stderr = client.read_stderr()
|
|
876
|
+
self.log.error("stderr from command: %s", stderr)
|
|
877
|
+
result = client.read_all()
|
|
878
|
+
if result and result.rstrip() != EMPTY_XCOM_RESULT:
|
|
879
|
+
# Note: result string is parsed to check if its valid json.
|
|
880
|
+
# This function still returns a string which is converted into json in the calling method.
|
|
881
|
+
json.loads(result)
|
|
882
|
+
|
|
336
883
|
if result is None:
|
|
337
|
-
raise AirflowException(f
|
|
884
|
+
raise AirflowException(f"Failed to extract xcom from pod: {pod.metadata.name}")
|
|
338
885
|
return result
|
|
339
886
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
887
|
+
@tenacity.retry(
|
|
888
|
+
stop=tenacity.stop_after_attempt(5),
|
|
889
|
+
wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
|
890
|
+
reraise=True,
|
|
891
|
+
)
|
|
892
|
+
def extract_xcom_kill(self, pod: V1Pod):
|
|
893
|
+
"""Kill xcom sidecar container."""
|
|
894
|
+
with closing(
|
|
895
|
+
kubernetes_stream(
|
|
896
|
+
self._client.connect_get_namespaced_pod_exec,
|
|
897
|
+
pod.metadata.name,
|
|
898
|
+
pod.metadata.namespace,
|
|
899
|
+
container=PodDefaults.SIDECAR_CONTAINER_NAME,
|
|
900
|
+
command=["/bin/sh"],
|
|
901
|
+
stdin=True,
|
|
902
|
+
stdout=True,
|
|
903
|
+
stderr=True,
|
|
904
|
+
tty=False,
|
|
905
|
+
_preload_content=False,
|
|
906
|
+
)
|
|
907
|
+
) as resp:
|
|
908
|
+
self._exec_pod_command(resp, "kill -2 $(pgrep -u $(id -u) -f 'sh')")
|
|
909
|
+
|
|
910
|
+
def _exec_pod_command(self, resp, command: str) -> str | None:
|
|
911
|
+
res = ""
|
|
912
|
+
if not resp.is_open():
|
|
913
|
+
return None
|
|
914
|
+
self.log.info("Running command... %s", command)
|
|
915
|
+
resp.write_stdin(f"{command}\n")
|
|
916
|
+
while resp.is_open():
|
|
917
|
+
resp.update(timeout=1)
|
|
918
|
+
while resp.peek_stdout():
|
|
919
|
+
res += resp.read_stdout()
|
|
920
|
+
error_res = ""
|
|
921
|
+
while resp.peek_stderr():
|
|
922
|
+
error_res += resp.read_stderr()
|
|
923
|
+
if error_res:
|
|
924
|
+
self.log.info("stderr from command: %s", error_res)
|
|
925
|
+
break
|
|
926
|
+
if res:
|
|
927
|
+
return res
|
|
351
928
|
return None
|
|
929
|
+
|
|
930
|
+
def _await_init_container_start(self, pod: V1Pod, container_name: str):
|
|
931
|
+
while True:
|
|
932
|
+
remote_pod = self.read_pod(pod)
|
|
933
|
+
|
|
934
|
+
if (
|
|
935
|
+
remote_pod.status is not None
|
|
936
|
+
and remote_pod.status.phase != PodPhase.PENDING
|
|
937
|
+
and get_container_status(remote_pod, container_name) is not None
|
|
938
|
+
and not container_is_wait(remote_pod, container_name)
|
|
939
|
+
):
|
|
940
|
+
return
|
|
941
|
+
|
|
942
|
+
time.sleep(1)
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
class OnFinishAction(str, enum.Enum):
|
|
946
|
+
"""Action to take when the pod finishes."""
|
|
947
|
+
|
|
948
|
+
KEEP_POD = "keep_pod"
|
|
949
|
+
DELETE_POD = "delete_pod"
|
|
950
|
+
DELETE_SUCCEEDED_POD = "delete_succeeded_pod"
|
|
951
|
+
|
|
952
|
+
|
|
953
|
+
def is_log_group_marker(line: str) -> bool:
|
|
954
|
+
"""Check if the line is a log group marker like `::group::` or `::endgroup::`."""
|
|
955
|
+
return line.startswith("::group::") or line.startswith("::endgroup::")
|
|
956
|
+
|
|
957
|
+
|
|
958
|
+
def parse_log_line(line: str) -> tuple[DateTime | None, str]:
|
|
959
|
+
"""
|
|
960
|
+
Parse K8s log line and returns the final state.
|
|
961
|
+
|
|
962
|
+
:param line: k8s log line
|
|
963
|
+
:return: timestamp and log message
|
|
964
|
+
"""
|
|
965
|
+
timestamp, sep, message = line.strip().partition(" ")
|
|
966
|
+
if not sep:
|
|
967
|
+
return None, line
|
|
968
|
+
try:
|
|
969
|
+
last_log_time = cast("DateTime", pendulum.parse(timestamp))
|
|
970
|
+
except ParserError:
|
|
971
|
+
return None, line
|
|
972
|
+
return last_log_time, message
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
class AsyncPodManager(LoggingMixin):
|
|
976
|
+
"""Create, monitor, and otherwise interact with Kubernetes pods for use with the KubernetesPodTriggerer."""
|
|
977
|
+
|
|
978
|
+
def __init__(
|
|
979
|
+
self,
|
|
980
|
+
async_hook: AsyncKubernetesHook,
|
|
981
|
+
callbacks: list[type[KubernetesPodOperatorCallback]] | None = None,
|
|
982
|
+
):
|
|
983
|
+
"""
|
|
984
|
+
Create the launcher.
|
|
985
|
+
|
|
986
|
+
:param kube_client: kubernetes client
|
|
987
|
+
:param callbacks:
|
|
988
|
+
"""
|
|
989
|
+
super().__init__()
|
|
990
|
+
self._hook = async_hook
|
|
991
|
+
self._watch = watch.Watch()
|
|
992
|
+
self._callbacks = callbacks or []
|
|
993
|
+
self.stop_watching_events = False
|
|
994
|
+
|
|
995
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(), reraise=True)
|
|
996
|
+
async def read_pod(self, pod: V1Pod) -> V1Pod:
|
|
997
|
+
"""Read POD information."""
|
|
998
|
+
return await self._hook.get_pod(
|
|
999
|
+
pod.metadata.name,
|
|
1000
|
+
pod.metadata.namespace,
|
|
1001
|
+
)
|
|
1002
|
+
|
|
1003
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(), reraise=True)
|
|
1004
|
+
async def read_pod_events(self, pod: V1Pod) -> CoreV1EventList:
|
|
1005
|
+
"""Get pod's events."""
|
|
1006
|
+
return await self._hook.get_pod_events(
|
|
1007
|
+
pod.metadata.name,
|
|
1008
|
+
pod.metadata.namespace,
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
async def watch_pod_events(self, pod: V1Pod, check_interval: float = 1) -> None:
|
|
1012
|
+
"""Read pod events and writes into log."""
|
|
1013
|
+
await watch_pod_events(pod_manager=self, pod=pod, check_interval=check_interval)
|
|
1014
|
+
|
|
1015
|
+
async def await_pod_start(
|
|
1016
|
+
self, pod: V1Pod, schedule_timeout: int = 120, startup_timeout: int = 120, check_interval: float = 1
|
|
1017
|
+
) -> None:
|
|
1018
|
+
"""
|
|
1019
|
+
Wait for the pod to reach phase other than ``Pending``.
|
|
1020
|
+
|
|
1021
|
+
:param pod:
|
|
1022
|
+
:param schedule_timeout: Timeout (in seconds) for pod stay in schedule state
|
|
1023
|
+
(if pod is taking to long in schedule state, fails task)
|
|
1024
|
+
:param startup_timeout: Timeout (in seconds) for startup of the pod
|
|
1025
|
+
(if pod is pending for too long after being scheduled, fails task)
|
|
1026
|
+
:param check_interval: Interval (in seconds) between checks
|
|
1027
|
+
:return:
|
|
1028
|
+
"""
|
|
1029
|
+
await await_pod_start(
|
|
1030
|
+
pod_manager=self,
|
|
1031
|
+
pod=pod,
|
|
1032
|
+
schedule_timeout=schedule_timeout,
|
|
1033
|
+
startup_timeout=startup_timeout,
|
|
1034
|
+
check_interval=check_interval,
|
|
1035
|
+
)
|
|
1036
|
+
|
|
1037
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(), reraise=True)
|
|
1038
|
+
async def fetch_container_logs_before_current_sec(
|
|
1039
|
+
self, pod: V1Pod, container_name: str, since_time: DateTime | None = None
|
|
1040
|
+
) -> DateTime | None:
|
|
1041
|
+
"""
|
|
1042
|
+
Asynchronously read the log file of the specified pod.
|
|
1043
|
+
|
|
1044
|
+
This method streams logs from the base container, skipping log lines from the current second to prevent duplicate entries on subsequent reads. It is designed to handle long-running containers and gracefully suppresses transient interruptions.
|
|
1045
|
+
|
|
1046
|
+
:param pod: The pod specification to monitor.
|
|
1047
|
+
:param container_name: The name of the container within the pod.
|
|
1048
|
+
:param since_time: The timestamp from which to start reading logs.
|
|
1049
|
+
:return: The timestamp to use for the next log read, representing the start of the current second. Returns None if an exception occurred.
|
|
1050
|
+
"""
|
|
1051
|
+
now = pendulum.now()
|
|
1052
|
+
logs = await self._hook.read_logs(
|
|
1053
|
+
name=pod.metadata.name,
|
|
1054
|
+
namespace=pod.metadata.namespace,
|
|
1055
|
+
container_name=container_name,
|
|
1056
|
+
since_seconds=(math.ceil((now - since_time).total_seconds()) if since_time else None),
|
|
1057
|
+
)
|
|
1058
|
+
message_to_log = None
|
|
1059
|
+
try:
|
|
1060
|
+
now_seconds = now.replace(microsecond=0)
|
|
1061
|
+
for line in logs:
|
|
1062
|
+
line_timestamp, message = parse_log_line(line)
|
|
1063
|
+
# Skip log lines from the current second to prevent duplicate entries on the next read.
|
|
1064
|
+
# The API only allows specifying 'since_seconds', not an exact timestamp.
|
|
1065
|
+
if line_timestamp and line_timestamp.replace(microsecond=0) == now_seconds:
|
|
1066
|
+
break
|
|
1067
|
+
if line_timestamp: # detect new log line
|
|
1068
|
+
if message_to_log is None: # first line in the log
|
|
1069
|
+
message_to_log = message
|
|
1070
|
+
else: # previous log line is complete
|
|
1071
|
+
if message_to_log is not None:
|
|
1072
|
+
if is_log_group_marker(message_to_log):
|
|
1073
|
+
print(message_to_log)
|
|
1074
|
+
else:
|
|
1075
|
+
self.log.info("[%s] %s", container_name, message_to_log)
|
|
1076
|
+
message_to_log = message
|
|
1077
|
+
elif message_to_log: # continuation of the previous log line
|
|
1078
|
+
message_to_log = f"{message_to_log}\n{message}"
|
|
1079
|
+
finally:
|
|
1080
|
+
# log the last line and update the last_captured_timestamp
|
|
1081
|
+
if message_to_log is not None:
|
|
1082
|
+
if is_log_group_marker(message_to_log):
|
|
1083
|
+
print(message_to_log)
|
|
1084
|
+
else:
|
|
1085
|
+
self.log.info("[%s] %s", container_name, message_to_log)
|
|
1086
|
+
return now # Return the current time as the last log time to ensure logs from the current second are read in the next fetch.
|