apache-airflow-providers-cncf-kubernetes 3.1.0__py3-none-any.whl → 10.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. airflow/providers/cncf/kubernetes/__init__.py +18 -23
  2. airflow/providers/cncf/kubernetes/backcompat/__init__.py +17 -0
  3. airflow/providers/cncf/kubernetes/backcompat/backwards_compat_converters.py +31 -49
  4. airflow/providers/cncf/kubernetes/callbacks.py +200 -0
  5. airflow/providers/cncf/kubernetes/cli/__init__.py +16 -0
  6. airflow/providers/cncf/kubernetes/cli/kubernetes_command.py +195 -0
  7. airflow/providers/cncf/kubernetes/decorators/kubernetes.py +163 -0
  8. airflow/providers/cncf/kubernetes/decorators/kubernetes_cmd.py +118 -0
  9. airflow/providers/cncf/kubernetes/exceptions.py +37 -0
  10. airflow/providers/cncf/kubernetes/executors/__init__.py +17 -0
  11. airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +831 -0
  12. airflow/providers/cncf/kubernetes/executors/kubernetes_executor_types.py +91 -0
  13. airflow/providers/cncf/kubernetes/executors/kubernetes_executor_utils.py +736 -0
  14. airflow/providers/cncf/kubernetes/executors/local_kubernetes_executor.py +306 -0
  15. airflow/providers/cncf/kubernetes/get_provider_info.py +249 -50
  16. airflow/providers/cncf/kubernetes/hooks/kubernetes.py +846 -112
  17. airflow/providers/cncf/kubernetes/k8s_model.py +62 -0
  18. airflow/providers/cncf/kubernetes/kube_client.py +156 -0
  19. airflow/providers/cncf/kubernetes/kube_config.py +125 -0
  20. airflow/providers/cncf/kubernetes/kubernetes_executor_templates/__init__.py +16 -0
  21. airflow/providers/cncf/kubernetes/kubernetes_executor_templates/basic_template.yaml +79 -0
  22. airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +165 -0
  23. airflow/providers/cncf/kubernetes/operators/custom_object_launcher.py +368 -0
  24. airflow/providers/cncf/kubernetes/operators/job.py +646 -0
  25. airflow/providers/cncf/kubernetes/operators/kueue.py +132 -0
  26. airflow/providers/cncf/kubernetes/operators/pod.py +1417 -0
  27. airflow/providers/cncf/kubernetes/operators/resource.py +191 -0
  28. airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +336 -35
  29. airflow/providers/cncf/kubernetes/pod_generator.py +592 -0
  30. airflow/providers/cncf/kubernetes/pod_template_file_examples/__init__.py +16 -0
  31. airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_image_template.yaml +68 -0
  32. airflow/providers/cncf/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml +74 -0
  33. airflow/providers/cncf/kubernetes/pod_template_file_examples/git_sync_template.yaml +95 -0
  34. airflow/providers/cncf/kubernetes/python_kubernetes_script.jinja2 +51 -0
  35. airflow/providers/cncf/kubernetes/python_kubernetes_script.py +92 -0
  36. airflow/providers/cncf/kubernetes/resource_convert/__init__.py +16 -0
  37. airflow/providers/cncf/kubernetes/resource_convert/configmap.py +52 -0
  38. airflow/providers/cncf/kubernetes/resource_convert/env_variable.py +39 -0
  39. airflow/providers/cncf/kubernetes/resource_convert/secret.py +40 -0
  40. airflow/providers/cncf/kubernetes/secret.py +128 -0
  41. airflow/providers/cncf/kubernetes/sensors/spark_kubernetes.py +30 -14
  42. airflow/providers/cncf/kubernetes/template_rendering.py +81 -0
  43. airflow/providers/cncf/kubernetes/triggers/__init__.py +16 -0
  44. airflow/providers/cncf/kubernetes/triggers/job.py +176 -0
  45. airflow/providers/cncf/kubernetes/triggers/pod.py +344 -0
  46. airflow/providers/cncf/kubernetes/utils/__init__.py +3 -0
  47. airflow/providers/cncf/kubernetes/utils/container.py +118 -0
  48. airflow/providers/cncf/kubernetes/utils/delete_from.py +154 -0
  49. airflow/providers/cncf/kubernetes/utils/k8s_resource_iterator.py +46 -0
  50. airflow/providers/cncf/kubernetes/utils/pod_manager.py +887 -152
  51. airflow/providers/cncf/kubernetes/utils/xcom_sidecar.py +25 -16
  52. airflow/providers/cncf/kubernetes/version_compat.py +38 -0
  53. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/METADATA +125 -0
  54. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/RECORD +62 -0
  55. {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info}/WHEEL +1 -2
  56. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/entry_points.txt +3 -0
  57. apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses/NOTICE +5 -0
  58. airflow/providers/cncf/kubernetes/backcompat/pod.py +0 -119
  59. airflow/providers/cncf/kubernetes/backcompat/pod_runtime_info_env.py +0 -56
  60. airflow/providers/cncf/kubernetes/backcompat/volume.py +0 -62
  61. airflow/providers/cncf/kubernetes/backcompat/volume_mount.py +0 -58
  62. airflow/providers/cncf/kubernetes/example_dags/example_kubernetes.py +0 -163
  63. airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes.py +0 -66
  64. airflow/providers/cncf/kubernetes/example_dags/example_spark_kubernetes_spark_pi.yaml +0 -57
  65. airflow/providers/cncf/kubernetes/operators/kubernetes_pod.py +0 -622
  66. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/METADATA +0 -452
  67. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/NOTICE +0 -6
  68. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/RECORD +0 -29
  69. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/entry_points.txt +0 -3
  70. apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info/top_level.txt +0 -1
  71. /airflow/providers/cncf/kubernetes/{example_dags → decorators}/__init__.py +0 -0
  72. {apache_airflow_providers_cncf_kubernetes-3.1.0.dist-info → apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info/licenses}/LICENSE +0 -0
@@ -14,35 +14,61 @@
14
14
  # KIND, either express or implied. See the License for the
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
- """Launches PODs"""
17
+ """Launches PODs."""
18
+
19
+ from __future__ import annotations
20
+
21
+ import asyncio
22
+ import enum
18
23
  import json
19
24
  import math
20
25
  import time
26
+ from collections.abc import Callable, Generator, Iterable
21
27
  from contextlib import closing
22
- from datetime import datetime
23
- from typing import TYPE_CHECKING, Iterable, Optional, Tuple, cast
28
+ from dataclasses import dataclass
29
+ from datetime import timedelta
30
+ from typing import TYPE_CHECKING, Literal, cast
24
31
 
25
32
  import pendulum
26
33
  import tenacity
27
34
  from kubernetes import client, watch
28
- from kubernetes.client.models.v1_pod import V1Pod
29
35
  from kubernetes.client.rest import ApiException
30
36
  from kubernetes.stream import stream as kubernetes_stream
31
37
  from pendulum import DateTime
32
38
  from pendulum.parsing.exceptions import ParserError
33
- from urllib3.exceptions import HTTPError as BaseHTTPError
39
+ from urllib3.exceptions import HTTPError, TimeoutError
34
40
 
35
41
  from airflow.exceptions import AirflowException
36
- from airflow.kubernetes.kube_client import get_kube_client
37
- from airflow.kubernetes.pod_generator import PodDefaults
42
+ from airflow.providers.cncf.kubernetes.callbacks import ExecutionMode, KubernetesPodOperatorCallback
43
+ from airflow.providers.cncf.kubernetes.utils.container import (
44
+ container_is_completed,
45
+ container_is_running,
46
+ container_is_terminated,
47
+ container_is_wait,
48
+ get_container_status,
49
+ )
50
+ from airflow.providers.cncf.kubernetes.utils.xcom_sidecar import PodDefaults
38
51
  from airflow.utils.log.logging_mixin import LoggingMixin
52
+ from airflow.utils.timezone import utcnow
39
53
 
40
54
  if TYPE_CHECKING:
41
- try:
42
- # Kube >= 19
43
- from kubernetes.client.models.core_v1_event_list import CoreV1EventList as V1EventList
44
- except ImportError:
45
- from kubernetes.client.models.v1_event_list import V1EventList
55
+ from kubernetes.client.models.core_v1_event_list import CoreV1EventList
56
+ from kubernetes.client.models.v1_container_state import V1ContainerState
57
+ from kubernetes.client.models.v1_container_state_waiting import V1ContainerStateWaiting
58
+ from kubernetes.client.models.v1_object_reference import V1ObjectReference
59
+ from kubernetes.client.models.v1_pod import V1Pod
60
+ from kubernetes.client.models.v1_pod_condition import V1PodCondition
61
+ from urllib3.response import HTTPResponse
62
+
63
+ from airflow.providers.cncf.kubernetes.hooks.kubernetes import AsyncKubernetesHook
64
+
65
+
66
+ EMPTY_XCOM_RESULT = "__airflow_xcom_result_empty__"
67
+ """
68
+ Sentinel for no xcom result.
69
+
70
+ :meta private:
71
+ """
46
72
 
47
73
 
48
74
  class PodLaunchFailedException(AirflowException):
@@ -50,90 +76,283 @@ class PodLaunchFailedException(AirflowException):
50
76
 
51
77
 
52
78
  def should_retry_start_pod(exception: BaseException) -> bool:
53
- """Check if an Exception indicates a transient error and warrants retrying"""
79
+ """Check if an Exception indicates a transient error and warrants retrying."""
54
80
  if isinstance(exception, ApiException):
55
- return exception.status == 409
81
+ return str(exception.status) == "409"
56
82
  return False
57
83
 
58
84
 
59
85
  class PodPhase:
60
86
  """
61
- Possible pod phases
87
+ Possible pod phases.
88
+
62
89
  See https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase.
63
90
  """
64
91
 
65
- PENDING = 'Pending'
66
- RUNNING = 'Running'
67
- FAILED = 'Failed'
68
- SUCCEEDED = 'Succeeded'
92
+ PENDING = "Pending"
93
+ RUNNING = "Running"
94
+ FAILED = "Failed"
95
+ SUCCEEDED = "Succeeded"
69
96
 
70
97
  terminal_states = {FAILED, SUCCEEDED}
71
98
 
72
99
 
73
- def container_is_running(pod: V1Pod, container_name: str) -> bool:
100
+ def check_exception_is_kubernetes_api_unauthorized(exc: BaseException):
101
+ return isinstance(exc, ApiException) and exc.status and str(exc.status) == "401"
102
+
103
+
104
+ async def watch_pod_events(
105
+ pod_manager: PodManager | AsyncPodManager,
106
+ pod: V1Pod,
107
+ check_interval: float = 1,
108
+ ) -> None:
74
109
  """
75
- Examines V1Pod ``pod`` to determine whether ``container_name`` is running.
76
- If that container is present and running, returns True. Returns False otherwise.
110
+ Read pod events and write them to the log.
111
+
112
+ This function supports both asynchronous and synchronous pod managers.
113
+
114
+ :param pod_manager: The pod manager instance (PodManager or AsyncPodManager).
115
+ :param pod: The pod object to monitor.
116
+ :param check_interval: Interval (in seconds) between checks.
77
117
  """
78
- container_statuses = pod.status.container_statuses if pod and pod.status else None
79
- if not container_statuses:
80
- return False
81
- container_status = next(iter([x for x in container_statuses if x.name == container_name]), None)
82
- if not container_status:
83
- return False
84
- return container_status.state.running is not None
118
+ num_events = 0
119
+ is_async = isinstance(pod_manager, AsyncPodManager)
120
+ while not pod_manager.stop_watching_events:
121
+ if is_async:
122
+ events = await pod_manager.read_pod_events(pod)
123
+ else:
124
+ events = pod_manager.read_pod_events(pod)
125
+ for new_event in events.items[num_events:]:
126
+ involved_object: V1ObjectReference = new_event.involved_object
127
+ pod_manager.log.info(
128
+ "The Pod has an Event: %s from %s", new_event.message, involved_object.field_path
129
+ )
130
+ num_events = len(events.items)
131
+ await asyncio.sleep(check_interval)
85
132
 
86
133
 
87
- class PodManager(LoggingMixin):
134
+ async def await_pod_start(
135
+ pod_manager: PodManager | AsyncPodManager,
136
+ pod: V1Pod,
137
+ schedule_timeout: int = 120,
138
+ startup_timeout: int = 120,
139
+ check_interval: float = 1,
140
+ ):
88
141
  """
89
- Helper class for creating, monitoring, and otherwise interacting with Kubernetes pods
90
- for use with the KubernetesPodOperator
142
+ Monitor the startup phase of a Kubernetes pod, waiting for it to leave the ``Pending`` state.
143
+
144
+ This function is shared by both PodManager and AsyncPodManager to provide consistent pod startup tracking.
145
+
146
+ :param pod_manager: The pod manager instance (PodManager or AsyncPodManager).
147
+ :param pod: The pod object to monitor.
148
+ :param schedule_timeout: Maximum time (in seconds) to wait for the pod to be scheduled.
149
+ :param startup_timeout: Maximum time (in seconds) to wait for the pod to start running after being scheduled.
150
+ :param check_interval: Interval (in seconds) between status checks.
151
+ :param is_async: Set to True if called in an async context; otherwise, False.
152
+ """
153
+ pod_manager.log.info("::group::Waiting until %ss to get the POD scheduled...", schedule_timeout)
154
+ pod_was_scheduled = False
155
+ start_check_time = time.time()
156
+ is_async = isinstance(pod_manager, AsyncPodManager)
157
+ while True:
158
+ if is_async:
159
+ remote_pod = await pod_manager.read_pod(pod)
160
+ else:
161
+ remote_pod = pod_manager.read_pod(pod)
162
+ pod_status = remote_pod.status
163
+ if pod_status.phase != PodPhase.PENDING:
164
+ pod_manager.stop_watching_events = True
165
+ pod_manager.log.info("::endgroup::")
166
+ break
167
+
168
+ # Check for timeout
169
+ pod_conditions: list[V1PodCondition] = pod_status.conditions
170
+ if pod_conditions and any(
171
+ (condition.type == "PodScheduled" and condition.status == "True") for condition in pod_conditions
172
+ ):
173
+ if not pod_was_scheduled:
174
+ # POD was initially scheduled update timeout for getting POD launched
175
+ pod_was_scheduled = True
176
+ start_check_time = time.time()
177
+ pod_manager.log.info("Waiting %ss to get the POD running...", startup_timeout)
178
+
179
+ if time.time() - start_check_time >= startup_timeout:
180
+ pod_manager.log.info("::endgroup::")
181
+ raise PodLaunchTimeoutException(
182
+ f"Pod took too long to start. More than {startup_timeout}s. Check the pod events in kubernetes."
183
+ )
184
+ else:
185
+ if time.time() - start_check_time >= schedule_timeout:
186
+ pod_manager.log.info("::endgroup::")
187
+ raise PodLaunchTimeoutException(
188
+ f"Pod took too long to be scheduled on the cluster, giving up. More than {schedule_timeout}s. Check the pod events in kubernetes."
189
+ )
190
+
191
+ # Check for general problems to terminate early - ErrImagePull
192
+ if pod_status.container_statuses:
193
+ for container_status in pod_status.container_statuses:
194
+ container_state: V1ContainerState = container_status.state
195
+ container_waiting: V1ContainerStateWaiting | None = container_state.waiting
196
+ if container_waiting:
197
+ if container_waiting.reason in ["ErrImagePull", "InvalidImageName"]:
198
+ pod_manager.log.info("::endgroup::")
199
+ raise PodLaunchFailedException(
200
+ f"Pod docker image cannot be pulled, unable to start: {container_waiting.reason}"
201
+ f"\n{container_waiting.message}"
202
+ )
203
+
204
+ await asyncio.sleep(check_interval)
205
+
206
+
207
+ class PodLaunchTimeoutException(AirflowException):
208
+ """When pod does not leave the ``Pending`` phase within specified timeout."""
209
+
210
+
211
+ class PodNotFoundException(AirflowException):
212
+ """Expected pod does not exist in kube-api."""
213
+
214
+
215
+ class PodLogsConsumer:
216
+ """
217
+ Responsible for pulling pod logs from a stream with checking a container status before reading data.
218
+
219
+ This class is a workaround for the issue https://github.com/apache/airflow/issues/23497.
220
+
221
+ :param response: HTTP response with logs
222
+ :param pod: Pod instance from Kubernetes client
223
+ :param pod_manager: Pod manager instance
224
+ :param container_name: Name of the container that we're reading logs from
225
+ :param post_termination_timeout: (Optional) The period of time in seconds representing for how long time
226
+ logs are available after the container termination.
227
+ :param read_pod_cache_timeout: (Optional) The container's status cache lifetime.
228
+ The container status is cached to reduce API calls.
229
+
230
+ :meta private:
91
231
  """
92
232
 
93
233
  def __init__(
94
234
  self,
95
- kube_client: client.CoreV1Api = None,
96
- in_cluster: bool = True,
97
- cluster_context: Optional[str] = None,
235
+ response: HTTPResponse,
236
+ pod: V1Pod,
237
+ pod_manager: PodManager,
238
+ container_name: str,
239
+ post_termination_timeout: int = 120,
240
+ read_pod_cache_timeout: int = 120,
241
+ ):
242
+ self.response = response
243
+ self.pod = pod
244
+ self.pod_manager = pod_manager
245
+ self.container_name = container_name
246
+ self.post_termination_timeout = post_termination_timeout
247
+ self.last_read_pod_at = None
248
+ self.read_pod_cache = None
249
+ self.read_pod_cache_timeout = read_pod_cache_timeout
250
+
251
+ def __iter__(self) -> Generator[bytes, None, None]:
252
+ r"""Yield log items divided by the '\n' symbol."""
253
+ incomplete_log_item: list[bytes] = []
254
+ if self.logs_available():
255
+ for data_chunk in self.response.stream(amt=None, decode_content=True):
256
+ if b"\n" in data_chunk:
257
+ log_items = data_chunk.split(b"\n")
258
+ yield from self._extract_log_items(incomplete_log_item, log_items)
259
+ incomplete_log_item = self._save_incomplete_log_item(log_items[-1])
260
+ else:
261
+ incomplete_log_item.append(data_chunk)
262
+ if not self.logs_available():
263
+ break
264
+ if incomplete_log_item:
265
+ yield b"".join(incomplete_log_item)
266
+
267
+ @staticmethod
268
+ def _extract_log_items(incomplete_log_item: list[bytes], log_items: list[bytes]):
269
+ yield b"".join(incomplete_log_item) + log_items[0] + b"\n"
270
+ for x in log_items[1:-1]:
271
+ yield x + b"\n"
272
+
273
+ @staticmethod
274
+ def _save_incomplete_log_item(sub_chunk: bytes):
275
+ return [sub_chunk] if [sub_chunk] else []
276
+
277
+ def logs_available(self):
278
+ remote_pod = self.read_pod()
279
+ if container_is_running(pod=remote_pod, container_name=self.container_name):
280
+ return True
281
+ container_status = get_container_status(pod=remote_pod, container_name=self.container_name)
282
+ state = container_status.state if container_status else None
283
+ terminated = state.terminated if state else None
284
+ if terminated:
285
+ termination_time = terminated.finished_at
286
+ if termination_time:
287
+ return termination_time + timedelta(seconds=self.post_termination_timeout) > utcnow()
288
+ return False
289
+
290
+ def read_pod(self):
291
+ _now = utcnow()
292
+ if (
293
+ self.read_pod_cache is None
294
+ or self.last_read_pod_at + timedelta(seconds=self.read_pod_cache_timeout) < _now
295
+ ):
296
+ self.read_pod_cache = self.pod_manager.read_pod(self.pod)
297
+ self.last_read_pod_at = _now
298
+ return self.read_pod_cache
299
+
300
+
301
+ @dataclass
302
+ class PodLoggingStatus:
303
+ """Return the status of the pod and last log time when exiting from `fetch_container_logs`."""
304
+
305
+ running: bool
306
+ last_log_time: DateTime | None
307
+
308
+
309
+ class PodManager(LoggingMixin):
310
+ """Create, monitor, and otherwise interact with Kubernetes pods for use with the KubernetesPodOperator."""
311
+
312
+ def __init__(
313
+ self,
314
+ kube_client: client.CoreV1Api,
315
+ callbacks: list[type[KubernetesPodOperatorCallback]] | None = None,
98
316
  ):
99
317
  """
100
- Creates the launcher.
318
+ Create the launcher.
101
319
 
102
320
  :param kube_client: kubernetes client
103
- :param in_cluster: whether we are in cluster
104
- :param cluster_context: context of the cluster
321
+ :param callbacks:
105
322
  """
106
323
  super().__init__()
107
- self._client = kube_client or get_kube_client(in_cluster=in_cluster, cluster_context=cluster_context)
324
+ self._client = kube_client
108
325
  self._watch = watch.Watch()
326
+ self._callbacks = callbacks or []
327
+ self.stop_watching_events = False
109
328
 
110
329
  def run_pod_async(self, pod: V1Pod, **kwargs) -> V1Pod:
111
- """Runs POD asynchronously"""
330
+ """Run POD asynchronously."""
112
331
  sanitized_pod = self._client.api_client.sanitize_for_serialization(pod)
113
332
  json_pod = json.dumps(sanitized_pod, indent=2)
114
333
 
115
- self.log.debug('Pod Creation Request: \n%s', json_pod)
334
+ self.log.debug("Pod Creation Request: \n%s", json_pod)
116
335
  try:
117
336
  resp = self._client.create_namespaced_pod(
118
337
  body=sanitized_pod, namespace=pod.metadata.namespace, **kwargs
119
338
  )
120
- self.log.debug('Pod Creation Response: %s', resp)
339
+ self.log.debug("Pod Creation Response: %s", resp)
121
340
  except Exception as e:
122
341
  self.log.exception(
123
- 'Exception when attempting to create Namespaced Pod: %s', str(json_pod).replace("\n", " ")
342
+ "Exception when attempting to create Namespaced Pod: %s", str(json_pod).replace("\n", " ")
124
343
  )
125
344
  raise e
126
345
  return resp
127
346
 
128
347
  def delete_pod(self, pod: V1Pod) -> None:
129
- """Deletes POD"""
348
+ """Delete POD."""
130
349
  try:
131
350
  self._client.delete_namespaced_pod(
132
351
  pod.metadata.name, pod.metadata.namespace, body=client.V1DeleteOptions()
133
352
  )
134
353
  except ApiException as e:
135
354
  # If the pod is already deleted
136
- if e.status != 404:
355
+ if str(e.status) != "404":
137
356
  raise
138
357
 
139
358
  @tenacity.retry(
@@ -143,209 +362,725 @@ class PodManager(LoggingMixin):
143
362
  retry=tenacity.retry_if_exception(should_retry_start_pod),
144
363
  )
145
364
  def create_pod(self, pod: V1Pod) -> V1Pod:
146
- """Launches the pod asynchronously."""
365
+ """Launch the pod asynchronously."""
147
366
  return self.run_pod_async(pod)
148
367
 
149
- def await_pod_start(self, pod: V1Pod, startup_timeout: int = 120) -> None:
368
+ async def watch_pod_events(self, pod: V1Pod, check_interval: int = 1) -> None:
369
+ """Read pod events and writes into log."""
370
+ await watch_pod_events(pod_manager=self, pod=pod, check_interval=check_interval)
371
+
372
+ async def await_pod_start(
373
+ self, pod: V1Pod, schedule_timeout: int = 120, startup_timeout: int = 120, check_interval: int = 1
374
+ ) -> None:
150
375
  """
151
- Waits for the pod to reach phase other than ``Pending``
376
+ Wait for the pod to reach phase other than ``Pending``.
152
377
 
153
378
  :param pod:
379
+ :param schedule_timeout: Timeout (in seconds) for pod stay in schedule state
380
+ (if pod is taking to long in schedule state, fails task)
154
381
  :param startup_timeout: Timeout (in seconds) for startup of the pod
155
- (if pod is pending for too long, fails task)
382
+ (if pod is pending for too long after being scheduled, fails task)
383
+ :param check_interval: Interval (in seconds) between checks
156
384
  :return:
157
385
  """
158
- curr_time = datetime.now()
159
- while True:
160
- remote_pod = self.read_pod(pod)
161
- if remote_pod.status.phase != PodPhase.PENDING:
162
- break
163
- self.log.warning("Pod not yet started: %s", pod.metadata.name)
164
- delta = datetime.now() - curr_time
165
- if delta.total_seconds() >= startup_timeout:
166
- msg = (
167
- f"Pod took longer than {startup_timeout} seconds to start. "
168
- "Check the pod events in kubernetes to determine why."
386
+ await await_pod_start(
387
+ pod_manager=self,
388
+ pod=pod,
389
+ schedule_timeout=schedule_timeout,
390
+ startup_timeout=startup_timeout,
391
+ check_interval=check_interval,
392
+ )
393
+
394
+ def _log_message(
395
+ self,
396
+ message: str,
397
+ container_name: str,
398
+ container_name_log_prefix_enabled: bool,
399
+ log_formatter: Callable[[str, str], str] | None,
400
+ ) -> None:
401
+ """Log a message with appropriate formatting."""
402
+ if is_log_group_marker(message):
403
+ print(message)
404
+ else:
405
+ if log_formatter:
406
+ formatted_message = log_formatter(container_name, message)
407
+ self.log.info("%s", formatted_message)
408
+ else:
409
+ log_message = (
410
+ f"[{container_name}] {message}" if container_name_log_prefix_enabled else message
169
411
  )
170
- raise PodLaunchFailedException(msg)
171
- time.sleep(1)
412
+ self.log.info("%s", log_message)
172
413
 
173
- def follow_container_logs(self, pod: V1Pod, container_name: str) -> None:
414
+ def fetch_container_logs(
415
+ self,
416
+ pod: V1Pod,
417
+ container_name: str,
418
+ *,
419
+ follow=False,
420
+ since_time: DateTime | None = None,
421
+ post_termination_timeout: int = 120,
422
+ container_name_log_prefix_enabled: bool = True,
423
+ log_formatter: Callable[[str, str], str] | None = None,
424
+ ) -> PodLoggingStatus:
174
425
  """
175
- Follows the logs of container and streams to airflow logging.
426
+ Follow the logs of container and stream to airflow logging.
427
+
176
428
  Returns when container exits.
177
429
 
178
- .. note:: :meth:`read_pod_logs` follows the logs, so we shouldn't necessarily *need* to
179
- loop as we do here. But in a long-running process we might temporarily lose connectivity.
180
- So the looping logic is there to let us resume following the logs.
430
+ Between when the pod starts and logs being available, there might be a delay due to CSR not approved
431
+ and signed yet. In such situation, ApiException is thrown. This is why we are retrying on this
432
+ specific exception.
433
+
434
+ :meta private:
181
435
  """
182
436
 
183
- def follow_logs(since_time: Optional[DateTime] = None) -> Optional[DateTime]:
437
+ def consume_logs(*, since_time: DateTime | None = None) -> tuple[DateTime | None, Exception | None]:
184
438
  """
185
- Tries to follow container logs until container completes.
439
+ Try to follow container logs until container completes.
440
+
186
441
  For a long-running container, sometimes the log read may be interrupted
187
442
  Such errors of this kind are suppressed.
188
443
 
189
444
  Returns the last timestamp observed in logs.
190
445
  """
191
- timestamp = None
446
+ exception = None
447
+ last_captured_timestamp = None
448
+ # We timeout connections after 30 minutes because otherwise they can get
449
+ # stuck forever. The 30 is somewhat arbitrary.
450
+ # As a consequence, a TimeoutError will be raised no more than 30 minutes
451
+ # after starting read.
452
+ connection_timeout = 60 * 30
453
+ # We set a shorter read timeout because that helps reduce *connection* timeouts
454
+ # (since the connection will be restarted periodically). And with read timeout,
455
+ # we don't need to worry about either duplicate messages or losing messages; we
456
+ # can safely resume from a few seconds later
457
+ read_timeout = 60 * 5
192
458
  try:
459
+ since_seconds = None
460
+ if since_time:
461
+ try:
462
+ since_seconds = math.ceil((pendulum.now() - since_time).total_seconds())
463
+ except TypeError:
464
+ self.log.warning(
465
+ "Error calculating since_seconds with since_time %s. Using None instead.",
466
+ since_time,
467
+ )
193
468
  logs = self.read_pod_logs(
194
469
  pod=pod,
195
470
  container_name=container_name,
196
471
  timestamps=True,
197
- since_seconds=(
198
- math.ceil((pendulum.now() - since_time).total_seconds()) if since_time else None
199
- ),
200
- )
201
- for line in logs:
202
- timestamp, message = self.parse_log_line(line.decode('utf-8'))
203
- self.log.info(message)
204
- except BaseHTTPError: # Catches errors like ProtocolError(TimeoutError).
205
- self.log.warning(
206
- 'Failed to read logs for pod %s',
207
- pod.metadata.name,
208
- exc_info=True,
472
+ since_seconds=since_seconds,
473
+ follow=follow,
474
+ post_termination_timeout=post_termination_timeout,
475
+ _request_timeout=(connection_timeout, read_timeout),
209
476
  )
210
- return timestamp or since_time
477
+ message_to_log = None
478
+ message_timestamp = None
479
+ progress_callback_lines = []
480
+ try:
481
+ for raw_line in logs:
482
+ line = raw_line.decode("utf-8", errors="backslashreplace")
483
+ line_timestamp, message = parse_log_line(line)
484
+ if line_timestamp: # detect new log line
485
+ if message_to_log is None: # first line in the log
486
+ message_to_log = message
487
+ message_timestamp = line_timestamp
488
+ progress_callback_lines.append(line)
489
+ else: # previous log line is complete
490
+ for line in progress_callback_lines:
491
+ for callback in self._callbacks:
492
+ callback.progress_callback(
493
+ line=line, client=self._client, mode=ExecutionMode.SYNC
494
+ )
495
+ if message_to_log is not None:
496
+ self._log_message(
497
+ message_to_log,
498
+ container_name,
499
+ container_name_log_prefix_enabled,
500
+ log_formatter,
501
+ )
502
+ last_captured_timestamp = message_timestamp
503
+ message_to_log = message
504
+ message_timestamp = line_timestamp
505
+ progress_callback_lines = [line]
506
+ else: # continuation of the previous log line
507
+ message_to_log = f"{message_to_log}\n{message}"
508
+ progress_callback_lines.append(line)
509
+ finally:
510
+ # log the last line and update the last_captured_timestamp
511
+ for line in progress_callback_lines:
512
+ for callback in self._callbacks:
513
+ callback.progress_callback(
514
+ line=line, client=self._client, mode=ExecutionMode.SYNC
515
+ )
516
+ if message_to_log is not None:
517
+ self._log_message(
518
+ message_to_log, container_name, container_name_log_prefix_enabled, log_formatter
519
+ )
520
+ last_captured_timestamp = message_timestamp
521
+ except TimeoutError as e:
522
+ # in case of timeout, increment return time by 2 seconds to avoid
523
+ # duplicate log entries
524
+ if val := (last_captured_timestamp or since_time):
525
+ return val.add(seconds=2), e
526
+ except HTTPError as e:
527
+ exception = e
528
+ self._http_error_timestamps = getattr(self, "_http_error_timestamps", [])
529
+ self._http_error_timestamps = [
530
+ t for t in self._http_error_timestamps if t > utcnow() - timedelta(seconds=60)
531
+ ]
532
+ self._http_error_timestamps.append(utcnow())
533
+ # Log only if more than 2 errors occurred in the last 60 seconds
534
+ if len(self._http_error_timestamps) > 2:
535
+ self.log.exception(
536
+ "Reading of logs interrupted for container %r; will retry.",
537
+ container_name,
538
+ )
539
+ return last_captured_timestamp or since_time, exception
211
540
 
212
- last_log_time = None
541
+ # note: `read_pod_logs` follows the logs, so we shouldn't necessarily *need* to
542
+ # loop as we do here. But in a long-running process we might temporarily lose connectivity.
543
+ # So the looping logic is there to let us resume following the logs.
544
+ last_log_time = since_time
213
545
  while True:
214
- last_log_time = follow_logs(since_time=last_log_time)
546
+ last_log_time, exc = consume_logs(since_time=last_log_time)
215
547
  if not self.container_is_running(pod, container_name=container_name):
216
- return
217
- else:
548
+ return PodLoggingStatus(running=False, last_log_time=last_log_time)
549
+ if not follow:
550
+ return PodLoggingStatus(running=True, last_log_time=last_log_time)
551
+ # a timeout is a normal thing and we ignore it and resume following logs
552
+ if not isinstance(exc, TimeoutError):
218
553
  self.log.warning(
219
- 'Pod %s log read interrupted but container %s still running',
554
+ "Pod %s log read interrupted but container %s still running. Logs generated in the last one second might get duplicated.",
220
555
  pod.metadata.name,
221
556
  container_name,
222
557
  )
223
- time.sleep(1)
224
-
225
- def await_container_completion(self, pod: V1Pod, container_name: str) -> None:
226
- while not self.container_is_running(pod=pod, container_name=container_name):
227
558
  time.sleep(1)
228
559
 
229
- def await_pod_completion(self, pod: V1Pod) -> V1Pod:
560
+ def _reconcile_requested_log_containers(
561
+ self, requested: Iterable[str] | str | bool | None, actual: list[str], pod_name
562
+ ) -> list[str]:
563
+ """Return actual containers based on requested."""
564
+ containers_to_log = []
565
+ if actual:
566
+ if isinstance(requested, str):
567
+ # fetch logs only for requested container if only one container is provided
568
+ if requested in actual:
569
+ containers_to_log.append(requested)
570
+ else:
571
+ self.log.error(
572
+ "container %s whose logs were requested not found in the pod %s",
573
+ requested,
574
+ pod_name,
575
+ )
576
+ elif isinstance(requested, bool):
577
+ # if True is provided, get logs for all the containers
578
+ if requested is True:
579
+ containers_to_log.extend(actual)
580
+ else:
581
+ self.log.error(
582
+ "False is not a valid value for container_logs",
583
+ )
584
+ else:
585
+ # if a sequence of containers are provided, iterate for every container in the pod
586
+ if isinstance(requested, Iterable):
587
+ for container in requested:
588
+ if container in actual:
589
+ containers_to_log.append(container)
590
+ else:
591
+ self.log.error(
592
+ "Container %s whose logs were requests not found in the pod %s",
593
+ container,
594
+ pod_name,
595
+ )
596
+ else:
597
+ self.log.error(
598
+ "Invalid type %s specified for container names input parameter", type(requested)
599
+ )
600
+ else:
601
+ self.log.error("Could not retrieve containers for the pod: %s", pod_name)
602
+ return containers_to_log
603
+
604
+ def fetch_requested_init_container_logs(
605
+ self,
606
+ pod: V1Pod,
607
+ init_containers: Iterable[str] | str | Literal[True] | None,
608
+ follow_logs=False,
609
+ container_name_log_prefix_enabled: bool = True,
610
+ log_formatter: Callable[[str, str], str] | None = None,
611
+ ) -> list[PodLoggingStatus]:
230
612
  """
231
- Monitors a pod and returns the final state
613
+ Follow the logs of containers in the specified pod and publish it to airflow logging.
614
+
615
+ Returns when all the containers exit.
616
+
617
+ :meta private:
618
+ """
619
+ pod_logging_statuses = []
620
+ all_containers = self.get_init_container_names(pod)
621
+ containers_to_log = self._reconcile_requested_log_containers(
622
+ requested=init_containers,
623
+ actual=all_containers,
624
+ pod_name=pod.metadata.name,
625
+ )
626
+ # sort by spec.initContainers because containers runs sequentially
627
+ containers_to_log = sorted(containers_to_log, key=lambda cn: all_containers.index(cn))
628
+ for c in containers_to_log:
629
+ self._await_init_container_start(pod=pod, container_name=c)
630
+ status = self.fetch_container_logs(
631
+ pod=pod,
632
+ container_name=c,
633
+ follow=follow_logs,
634
+ container_name_log_prefix_enabled=container_name_log_prefix_enabled,
635
+ log_formatter=log_formatter,
636
+ )
637
+ pod_logging_statuses.append(status)
638
+ return pod_logging_statuses
639
+
640
+ def fetch_requested_container_logs(
641
+ self,
642
+ pod: V1Pod,
643
+ containers: Iterable[str] | str | Literal[True],
644
+ follow_logs=False,
645
+ container_name_log_prefix_enabled: bool = True,
646
+ log_formatter: Callable[[str, str], str] | None = None,
647
+ ) -> list[PodLoggingStatus]:
648
+ """
649
+ Follow the logs of containers in the specified pod and publish it to airflow logging.
650
+
651
+ Returns when all the containers exit.
652
+
653
+ :meta private:
654
+ """
655
+ pod_logging_statuses = []
656
+ all_containers = self.get_container_names(pod)
657
+ containers_to_log = self._reconcile_requested_log_containers(
658
+ requested=containers,
659
+ actual=all_containers,
660
+ pod_name=pod.metadata.name,
661
+ )
662
+ for c in containers_to_log:
663
+ status = self.fetch_container_logs(
664
+ pod=pod,
665
+ container_name=c,
666
+ follow=follow_logs,
667
+ container_name_log_prefix_enabled=container_name_log_prefix_enabled,
668
+ log_formatter=log_formatter,
669
+ )
670
+ pod_logging_statuses.append(status)
671
+ return pod_logging_statuses
672
+
673
+ def await_container_completion(self, pod: V1Pod, container_name: str, polling_time: float = 1) -> None:
674
+ """
675
+ Wait for the given container in the given pod to be completed.
232
676
 
233
677
  :param pod: pod spec that will be monitored
234
- :return: Tuple[State, Optional[str]]
678
+ :param container_name: name of the container within the pod to monitor
679
+ :param polling_time: polling time between two container status checks.
680
+ Defaults to 1s.
235
681
  """
236
682
  while True:
237
683
  remote_pod = self.read_pod(pod)
238
- if remote_pod.status.phase in PodPhase.terminal_states:
684
+ terminated = container_is_completed(remote_pod, container_name)
685
+ if terminated:
239
686
  break
240
- self.log.info('Pod %s has phase %s', pod.metadata.name, remote_pod.status.phase)
241
- time.sleep(2)
242
- return remote_pod
687
+ self.log.info("Waiting for container '%s' state to be completed", container_name)
688
+ time.sleep(polling_time)
243
689
 
244
- def parse_log_line(self, line: str) -> Tuple[Optional[DateTime], str]:
690
+ def await_pod_completion(
691
+ self, pod: V1Pod, istio_enabled: bool = False, container_name: str = "base"
692
+ ) -> V1Pod:
245
693
  """
246
- Parse K8s log line and returns the final state
694
+ Monitor a pod and return the final state.
247
695
 
248
- :param line: k8s log line
249
- :return: timestamp and log message
250
- :rtype: Tuple[str, str]
696
+ :param istio_enabled: whether istio is enabled in the namespace
697
+ :param pod: pod spec that will be monitored
698
+ :param container_name: name of the container within the pod
699
+ :return: tuple[State, str | None]
251
700
  """
252
- split_at = line.find(' ')
253
- if split_at == -1:
254
- raise Exception(f'Log not in "{{timestamp}} {{log}}" format. Got: {line}')
255
- timestamp = line[:split_at]
256
- message = line[split_at + 1 :].rstrip()
257
- try:
258
- last_log_time = cast(DateTime, pendulum.parse(timestamp))
259
- except ParserError:
260
- self.log.error("Error parsing timestamp. Will continue execution but won't update timestamp")
261
- return None, line
262
- return last_log_time, message
701
+ while True:
702
+ remote_pod = self.read_pod(pod)
703
+ if remote_pod.status.phase in PodPhase.terminal_states:
704
+ break
705
+ if istio_enabled and container_is_completed(remote_pod, container_name):
706
+ break
707
+ self.log.info("Pod %s has phase %s", pod.metadata.name, remote_pod.status.phase)
708
+ time.sleep(2)
709
+ return remote_pod
263
710
 
264
711
  def container_is_running(self, pod: V1Pod, container_name: str) -> bool:
265
- """Reads pod and checks if container is running"""
712
+ """Read pod and checks if container is running."""
266
713
  remote_pod = self.read_pod(pod)
267
714
  return container_is_running(pod=remote_pod, container_name=container_name)
268
715
 
269
- @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
716
+ def container_is_terminated(self, pod: V1Pod, container_name: str) -> bool:
717
+ """Read pod and checks if container is terminated."""
718
+ remote_pod = self.read_pod(pod)
719
+ return container_is_terminated(pod=remote_pod, container_name=container_name)
720
+
721
+ @tenacity.retry(stop=tenacity.stop_after_attempt(6), wait=tenacity.wait_exponential(max=15), reraise=True)
270
722
  def read_pod_logs(
271
723
  self,
272
724
  pod: V1Pod,
273
725
  container_name: str,
274
- tail_lines: Optional[int] = None,
726
+ tail_lines: int | None = None,
275
727
  timestamps: bool = False,
276
- since_seconds: Optional[int] = None,
277
- ) -> Iterable[bytes]:
278
- """Reads log from the POD"""
728
+ since_seconds: int | None = None,
729
+ follow=True,
730
+ post_termination_timeout: int = 120,
731
+ **kwargs,
732
+ ) -> PodLogsConsumer:
733
+ """Read log from the POD."""
279
734
  additional_kwargs = {}
280
735
  if since_seconds:
281
- additional_kwargs['since_seconds'] = since_seconds
736
+ additional_kwargs["since_seconds"] = since_seconds
282
737
 
283
738
  if tail_lines:
284
- additional_kwargs['tail_lines'] = tail_lines
739
+ additional_kwargs["tail_lines"] = tail_lines
740
+ additional_kwargs.update(**kwargs)
285
741
 
286
742
  try:
287
- return self._client.read_namespaced_pod_log(
743
+ logs = self._client.read_namespaced_pod_log(
288
744
  name=pod.metadata.name,
289
745
  namespace=pod.metadata.namespace,
290
746
  container=container_name,
291
- follow=True,
747
+ follow=follow,
292
748
  timestamps=timestamps,
293
749
  _preload_content=False,
294
750
  **additional_kwargs,
295
751
  )
296
- except BaseHTTPError:
297
- self.log.exception('There was an error reading the kubernetes API.')
752
+ except HTTPError:
753
+ self.log.exception("There was an error reading the kubernetes API.")
298
754
  raise
299
755
 
756
+ return PodLogsConsumer(
757
+ response=logs,
758
+ pod=pod,
759
+ pod_manager=self,
760
+ container_name=container_name,
761
+ post_termination_timeout=post_termination_timeout,
762
+ )
763
+
764
+ @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
765
+ def get_init_container_names(self, pod: V1Pod) -> list[str]:
766
+ """
767
+ Return container names from the POD except for the airflow-xcom-sidecar container.
768
+
769
+ :meta private:
770
+ """
771
+ return [container_spec.name for container_spec in pod.spec.init_containers]
772
+
300
773
  @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
301
- def read_pod_events(self, pod: V1Pod) -> "V1EventList":
302
- """Reads events from the POD"""
774
+ def get_container_names(self, pod: V1Pod) -> list[str]:
775
+ """
776
+ Return container names from the POD except for the airflow-xcom-sidecar container.
777
+
778
+ :meta private:
779
+ """
780
+ pod_info = self.read_pod(pod)
781
+ return [
782
+ container_spec.name
783
+ for container_spec in pod_info.spec.containers
784
+ if container_spec.name != PodDefaults.SIDECAR_CONTAINER_NAME
785
+ ]
786
+
787
+ @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
788
+ def read_pod_events(self, pod: V1Pod) -> CoreV1EventList:
789
+ """Read events from the POD."""
303
790
  try:
304
791
  return self._client.list_namespaced_event(
305
792
  namespace=pod.metadata.namespace, field_selector=f"involvedObject.name={pod.metadata.name}"
306
793
  )
307
- except BaseHTTPError as e:
308
- raise AirflowException(f'There was an error reading the kubernetes API: {e}')
794
+ except HTTPError as e:
795
+ raise AirflowException(f"There was an error reading the kubernetes API: {e}")
309
796
 
310
797
  @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(), reraise=True)
311
798
  def read_pod(self, pod: V1Pod) -> V1Pod:
312
- """Read POD information"""
799
+ """Read POD information."""
313
800
  try:
314
801
  return self._client.read_namespaced_pod(pod.metadata.name, pod.metadata.namespace)
315
- except BaseHTTPError as e:
316
- raise AirflowException(f'There was an error reading the kubernetes API: {e}')
802
+ except HTTPError as e:
803
+ raise AirflowException(f"There was an error reading the kubernetes API: {e}")
804
+
805
+ def await_xcom_sidecar_container_start(
806
+ self, pod: V1Pod, timeout: int = 900, log_interval: int = 30
807
+ ) -> None:
808
+ """Check if the sidecar container has reached the 'Running' state before performing do_xcom_push."""
809
+ self.log.info("Checking if xcom sidecar container is started.")
810
+ start_time = time.time()
811
+ last_log_time = start_time
812
+
813
+ while True:
814
+ elapsed_time = time.time() - start_time
815
+ if self.container_is_running(pod, PodDefaults.SIDECAR_CONTAINER_NAME):
816
+ self.log.info("The xcom sidecar container has started.")
817
+ break
818
+ if self.container_is_terminated(pod, PodDefaults.SIDECAR_CONTAINER_NAME):
819
+ raise AirflowException(
820
+ "Xcom sidecar container is already terminated! Not possible to read xcom output of task."
821
+ )
822
+ if (time.time() - last_log_time) >= log_interval:
823
+ self.log.warning(
824
+ "Still waiting for the xcom sidecar container to start. Elapsed time: %d seconds.",
825
+ int(elapsed_time),
826
+ )
827
+ last_log_time = time.time()
828
+ if elapsed_time > timeout:
829
+ raise AirflowException(
830
+ f"Xcom sidecar container did not start within {timeout // 60} minutes."
831
+ )
832
+ time.sleep(1)
317
833
 
318
834
  def extract_xcom(self, pod: V1Pod) -> str:
319
- """Retrieves XCom value and kills xcom sidecar container"""
835
+ """Retrieve XCom value and kill xcom sidecar container."""
836
+ try:
837
+ result = self.extract_xcom_json(pod)
838
+ return result
839
+ finally:
840
+ self.extract_xcom_kill(pod)
841
+
842
+ @tenacity.retry(
843
+ stop=tenacity.stop_after_attempt(5),
844
+ wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
845
+ reraise=True,
846
+ )
847
+ def extract_xcom_json(self, pod: V1Pod) -> str:
848
+ """Retrieve XCom value and also check if xcom json is valid."""
849
+ command = (
850
+ f"if [ -s {PodDefaults.XCOM_MOUNT_PATH}/return.json ]; "
851
+ f"then cat {PodDefaults.XCOM_MOUNT_PATH}/return.json; "
852
+ f"else echo {EMPTY_XCOM_RESULT}; fi"
853
+ )
320
854
  with closing(
321
855
  kubernetes_stream(
322
856
  self._client.connect_get_namespaced_pod_exec,
323
857
  pod.metadata.name,
324
858
  pod.metadata.namespace,
325
859
  container=PodDefaults.SIDECAR_CONTAINER_NAME,
326
- command=['/bin/sh'],
327
- stdin=True,
860
+ command=[
861
+ "/bin/sh",
862
+ "-c",
863
+ command,
864
+ ],
865
+ stdin=False,
328
866
  stdout=True,
329
867
  stderr=True,
330
868
  tty=False,
331
869
  _preload_content=False,
332
870
  )
333
- ) as resp:
334
- result = self._exec_pod_command(resp, f'cat {PodDefaults.XCOM_MOUNT_PATH}/return.json')
335
- self._exec_pod_command(resp, 'kill -s SIGINT 1')
871
+ ) as client:
872
+ self.log.info("Running command... %s", command)
873
+ client.run_forever()
874
+ if client.peek_stderr():
875
+ stderr = client.read_stderr()
876
+ self.log.error("stderr from command: %s", stderr)
877
+ result = client.read_all()
878
+ if result and result.rstrip() != EMPTY_XCOM_RESULT:
879
+ # Note: result string is parsed to check if its valid json.
880
+ # This function still returns a string which is converted into json in the calling method.
881
+ json.loads(result)
882
+
336
883
  if result is None:
337
- raise AirflowException(f'Failed to extract xcom from pod: {pod.metadata.name}')
884
+ raise AirflowException(f"Failed to extract xcom from pod: {pod.metadata.name}")
338
885
  return result
339
886
 
340
- def _exec_pod_command(self, resp, command: str) -> Optional[str]:
341
- if resp.is_open():
342
- self.log.info('Running command... %s\n', command)
343
- resp.write_stdin(command + '\n')
344
- while resp.is_open():
345
- resp.update(timeout=1)
346
- if resp.peek_stdout():
347
- return resp.read_stdout()
348
- if resp.peek_stderr():
349
- self.log.info("stderr from command: %s", resp.read_stderr())
350
- break
887
+ @tenacity.retry(
888
+ stop=tenacity.stop_after_attempt(5),
889
+ wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
890
+ reraise=True,
891
+ )
892
+ def extract_xcom_kill(self, pod: V1Pod):
893
+ """Kill xcom sidecar container."""
894
+ with closing(
895
+ kubernetes_stream(
896
+ self._client.connect_get_namespaced_pod_exec,
897
+ pod.metadata.name,
898
+ pod.metadata.namespace,
899
+ container=PodDefaults.SIDECAR_CONTAINER_NAME,
900
+ command=["/bin/sh"],
901
+ stdin=True,
902
+ stdout=True,
903
+ stderr=True,
904
+ tty=False,
905
+ _preload_content=False,
906
+ )
907
+ ) as resp:
908
+ self._exec_pod_command(resp, "kill -2 $(pgrep -u $(id -u) -f 'sh')")
909
+
910
+ def _exec_pod_command(self, resp, command: str) -> str | None:
911
+ res = ""
912
+ if not resp.is_open():
913
+ return None
914
+ self.log.info("Running command... %s", command)
915
+ resp.write_stdin(f"{command}\n")
916
+ while resp.is_open():
917
+ resp.update(timeout=1)
918
+ while resp.peek_stdout():
919
+ res += resp.read_stdout()
920
+ error_res = ""
921
+ while resp.peek_stderr():
922
+ error_res += resp.read_stderr()
923
+ if error_res:
924
+ self.log.info("stderr from command: %s", error_res)
925
+ break
926
+ if res:
927
+ return res
351
928
  return None
929
+
930
+ def _await_init_container_start(self, pod: V1Pod, container_name: str):
931
+ while True:
932
+ remote_pod = self.read_pod(pod)
933
+
934
+ if (
935
+ remote_pod.status is not None
936
+ and remote_pod.status.phase != PodPhase.PENDING
937
+ and get_container_status(remote_pod, container_name) is not None
938
+ and not container_is_wait(remote_pod, container_name)
939
+ ):
940
+ return
941
+
942
+ time.sleep(1)
943
+
944
+
945
+ class OnFinishAction(str, enum.Enum):
946
+ """Action to take when the pod finishes."""
947
+
948
+ KEEP_POD = "keep_pod"
949
+ DELETE_POD = "delete_pod"
950
+ DELETE_SUCCEEDED_POD = "delete_succeeded_pod"
951
+
952
+
953
+ def is_log_group_marker(line: str) -> bool:
954
+ """Check if the line is a log group marker like `::group::` or `::endgroup::`."""
955
+ return line.startswith("::group::") or line.startswith("::endgroup::")
956
+
957
+
958
+ def parse_log_line(line: str) -> tuple[DateTime | None, str]:
959
+ """
960
+ Parse K8s log line and returns the final state.
961
+
962
+ :param line: k8s log line
963
+ :return: timestamp and log message
964
+ """
965
+ timestamp, sep, message = line.strip().partition(" ")
966
+ if not sep:
967
+ return None, line
968
+ try:
969
+ last_log_time = cast("DateTime", pendulum.parse(timestamp))
970
+ except ParserError:
971
+ return None, line
972
+ return last_log_time, message
973
+
974
+
975
+ class AsyncPodManager(LoggingMixin):
976
+ """Create, monitor, and otherwise interact with Kubernetes pods for use with the KubernetesPodTriggerer."""
977
+
978
+ def __init__(
979
+ self,
980
+ async_hook: AsyncKubernetesHook,
981
+ callbacks: list[type[KubernetesPodOperatorCallback]] | None = None,
982
+ ):
983
+ """
984
+ Create the launcher.
985
+
986
+ :param kube_client: kubernetes client
987
+ :param callbacks:
988
+ """
989
+ super().__init__()
990
+ self._hook = async_hook
991
+ self._watch = watch.Watch()
992
+ self._callbacks = callbacks or []
993
+ self.stop_watching_events = False
994
+
995
+ @tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(), reraise=True)
996
+ async def read_pod(self, pod: V1Pod) -> V1Pod:
997
+ """Read POD information."""
998
+ return await self._hook.get_pod(
999
+ pod.metadata.name,
1000
+ pod.metadata.namespace,
1001
+ )
1002
+
1003
+ @tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(), reraise=True)
1004
+ async def read_pod_events(self, pod: V1Pod) -> CoreV1EventList:
1005
+ """Get pod's events."""
1006
+ return await self._hook.get_pod_events(
1007
+ pod.metadata.name,
1008
+ pod.metadata.namespace,
1009
+ )
1010
+
1011
+ async def watch_pod_events(self, pod: V1Pod, check_interval: float = 1) -> None:
1012
+ """Read pod events and writes into log."""
1013
+ await watch_pod_events(pod_manager=self, pod=pod, check_interval=check_interval)
1014
+
1015
+ async def await_pod_start(
1016
+ self, pod: V1Pod, schedule_timeout: int = 120, startup_timeout: int = 120, check_interval: float = 1
1017
+ ) -> None:
1018
+ """
1019
+ Wait for the pod to reach phase other than ``Pending``.
1020
+
1021
+ :param pod:
1022
+ :param schedule_timeout: Timeout (in seconds) for pod stay in schedule state
1023
+ (if pod is taking to long in schedule state, fails task)
1024
+ :param startup_timeout: Timeout (in seconds) for startup of the pod
1025
+ (if pod is pending for too long after being scheduled, fails task)
1026
+ :param check_interval: Interval (in seconds) between checks
1027
+ :return:
1028
+ """
1029
+ await await_pod_start(
1030
+ pod_manager=self,
1031
+ pod=pod,
1032
+ schedule_timeout=schedule_timeout,
1033
+ startup_timeout=startup_timeout,
1034
+ check_interval=check_interval,
1035
+ )
1036
+
1037
+ @tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_exponential(), reraise=True)
1038
+ async def fetch_container_logs_before_current_sec(
1039
+ self, pod: V1Pod, container_name: str, since_time: DateTime | None = None
1040
+ ) -> DateTime | None:
1041
+ """
1042
+ Asynchronously read the log file of the specified pod.
1043
+
1044
+ This method streams logs from the base container, skipping log lines from the current second to prevent duplicate entries on subsequent reads. It is designed to handle long-running containers and gracefully suppresses transient interruptions.
1045
+
1046
+ :param pod: The pod specification to monitor.
1047
+ :param container_name: The name of the container within the pod.
1048
+ :param since_time: The timestamp from which to start reading logs.
1049
+ :return: The timestamp to use for the next log read, representing the start of the current second. Returns None if an exception occurred.
1050
+ """
1051
+ now = pendulum.now()
1052
+ logs = await self._hook.read_logs(
1053
+ name=pod.metadata.name,
1054
+ namespace=pod.metadata.namespace,
1055
+ container_name=container_name,
1056
+ since_seconds=(math.ceil((now - since_time).total_seconds()) if since_time else None),
1057
+ )
1058
+ message_to_log = None
1059
+ try:
1060
+ now_seconds = now.replace(microsecond=0)
1061
+ for line in logs:
1062
+ line_timestamp, message = parse_log_line(line)
1063
+ # Skip log lines from the current second to prevent duplicate entries on the next read.
1064
+ # The API only allows specifying 'since_seconds', not an exact timestamp.
1065
+ if line_timestamp and line_timestamp.replace(microsecond=0) == now_seconds:
1066
+ break
1067
+ if line_timestamp: # detect new log line
1068
+ if message_to_log is None: # first line in the log
1069
+ message_to_log = message
1070
+ else: # previous log line is complete
1071
+ if message_to_log is not None:
1072
+ if is_log_group_marker(message_to_log):
1073
+ print(message_to_log)
1074
+ else:
1075
+ self.log.info("[%s] %s", container_name, message_to_log)
1076
+ message_to_log = message
1077
+ elif message_to_log: # continuation of the previous log line
1078
+ message_to_log = f"{message_to_log}\n{message}"
1079
+ finally:
1080
+ # log the last line and update the last_captured_timestamp
1081
+ if message_to_log is not None:
1082
+ if is_log_group_marker(message_to_log):
1083
+ print(message_to_log)
1084
+ else:
1085
+ self.log.info("[%s] %s", container_name, message_to_log)
1086
+ return now # Return the current time as the last log time to ensure logs from the current second are read in the next fetch.