apache-airflow-providers-cncf-kubernetes 10.9.0rc1__py3-none-any.whl → 10.11.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. airflow/providers/cncf/kubernetes/__init__.py +3 -3
  2. airflow/providers/cncf/kubernetes/exceptions.py +9 -3
  3. airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +24 -5
  4. airflow/providers/cncf/kubernetes/get_provider_info.py +6 -0
  5. airflow/providers/cncf/kubernetes/hooks/kubernetes.py +58 -21
  6. airflow/providers/cncf/kubernetes/kube_config.py +24 -1
  7. airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +63 -16
  8. airflow/providers/cncf/kubernetes/operators/job.py +9 -3
  9. airflow/providers/cncf/kubernetes/operators/pod.py +36 -45
  10. airflow/providers/cncf/kubernetes/operators/resource.py +2 -8
  11. airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +18 -3
  12. airflow/providers/cncf/kubernetes/secret.py +3 -0
  13. airflow/providers/cncf/kubernetes/triggers/pod.py +56 -24
  14. airflow/providers/cncf/kubernetes/utils/pod_manager.py +256 -111
  15. airflow/providers/cncf/kubernetes/version_compat.py +5 -1
  16. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/METADATA +19 -17
  17. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/RECORD +21 -20
  18. apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info/licenses/NOTICE +5 -0
  19. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/WHEEL +0 -0
  20. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/entry_points.txt +0 -0
  21. {airflow/providers/cncf/kubernetes → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info/licenses}/LICENSE +0 -0
@@ -41,11 +41,6 @@ from kubernetes.stream import stream
41
41
  from urllib3.exceptions import HTTPError
42
42
 
43
43
  from airflow.configuration import conf
44
- from airflow.exceptions import (
45
- AirflowException,
46
- AirflowSkipException,
47
- TaskDeferred,
48
- )
49
44
  from airflow.providers.cncf.kubernetes import pod_generator
50
45
  from airflow.providers.cncf.kubernetes.backcompat.backwards_compat_converters import (
51
46
  convert_affinity,
@@ -65,6 +60,7 @@ from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import (
65
60
  POD_NAME_MAX_LENGTH,
66
61
  add_unique_suffix,
67
62
  create_unique_id,
63
+ generic_api_retry,
68
64
  )
69
65
  from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator
70
66
  from airflow.providers.cncf.kubernetes.triggers.pod import KubernetesPodTrigger
@@ -82,12 +78,13 @@ from airflow.providers.cncf.kubernetes.utils.pod_manager import (
82
78
  PodPhase,
83
79
  )
84
80
  from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_1_PLUS
85
- from airflow.providers.common.compat.sdk import XCOM_RETURN_KEY
81
+ from airflow.providers.common.compat.sdk import XCOM_RETURN_KEY, AirflowSkipException, TaskDeferred
86
82
 
87
83
  if AIRFLOW_V_3_1_PLUS:
88
84
  from airflow.sdk import BaseOperator
89
85
  else:
90
86
  from airflow.models import BaseOperator
87
+ from airflow.exceptions import AirflowException
91
88
  from airflow.settings import pod_mutation_hook
92
89
  from airflow.utils import yaml
93
90
  from airflow.utils.helpers import prune_dict, validate_key
@@ -126,6 +123,10 @@ class PodCredentialsExpiredFailure(AirflowException):
126
123
  """When pod fails to refresh credentials."""
127
124
 
128
125
 
126
+ class FoundMoreThanOnePodFailure(AirflowException):
127
+ """When during reconnect more than one matching pod was found."""
128
+
129
+
129
130
  class KubernetesPodOperator(BaseOperator):
130
131
  """
131
132
  Execute a task in a Kubernetes Pod.
@@ -563,6 +564,7 @@ class KubernetesPodOperator(BaseOperator):
563
564
  callback.on_sync_client_creation(client=client, operator=self)
564
565
  return client
565
566
 
567
+ @generic_api_retry
566
568
  def find_pod(self, namespace: str, context: Context, *, exclude_checked: bool = True) -> k8s.V1Pod | None:
567
569
  """Return an already-running pod for this task instance if one exists."""
568
570
  label_selector = self._build_find_pod_label_selector(context, exclude_checked=exclude_checked)
@@ -579,7 +581,7 @@ class KubernetesPodOperator(BaseOperator):
579
581
  self.log_matching_pod(pod=pod, context=context)
580
582
  elif num_pods > 1:
581
583
  if self.reattach_on_restart:
582
- raise AirflowException(f"More than one pod running with labels {label_selector}")
584
+ raise FoundMoreThanOnePodFailure(f"More than one pod running with labels {label_selector}")
583
585
  self.log.warning("Found more than one pod running with labels %s, resolving ...", label_selector)
584
586
  pod = self.process_duplicate_label_pods(pod_list)
585
587
  self.log_matching_pod(pod=pod, context=context)
@@ -868,6 +870,7 @@ class KubernetesPodOperator(BaseOperator):
868
870
  get_logs=self.get_logs,
869
871
  startup_timeout=self.startup_timeout_seconds,
870
872
  startup_check_interval=self.startup_check_interval_seconds,
873
+ schedule_timeout=self.schedule_timeout_seconds,
871
874
  base_container_name=self.base_container_name,
872
875
  on_finish_action=self.on_finish_action.value,
873
876
  last_log_time=last_log_time,
@@ -898,17 +901,6 @@ class KubernetesPodOperator(BaseOperator):
898
901
  if not self.pod:
899
902
  raise PodNotFoundException("Could not find pod after resuming from deferral")
900
903
 
901
- if event["status"] != "running":
902
- for callback in self.callbacks:
903
- callback.on_operator_resuming(
904
- pod=self.pod,
905
- event=event,
906
- client=self.client,
907
- mode=ExecutionMode.SYNC,
908
- context=context,
909
- operator=self,
910
- )
911
-
912
904
  follow = self.logging_interval is None
913
905
  last_log_time = event.get("last_log_time")
914
906
 
@@ -941,33 +933,17 @@ class KubernetesPodOperator(BaseOperator):
941
933
  )
942
934
  message = event.get("stack_trace", event["message"])
943
935
  raise AirflowException(message)
944
-
945
- return xcom_sidecar_output
946
-
947
- if event["status"] == "running":
948
- if self.get_logs:
949
- self.log.info("Resuming logs read from time %r", last_log_time)
950
-
951
- pod_log_status = self.pod_manager.fetch_container_logs(
952
- pod=self.pod,
953
- container_name=self.base_container_name,
954
- follow=follow,
955
- since_time=last_log_time,
956
- container_name_log_prefix_enabled=self.container_name_log_prefix_enabled,
957
- log_formatter=self.log_formatter,
958
- )
959
-
960
- self.invoke_defer_method(pod_log_status.last_log_time)
961
- else:
962
- self.invoke_defer_method()
963
936
  except TaskDeferred:
964
937
  raise
965
938
  finally:
966
939
  self._clean(event=event, context=context, result=xcom_sidecar_output)
940
+ if self.do_xcom_push:
941
+ return xcom_sidecar_output
967
942
 
968
943
  def _clean(self, event: dict[str, Any], result: dict | None, context: Context) -> None:
969
- if event["status"] == "running":
944
+ if self.pod is None:
970
945
  return
946
+
971
947
  istio_enabled = self.is_istio_enabled(self.pod)
972
948
  # Skip await_pod_completion when the event is 'timeout' due to the pod can hang
973
949
  # on the ErrImagePull or ContainerCreating step and it will never complete
@@ -1208,6 +1184,7 @@ class KubernetesPodOperator(BaseOperator):
1208
1184
  **self.labels,
1209
1185
  **self._get_ti_pod_labels(context, include_try_number=False),
1210
1186
  }
1187
+ labels = _normalize_labels_dict(labels)
1211
1188
  label_strings = [f"{label_id}={label}" for label_id, label in sorted(labels.items())]
1212
1189
  labels_value = ",".join(label_strings)
1213
1190
  if exclude_checked:
@@ -1225,11 +1202,16 @@ class KubernetesPodOperator(BaseOperator):
1225
1202
  def patch_already_checked(self, pod: k8s.V1Pod, *, reraise=True):
1226
1203
  """Add an "already checked" label to ensure we don't reattach on retries."""
1227
1204
  with _optionally_suppress(reraise=reraise):
1228
- self.client.patch_namespaced_pod(
1229
- name=pod.metadata.name,
1230
- namespace=pod.metadata.namespace,
1231
- body={"metadata": {"labels": {self.POD_CHECKED_KEY: "True"}}},
1232
- )
1205
+
1206
+ @generic_api_retry
1207
+ def _patch_with_retry():
1208
+ self.client.patch_namespaced_pod(
1209
+ name=pod.metadata.name,
1210
+ namespace=pod.metadata.namespace,
1211
+ body={"metadata": {"labels": {self.POD_CHECKED_KEY: "True"}}},
1212
+ )
1213
+
1214
+ _patch_with_retry()
1233
1215
 
1234
1216
  def on_kill(self) -> None:
1235
1217
  self._killed = True
@@ -1242,8 +1224,12 @@ class KubernetesPodOperator(BaseOperator):
1242
1224
  if self.termination_grace_period is not None:
1243
1225
  kwargs.update(grace_period_seconds=self.termination_grace_period)
1244
1226
 
1245
- try:
1227
+ @generic_api_retry
1228
+ def _delete_with_retry():
1246
1229
  self.client.delete_namespaced_pod(**kwargs)
1230
+
1231
+ try:
1232
+ _delete_with_retry()
1247
1233
  except kubernetes.client.exceptions.ApiException:
1248
1234
  self.log.exception("Unable to delete pod %s", self.pod.metadata.name)
1249
1235
 
@@ -1280,7 +1266,7 @@ class KubernetesPodOperator(BaseOperator):
1280
1266
  kind="Pod",
1281
1267
  metadata=k8s.V1ObjectMeta(
1282
1268
  namespace=self.namespace,
1283
- labels=self.labels,
1269
+ labels=_normalize_labels_dict(self.labels),
1284
1270
  name=self.name,
1285
1271
  annotations=self.annotations,
1286
1272
  ),
@@ -1437,3 +1423,8 @@ class _optionally_suppress(AbstractContextManager):
1437
1423
  logger = logging.getLogger(__name__)
1438
1424
  logger.exception(excinst)
1439
1425
  return True
1426
+
1427
+
1428
+ def _normalize_labels_dict(labels: dict) -> dict:
1429
+ """Return a copy of the labels dict with all None values replaced by empty strings."""
1430
+ return {k: ("" if v is None else v) for k, v in labels.items()}
@@ -23,13 +23,12 @@ from collections.abc import Sequence
23
23
  from functools import cached_property
24
24
  from typing import TYPE_CHECKING
25
25
 
26
- import tenacity
27
26
  import yaml
28
27
  from kubernetes.utils import create_from_yaml
29
28
 
30
29
  from airflow.exceptions import AirflowException
31
30
  from airflow.providers.cncf.kubernetes.hooks.kubernetes import KubernetesHook
32
- from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import should_retry_creation
31
+ from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import generic_api_retry
33
32
  from airflow.providers.cncf.kubernetes.utils.delete_from import delete_from_yaml
34
33
  from airflow.providers.cncf.kubernetes.utils.k8s_resource_iterator import k8s_resource_iterator
35
34
  from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_1_PLUS
@@ -132,12 +131,7 @@ class KubernetesCreateResourceOperator(KubernetesResourceBaseOperator):
132
131
  else:
133
132
  self.custom_object_client.create_cluster_custom_object(group, version, plural, body)
134
133
 
135
- @tenacity.retry(
136
- stop=tenacity.stop_after_attempt(3),
137
- wait=tenacity.wait_random_exponential(),
138
- reraise=True,
139
- retry=tenacity.retry_if_exception(should_retry_creation),
140
- )
134
+ @generic_api_retry
141
135
  def _create_objects(self, objects):
142
136
  self.log.info("Starting resource creation")
143
137
  if not self.custom_resource_definition:
@@ -286,6 +286,16 @@ class SparkKubernetesOperator(KubernetesPodOperator):
286
286
  def custom_obj_api(self) -> CustomObjectsApi:
287
287
  return CustomObjectsApi()
288
288
 
289
+ @cached_property
290
+ def launcher(self) -> CustomObjectLauncher:
291
+ return CustomObjectLauncher(
292
+ name=self.name,
293
+ namespace=self.namespace,
294
+ kube_client=self.client,
295
+ custom_obj_api=self.custom_obj_api,
296
+ template_body=self.template_body,
297
+ )
298
+
289
299
  def get_or_create_spark_crd(self, launcher: CustomObjectLauncher, context) -> k8s.V1Pod:
290
300
  if self.reattach_on_restart:
291
301
  driver_pod = self.find_spark_job(context)
@@ -323,6 +333,8 @@ class SparkKubernetesOperator(KubernetesPodOperator):
323
333
  )
324
334
  self.pod = existing_pod
325
335
  self.pod_request_obj = None
336
+ if self.pod.metadata.name.endswith("-driver"):
337
+ self.name = self.pod.metadata.name.removesuffix("-driver")
326
338
  return
327
339
 
328
340
  if "spark" not in template_body:
@@ -361,9 +373,12 @@ class SparkKubernetesOperator(KubernetesPodOperator):
361
373
  return self.find_spark_job(context, exclude_checked=exclude_checked)
362
374
 
363
375
  def on_kill(self) -> None:
364
- if self.launcher:
365
- self.log.debug("Deleting spark job for task %s", self.task_id)
366
- self.launcher.delete_spark_job()
376
+ self.log.debug("Deleting spark job for task %s", self.task_id)
377
+ job_name = self.name
378
+ if self.pod and self.pod.metadata and self.pod.metadata.name:
379
+ if self.pod.metadata.name.endswith("-driver"):
380
+ job_name = self.pod.metadata.name.removesuffix("-driver")
381
+ self.launcher.delete_spark_job(spark_job_name=job_name)
367
382
 
368
383
  def patch_already_checked(self, pod: k8s.V1Pod, *, reraise=True):
369
384
  """Add an "already checked" annotation to ensure we don't reattach on retries."""
@@ -121,5 +121,8 @@ class Secret(K8SModel):
121
121
  and self.key == other.key
122
122
  )
123
123
 
124
+ def __hash__(self):
125
+ return hash((self.deploy_type, self.deploy_target, self.secret, self.key))
126
+
124
127
  def __repr__(self):
125
128
  return f"Secret({self.deploy_type}, {self.deploy_target}, {self.secret}, {self.key})"
@@ -26,8 +26,10 @@ from typing import TYPE_CHECKING, Any, cast
26
26
 
27
27
  import tenacity
28
28
 
29
+ from airflow.providers.cncf.kubernetes.exceptions import KubernetesApiPermissionError
29
30
  from airflow.providers.cncf.kubernetes.hooks.kubernetes import AsyncKubernetesHook
30
31
  from airflow.providers.cncf.kubernetes.utils.pod_manager import (
32
+ AsyncPodManager,
31
33
  OnFinishAction,
32
34
  PodLaunchTimeoutException,
33
35
  PodPhase,
@@ -69,6 +71,7 @@ class KubernetesPodTrigger(BaseTrigger):
69
71
  :param get_logs: get the stdout of the container as logs of the tasks.
70
72
  :param startup_timeout: timeout in seconds to start up the pod.
71
73
  :param startup_check_interval: interval in seconds to check if the pod has already started.
74
+ :param schedule_timeout: timeout in seconds to schedule pod in cluster.
72
75
  :param on_finish_action: What to do when the pod reaches its final state, or the execution is interrupted.
73
76
  If "delete_pod", the pod will be deleted regardless its state; if "delete_succeeded_pod",
74
77
  only succeeded pod will be deleted. You can set to "keep_pod" to keep the pod.
@@ -91,7 +94,8 @@ class KubernetesPodTrigger(BaseTrigger):
91
94
  in_cluster: bool | None = None,
92
95
  get_logs: bool = True,
93
96
  startup_timeout: int = 120,
94
- startup_check_interval: int = 5,
97
+ startup_check_interval: float = 5,
98
+ schedule_timeout: int = 120,
95
99
  on_finish_action: str = "delete_pod",
96
100
  last_log_time: DateTime | None = None,
97
101
  logging_interval: int | None = None,
@@ -110,11 +114,11 @@ class KubernetesPodTrigger(BaseTrigger):
110
114
  self.get_logs = get_logs
111
115
  self.startup_timeout = startup_timeout
112
116
  self.startup_check_interval = startup_check_interval
117
+ self.schedule_timeout = schedule_timeout
113
118
  self.last_log_time = last_log_time
114
119
  self.logging_interval = logging_interval
115
120
  self.on_finish_action = OnFinishAction(on_finish_action)
116
121
  self.trigger_kwargs = trigger_kwargs or {}
117
-
118
122
  self._since_time = None
119
123
 
120
124
  def serialize(self) -> tuple[str, dict[str, Any]]:
@@ -133,6 +137,7 @@ class KubernetesPodTrigger(BaseTrigger):
133
137
  "get_logs": self.get_logs,
134
138
  "startup_timeout": self.startup_timeout,
135
139
  "startup_check_interval": self.startup_check_interval,
140
+ "schedule_timeout": self.schedule_timeout,
136
141
  "trigger_start_time": self.trigger_start_time,
137
142
  "on_finish_action": self.on_finish_action.value,
138
143
  "last_log_time": self.last_log_time,
@@ -143,7 +148,12 @@ class KubernetesPodTrigger(BaseTrigger):
143
148
 
144
149
  async def run(self) -> AsyncIterator[TriggerEvent]:
145
150
  """Get current pod status and yield a TriggerEvent."""
146
- self.log.info("Checking pod %r in namespace %r.", self.pod_name, self.pod_namespace)
151
+ self.log.info(
152
+ "Checking pod %r in namespace %r with poll interval %r.",
153
+ self.pod_name,
154
+ self.pod_namespace,
155
+ self.poll_interval,
156
+ )
147
157
  try:
148
158
  state = await self._wait_for_pod_start()
149
159
  if state == ContainerState.TERMINATED:
@@ -182,7 +192,28 @@ class KubernetesPodTrigger(BaseTrigger):
182
192
  }
183
193
  )
184
194
  return
195
+ except KubernetesApiPermissionError as e:
196
+ message = (
197
+ "Kubernetes API permission error: The triggerer may not have sufficient permissions to monitor or delete pods. "
198
+ "Please ensure the triggerer's service account is included in the 'pod-launcher-role' as defined in the latest Airflow Helm chart. "
199
+ f"Original error: {e}"
200
+ )
201
+ yield TriggerEvent(
202
+ {
203
+ "name": self.pod_name,
204
+ "namespace": self.pod_namespace,
205
+ "status": "error",
206
+ "message": message,
207
+ **self.trigger_kwargs,
208
+ }
209
+ )
210
+ return
185
211
  except Exception as e:
212
+ self.log.exception(
213
+ "Unexpected error while waiting for pod %s in namespace %s",
214
+ self.pod_name,
215
+ self.pod_namespace,
216
+ )
186
217
  yield TriggerEvent(
187
218
  {
188
219
  "name": self.pod_name,
@@ -209,17 +240,16 @@ class KubernetesPodTrigger(BaseTrigger):
209
240
 
210
241
  async def _wait_for_pod_start(self) -> ContainerState:
211
242
  """Loops until pod phase leaves ``PENDING`` If timeout is reached, throws error."""
212
- while True:
213
- pod = await self._get_pod()
214
- if not pod.status.phase == "Pending":
215
- return self.define_container_state(pod)
216
-
217
- delta = datetime.datetime.now(tz=datetime.timezone.utc) - self.trigger_start_time
218
- if self.startup_timeout < delta.total_seconds():
219
- raise PodLaunchTimeoutException("Pod did not leave 'Pending' phase within specified timeout")
220
-
221
- self.log.info("Still waiting for pod to start. The pod state is %s", pod.status.phase)
222
- await asyncio.sleep(self.startup_check_interval)
243
+ pod = await self._get_pod()
244
+ events_task = self.pod_manager.watch_pod_events(pod, self.startup_check_interval)
245
+ pod_start_task = self.pod_manager.await_pod_start(
246
+ pod=pod,
247
+ schedule_timeout=self.schedule_timeout,
248
+ startup_timeout=self.startup_timeout,
249
+ check_interval=self.startup_check_interval,
250
+ )
251
+ await asyncio.gather(pod_start_task, events_task)
252
+ return self.define_container_state(await self._get_pod())
223
253
 
224
254
  async def _wait_for_container_completion(self) -> TriggerEvent:
225
255
  """
@@ -257,16 +287,14 @@ class KubernetesPodTrigger(BaseTrigger):
257
287
  }
258
288
  )
259
289
  self.log.debug("Container is not completed and still working.")
260
- if time_get_more_logs and datetime.datetime.now(tz=datetime.timezone.utc) > time_get_more_logs:
261
- return TriggerEvent(
262
- {
263
- "status": "running",
264
- "last_log_time": self.last_log_time,
265
- "namespace": self.pod_namespace,
266
- "name": self.pod_name,
267
- **self.trigger_kwargs,
268
- }
269
- )
290
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
291
+ if time_get_more_logs and now >= time_get_more_logs:
292
+ if self.get_logs and self.logging_interval:
293
+ self.last_log_time = await self.pod_manager.fetch_container_logs_before_current_sec(
294
+ pod, container_name=self.base_container_name, since_time=self.last_log_time
295
+ )
296
+ time_get_more_logs = now + datetime.timedelta(seconds=self.logging_interval)
297
+
270
298
  self.log.debug("Sleeping for %s seconds.", self.poll_interval)
271
299
  await asyncio.sleep(self.poll_interval)
272
300
 
@@ -287,6 +315,10 @@ class KubernetesPodTrigger(BaseTrigger):
287
315
  cluster_context=self.cluster_context,
288
316
  )
289
317
 
318
+ @cached_property
319
+ def pod_manager(self) -> AsyncPodManager:
320
+ return AsyncPodManager(async_hook=self.hook)
321
+
290
322
  def define_container_state(self, pod: V1Pod) -> ContainerState:
291
323
  pod_containers = pod.status.container_statuses
292
324