apache-airflow-providers-cncf-kubernetes 10.10.0rc1__py3-none-any.whl → 10.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. airflow/providers/cncf/kubernetes/__init__.py +3 -3
  2. airflow/providers/cncf/kubernetes/backcompat/backwards_compat_converters.py +1 -1
  3. airflow/providers/cncf/kubernetes/callbacks.py +1 -1
  4. airflow/providers/cncf/kubernetes/decorators/kubernetes.py +8 -3
  5. airflow/providers/cncf/kubernetes/decorators/kubernetes_cmd.py +6 -3
  6. airflow/providers/cncf/kubernetes/exceptions.py +7 -3
  7. airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +1 -2
  8. airflow/providers/cncf/kubernetes/executors/kubernetes_executor_utils.py +1 -1
  9. airflow/providers/cncf/kubernetes/hooks/kubernetes.py +118 -18
  10. airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +65 -20
  11. airflow/providers/cncf/kubernetes/operators/custom_object_launcher.py +1 -1
  12. airflow/providers/cncf/kubernetes/operators/job.py +13 -7
  13. airflow/providers/cncf/kubernetes/operators/kueue.py +1 -1
  14. airflow/providers/cncf/kubernetes/operators/pod.py +86 -34
  15. airflow/providers/cncf/kubernetes/operators/resource.py +3 -9
  16. airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +20 -9
  17. airflow/providers/cncf/kubernetes/resource_convert/env_variable.py +1 -1
  18. airflow/providers/cncf/kubernetes/sensors/spark_kubernetes.py +2 -3
  19. airflow/providers/cncf/kubernetes/template_rendering.py +1 -1
  20. airflow/providers/cncf/kubernetes/triggers/pod.py +23 -8
  21. airflow/providers/cncf/kubernetes/utils/pod_manager.py +98 -86
  22. airflow/providers/cncf/kubernetes/version_compat.py +5 -1
  23. {apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.12.0.dist-info}/METADATA +12 -10
  24. {apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.12.0.dist-info}/RECORD +28 -28
  25. {apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.12.0.dist-info}/WHEEL +0 -0
  26. {apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.12.0.dist-info}/entry_points.txt +0 -0
  27. {apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.12.0.dist-info}/licenses/LICENSE +0 -0
  28. {apache_airflow_providers_cncf_kubernetes-10.10.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.12.0.dist-info}/licenses/NOTICE +0 -0
@@ -29,11 +29,11 @@ from airflow import __version__ as airflow_version
29
29
 
30
30
  __all__ = ["__version__"]
31
31
 
32
- __version__ = "10.10.0"
32
+ __version__ = "10.12.0"
33
33
 
34
34
  if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
35
- "2.10.0"
35
+ "2.11.0"
36
36
  ):
37
37
  raise RuntimeError(
38
- f"The package `apache-airflow-providers-cncf-kubernetes:{__version__}` needs Apache Airflow 2.10.0+"
38
+ f"The package `apache-airflow-providers-cncf-kubernetes:{__version__}` needs Apache Airflow 2.11.0+"
39
39
  )
@@ -20,7 +20,7 @@ from __future__ import annotations
20
20
 
21
21
  from kubernetes.client import ApiClient, models as k8s
22
22
 
23
- from airflow.exceptions import AirflowException
23
+ from airflow.providers.common.compat.sdk import AirflowException
24
24
 
25
25
 
26
26
  def _convert_kube_model_object(obj, new_class):
@@ -24,7 +24,7 @@ import kubernetes_asyncio.client as async_k8s
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
27
- from airflow.utils.context import Context
27
+ from airflow.sdk import Context
28
28
 
29
29
  client_type: TypeAlias = k8s.CoreV1Api | async_k8s.CoreV1Api
30
30
 
@@ -38,7 +38,7 @@ from airflow.providers.common.compat.sdk import (
38
38
  )
39
39
 
40
40
  if TYPE_CHECKING:
41
- from airflow.utils.context import Context
41
+ from airflow.sdk import Context
42
42
 
43
43
  _PYTHON_SCRIPT_ENV = "__PYTHON_SCRIPT"
44
44
  _PYTHON_INPUT_ENV = "__PYTHON_INPUT"
@@ -87,7 +87,13 @@ class _KubernetesDecoratedOperator(DecoratedOperator, KubernetesPodOperator):
87
87
  def _generate_cmds(self) -> list[str]:
88
88
  script_filename = "/tmp/script.py"
89
89
  input_filename = "/tmp/script.in"
90
- output_filename = "/airflow/xcom/return.json"
90
+
91
+ if getattr(self, "do_xcom_push", False):
92
+ output_filename = "/airflow/xcom/return.json"
93
+ make_xcom_dir_cmd = "mkdir -p /airflow/xcom"
94
+ else:
95
+ output_filename = "/dev/null"
96
+ make_xcom_dir_cmd = ":" # shell no-op
91
97
 
92
98
  write_local_script_file_cmd = (
93
99
  f"{_generate_decoded_command(quote(_PYTHON_SCRIPT_ENV), quote(script_filename))}"
@@ -95,7 +101,6 @@ class _KubernetesDecoratedOperator(DecoratedOperator, KubernetesPodOperator):
95
101
  write_local_input_file_cmd = (
96
102
  f"{_generate_decoded_command(quote(_PYTHON_INPUT_ENV), quote(input_filename))}"
97
103
  )
98
- make_xcom_dir_cmd = "mkdir -p /airflow/xcom"
99
104
  exec_python_cmd = f"python {script_filename} {input_filename} {output_filename}"
100
105
  return [
101
106
  "bash",
@@ -30,13 +30,14 @@ from airflow.providers.common.compat.sdk import (
30
30
  from airflow.utils.operator_helpers import determine_kwargs
31
31
 
32
32
  if TYPE_CHECKING:
33
- from airflow.utils.context import Context
33
+ from airflow.sdk import Context
34
34
 
35
35
 
36
36
  class _KubernetesCmdDecoratedOperator(DecoratedOperator, KubernetesPodOperator):
37
37
  custom_operator_name = "@task.kubernetes_cmd"
38
38
 
39
- template_fields: Sequence[str] = KubernetesPodOperator.template_fields
39
+ template_fields: Sequence[str] = tuple({"op_args", "op_kwargs", *KubernetesPodOperator.template_fields})
40
+
40
41
  overwrite_rtif_after_execution: bool = True
41
42
 
42
43
  def __init__(self, *, python_callable: Callable, args_only: bool = False, **kwargs) -> None:
@@ -69,6 +70,8 @@ class _KubernetesCmdDecoratedOperator(DecoratedOperator, KubernetesPodOperator):
69
70
  )
70
71
 
71
72
  def execute(self, context: Context):
73
+ self.render_template_fields(context)
74
+
72
75
  generated = self._generate_cmds(context)
73
76
  if self.args_only:
74
77
  self.cmds = []
@@ -76,7 +79,7 @@ class _KubernetesCmdDecoratedOperator(DecoratedOperator, KubernetesPodOperator):
76
79
  else:
77
80
  self.cmds = generated
78
81
  self.arguments = []
79
- context["ti"].render_templates() # type: ignore[attr-defined]
82
+ self.render_template_fields(context)
80
83
  return super().execute(context)
81
84
 
82
85
  def _generate_cmds(self, context: Context) -> list[str]:
@@ -16,9 +16,13 @@
16
16
  # under the License.
17
17
  from __future__ import annotations
18
18
 
19
- from airflow.exceptions import (
20
- AirflowException,
21
- )
19
+ from airflow.exceptions import AirflowException
20
+
21
+ # Todo: we cannot have a backcompat import for AirflowException yet
22
+ # because PodMutationHookException is redefined in airflow.exception
23
+ # Remove this and either import AirflowException from common.sdk or
24
+ # import it from airflow.sdk.exceptions when PodMutationHookException
25
+ # is removed from airflow.exceptions
22
26
 
23
27
 
24
28
  class PodMutationHookException(AirflowException):
@@ -30,7 +30,6 @@ import logging
30
30
  import multiprocessing
31
31
  import time
32
32
  from collections import Counter, defaultdict
33
- from collections.abc import Sequence
34
33
  from contextlib import suppress
35
34
  from datetime import datetime
36
35
  from queue import Empty, Queue
@@ -71,7 +70,7 @@ from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_types impor
71
70
  )
72
71
  from airflow.providers.cncf.kubernetes.kube_config import KubeConfig
73
72
  from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import annotations_to_key
74
- from airflow.stats import Stats
73
+ from airflow.providers.common.compat.sdk import Stats
75
74
  from airflow.utils.log.logging_mixin import remove_escape_codes
76
75
  from airflow.utils.session import NEW_SESSION, provide_session
77
76
  from airflow.utils.state import TaskInstanceState
@@ -27,7 +27,6 @@ from kubernetes import client, watch
27
27
  from kubernetes.client.rest import ApiException
28
28
  from urllib3.exceptions import ReadTimeoutError
29
29
 
30
- from airflow.exceptions import AirflowException
31
30
  from airflow.providers.cncf.kubernetes.backcompat import get_logical_date_key
32
31
  from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_types import (
33
32
  ADOPTED,
@@ -46,6 +45,7 @@ from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import (
46
45
  create_unique_id,
47
46
  )
48
47
  from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator, workload_to_command_args
48
+ from airflow.providers.common.compat.sdk import AirflowException
49
49
  from airflow.utils.log.logging_mixin import LoggingMixin
50
50
  from airflow.utils.singleton import Singleton
51
51
  from airflow.utils.state import TaskInstanceState
@@ -20,36 +20,35 @@ import asyncio
20
20
  import contextlib
21
21
  import json
22
22
  import tempfile
23
- from collections.abc import Generator
24
23
  from functools import cached_property
25
24
  from time import sleep
26
25
  from typing import TYPE_CHECKING, Any, Protocol
27
26
 
28
27
  import aiofiles
29
28
  import requests
30
- import tenacity
31
29
  from asgiref.sync import sync_to_async
32
30
  from kubernetes import client, config, utils, watch
33
31
  from kubernetes.client.models import V1Deployment
34
32
  from kubernetes.config import ConfigException
35
- from kubernetes_asyncio import client as async_client, config as async_config
33
+ from kubernetes_asyncio import client as async_client, config as async_config, watch as async_watch
36
34
  from urllib3.exceptions import HTTPError
37
35
 
38
- from airflow.exceptions import AirflowException, AirflowNotFoundException
39
36
  from airflow.models import Connection
40
37
  from airflow.providers.cncf.kubernetes.exceptions import KubernetesApiError, KubernetesApiPermissionError
41
38
  from airflow.providers.cncf.kubernetes.kube_client import _disable_verify_ssl, _enable_tcp_keepalive
42
- from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import should_retry_creation
39
+ from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import generic_api_retry
43
40
  from airflow.providers.cncf.kubernetes.utils.container import (
44
41
  container_is_completed,
45
42
  container_is_running,
46
43
  )
47
- from airflow.providers.common.compat.sdk import BaseHook
44
+ from airflow.providers.common.compat.sdk import AirflowException, AirflowNotFoundException, BaseHook
48
45
  from airflow.utils import yaml
49
46
 
50
47
  if TYPE_CHECKING:
48
+ from collections.abc import AsyncGenerator, Generator
49
+
51
50
  from kubernetes.client import V1JobList
52
- from kubernetes.client.models import CoreV1EventList, V1Job, V1Pod
51
+ from kubernetes.client.models import CoreV1Event, CoreV1EventList, V1Job, V1Pod
53
52
 
54
53
  LOADING_KUBE_CONFIG_FILE_RESOURCE = "Loading Kubernetes configuration file kube_config from {}..."
55
54
 
@@ -390,6 +389,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
390
389
  self.log.debug("Response: %s", response)
391
390
  return response
392
391
 
392
+ @generic_api_retry
393
393
  def get_custom_object(
394
394
  self, group: str, version: str, plural: str, name: str, namespace: str | None = None
395
395
  ):
@@ -412,6 +412,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
412
412
  )
413
413
  return response
414
414
 
415
+ @generic_api_retry
415
416
  def delete_custom_object(
416
417
  self, group: str, version: str, plural: str, name: str, namespace: str | None = None, **kwargs
417
418
  ):
@@ -540,12 +541,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
540
541
  name=name, namespace=namespace, pretty=True, **kwargs
541
542
  )
542
543
 
543
- @tenacity.retry(
544
- stop=tenacity.stop_after_attempt(3),
545
- wait=tenacity.wait_random_exponential(),
546
- reraise=True,
547
- retry=tenacity.retry_if_exception(should_retry_creation),
548
- )
544
+ @generic_api_retry
549
545
  def create_job(
550
546
  self,
551
547
  job: V1Job,
@@ -572,6 +568,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
572
568
  raise e
573
569
  return resp
574
570
 
571
+ @generic_api_retry
575
572
  def get_job(self, job_name: str, namespace: str) -> V1Job:
576
573
  """
577
574
  Get Job of specified name and namespace.
@@ -582,6 +579,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
582
579
  """
583
580
  return self.batch_v1_client.read_namespaced_job(name=job_name, namespace=namespace, pretty=True)
584
581
 
582
+ @generic_api_retry
585
583
  def get_job_status(self, job_name: str, namespace: str) -> V1Job:
586
584
  """
587
585
  Get job with status of specified name and namespace.
@@ -611,6 +609,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
611
609
  self.log.info("The job '%s' is incomplete. Sleeping for %i sec.", job_name, job_poll_interval)
612
610
  sleep(job_poll_interval)
613
611
 
612
+ @generic_api_retry
614
613
  def list_jobs_all_namespaces(self) -> V1JobList:
615
614
  """
616
615
  Get list of Jobs from all namespaces.
@@ -619,6 +618,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
619
618
  """
620
619
  return self.batch_v1_client.list_job_for_all_namespaces(pretty=True)
621
620
 
621
+ @generic_api_retry
622
622
  def list_jobs_from_namespace(self, namespace: str) -> V1JobList:
623
623
  """
624
624
  Get list of Jobs from dedicated namespace.
@@ -674,6 +674,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
674
674
  return bool(next((c for c in conditions if c.type == "Complete" and c.status), None))
675
675
  return False
676
676
 
677
+ @generic_api_retry
677
678
  def patch_namespaced_job(self, job_name: str, namespace: str, body: object) -> V1Job:
678
679
  """
679
680
  Update the specified Job.
@@ -777,11 +778,14 @@ def _get_bool(val) -> bool | None:
777
778
  class AsyncKubernetesHook(KubernetesHook):
778
779
  """Hook to use Kubernetes SDK asynchronously."""
779
780
 
780
- def __init__(self, config_dict: dict | None = None, *args, **kwargs):
781
+ def __init__(
782
+ self, config_dict: dict | None = None, connection_extras: dict | None = None, *args, **kwargs
783
+ ):
781
784
  super().__init__(*args, **kwargs)
782
785
 
783
786
  self.config_dict = config_dict
784
- self._extras: dict | None = None
787
+ self._extras: dict | None = connection_extras
788
+ self._event_polling_fallback = False
785
789
 
786
790
  async def _load_config(self):
787
791
  """Return Kubernetes API session for use with requests."""
@@ -831,6 +835,13 @@ class AsyncKubernetesHook(KubernetesHook):
831
835
  "Reading kubernetes configuration file from connection "
832
836
  "object and writing temporary config file with its content",
833
837
  )
838
+ if isinstance(kubeconfig, dict):
839
+ self.log.debug(
840
+ LOADING_KUBE_CONFIG_FILE_RESOURCE.format(
841
+ "connection kube_config dictionary (serializing)"
842
+ )
843
+ )
844
+ kubeconfig = json.dumps(kubeconfig)
834
845
  await temp_config.write(kubeconfig.encode())
835
846
  await temp_config.flush()
836
847
  self._is_in_cluster = False
@@ -872,6 +883,7 @@ class AsyncKubernetesHook(KubernetesHook):
872
883
  if kube_client is not None:
873
884
  await kube_client.close()
874
885
 
886
+ @generic_api_retry
875
887
  async def get_pod(self, name: str, namespace: str) -> V1Pod:
876
888
  """
877
889
  Get pod's object.
@@ -892,6 +904,7 @@ class AsyncKubernetesHook(KubernetesHook):
892
904
  raise KubernetesApiPermissionError("Permission denied (403) from Kubernetes API.") from e
893
905
  raise KubernetesApiError from e
894
906
 
907
+ @generic_api_retry
895
908
  async def delete_pod(self, name: str, namespace: str):
896
909
  """
897
910
  Delete pod's object.
@@ -910,6 +923,7 @@ class AsyncKubernetesHook(KubernetesHook):
910
923
  if str(e.status) != "404":
911
924
  raise
912
925
 
926
+ @generic_api_retry
913
927
  async def read_logs(
914
928
  self, name: str, namespace: str, container_name: str | None = None, since_seconds: int | None = None
915
929
  ) -> list[str]:
@@ -932,7 +946,7 @@ class AsyncKubernetesHook(KubernetesHook):
932
946
  logs = await v1_api.read_namespaced_pod_log(
933
947
  name=name,
934
948
  namespace=namespace,
935
- container_name=container_name,
949
+ container=container_name,
936
950
  follow=False,
937
951
  timestamps=True,
938
952
  since_seconds=since_seconds,
@@ -942,14 +956,25 @@ class AsyncKubernetesHook(KubernetesHook):
942
956
  except HTTPError as e:
943
957
  raise KubernetesApiError from e
944
958
 
945
- async def get_pod_events(self, name: str, namespace: str) -> CoreV1EventList:
946
- """Get pod's events."""
959
+ @generic_api_retry
960
+ async def get_pod_events(
961
+ self, name: str, namespace: str, resource_version: str | None = None
962
+ ) -> CoreV1EventList:
963
+ """
964
+ Get pod events.
965
+
966
+ :param name: Pod name to get events for
967
+ :param namespace: Kubernetes namespace
968
+ :param resource_version: Only return events not older than this resource version
969
+ """
947
970
  async with self.get_conn() as connection:
948
971
  try:
949
972
  v1_api = async_client.CoreV1Api(connection)
950
973
  events: CoreV1EventList = await v1_api.list_namespaced_event(
951
974
  field_selector=f"involvedObject.name={name}",
952
975
  namespace=namespace,
976
+ resource_version=resource_version,
977
+ resource_version_match="NotOlderThan" if resource_version else None,
953
978
  )
954
979
  return events
955
980
  except HTTPError as e:
@@ -957,6 +982,81 @@ class AsyncKubernetesHook(KubernetesHook):
957
982
  raise KubernetesApiPermissionError("Permission denied (403) from Kubernetes API.") from e
958
983
  raise KubernetesApiError from e
959
984
 
985
+ @generic_api_retry
986
+ async def watch_pod_events(
987
+ self,
988
+ name: str,
989
+ namespace: str,
990
+ resource_version: str | None = None,
991
+ timeout_seconds: int = 30,
992
+ ) -> AsyncGenerator[CoreV1Event]:
993
+ """
994
+ Watch pod events using Kubernetes Watch API.
995
+
996
+ :param name: Pod name to watch events for
997
+ :param namespace: Kubernetes namespace
998
+ :param resource_version: Only return events not older than this resource version
999
+ :param timeout_seconds: Timeout in seconds for the watch stream
1000
+ """
1001
+ if self._event_polling_fallback:
1002
+ async for event_polled in self.watch_pod_events_polling_fallback(
1003
+ name, namespace, resource_version, timeout_seconds
1004
+ ):
1005
+ yield event_polled
1006
+
1007
+ try:
1008
+ w = async_watch.Watch()
1009
+ async with self.get_conn() as connection:
1010
+ v1_api = async_client.CoreV1Api(connection)
1011
+
1012
+ async for event_watched in w.stream(
1013
+ v1_api.list_namespaced_event,
1014
+ namespace=namespace,
1015
+ field_selector=f"involvedObject.name={name}",
1016
+ resource_version=resource_version,
1017
+ timeout_seconds=timeout_seconds,
1018
+ ):
1019
+ event: CoreV1Event = event_watched.get("object")
1020
+ yield event
1021
+
1022
+ except async_client.exceptions.ApiException as e:
1023
+ if hasattr(e, "status") and e.status == 403:
1024
+ self.log.warning(
1025
+ "Triggerer does not have Kubernetes API permission to 'watch' events: %s Falling back to polling.",
1026
+ str(e),
1027
+ )
1028
+ self._event_polling_fallback = True
1029
+ async for event_polled in self.watch_pod_events_polling_fallback(
1030
+ name, namespace, resource_version, timeout_seconds
1031
+ ):
1032
+ yield event_polled
1033
+
1034
+ finally:
1035
+ w.stop()
1036
+
1037
+ async def watch_pod_events_polling_fallback(
1038
+ self,
1039
+ name: str,
1040
+ namespace: str,
1041
+ resource_version: str | None = None,
1042
+ interval: int = 30,
1043
+ ) -> AsyncGenerator[CoreV1Event]:
1044
+ """
1045
+ Fallback method to poll pod event at regular intervals.
1046
+
1047
+ This is required when the Airflow triggerer does not have permission to watch events.
1048
+
1049
+ :param name: Pod name to watch events for
1050
+ :param namespace: Kubernetes namespace
1051
+ :param resource_version: Only return events not older than this resource version
1052
+ :param interval: Polling interval in seconds
1053
+ """
1054
+ events: CoreV1EventList = await self.get_pod_events(name, namespace, resource_version)
1055
+ for event in events.items:
1056
+ yield event
1057
+ await asyncio.sleep(interval)
1058
+
1059
+ @generic_api_retry
960
1060
  async def get_job_status(self, name: str, namespace: str) -> V1Job:
961
1061
  """
962
1062
  Get job's status object.
@@ -23,11 +23,16 @@ from functools import cache
23
23
  from typing import TYPE_CHECKING
24
24
 
25
25
  import pendulum
26
- from kubernetes.client.rest import ApiException
26
+ import tenacity
27
+ from kubernetes.client.rest import ApiException as SyncApiException
28
+ from kubernetes_asyncio.client.exceptions import ApiException as AsyncApiException
27
29
  from slugify import slugify
30
+ from sqlalchemy import select
31
+ from urllib3.exceptions import HTTPError
28
32
 
29
33
  from airflow.configuration import conf
30
34
  from airflow.providers.cncf.kubernetes.backcompat import get_logical_date_key
35
+ from airflow.providers.common.compat.sdk import AirflowException
31
36
 
32
37
  if TYPE_CHECKING:
33
38
  from airflow.models.taskinstancekey import TaskInstanceKey
@@ -39,6 +44,62 @@ alphanum_lower = string.ascii_lowercase + string.digits
39
44
  POD_NAME_MAX_LENGTH = 63 # Matches Linux kernel's HOST_NAME_MAX default value minus 1.
40
45
 
41
46
 
47
+ class PodLaunchFailedException(AirflowException):
48
+ """When pod launching fails in KubernetesPodOperator."""
49
+
50
+
51
+ class KubernetesApiException(AirflowException):
52
+ """When communication with kubernetes API fails."""
53
+
54
+
55
+ API_RETRIES = conf.getint("workers", "api_retries", fallback=5)
56
+ API_RETRY_WAIT_MIN = conf.getfloat("workers", "api_retry_wait_min", fallback=1)
57
+ API_RETRY_WAIT_MAX = conf.getfloat("workers", "api_retry_wait_max", fallback=15)
58
+
59
+ _default_wait = tenacity.wait_exponential(min=API_RETRY_WAIT_MIN, max=API_RETRY_WAIT_MAX)
60
+
61
+ TRANSIENT_STATUS_CODES = {409, 429, 500, 502, 503, 504}
62
+
63
+
64
+ def _should_retry_api(exc: BaseException) -> bool:
65
+ """Retry on selected ApiException status codes, plus plain HTTP/timeout errors."""
66
+ if isinstance(exc, (SyncApiException, AsyncApiException)):
67
+ return exc.status in TRANSIENT_STATUS_CODES
68
+ return isinstance(exc, (HTTPError, KubernetesApiException))
69
+
70
+
71
+ class WaitRetryAfterOrExponential(tenacity.wait.wait_base):
72
+ """Wait strategy that honors Retry-After header on 429, else falls back to exponential backoff."""
73
+
74
+ def __call__(self, retry_state):
75
+ exc = retry_state.outcome.exception() if retry_state.outcome else None
76
+ if isinstance(exc, (SyncApiException, AsyncApiException)) and exc.status == 429:
77
+ retry_after = (exc.headers or {}).get("Retry-After")
78
+ if retry_after:
79
+ try:
80
+ return float(int(retry_after))
81
+ except ValueError:
82
+ pass
83
+ # Inline exponential fallback
84
+ return _default_wait(retry_state)
85
+
86
+
87
+ def generic_api_retry(func):
88
+ """
89
+ Retry to Kubernetes API calls.
90
+
91
+ - Retries only transient ApiException status codes.
92
+ - Honors Retry-After on 429.
93
+ """
94
+ return tenacity.retry(
95
+ stop=tenacity.stop_after_attempt(API_RETRIES),
96
+ wait=WaitRetryAfterOrExponential(),
97
+ retry=tenacity.retry_if_exception(_should_retry_api),
98
+ reraise=True,
99
+ before_sleep=tenacity.before_sleep_log(log, logging.WARNING),
100
+ )(func)
101
+
102
+
42
103
  def rand_str(num):
43
104
  """
44
105
  Generate random lowercase alphanumeric string of length num.
@@ -115,15 +176,14 @@ def annotations_to_key(annotations: dict[str, str]) -> TaskInstanceKey:
115
176
  raise RuntimeError("Session not configured. Call configure_orm() first.")
116
177
  session = Session()
117
178
 
118
- task_instance_run_id = (
119
- session.query(TaskInstance.run_id)
179
+ task_instance_run_id = session.scalar(
180
+ select(TaskInstance.run_id)
120
181
  .join(TaskInstance.dag_run)
121
- .filter(
182
+ .where(
122
183
  TaskInstance.dag_id == dag_id,
123
184
  TaskInstance.task_id == task_id,
124
185
  getattr(DagRun, logical_date_key) == logical_date,
125
186
  )
126
- .scalar()
127
187
  )
128
188
  else:
129
189
  task_instance_run_id = annotation_run_id
@@ -148,18 +208,3 @@ def annotations_for_logging_task_metadata(annotation_set):
148
208
  else:
149
209
  annotations_for_logging = "<omitted>"
150
210
  return annotations_for_logging
151
-
152
-
153
- def should_retry_creation(exception: BaseException) -> bool:
154
- """
155
- Check if an Exception indicates a transient error and warrants retrying.
156
-
157
- This function is needed for preventing 'No agent available' error. The error appears time to time
158
- when users try to create a Resource or Job. This issue is inside kubernetes and in the current moment
159
- has no solution. Like a temporary solution we decided to retry Job or Resource creation request each
160
- time when this error appears.
161
- More about this issue here: https://github.com/cert-manager/cert-manager/issues/6457
162
- """
163
- if isinstance(exception, ApiException):
164
- return str(exception.status) == "500"
165
- return False
@@ -28,7 +28,6 @@ import tenacity
28
28
  from kubernetes.client import CoreV1Api, CustomObjectsApi, models as k8s
29
29
  from kubernetes.client.rest import ApiException
30
30
 
31
- from airflow.exceptions import AirflowException
32
31
  from airflow.providers.cncf.kubernetes.resource_convert.configmap import (
33
32
  convert_configmap,
34
33
  convert_configmap_to_volume,
@@ -39,6 +38,7 @@ from airflow.providers.cncf.kubernetes.resource_convert.secret import (
39
38
  convert_secret,
40
39
  )
41
40
  from airflow.providers.cncf.kubernetes.utils.pod_manager import PodManager
41
+ from airflow.providers.common.compat.sdk import AirflowException
42
42
  from airflow.utils.log.logging_mixin import LoggingMixin
43
43
 
44
44
 
@@ -32,9 +32,10 @@ from kubernetes.client.api_client import ApiClient
32
32
  from kubernetes.client.rest import ApiException
33
33
 
34
34
  from airflow.configuration import conf
35
- from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
35
+ from airflow.exceptions import AirflowProviderDeprecationWarning
36
36
  from airflow.providers.cncf.kubernetes.hooks.kubernetes import KubernetesHook
37
37
  from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import (
38
+ POD_NAME_MAX_LENGTH,
38
39
  add_unique_suffix,
39
40
  create_unique_id,
40
41
  )
@@ -43,19 +44,21 @@ from airflow.providers.cncf.kubernetes.pod_generator import PodGenerator, merge_
43
44
  from airflow.providers.cncf.kubernetes.triggers.job import KubernetesJobTrigger
44
45
  from airflow.providers.cncf.kubernetes.utils.pod_manager import EMPTY_XCOM_RESULT, PodNotFoundException
45
46
  from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_1_PLUS
47
+ from airflow.providers.common.compat.sdk import AirflowException
48
+ from airflow.utils import yaml
46
49
 
47
50
  if AIRFLOW_V_3_1_PLUS:
48
51
  from airflow.sdk import BaseOperator
49
52
  else:
50
53
  from airflow.models import BaseOperator
51
- from airflow.utils import yaml
52
- from airflow.utils.context import Context
53
54
 
54
55
  if TYPE_CHECKING:
55
- from airflow.utils.context import Context
56
+ from airflow.sdk import Context
56
57
 
57
58
  log = logging.getLogger(__name__)
58
59
 
60
+ JOB_NAME_PREFIX = "job-"
61
+
59
62
 
60
63
  class KubernetesJobOperator(KubernetesPodOperator):
61
64
  """
@@ -378,15 +381,18 @@ class KubernetesJobOperator(KubernetesPodOperator):
378
381
 
379
382
  job = self.reconcile_jobs(job_template, job)
380
383
 
384
+ # Account for job name prefix when generating/truncating the name
385
+ max_base_length = POD_NAME_MAX_LENGTH - len(JOB_NAME_PREFIX)
386
+
381
387
  if not job.metadata.name:
382
388
  job.metadata.name = create_unique_id(
383
- task_id=self.task_id, unique=self.random_name_suffix, max_length=80
389
+ task_id=self.task_id, unique=self.random_name_suffix, max_length=max_base_length
384
390
  )
385
391
  elif self.random_name_suffix:
386
392
  # user has supplied job name, we're just adding suffix
387
- job.metadata.name = add_unique_suffix(name=job.metadata.name)
393
+ job.metadata.name = add_unique_suffix(name=job.metadata.name, max_len=max_base_length)
388
394
 
389
- job.metadata.name = f"job-{job.metadata.name}"
395
+ job.metadata.name = f"{JOB_NAME_PREFIX}{job.metadata.name}"
390
396
 
391
397
  if not job.metadata.namespace:
392
398
  hook_namespace = self.hook.get_namespace()
@@ -24,10 +24,10 @@ from functools import cached_property
24
24
 
25
25
  from kubernetes.utils import FailToCreateError
26
26
 
27
- from airflow.exceptions import AirflowException
28
27
  from airflow.providers.cncf.kubernetes.hooks.kubernetes import KubernetesHook
29
28
  from airflow.providers.cncf.kubernetes.operators.job import KubernetesJobOperator
30
29
  from airflow.providers.cncf.kubernetes.version_compat import AIRFLOW_V_3_1_PLUS
30
+ from airflow.providers.common.compat.sdk import AirflowException
31
31
 
32
32
  if AIRFLOW_V_3_1_PLUS:
33
33
  from airflow.sdk import BaseOperator