apache-airflow-providers-cncf-kubernetes 10.9.0rc1__py3-none-any.whl → 10.11.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. airflow/providers/cncf/kubernetes/__init__.py +3 -3
  2. airflow/providers/cncf/kubernetes/exceptions.py +9 -3
  3. airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +24 -5
  4. airflow/providers/cncf/kubernetes/get_provider_info.py +6 -0
  5. airflow/providers/cncf/kubernetes/hooks/kubernetes.py +58 -21
  6. airflow/providers/cncf/kubernetes/kube_config.py +24 -1
  7. airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +63 -16
  8. airflow/providers/cncf/kubernetes/operators/job.py +9 -3
  9. airflow/providers/cncf/kubernetes/operators/pod.py +36 -45
  10. airflow/providers/cncf/kubernetes/operators/resource.py +2 -8
  11. airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +18 -3
  12. airflow/providers/cncf/kubernetes/secret.py +3 -0
  13. airflow/providers/cncf/kubernetes/triggers/pod.py +56 -24
  14. airflow/providers/cncf/kubernetes/utils/pod_manager.py +256 -111
  15. airflow/providers/cncf/kubernetes/version_compat.py +5 -1
  16. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/METADATA +19 -17
  17. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/RECORD +21 -20
  18. apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info/licenses/NOTICE +5 -0
  19. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/WHEEL +0 -0
  20. {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/entry_points.txt +0 -0
  21. {airflow/providers/cncf/kubernetes → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info/licenses}/LICENSE +0 -0
@@ -29,11 +29,11 @@ from airflow import __version__ as airflow_version
29
29
 
30
30
  __all__ = ["__version__"]
31
31
 
32
- __version__ = "10.9.0"
32
+ __version__ = "10.11.0"
33
33
 
34
34
  if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
35
- "2.10.0"
35
+ "2.11.0"
36
36
  ):
37
37
  raise RuntimeError(
38
- f"The package `apache-airflow-providers-cncf-kubernetes:{__version__}` needs Apache Airflow 2.10.0+"
38
+ f"The package `apache-airflow-providers-cncf-kubernetes:{__version__}` needs Apache Airflow 2.11.0+"
39
39
  )
@@ -16,9 +16,7 @@
16
16
  # under the License.
17
17
  from __future__ import annotations
18
18
 
19
- from airflow.exceptions import (
20
- AirflowException,
21
- )
19
+ from airflow.exceptions import AirflowException
22
20
 
23
21
 
24
22
  class PodMutationHookException(AirflowException):
@@ -27,3 +25,11 @@ class PodMutationHookException(AirflowException):
27
25
 
28
26
  class PodReconciliationError(AirflowException):
29
27
  """Raised when an error is encountered while trying to merge pod configs."""
28
+
29
+
30
+ class KubernetesApiError(AirflowException):
31
+ """Raised when an error is encountered while trying access Kubernetes API."""
32
+
33
+
34
+ class KubernetesApiPermissionError(AirflowException):
35
+ """Raised when an error is encountered while trying access Kubernetes API."""
@@ -165,6 +165,7 @@ class KubernetesExecutor(BaseExecutor):
165
165
  self.task_publish_max_retries = conf.getint(
166
166
  "kubernetes_executor", "task_publish_max_retries", fallback=0
167
167
  )
168
+ self.completed: set[KubernetesResults] = set()
168
169
  super().__init__(parallelism=self.kube_config.parallelism)
169
170
 
170
171
  def _list_pods(self, query_kwargs):
@@ -343,6 +344,9 @@ class KubernetesExecutor(BaseExecutor):
343
344
  finally:
344
345
  self.result_queue.task_done()
345
346
 
347
+ for result in self.completed:
348
+ self._change_state(result)
349
+
346
350
  from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import ResourceVersion
347
351
 
348
352
  resource_instance = ResourceVersion()
@@ -385,6 +389,7 @@ class KubernetesExecutor(BaseExecutor):
385
389
  if (
386
390
  (str(e.status) == "403" and "exceeded quota" in message)
387
391
  or (str(e.status) == "409" and "object has been modified" in message)
392
+ or str(e.status) == "500"
388
393
  ) and (self.task_publish_max_retries == -1 or retries < self.task_publish_max_retries):
389
394
  self.log.warning(
390
395
  "[Try %s of %s] Kube ApiException for Task: (%s). Reason: %r. Message: %s",
@@ -501,7 +506,11 @@ class KubernetesExecutor(BaseExecutor):
501
506
  if state is None:
502
507
  from airflow.models.taskinstance import TaskInstance
503
508
 
504
- state = session.scalar(select(TaskInstance.state).where(TaskInstance.filter_for_tis([key])))
509
+ filter_for_tis = TaskInstance.filter_for_tis([key])
510
+ if filter_for_tis is not None:
511
+ state = session.scalar(select(TaskInstance.state).where(filter_for_tis))
512
+ else:
513
+ state = None
505
514
  state = TaskInstanceState(state) if state else None
506
515
 
507
516
  self.event_buffer[key] = state, None
@@ -511,7 +520,8 @@ class KubernetesExecutor(BaseExecutor):
511
520
  pod_override = ti.executor_config.get("pod_override")
512
521
  namespace = None
513
522
  with suppress(Exception):
514
- namespace = pod_override.metadata.namespace
523
+ if pod_override is not None:
524
+ namespace = pod_override.metadata.namespace
515
525
  return namespace or conf.get("kubernetes_executor", "namespace")
516
526
 
517
527
  def get_task_log(self, ti: TaskInstance, try_number: int) -> tuple[list[str], list[str]]:
@@ -565,7 +575,7 @@ class KubernetesExecutor(BaseExecutor):
565
575
  tis_to_flush_by_key = {ti.key: ti for ti in tis if ti.queued_by_job_id}
566
576
  kube_client: client.CoreV1Api = self.kube_client
567
577
  for scheduler_job_id in scheduler_job_ids:
568
- scheduler_job_id = self._make_safe_label_value(str(scheduler_job_id))
578
+ scheduler_job_id_safe_label = self._make_safe_label_value(str(scheduler_job_id))
569
579
  # We will look for any pods owned by the no-longer-running scheduler,
570
580
  # but will exclude only successful pods, as those TIs will have a terminal state
571
581
  # and not be up for adoption!
@@ -575,7 +585,7 @@ class KubernetesExecutor(BaseExecutor):
575
585
  "field_selector": "status.phase!=Succeeded",
576
586
  "label_selector": (
577
587
  "kubernetes_executor=True,"
578
- f"airflow-worker={scheduler_job_id},{POD_EXECUTOR_DONE_KEY}!=True"
588
+ f"airflow-worker={scheduler_job_id_safe_label},{POD_EXECUTOR_DONE_KEY}!=True"
579
589
  ),
580
590
  }
581
591
  pod_list = self._list_pods(query_kwargs)
@@ -720,7 +730,16 @@ class KubernetesExecutor(BaseExecutor):
720
730
  continue
721
731
 
722
732
  ti_id = annotations_to_key(pod.metadata.annotations)
723
- self.running.add(ti_id)
733
+ self.completed.add(
734
+ KubernetesResults(
735
+ key=ti_id,
736
+ state="completed",
737
+ pod_name=pod.metadata.name,
738
+ namespace=pod.metadata.namespace,
739
+ resource_version=pod.metadata.resource_version,
740
+ failure_details=None,
741
+ )
742
+ )
724
743
 
725
744
  def _flush_task_queue(self) -> None:
726
745
  if TYPE_CHECKING:
@@ -135,6 +135,8 @@ def get_provider_info():
135
135
  "type": "string",
136
136
  "example": None,
137
137
  "default": "",
138
+ "deprecated": True,
139
+ "deprecation_reason": "This configuration is deprecated. Use `pod_template_file` to specify container image instead.\n",
138
140
  },
139
141
  "worker_container_tag": {
140
142
  "description": "The tag of the Kubernetes Image for the Worker to Run\n",
@@ -142,6 +144,8 @@ def get_provider_info():
142
144
  "type": "string",
143
145
  "example": None,
144
146
  "default": "",
147
+ "deprecated": True,
148
+ "deprecation_reason": "This configuration is deprecated. Use `pod_template_file` to specify the image tag instead.\n",
145
149
  },
146
150
  "namespace": {
147
151
  "description": "The Kubernetes namespace where airflow workers should be created. Defaults to ``default``\n",
@@ -149,6 +153,8 @@ def get_provider_info():
149
153
  "type": "string",
150
154
  "example": None,
151
155
  "default": "default",
156
+ "deprecated": True,
157
+ "deprecation_reason": "This configuration is deprecated. Use `pod_template_file` to specify namespace instead.\n",
152
158
  },
153
159
  "delete_worker_pods": {
154
160
  "description": "If True, all worker pods will be deleted upon termination\n",
@@ -27,7 +27,6 @@ from typing import TYPE_CHECKING, Any, Protocol
27
27
 
28
28
  import aiofiles
29
29
  import requests
30
- import tenacity
31
30
  from asgiref.sync import sync_to_async
32
31
  from kubernetes import client, config, utils, watch
33
32
  from kubernetes.client.models import V1Deployment
@@ -37,8 +36,9 @@ from urllib3.exceptions import HTTPError
37
36
 
38
37
  from airflow.exceptions import AirflowException, AirflowNotFoundException
39
38
  from airflow.models import Connection
39
+ from airflow.providers.cncf.kubernetes.exceptions import KubernetesApiError, KubernetesApiPermissionError
40
40
  from airflow.providers.cncf.kubernetes.kube_client import _disable_verify_ssl, _enable_tcp_keepalive
41
- from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import should_retry_creation
41
+ from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import generic_api_retry
42
42
  from airflow.providers.cncf.kubernetes.utils.container import (
43
43
  container_is_completed,
44
44
  container_is_running,
@@ -48,7 +48,7 @@ from airflow.utils import yaml
48
48
 
49
49
  if TYPE_CHECKING:
50
50
  from kubernetes.client import V1JobList
51
- from kubernetes.client.models import V1Job, V1Pod
51
+ from kubernetes.client.models import CoreV1EventList, V1Job, V1Pod
52
52
 
53
53
  LOADING_KUBE_CONFIG_FILE_RESOURCE = "Loading Kubernetes configuration file kube_config from {}..."
54
54
 
@@ -389,6 +389,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
389
389
  self.log.debug("Response: %s", response)
390
390
  return response
391
391
 
392
+ @generic_api_retry
392
393
  def get_custom_object(
393
394
  self, group: str, version: str, plural: str, name: str, namespace: str | None = None
394
395
  ):
@@ -411,6 +412,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
411
412
  )
412
413
  return response
413
414
 
415
+ @generic_api_retry
414
416
  def delete_custom_object(
415
417
  self, group: str, version: str, plural: str, name: str, namespace: str | None = None, **kwargs
416
418
  ):
@@ -539,12 +541,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
539
541
  name=name, namespace=namespace, pretty=True, **kwargs
540
542
  )
541
543
 
542
- @tenacity.retry(
543
- stop=tenacity.stop_after_attempt(3),
544
- wait=tenacity.wait_random_exponential(),
545
- reraise=True,
546
- retry=tenacity.retry_if_exception(should_retry_creation),
547
- )
544
+ @generic_api_retry
548
545
  def create_job(
549
546
  self,
550
547
  job: V1Job,
@@ -571,6 +568,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
571
568
  raise e
572
569
  return resp
573
570
 
571
+ @generic_api_retry
574
572
  def get_job(self, job_name: str, namespace: str) -> V1Job:
575
573
  """
576
574
  Get Job of specified name and namespace.
@@ -581,6 +579,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
581
579
  """
582
580
  return self.batch_v1_client.read_namespaced_job(name=job_name, namespace=namespace, pretty=True)
583
581
 
582
+ @generic_api_retry
584
583
  def get_job_status(self, job_name: str, namespace: str) -> V1Job:
585
584
  """
586
585
  Get job with status of specified name and namespace.
@@ -610,6 +609,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
610
609
  self.log.info("The job '%s' is incomplete. Sleeping for %i sec.", job_name, job_poll_interval)
611
610
  sleep(job_poll_interval)
612
611
 
612
+ @generic_api_retry
613
613
  def list_jobs_all_namespaces(self) -> V1JobList:
614
614
  """
615
615
  Get list of Jobs from all namespaces.
@@ -618,6 +618,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
618
618
  """
619
619
  return self.batch_v1_client.list_job_for_all_namespaces(pretty=True)
620
620
 
621
+ @generic_api_retry
621
622
  def list_jobs_from_namespace(self, namespace: str) -> V1JobList:
622
623
  """
623
624
  Get list of Jobs from dedicated namespace.
@@ -673,6 +674,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
673
674
  return bool(next((c for c in conditions if c.type == "Complete" and c.status), None))
674
675
  return False
675
676
 
677
+ @generic_api_retry
676
678
  def patch_namespaced_job(self, job_name: str, namespace: str, body: object) -> V1Job:
677
679
  """
678
680
  Update the specified Job.
@@ -830,6 +832,13 @@ class AsyncKubernetesHook(KubernetesHook):
830
832
  "Reading kubernetes configuration file from connection "
831
833
  "object and writing temporary config file with its content",
832
834
  )
835
+ if isinstance(kubeconfig, dict):
836
+ self.log.debug(
837
+ LOADING_KUBE_CONFIG_FILE_RESOURCE.format(
838
+ "connection kube_config dictionary (serializing)"
839
+ )
840
+ )
841
+ kubeconfig = json.dumps(kubeconfig)
833
842
  await temp_config.write(kubeconfig.encode())
834
843
  await temp_config.flush()
835
844
  self._is_in_cluster = False
@@ -871,6 +880,7 @@ class AsyncKubernetesHook(KubernetesHook):
871
880
  if kube_client is not None:
872
881
  await kube_client.close()
873
882
 
883
+ @generic_api_retry
874
884
  async def get_pod(self, name: str, namespace: str) -> V1Pod:
875
885
  """
876
886
  Get pod's object.
@@ -879,13 +889,19 @@ class AsyncKubernetesHook(KubernetesHook):
879
889
  :param namespace: Name of the pod's namespace.
880
890
  """
881
891
  async with self.get_conn() as connection:
882
- v1_api = async_client.CoreV1Api(connection)
883
- pod: V1Pod = await v1_api.read_namespaced_pod(
884
- name=name,
885
- namespace=namespace,
886
- )
887
- return pod
892
+ try:
893
+ v1_api = async_client.CoreV1Api(connection)
894
+ pod: V1Pod = await v1_api.read_namespaced_pod(
895
+ name=name,
896
+ namespace=namespace,
897
+ )
898
+ return pod
899
+ except HTTPError as e:
900
+ if hasattr(e, "status") and e.status == 403:
901
+ raise KubernetesApiPermissionError("Permission denied (403) from Kubernetes API.") from e
902
+ raise KubernetesApiError from e
888
903
 
904
+ @generic_api_retry
889
905
  async def delete_pod(self, name: str, namespace: str):
890
906
  """
891
907
  Delete pod's object.
@@ -904,7 +920,10 @@ class AsyncKubernetesHook(KubernetesHook):
904
920
  if str(e.status) != "404":
905
921
  raise
906
922
 
907
- async def read_logs(self, name: str, namespace: str):
923
+ @generic_api_retry
924
+ async def read_logs(
925
+ self, name: str, namespace: str, container_name: str | None = None, since_seconds: int | None = None
926
+ ) -> list[str]:
908
927
  """
909
928
  Read logs inside the pod while starting containers inside.
910
929
 
@@ -915,6 +934,8 @@ class AsyncKubernetesHook(KubernetesHook):
915
934
 
916
935
  :param name: Name of the pod.
917
936
  :param namespace: Name of the pod's namespace.
937
+ :param container_name: Name of the container inside the pod.
938
+ :param since_seconds: Only return logs newer than a relative duration in seconds.
918
939
  """
919
940
  async with self.get_conn() as connection:
920
941
  try:
@@ -922,17 +943,33 @@ class AsyncKubernetesHook(KubernetesHook):
922
943
  logs = await v1_api.read_namespaced_pod_log(
923
944
  name=name,
924
945
  namespace=namespace,
946
+ container=container_name,
925
947
  follow=False,
926
948
  timestamps=True,
949
+ since_seconds=since_seconds,
927
950
  )
928
951
  logs = logs.splitlines()
929
- for line in logs:
930
- self.log.info("Container logs from %s", line)
931
952
  return logs
932
- except HTTPError:
933
- self.log.exception("There was an error reading the kubernetes API.")
934
- raise
953
+ except HTTPError as e:
954
+ raise KubernetesApiError from e
955
+
956
+ @generic_api_retry
957
+ async def get_pod_events(self, name: str, namespace: str) -> CoreV1EventList:
958
+ """Get pod's events."""
959
+ async with self.get_conn() as connection:
960
+ try:
961
+ v1_api = async_client.CoreV1Api(connection)
962
+ events: CoreV1EventList = await v1_api.list_namespaced_event(
963
+ field_selector=f"involvedObject.name={name}",
964
+ namespace=namespace,
965
+ )
966
+ return events
967
+ except HTTPError as e:
968
+ if hasattr(e, "status") and e.status == 403:
969
+ raise KubernetesApiPermissionError("Permission denied (403) from Kubernetes API.") from e
970
+ raise KubernetesApiError from e
935
971
 
972
+ @generic_api_retry
936
973
  async def get_job_status(self, name: str, namespace: str) -> V1Job:
937
974
  """
938
975
  Get job's status object.
@@ -16,8 +16,10 @@
16
16
  # under the License.
17
17
  from __future__ import annotations
18
18
 
19
+ import warnings
20
+
19
21
  from airflow.configuration import conf
20
- from airflow.exceptions import AirflowConfigException
22
+ from airflow.exceptions import AirflowConfigException, AirflowProviderDeprecationWarning
21
23
  from airflow.settings import AIRFLOW_HOME
22
24
 
23
25
 
@@ -53,7 +55,21 @@ class KubeConfig:
53
55
  self.kubernetes_section, "worker_pods_creation_batch_size"
54
56
  )
55
57
  self.worker_container_repository = conf.get(self.kubernetes_section, "worker_container_repository")
58
+ if self.worker_container_repository:
59
+ warnings.warn(
60
+ "Configuration 'worker_container_repository' is deprecated. "
61
+ "Use 'pod_template_file' to specify the container image repository instead.",
62
+ AirflowProviderDeprecationWarning,
63
+ stacklevel=2,
64
+ )
56
65
  self.worker_container_tag = conf.get(self.kubernetes_section, "worker_container_tag")
66
+ if self.worker_container_tag:
67
+ warnings.warn(
68
+ "Configuration 'worker_container_tag' is deprecated. "
69
+ "Use 'pod_template_file' to specify the container image tag instead.",
70
+ AirflowProviderDeprecationWarning,
71
+ stacklevel=2,
72
+ )
57
73
  if self.worker_container_repository and self.worker_container_tag:
58
74
  self.kube_image = f"{self.worker_container_repository}:{self.worker_container_tag}"
59
75
  else:
@@ -64,6 +80,13 @@ class KubeConfig:
64
80
  # cluster has RBAC enabled, your scheduler may need service account permissions to
65
81
  # create, watch, get, and delete pods in this namespace.
66
82
  self.kube_namespace = conf.get(self.kubernetes_section, "namespace")
83
+ if self.kube_namespace and self.kube_namespace != "default":
84
+ warnings.warn(
85
+ "Configuration 'namespace' is deprecated. "
86
+ "Use 'pod_template_file' to specify the namespace instead.",
87
+ AirflowProviderDeprecationWarning,
88
+ stacklevel=2,
89
+ )
67
90
  self.multi_namespace_mode = conf.getboolean(self.kubernetes_section, "multi_namespace_mode")
68
91
  if self.multi_namespace_mode and conf.get(
69
92
  self.kubernetes_section, "multi_namespace_mode_namespace_list"
@@ -23,10 +23,14 @@ from functools import cache
23
23
  from typing import TYPE_CHECKING
24
24
 
25
25
  import pendulum
26
- from kubernetes.client.rest import ApiException
26
+ import tenacity
27
+ from kubernetes.client.rest import ApiException as SyncApiException
28
+ from kubernetes_asyncio.client.exceptions import ApiException as AsyncApiException
27
29
  from slugify import slugify
30
+ from urllib3.exceptions import HTTPError
28
31
 
29
32
  from airflow.configuration import conf
33
+ from airflow.exceptions import AirflowException
30
34
  from airflow.providers.cncf.kubernetes.backcompat import get_logical_date_key
31
35
 
32
36
  if TYPE_CHECKING:
@@ -39,6 +43,62 @@ alphanum_lower = string.ascii_lowercase + string.digits
39
43
  POD_NAME_MAX_LENGTH = 63 # Matches Linux kernel's HOST_NAME_MAX default value minus 1.
40
44
 
41
45
 
46
+ class PodLaunchFailedException(AirflowException):
47
+ """When pod launching fails in KubernetesPodOperator."""
48
+
49
+
50
+ class KubernetesApiException(AirflowException):
51
+ """When communication with kubernetes API fails."""
52
+
53
+
54
+ API_RETRIES = conf.getint("workers", "api_retries", fallback=5)
55
+ API_RETRY_WAIT_MIN = conf.getfloat("workers", "api_retry_wait_min", fallback=1)
56
+ API_RETRY_WAIT_MAX = conf.getfloat("workers", "api_retry_wait_max", fallback=15)
57
+
58
+ _default_wait = tenacity.wait_exponential(min=API_RETRY_WAIT_MIN, max=API_RETRY_WAIT_MAX)
59
+
60
+ TRANSIENT_STATUS_CODES = {409, 429, 500, 502, 503, 504}
61
+
62
+
63
+ def _should_retry_api(exc: BaseException) -> bool:
64
+ """Retry on selected ApiException status codes, plus plain HTTP/timeout errors."""
65
+ if isinstance(exc, (SyncApiException, AsyncApiException)):
66
+ return exc.status in TRANSIENT_STATUS_CODES
67
+ return isinstance(exc, (HTTPError, KubernetesApiException))
68
+
69
+
70
+ class WaitRetryAfterOrExponential(tenacity.wait.wait_base):
71
+ """Wait strategy that honors Retry-After header on 429, else falls back to exponential backoff."""
72
+
73
+ def __call__(self, retry_state):
74
+ exc = retry_state.outcome.exception() if retry_state.outcome else None
75
+ if isinstance(exc, (SyncApiException, AsyncApiException)) and exc.status == 429:
76
+ retry_after = (exc.headers or {}).get("Retry-After")
77
+ if retry_after:
78
+ try:
79
+ return float(int(retry_after))
80
+ except ValueError:
81
+ pass
82
+ # Inline exponential fallback
83
+ return _default_wait(retry_state)
84
+
85
+
86
+ def generic_api_retry(func):
87
+ """
88
+ Retry to Kubernetes API calls.
89
+
90
+ - Retries only transient ApiException status codes.
91
+ - Honors Retry-After on 429.
92
+ """
93
+ return tenacity.retry(
94
+ stop=tenacity.stop_after_attempt(API_RETRIES),
95
+ wait=WaitRetryAfterOrExponential(),
96
+ retry=tenacity.retry_if_exception(_should_retry_api),
97
+ reraise=True,
98
+ before_sleep=tenacity.before_sleep_log(log, logging.WARNING),
99
+ )(func)
100
+
101
+
42
102
  def rand_str(num):
43
103
  """
44
104
  Generate random lowercase alphanumeric string of length num.
@@ -111,6 +171,8 @@ def annotations_to_key(annotations: dict[str, str]) -> TaskInstanceKey:
111
171
  if not annotation_run_id and logical_date_key in annotations:
112
172
  logical_date = pendulum.parse(annotations[logical_date_key])
113
173
  # Do _not_ use create-session, we don't want to expunge
174
+ if Session is None:
175
+ raise RuntimeError("Session not configured. Call configure_orm() first.")
114
176
  session = Session()
115
177
 
116
178
  task_instance_run_id = (
@@ -146,18 +208,3 @@ def annotations_for_logging_task_metadata(annotation_set):
146
208
  else:
147
209
  annotations_for_logging = "<omitted>"
148
210
  return annotations_for_logging
149
-
150
-
151
- def should_retry_creation(exception: BaseException) -> bool:
152
- """
153
- Check if an Exception indicates a transient error and warrants retrying.
154
-
155
- This function is needed for preventing 'No agent available' error. The error appears time to time
156
- when users try to create a Resource or Job. This issue is inside kubernetes and in the current moment
157
- has no solution. Like a temporary solution we decided to retry Job or Resource creation request each
158
- time when this error appears.
159
- More about this issue here: https://github.com/cert-manager/cert-manager/issues/6457
160
- """
161
- if isinstance(exception, ApiException):
162
- return str(exception.status) == "500"
163
- return False
@@ -35,6 +35,7 @@ from airflow.configuration import conf
35
35
  from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
36
36
  from airflow.providers.cncf.kubernetes.hooks.kubernetes import KubernetesHook
37
37
  from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import (
38
+ POD_NAME_MAX_LENGTH,
38
39
  add_unique_suffix,
39
40
  create_unique_id,
40
41
  )
@@ -56,6 +57,8 @@ if TYPE_CHECKING:
56
57
 
57
58
  log = logging.getLogger(__name__)
58
59
 
60
+ JOB_NAME_PREFIX = "job-"
61
+
59
62
 
60
63
  class KubernetesJobOperator(KubernetesPodOperator):
61
64
  """
@@ -378,15 +381,18 @@ class KubernetesJobOperator(KubernetesPodOperator):
378
381
 
379
382
  job = self.reconcile_jobs(job_template, job)
380
383
 
384
+ # Account for job name prefix when generating/truncating the name
385
+ max_base_length = POD_NAME_MAX_LENGTH - len(JOB_NAME_PREFIX)
386
+
381
387
  if not job.metadata.name:
382
388
  job.metadata.name = create_unique_id(
383
- task_id=self.task_id, unique=self.random_name_suffix, max_length=80
389
+ task_id=self.task_id, unique=self.random_name_suffix, max_length=max_base_length
384
390
  )
385
391
  elif self.random_name_suffix:
386
392
  # user has supplied job name, we're just adding suffix
387
- job.metadata.name = add_unique_suffix(name=job.metadata.name)
393
+ job.metadata.name = add_unique_suffix(name=job.metadata.name, max_len=max_base_length)
388
394
 
389
- job.metadata.name = f"job-{job.metadata.name}"
395
+ job.metadata.name = f"{JOB_NAME_PREFIX}{job.metadata.name}"
390
396
 
391
397
  if not job.metadata.namespace:
392
398
  hook_namespace = self.hook.get_namespace()