apache-airflow-providers-cncf-kubernetes 10.9.0rc1__py3-none-any.whl → 10.11.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/cncf/kubernetes/__init__.py +3 -3
- airflow/providers/cncf/kubernetes/exceptions.py +9 -3
- airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +24 -5
- airflow/providers/cncf/kubernetes/get_provider_info.py +6 -0
- airflow/providers/cncf/kubernetes/hooks/kubernetes.py +58 -21
- airflow/providers/cncf/kubernetes/kube_config.py +24 -1
- airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py +63 -16
- airflow/providers/cncf/kubernetes/operators/job.py +9 -3
- airflow/providers/cncf/kubernetes/operators/pod.py +36 -45
- airflow/providers/cncf/kubernetes/operators/resource.py +2 -8
- airflow/providers/cncf/kubernetes/operators/spark_kubernetes.py +18 -3
- airflow/providers/cncf/kubernetes/secret.py +3 -0
- airflow/providers/cncf/kubernetes/triggers/pod.py +56 -24
- airflow/providers/cncf/kubernetes/utils/pod_manager.py +256 -111
- airflow/providers/cncf/kubernetes/version_compat.py +5 -1
- {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/METADATA +19 -17
- {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/RECORD +21 -20
- apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info/licenses/NOTICE +5 -0
- {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_cncf_kubernetes-10.9.0rc1.dist-info → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info}/entry_points.txt +0 -0
- {airflow/providers/cncf/kubernetes → apache_airflow_providers_cncf_kubernetes-10.11.0rc2.dist-info/licenses}/LICENSE +0 -0
|
@@ -29,11 +29,11 @@ from airflow import __version__ as airflow_version
|
|
|
29
29
|
|
|
30
30
|
__all__ = ["__version__"]
|
|
31
31
|
|
|
32
|
-
__version__ = "10.
|
|
32
|
+
__version__ = "10.11.0"
|
|
33
33
|
|
|
34
34
|
if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
|
|
35
|
-
"2.
|
|
35
|
+
"2.11.0"
|
|
36
36
|
):
|
|
37
37
|
raise RuntimeError(
|
|
38
|
-
f"The package `apache-airflow-providers-cncf-kubernetes:{__version__}` needs Apache Airflow 2.
|
|
38
|
+
f"The package `apache-airflow-providers-cncf-kubernetes:{__version__}` needs Apache Airflow 2.11.0+"
|
|
39
39
|
)
|
|
@@ -16,9 +16,7 @@
|
|
|
16
16
|
# under the License.
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
|
-
from airflow.exceptions import
|
|
20
|
-
AirflowException,
|
|
21
|
-
)
|
|
19
|
+
from airflow.exceptions import AirflowException
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
class PodMutationHookException(AirflowException):
|
|
@@ -27,3 +25,11 @@ class PodMutationHookException(AirflowException):
|
|
|
27
25
|
|
|
28
26
|
class PodReconciliationError(AirflowException):
|
|
29
27
|
"""Raised when an error is encountered while trying to merge pod configs."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class KubernetesApiError(AirflowException):
|
|
31
|
+
"""Raised when an error is encountered while trying access Kubernetes API."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class KubernetesApiPermissionError(AirflowException):
|
|
35
|
+
"""Raised when an error is encountered while trying access Kubernetes API."""
|
|
@@ -165,6 +165,7 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
165
165
|
self.task_publish_max_retries = conf.getint(
|
|
166
166
|
"kubernetes_executor", "task_publish_max_retries", fallback=0
|
|
167
167
|
)
|
|
168
|
+
self.completed: set[KubernetesResults] = set()
|
|
168
169
|
super().__init__(parallelism=self.kube_config.parallelism)
|
|
169
170
|
|
|
170
171
|
def _list_pods(self, query_kwargs):
|
|
@@ -343,6 +344,9 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
343
344
|
finally:
|
|
344
345
|
self.result_queue.task_done()
|
|
345
346
|
|
|
347
|
+
for result in self.completed:
|
|
348
|
+
self._change_state(result)
|
|
349
|
+
|
|
346
350
|
from airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils import ResourceVersion
|
|
347
351
|
|
|
348
352
|
resource_instance = ResourceVersion()
|
|
@@ -385,6 +389,7 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
385
389
|
if (
|
|
386
390
|
(str(e.status) == "403" and "exceeded quota" in message)
|
|
387
391
|
or (str(e.status) == "409" and "object has been modified" in message)
|
|
392
|
+
or str(e.status) == "500"
|
|
388
393
|
) and (self.task_publish_max_retries == -1 or retries < self.task_publish_max_retries):
|
|
389
394
|
self.log.warning(
|
|
390
395
|
"[Try %s of %s] Kube ApiException for Task: (%s). Reason: %r. Message: %s",
|
|
@@ -501,7 +506,11 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
501
506
|
if state is None:
|
|
502
507
|
from airflow.models.taskinstance import TaskInstance
|
|
503
508
|
|
|
504
|
-
|
|
509
|
+
filter_for_tis = TaskInstance.filter_for_tis([key])
|
|
510
|
+
if filter_for_tis is not None:
|
|
511
|
+
state = session.scalar(select(TaskInstance.state).where(filter_for_tis))
|
|
512
|
+
else:
|
|
513
|
+
state = None
|
|
505
514
|
state = TaskInstanceState(state) if state else None
|
|
506
515
|
|
|
507
516
|
self.event_buffer[key] = state, None
|
|
@@ -511,7 +520,8 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
511
520
|
pod_override = ti.executor_config.get("pod_override")
|
|
512
521
|
namespace = None
|
|
513
522
|
with suppress(Exception):
|
|
514
|
-
|
|
523
|
+
if pod_override is not None:
|
|
524
|
+
namespace = pod_override.metadata.namespace
|
|
515
525
|
return namespace or conf.get("kubernetes_executor", "namespace")
|
|
516
526
|
|
|
517
527
|
def get_task_log(self, ti: TaskInstance, try_number: int) -> tuple[list[str], list[str]]:
|
|
@@ -565,7 +575,7 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
565
575
|
tis_to_flush_by_key = {ti.key: ti for ti in tis if ti.queued_by_job_id}
|
|
566
576
|
kube_client: client.CoreV1Api = self.kube_client
|
|
567
577
|
for scheduler_job_id in scheduler_job_ids:
|
|
568
|
-
|
|
578
|
+
scheduler_job_id_safe_label = self._make_safe_label_value(str(scheduler_job_id))
|
|
569
579
|
# We will look for any pods owned by the no-longer-running scheduler,
|
|
570
580
|
# but will exclude only successful pods, as those TIs will have a terminal state
|
|
571
581
|
# and not be up for adoption!
|
|
@@ -575,7 +585,7 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
575
585
|
"field_selector": "status.phase!=Succeeded",
|
|
576
586
|
"label_selector": (
|
|
577
587
|
"kubernetes_executor=True,"
|
|
578
|
-
f"airflow-worker={
|
|
588
|
+
f"airflow-worker={scheduler_job_id_safe_label},{POD_EXECUTOR_DONE_KEY}!=True"
|
|
579
589
|
),
|
|
580
590
|
}
|
|
581
591
|
pod_list = self._list_pods(query_kwargs)
|
|
@@ -720,7 +730,16 @@ class KubernetesExecutor(BaseExecutor):
|
|
|
720
730
|
continue
|
|
721
731
|
|
|
722
732
|
ti_id = annotations_to_key(pod.metadata.annotations)
|
|
723
|
-
self.
|
|
733
|
+
self.completed.add(
|
|
734
|
+
KubernetesResults(
|
|
735
|
+
key=ti_id,
|
|
736
|
+
state="completed",
|
|
737
|
+
pod_name=pod.metadata.name,
|
|
738
|
+
namespace=pod.metadata.namespace,
|
|
739
|
+
resource_version=pod.metadata.resource_version,
|
|
740
|
+
failure_details=None,
|
|
741
|
+
)
|
|
742
|
+
)
|
|
724
743
|
|
|
725
744
|
def _flush_task_queue(self) -> None:
|
|
726
745
|
if TYPE_CHECKING:
|
|
@@ -135,6 +135,8 @@ def get_provider_info():
|
|
|
135
135
|
"type": "string",
|
|
136
136
|
"example": None,
|
|
137
137
|
"default": "",
|
|
138
|
+
"deprecated": True,
|
|
139
|
+
"deprecation_reason": "This configuration is deprecated. Use `pod_template_file` to specify container image instead.\n",
|
|
138
140
|
},
|
|
139
141
|
"worker_container_tag": {
|
|
140
142
|
"description": "The tag of the Kubernetes Image for the Worker to Run\n",
|
|
@@ -142,6 +144,8 @@ def get_provider_info():
|
|
|
142
144
|
"type": "string",
|
|
143
145
|
"example": None,
|
|
144
146
|
"default": "",
|
|
147
|
+
"deprecated": True,
|
|
148
|
+
"deprecation_reason": "This configuration is deprecated. Use `pod_template_file` to specify the image tag instead.\n",
|
|
145
149
|
},
|
|
146
150
|
"namespace": {
|
|
147
151
|
"description": "The Kubernetes namespace where airflow workers should be created. Defaults to ``default``\n",
|
|
@@ -149,6 +153,8 @@ def get_provider_info():
|
|
|
149
153
|
"type": "string",
|
|
150
154
|
"example": None,
|
|
151
155
|
"default": "default",
|
|
156
|
+
"deprecated": True,
|
|
157
|
+
"deprecation_reason": "This configuration is deprecated. Use `pod_template_file` to specify namespace instead.\n",
|
|
152
158
|
},
|
|
153
159
|
"delete_worker_pods": {
|
|
154
160
|
"description": "If True, all worker pods will be deleted upon termination\n",
|
|
@@ -27,7 +27,6 @@ from typing import TYPE_CHECKING, Any, Protocol
|
|
|
27
27
|
|
|
28
28
|
import aiofiles
|
|
29
29
|
import requests
|
|
30
|
-
import tenacity
|
|
31
30
|
from asgiref.sync import sync_to_async
|
|
32
31
|
from kubernetes import client, config, utils, watch
|
|
33
32
|
from kubernetes.client.models import V1Deployment
|
|
@@ -37,8 +36,9 @@ from urllib3.exceptions import HTTPError
|
|
|
37
36
|
|
|
38
37
|
from airflow.exceptions import AirflowException, AirflowNotFoundException
|
|
39
38
|
from airflow.models import Connection
|
|
39
|
+
from airflow.providers.cncf.kubernetes.exceptions import KubernetesApiError, KubernetesApiPermissionError
|
|
40
40
|
from airflow.providers.cncf.kubernetes.kube_client import _disable_verify_ssl, _enable_tcp_keepalive
|
|
41
|
-
from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import
|
|
41
|
+
from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import generic_api_retry
|
|
42
42
|
from airflow.providers.cncf.kubernetes.utils.container import (
|
|
43
43
|
container_is_completed,
|
|
44
44
|
container_is_running,
|
|
@@ -48,7 +48,7 @@ from airflow.utils import yaml
|
|
|
48
48
|
|
|
49
49
|
if TYPE_CHECKING:
|
|
50
50
|
from kubernetes.client import V1JobList
|
|
51
|
-
from kubernetes.client.models import V1Job, V1Pod
|
|
51
|
+
from kubernetes.client.models import CoreV1EventList, V1Job, V1Pod
|
|
52
52
|
|
|
53
53
|
LOADING_KUBE_CONFIG_FILE_RESOURCE = "Loading Kubernetes configuration file kube_config from {}..."
|
|
54
54
|
|
|
@@ -389,6 +389,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
389
389
|
self.log.debug("Response: %s", response)
|
|
390
390
|
return response
|
|
391
391
|
|
|
392
|
+
@generic_api_retry
|
|
392
393
|
def get_custom_object(
|
|
393
394
|
self, group: str, version: str, plural: str, name: str, namespace: str | None = None
|
|
394
395
|
):
|
|
@@ -411,6 +412,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
411
412
|
)
|
|
412
413
|
return response
|
|
413
414
|
|
|
415
|
+
@generic_api_retry
|
|
414
416
|
def delete_custom_object(
|
|
415
417
|
self, group: str, version: str, plural: str, name: str, namespace: str | None = None, **kwargs
|
|
416
418
|
):
|
|
@@ -539,12 +541,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
539
541
|
name=name, namespace=namespace, pretty=True, **kwargs
|
|
540
542
|
)
|
|
541
543
|
|
|
542
|
-
@
|
|
543
|
-
stop=tenacity.stop_after_attempt(3),
|
|
544
|
-
wait=tenacity.wait_random_exponential(),
|
|
545
|
-
reraise=True,
|
|
546
|
-
retry=tenacity.retry_if_exception(should_retry_creation),
|
|
547
|
-
)
|
|
544
|
+
@generic_api_retry
|
|
548
545
|
def create_job(
|
|
549
546
|
self,
|
|
550
547
|
job: V1Job,
|
|
@@ -571,6 +568,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
571
568
|
raise e
|
|
572
569
|
return resp
|
|
573
570
|
|
|
571
|
+
@generic_api_retry
|
|
574
572
|
def get_job(self, job_name: str, namespace: str) -> V1Job:
|
|
575
573
|
"""
|
|
576
574
|
Get Job of specified name and namespace.
|
|
@@ -581,6 +579,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
581
579
|
"""
|
|
582
580
|
return self.batch_v1_client.read_namespaced_job(name=job_name, namespace=namespace, pretty=True)
|
|
583
581
|
|
|
582
|
+
@generic_api_retry
|
|
584
583
|
def get_job_status(self, job_name: str, namespace: str) -> V1Job:
|
|
585
584
|
"""
|
|
586
585
|
Get job with status of specified name and namespace.
|
|
@@ -610,6 +609,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
610
609
|
self.log.info("The job '%s' is incomplete. Sleeping for %i sec.", job_name, job_poll_interval)
|
|
611
610
|
sleep(job_poll_interval)
|
|
612
611
|
|
|
612
|
+
@generic_api_retry
|
|
613
613
|
def list_jobs_all_namespaces(self) -> V1JobList:
|
|
614
614
|
"""
|
|
615
615
|
Get list of Jobs from all namespaces.
|
|
@@ -618,6 +618,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
618
618
|
"""
|
|
619
619
|
return self.batch_v1_client.list_job_for_all_namespaces(pretty=True)
|
|
620
620
|
|
|
621
|
+
@generic_api_retry
|
|
621
622
|
def list_jobs_from_namespace(self, namespace: str) -> V1JobList:
|
|
622
623
|
"""
|
|
623
624
|
Get list of Jobs from dedicated namespace.
|
|
@@ -673,6 +674,7 @@ class KubernetesHook(BaseHook, PodOperatorHookProtocol):
|
|
|
673
674
|
return bool(next((c for c in conditions if c.type == "Complete" and c.status), None))
|
|
674
675
|
return False
|
|
675
676
|
|
|
677
|
+
@generic_api_retry
|
|
676
678
|
def patch_namespaced_job(self, job_name: str, namespace: str, body: object) -> V1Job:
|
|
677
679
|
"""
|
|
678
680
|
Update the specified Job.
|
|
@@ -830,6 +832,13 @@ class AsyncKubernetesHook(KubernetesHook):
|
|
|
830
832
|
"Reading kubernetes configuration file from connection "
|
|
831
833
|
"object and writing temporary config file with its content",
|
|
832
834
|
)
|
|
835
|
+
if isinstance(kubeconfig, dict):
|
|
836
|
+
self.log.debug(
|
|
837
|
+
LOADING_KUBE_CONFIG_FILE_RESOURCE.format(
|
|
838
|
+
"connection kube_config dictionary (serializing)"
|
|
839
|
+
)
|
|
840
|
+
)
|
|
841
|
+
kubeconfig = json.dumps(kubeconfig)
|
|
833
842
|
await temp_config.write(kubeconfig.encode())
|
|
834
843
|
await temp_config.flush()
|
|
835
844
|
self._is_in_cluster = False
|
|
@@ -871,6 +880,7 @@ class AsyncKubernetesHook(KubernetesHook):
|
|
|
871
880
|
if kube_client is not None:
|
|
872
881
|
await kube_client.close()
|
|
873
882
|
|
|
883
|
+
@generic_api_retry
|
|
874
884
|
async def get_pod(self, name: str, namespace: str) -> V1Pod:
|
|
875
885
|
"""
|
|
876
886
|
Get pod's object.
|
|
@@ -879,13 +889,19 @@ class AsyncKubernetesHook(KubernetesHook):
|
|
|
879
889
|
:param namespace: Name of the pod's namespace.
|
|
880
890
|
"""
|
|
881
891
|
async with self.get_conn() as connection:
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
892
|
+
try:
|
|
893
|
+
v1_api = async_client.CoreV1Api(connection)
|
|
894
|
+
pod: V1Pod = await v1_api.read_namespaced_pod(
|
|
895
|
+
name=name,
|
|
896
|
+
namespace=namespace,
|
|
897
|
+
)
|
|
898
|
+
return pod
|
|
899
|
+
except HTTPError as e:
|
|
900
|
+
if hasattr(e, "status") and e.status == 403:
|
|
901
|
+
raise KubernetesApiPermissionError("Permission denied (403) from Kubernetes API.") from e
|
|
902
|
+
raise KubernetesApiError from e
|
|
888
903
|
|
|
904
|
+
@generic_api_retry
|
|
889
905
|
async def delete_pod(self, name: str, namespace: str):
|
|
890
906
|
"""
|
|
891
907
|
Delete pod's object.
|
|
@@ -904,7 +920,10 @@ class AsyncKubernetesHook(KubernetesHook):
|
|
|
904
920
|
if str(e.status) != "404":
|
|
905
921
|
raise
|
|
906
922
|
|
|
907
|
-
|
|
923
|
+
@generic_api_retry
|
|
924
|
+
async def read_logs(
|
|
925
|
+
self, name: str, namespace: str, container_name: str | None = None, since_seconds: int | None = None
|
|
926
|
+
) -> list[str]:
|
|
908
927
|
"""
|
|
909
928
|
Read logs inside the pod while starting containers inside.
|
|
910
929
|
|
|
@@ -915,6 +934,8 @@ class AsyncKubernetesHook(KubernetesHook):
|
|
|
915
934
|
|
|
916
935
|
:param name: Name of the pod.
|
|
917
936
|
:param namespace: Name of the pod's namespace.
|
|
937
|
+
:param container_name: Name of the container inside the pod.
|
|
938
|
+
:param since_seconds: Only return logs newer than a relative duration in seconds.
|
|
918
939
|
"""
|
|
919
940
|
async with self.get_conn() as connection:
|
|
920
941
|
try:
|
|
@@ -922,17 +943,33 @@ class AsyncKubernetesHook(KubernetesHook):
|
|
|
922
943
|
logs = await v1_api.read_namespaced_pod_log(
|
|
923
944
|
name=name,
|
|
924
945
|
namespace=namespace,
|
|
946
|
+
container=container_name,
|
|
925
947
|
follow=False,
|
|
926
948
|
timestamps=True,
|
|
949
|
+
since_seconds=since_seconds,
|
|
927
950
|
)
|
|
928
951
|
logs = logs.splitlines()
|
|
929
|
-
for line in logs:
|
|
930
|
-
self.log.info("Container logs from %s", line)
|
|
931
952
|
return logs
|
|
932
|
-
except HTTPError:
|
|
933
|
-
|
|
934
|
-
|
|
953
|
+
except HTTPError as e:
|
|
954
|
+
raise KubernetesApiError from e
|
|
955
|
+
|
|
956
|
+
@generic_api_retry
|
|
957
|
+
async def get_pod_events(self, name: str, namespace: str) -> CoreV1EventList:
|
|
958
|
+
"""Get pod's events."""
|
|
959
|
+
async with self.get_conn() as connection:
|
|
960
|
+
try:
|
|
961
|
+
v1_api = async_client.CoreV1Api(connection)
|
|
962
|
+
events: CoreV1EventList = await v1_api.list_namespaced_event(
|
|
963
|
+
field_selector=f"involvedObject.name={name}",
|
|
964
|
+
namespace=namespace,
|
|
965
|
+
)
|
|
966
|
+
return events
|
|
967
|
+
except HTTPError as e:
|
|
968
|
+
if hasattr(e, "status") and e.status == 403:
|
|
969
|
+
raise KubernetesApiPermissionError("Permission denied (403) from Kubernetes API.") from e
|
|
970
|
+
raise KubernetesApiError from e
|
|
935
971
|
|
|
972
|
+
@generic_api_retry
|
|
936
973
|
async def get_job_status(self, name: str, namespace: str) -> V1Job:
|
|
937
974
|
"""
|
|
938
975
|
Get job's status object.
|
|
@@ -16,8 +16,10 @@
|
|
|
16
16
|
# under the License.
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
|
+
import warnings
|
|
20
|
+
|
|
19
21
|
from airflow.configuration import conf
|
|
20
|
-
from airflow.exceptions import AirflowConfigException
|
|
22
|
+
from airflow.exceptions import AirflowConfigException, AirflowProviderDeprecationWarning
|
|
21
23
|
from airflow.settings import AIRFLOW_HOME
|
|
22
24
|
|
|
23
25
|
|
|
@@ -53,7 +55,21 @@ class KubeConfig:
|
|
|
53
55
|
self.kubernetes_section, "worker_pods_creation_batch_size"
|
|
54
56
|
)
|
|
55
57
|
self.worker_container_repository = conf.get(self.kubernetes_section, "worker_container_repository")
|
|
58
|
+
if self.worker_container_repository:
|
|
59
|
+
warnings.warn(
|
|
60
|
+
"Configuration 'worker_container_repository' is deprecated. "
|
|
61
|
+
"Use 'pod_template_file' to specify the container image repository instead.",
|
|
62
|
+
AirflowProviderDeprecationWarning,
|
|
63
|
+
stacklevel=2,
|
|
64
|
+
)
|
|
56
65
|
self.worker_container_tag = conf.get(self.kubernetes_section, "worker_container_tag")
|
|
66
|
+
if self.worker_container_tag:
|
|
67
|
+
warnings.warn(
|
|
68
|
+
"Configuration 'worker_container_tag' is deprecated. "
|
|
69
|
+
"Use 'pod_template_file' to specify the container image tag instead.",
|
|
70
|
+
AirflowProviderDeprecationWarning,
|
|
71
|
+
stacklevel=2,
|
|
72
|
+
)
|
|
57
73
|
if self.worker_container_repository and self.worker_container_tag:
|
|
58
74
|
self.kube_image = f"{self.worker_container_repository}:{self.worker_container_tag}"
|
|
59
75
|
else:
|
|
@@ -64,6 +80,13 @@ class KubeConfig:
|
|
|
64
80
|
# cluster has RBAC enabled, your scheduler may need service account permissions to
|
|
65
81
|
# create, watch, get, and delete pods in this namespace.
|
|
66
82
|
self.kube_namespace = conf.get(self.kubernetes_section, "namespace")
|
|
83
|
+
if self.kube_namespace and self.kube_namespace != "default":
|
|
84
|
+
warnings.warn(
|
|
85
|
+
"Configuration 'namespace' is deprecated. "
|
|
86
|
+
"Use 'pod_template_file' to specify the namespace instead.",
|
|
87
|
+
AirflowProviderDeprecationWarning,
|
|
88
|
+
stacklevel=2,
|
|
89
|
+
)
|
|
67
90
|
self.multi_namespace_mode = conf.getboolean(self.kubernetes_section, "multi_namespace_mode")
|
|
68
91
|
if self.multi_namespace_mode and conf.get(
|
|
69
92
|
self.kubernetes_section, "multi_namespace_mode_namespace_list"
|
|
@@ -23,10 +23,14 @@ from functools import cache
|
|
|
23
23
|
from typing import TYPE_CHECKING
|
|
24
24
|
|
|
25
25
|
import pendulum
|
|
26
|
-
|
|
26
|
+
import tenacity
|
|
27
|
+
from kubernetes.client.rest import ApiException as SyncApiException
|
|
28
|
+
from kubernetes_asyncio.client.exceptions import ApiException as AsyncApiException
|
|
27
29
|
from slugify import slugify
|
|
30
|
+
from urllib3.exceptions import HTTPError
|
|
28
31
|
|
|
29
32
|
from airflow.configuration import conf
|
|
33
|
+
from airflow.exceptions import AirflowException
|
|
30
34
|
from airflow.providers.cncf.kubernetes.backcompat import get_logical_date_key
|
|
31
35
|
|
|
32
36
|
if TYPE_CHECKING:
|
|
@@ -39,6 +43,62 @@ alphanum_lower = string.ascii_lowercase + string.digits
|
|
|
39
43
|
POD_NAME_MAX_LENGTH = 63 # Matches Linux kernel's HOST_NAME_MAX default value minus 1.
|
|
40
44
|
|
|
41
45
|
|
|
46
|
+
class PodLaunchFailedException(AirflowException):
|
|
47
|
+
"""When pod launching fails in KubernetesPodOperator."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class KubernetesApiException(AirflowException):
|
|
51
|
+
"""When communication with kubernetes API fails."""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
API_RETRIES = conf.getint("workers", "api_retries", fallback=5)
|
|
55
|
+
API_RETRY_WAIT_MIN = conf.getfloat("workers", "api_retry_wait_min", fallback=1)
|
|
56
|
+
API_RETRY_WAIT_MAX = conf.getfloat("workers", "api_retry_wait_max", fallback=15)
|
|
57
|
+
|
|
58
|
+
_default_wait = tenacity.wait_exponential(min=API_RETRY_WAIT_MIN, max=API_RETRY_WAIT_MAX)
|
|
59
|
+
|
|
60
|
+
TRANSIENT_STATUS_CODES = {409, 429, 500, 502, 503, 504}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _should_retry_api(exc: BaseException) -> bool:
|
|
64
|
+
"""Retry on selected ApiException status codes, plus plain HTTP/timeout errors."""
|
|
65
|
+
if isinstance(exc, (SyncApiException, AsyncApiException)):
|
|
66
|
+
return exc.status in TRANSIENT_STATUS_CODES
|
|
67
|
+
return isinstance(exc, (HTTPError, KubernetesApiException))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class WaitRetryAfterOrExponential(tenacity.wait.wait_base):
|
|
71
|
+
"""Wait strategy that honors Retry-After header on 429, else falls back to exponential backoff."""
|
|
72
|
+
|
|
73
|
+
def __call__(self, retry_state):
|
|
74
|
+
exc = retry_state.outcome.exception() if retry_state.outcome else None
|
|
75
|
+
if isinstance(exc, (SyncApiException, AsyncApiException)) and exc.status == 429:
|
|
76
|
+
retry_after = (exc.headers or {}).get("Retry-After")
|
|
77
|
+
if retry_after:
|
|
78
|
+
try:
|
|
79
|
+
return float(int(retry_after))
|
|
80
|
+
except ValueError:
|
|
81
|
+
pass
|
|
82
|
+
# Inline exponential fallback
|
|
83
|
+
return _default_wait(retry_state)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def generic_api_retry(func):
|
|
87
|
+
"""
|
|
88
|
+
Retry to Kubernetes API calls.
|
|
89
|
+
|
|
90
|
+
- Retries only transient ApiException status codes.
|
|
91
|
+
- Honors Retry-After on 429.
|
|
92
|
+
"""
|
|
93
|
+
return tenacity.retry(
|
|
94
|
+
stop=tenacity.stop_after_attempt(API_RETRIES),
|
|
95
|
+
wait=WaitRetryAfterOrExponential(),
|
|
96
|
+
retry=tenacity.retry_if_exception(_should_retry_api),
|
|
97
|
+
reraise=True,
|
|
98
|
+
before_sleep=tenacity.before_sleep_log(log, logging.WARNING),
|
|
99
|
+
)(func)
|
|
100
|
+
|
|
101
|
+
|
|
42
102
|
def rand_str(num):
|
|
43
103
|
"""
|
|
44
104
|
Generate random lowercase alphanumeric string of length num.
|
|
@@ -111,6 +171,8 @@ def annotations_to_key(annotations: dict[str, str]) -> TaskInstanceKey:
|
|
|
111
171
|
if not annotation_run_id and logical_date_key in annotations:
|
|
112
172
|
logical_date = pendulum.parse(annotations[logical_date_key])
|
|
113
173
|
# Do _not_ use create-session, we don't want to expunge
|
|
174
|
+
if Session is None:
|
|
175
|
+
raise RuntimeError("Session not configured. Call configure_orm() first.")
|
|
114
176
|
session = Session()
|
|
115
177
|
|
|
116
178
|
task_instance_run_id = (
|
|
@@ -146,18 +208,3 @@ def annotations_for_logging_task_metadata(annotation_set):
|
|
|
146
208
|
else:
|
|
147
209
|
annotations_for_logging = "<omitted>"
|
|
148
210
|
return annotations_for_logging
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def should_retry_creation(exception: BaseException) -> bool:
|
|
152
|
-
"""
|
|
153
|
-
Check if an Exception indicates a transient error and warrants retrying.
|
|
154
|
-
|
|
155
|
-
This function is needed for preventing 'No agent available' error. The error appears time to time
|
|
156
|
-
when users try to create a Resource or Job. This issue is inside kubernetes and in the current moment
|
|
157
|
-
has no solution. Like a temporary solution we decided to retry Job or Resource creation request each
|
|
158
|
-
time when this error appears.
|
|
159
|
-
More about this issue here: https://github.com/cert-manager/cert-manager/issues/6457
|
|
160
|
-
"""
|
|
161
|
-
if isinstance(exception, ApiException):
|
|
162
|
-
return str(exception.status) == "500"
|
|
163
|
-
return False
|
|
@@ -35,6 +35,7 @@ from airflow.configuration import conf
|
|
|
35
35
|
from airflow.exceptions import AirflowException, AirflowProviderDeprecationWarning
|
|
36
36
|
from airflow.providers.cncf.kubernetes.hooks.kubernetes import KubernetesHook
|
|
37
37
|
from airflow.providers.cncf.kubernetes.kubernetes_helper_functions import (
|
|
38
|
+
POD_NAME_MAX_LENGTH,
|
|
38
39
|
add_unique_suffix,
|
|
39
40
|
create_unique_id,
|
|
40
41
|
)
|
|
@@ -56,6 +57,8 @@ if TYPE_CHECKING:
|
|
|
56
57
|
|
|
57
58
|
log = logging.getLogger(__name__)
|
|
58
59
|
|
|
60
|
+
JOB_NAME_PREFIX = "job-"
|
|
61
|
+
|
|
59
62
|
|
|
60
63
|
class KubernetesJobOperator(KubernetesPodOperator):
|
|
61
64
|
"""
|
|
@@ -378,15 +381,18 @@ class KubernetesJobOperator(KubernetesPodOperator):
|
|
|
378
381
|
|
|
379
382
|
job = self.reconcile_jobs(job_template, job)
|
|
380
383
|
|
|
384
|
+
# Account for job name prefix when generating/truncating the name
|
|
385
|
+
max_base_length = POD_NAME_MAX_LENGTH - len(JOB_NAME_PREFIX)
|
|
386
|
+
|
|
381
387
|
if not job.metadata.name:
|
|
382
388
|
job.metadata.name = create_unique_id(
|
|
383
|
-
task_id=self.task_id, unique=self.random_name_suffix, max_length=
|
|
389
|
+
task_id=self.task_id, unique=self.random_name_suffix, max_length=max_base_length
|
|
384
390
|
)
|
|
385
391
|
elif self.random_name_suffix:
|
|
386
392
|
# user has supplied job name, we're just adding suffix
|
|
387
|
-
job.metadata.name = add_unique_suffix(name=job.metadata.name)
|
|
393
|
+
job.metadata.name = add_unique_suffix(name=job.metadata.name, max_len=max_base_length)
|
|
388
394
|
|
|
389
|
-
job.metadata.name = f"
|
|
395
|
+
job.metadata.name = f"{JOB_NAME_PREFIX}{job.metadata.name}"
|
|
390
396
|
|
|
391
397
|
if not job.metadata.namespace:
|
|
392
398
|
hook_namespace = self.hook.get_namespace()
|