polyaxon 2.0.6rc9__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polyaxon/_cli/config.py +1 -1
- polyaxon/_cli/run.py +8 -0
- polyaxon/_cli/services/clean_artifacts.py +1 -1
- polyaxon/_client/client.py +17 -0
- polyaxon/_client/run.py +12 -0
- polyaxon/_compiler/resolver/agent.py +1 -1
- polyaxon/_compiler/resolver/runtime.py +1 -1
- polyaxon/_deploy/schemas/service.py +3 -0
- polyaxon/_docker/executor.py +10 -4
- polyaxon/_env_vars/getters/run.py +3 -0
- polyaxon/_env_vars/keys.py +3 -0
- polyaxon/_flow/__init__.py +2 -0
- polyaxon/_flow/builds/__init__.py +19 -6
- polyaxon/_flow/component/base.py +1 -0
- polyaxon/_flow/component/component.py +14 -0
- polyaxon/_flow/environment/__init__.py +5 -5
- polyaxon/_flow/hooks/__init__.py +19 -6
- polyaxon/_flow/matrix/tuner.py +18 -6
- polyaxon/_flow/operations/operation.py +19 -0
- polyaxon/_flow/run/__init__.py +2 -2
- polyaxon/_flow/run/kubeflow/paddle_job.py +34 -2
- polyaxon/_flow/run/kubeflow/pytorch_job.py +50 -3
- polyaxon/_flow/run/kubeflow/scheduling_policy.py +4 -0
- polyaxon/_flow/run/kubeflow/tf_job.py +2 -1
- polyaxon/_fs/fs.py +5 -0
- polyaxon/_k8s/converter/converters/job.py +1 -1
- polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
- polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
- polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
- polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
- polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
- polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
- polyaxon/_k8s/executor/base.py +23 -6
- polyaxon/_k8s/logging/async_monitor.py +73 -12
- polyaxon/_k8s/manager/async_manager.py +81 -23
- polyaxon/_k8s/manager/base.py +4 -0
- polyaxon/_k8s/manager/manager.py +266 -133
- polyaxon/_operations/tuner.py +1 -0
- polyaxon/_polyaxonfile/check.py +2 -0
- polyaxon/_polyaxonfile/manager/operations.py +3 -0
- polyaxon/_polyaxonfile/manager/workflows.py +2 -0
- polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
- polyaxon/_polyaxonfile/specs/operation.py +1 -0
- polyaxon/_polyaxonfile/specs/sections.py +3 -0
- polyaxon/_runner/agent/async_agent.py +94 -18
- polyaxon/_runner/agent/base_agent.py +25 -7
- polyaxon/_runner/agent/client.py +15 -1
- polyaxon/_runner/agent/sync_agent.py +83 -18
- polyaxon/_runner/executor.py +13 -7
- polyaxon/_schemas/agent.py +27 -1
- polyaxon/_schemas/client.py +30 -3
- polyaxon/_sdk/api/agents_v1_api.py +875 -51
- polyaxon/_sdk/api/service_accounts_v1_api.py +12 -12
- polyaxon/_sdk/schemas/__init__.py +3 -0
- polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
- polyaxon/_sidecar/container/__init__.py +1 -1
- polyaxon/_sidecar/container/monitors/spec.py +1 -1
- polyaxon/pkg.py +1 -1
- {polyaxon-2.0.6rc9.dist-info → polyaxon-2.1.0.dist-info}/METADATA +6 -6
- {polyaxon-2.0.6rc9.dist-info → polyaxon-2.1.0.dist-info}/RECORD +64 -63
- {polyaxon-2.0.6rc9.dist-info → polyaxon-2.1.0.dist-info}/LICENSE +0 -0
- {polyaxon-2.0.6rc9.dist-info → polyaxon-2.1.0.dist-info}/WHEEL +0 -0
- {polyaxon-2.0.6rc9.dist-info → polyaxon-2.1.0.dist-info}/entry_points.txt +0 -0
- {polyaxon-2.0.6rc9.dist-info → polyaxon-2.1.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
|
|
1
|
-
from typing import Optional, Union
|
1
|
+
from typing import Dict, List, Optional, Union
|
2
2
|
from typing_extensions import Literal
|
3
3
|
|
4
|
-
from clipped.compact.pydantic import Field
|
5
|
-
from clipped.types.ref_or_obj import RefField
|
4
|
+
from clipped.compact.pydantic import Field, StrictStr
|
5
|
+
from clipped.types.ref_or_obj import BoolOrRef, IntOrRef, RefField
|
6
6
|
|
7
7
|
from polyaxon._flow.run.base import BaseRun
|
8
8
|
from polyaxon._flow.run.enums import V1RunKind
|
@@ -12,6 +12,39 @@ from polyaxon._flow.run.kubeflow.scheduling_policy import V1SchedulingPolicy
|
|
12
12
|
from polyaxon._flow.run.resources import V1RunResources
|
13
13
|
from polyaxon._flow.run.utils import DestinationImageMixin
|
14
14
|
from polyaxon._k8s.k8s_schemas import V1Container
|
15
|
+
from polyaxon._schemas.base import BaseSchemaModel
|
16
|
+
|
17
|
+
|
18
|
+
class V1PytorchElasticPolicy(BaseSchemaModel):
|
19
|
+
"""Elastic policy for Pytorch distributed runs.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
min_replicas: int, optional
|
23
|
+
max_replicas: int, optional
|
24
|
+
rdvz_backend: str, optional
|
25
|
+
rdvz_port: int, optional
|
26
|
+
rdvz_host: str, optional
|
27
|
+
rdvz_id: str, optional
|
28
|
+
rdvz_conf: List[Dict], optional
|
29
|
+
standalone: bool, optional
|
30
|
+
n_proc_per_node: int, optional
|
31
|
+
max_restarts: int, optional
|
32
|
+
metrics: List[Dict], optional
|
33
|
+
"""
|
34
|
+
|
35
|
+
_IDENTIFIER = "elasticPolicy"
|
36
|
+
|
37
|
+
min_replicas: Optional[IntOrRef] = Field(alias="minReplicas")
|
38
|
+
max_replicas: Optional[IntOrRef] = Field(alias="maxReplicas")
|
39
|
+
rdvz_backend: Optional[StrictStr] = Field(alias="rdvzBackend")
|
40
|
+
rdvz_port: Optional[IntOrRef] = Field(alias="rdvzPort")
|
41
|
+
rdvz_host: Optional[StrictStr] = Field(alias="rdvzHost")
|
42
|
+
rdvz_id: Optional[StrictStr] = Field(alias="rdvzId")
|
43
|
+
rdvz_conf: Optional[List[Dict]] = Field(alias="rdvzConf")
|
44
|
+
standalone: Optional[BoolOrRef]
|
45
|
+
n_proc_per_node: Optional[IntOrRef] = Field(alias="nProcPerNode")
|
46
|
+
max_restarts: Optional[IntOrRef] = Field(alias="maxRestarts")
|
47
|
+
metrics: Optional[List[Dict]] = Field(alias="Metrics")
|
15
48
|
|
16
49
|
|
17
50
|
class V1PytorchJob(BaseRun, DestinationImageMixin):
|
@@ -88,6 +121,18 @@ class V1PytorchJob(BaseRun, DestinationImageMixin):
|
|
88
121
|
>>> ...
|
89
122
|
```
|
90
123
|
|
124
|
+
### elasticPolicy
|
125
|
+
|
126
|
+
ElasticPolicy encapsulates various policies for elastic distributed training job.
|
127
|
+
|
128
|
+
```yaml
|
129
|
+
>>> run:
|
130
|
+
>>> kind: pytorchjob
|
131
|
+
>>> elasticPolicy:
|
132
|
+
>>> ...
|
133
|
+
>>> ...
|
134
|
+
```
|
135
|
+
|
91
136
|
### master
|
92
137
|
|
93
138
|
The master replica in the distributed PytorchJob
|
@@ -122,6 +167,8 @@ class V1PytorchJob(BaseRun, DestinationImageMixin):
|
|
122
167
|
kind: Literal[_IDENTIFIER] = _IDENTIFIER
|
123
168
|
clean_pod_policy: Optional[V1CleanPodPolicy] = Field(alias="cleanPodPolicy")
|
124
169
|
scheduling_policy: Optional[V1SchedulingPolicy] = Field(alias="schedulingPolicy")
|
170
|
+
elastic_policy: Optional[V1PytorchElasticPolicy] = Field(alias="elasticPolicy")
|
171
|
+
n_proc_per_node: Optional[IntOrRef] = Field(alias="nProcPerNode")
|
125
172
|
master: Optional[Union[V1KFReplica, RefField]]
|
126
173
|
worker: Optional[Union[V1KFReplica, RefField]]
|
127
174
|
|
@@ -12,11 +12,15 @@ class V1SchedulingPolicy(BaseSchemaModel):
|
|
12
12
|
Args:
|
13
13
|
min_available: int, optional
|
14
14
|
queue: str, optional
|
15
|
+
min_resources: int, optional
|
15
16
|
priority_class: str, optional
|
17
|
+
schedule_timeout_seconds: int, optional
|
16
18
|
"""
|
17
19
|
|
18
20
|
_IDENTIFIER = "schedulingPolicy"
|
19
21
|
|
20
22
|
min_available: Optional[IntOrRef] = Field(alias="minAvailable")
|
21
23
|
queue: Optional[StrictStr]
|
24
|
+
min_resources: Optional[IntOrRef] = Field(alias="minResources")
|
22
25
|
priority_class: Optional[StrictStr] = Field(alias="priorityClass")
|
26
|
+
schedule_timeout_seconds: Optional[IntOrRef] = Field(alias="scheduleTimeoutSeconds")
|
@@ -172,8 +172,9 @@ class V1TFJob(BaseRun, DestinationImageMixin):
|
|
172
172
|
|
173
173
|
kind: Literal[_IDENTIFIER] = _IDENTIFIER
|
174
174
|
clean_pod_policy: Optional[V1CleanPodPolicy] = Field(alias="cleanPodPolicy")
|
175
|
-
enable_dynamic_worker: Optional[bool] = Field(alias="enableDynamicWorker")
|
176
175
|
scheduling_policy: Optional[V1SchedulingPolicy] = Field(alias="schedulingPolicy")
|
176
|
+
enable_dynamic_worker: Optional[bool] = Field(alias="enableDynamicWorker")
|
177
|
+
success_policy: Optional[str] = Field(alias="successPolicy")
|
177
178
|
chief: Optional[Union[V1KFReplica, RefField]]
|
178
179
|
ps: Optional[Union[V1KFReplica, RefField]]
|
179
180
|
worker: Optional[Union[V1KFReplica, RefField]]
|
polyaxon/_fs/fs.py
CHANGED
@@ -72,6 +72,11 @@ def get_fs_from_name(connection_name: str, asynchronous: bool = False, **kwargs)
|
|
72
72
|
)
|
73
73
|
|
74
74
|
|
75
|
+
def get_sync_default_fs(**kwargs):
|
76
|
+
connection = get_artifacts_connection()
|
77
|
+
return get_sync_fs_from_connection(connection=connection, **kwargs)
|
78
|
+
|
79
|
+
|
75
80
|
async def get_default_fs(**kwargs):
|
76
81
|
connection = get_artifacts_connection()
|
77
82
|
return await get_async_fs_from_connection(connection=connection, **kwargs)
|
@@ -39,7 +39,7 @@ class JobConverter(JobMixin, BaseConverter):
|
|
39
39
|
default_sa=default_sa,
|
40
40
|
)
|
41
41
|
return get_job_custom_resource(
|
42
|
-
namespace=self.namespace,
|
42
|
+
namespace=compiled_operation.namespace or self.namespace,
|
43
43
|
main_container=replica_spec.main_container,
|
44
44
|
sidecar_containers=replica_spec.sidecar_containers,
|
45
45
|
init_containers=replica_spec.init_containers,
|
@@ -58,6 +58,7 @@ class PaddleJobConverter(PaddleJobMixin, BaseConverter):
|
|
58
58
|
termination=compiled_operation.termination,
|
59
59
|
clean_pod_policy=job.clean_pod_policy,
|
60
60
|
scheduling_policy=job.scheduling_policy,
|
61
|
+
elastic_policy=job.elastic_policy,
|
61
62
|
collect_logs=plugins.collect_logs,
|
62
63
|
sync_statuses=plugins.sync_statuses,
|
63
64
|
notifications=plugins.notifications,
|
@@ -58,6 +58,8 @@ class PytorchJobConverter(PytorchJobMixin, BaseConverter):
|
|
58
58
|
termination=compiled_operation.termination,
|
59
59
|
clean_pod_policy=job.clean_pod_policy,
|
60
60
|
scheduling_policy=job.scheduling_policy,
|
61
|
+
elastic_policy=job.elastic_policy,
|
62
|
+
n_proc_per_node=job.n_proc_per_node,
|
61
63
|
collect_logs=plugins.collect_logs,
|
62
64
|
sync_statuses=plugins.sync_statuses,
|
63
65
|
notifications=plugins.notifications,
|
@@ -63,6 +63,7 @@ class TfJobConverter(TFJobMixin, BaseConverter):
|
|
63
63
|
collect_logs=plugins.collect_logs,
|
64
64
|
clean_pod_policy=job.clean_pod_policy,
|
65
65
|
scheduling_policy=job.scheduling_policy,
|
66
|
+
success_policy=job.success_policy,
|
66
67
|
enable_dynamic_worker=job.enable_dynamic_worker,
|
67
68
|
sync_statuses=plugins.sync_statuses,
|
68
69
|
notifications=plugins.notifications,
|
@@ -1,6 +1,11 @@
|
|
1
1
|
from typing import Dict, List, Optional
|
2
2
|
|
3
|
-
from polyaxon._flow import
|
3
|
+
from polyaxon._flow import (
|
4
|
+
V1Notification,
|
5
|
+
V1PaddleElasticPolicy,
|
6
|
+
V1SchedulingPolicy,
|
7
|
+
V1Termination,
|
8
|
+
)
|
4
9
|
from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
|
5
10
|
from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
|
6
11
|
from polyaxon._k8s.custom_resources.setter import (
|
@@ -25,6 +30,7 @@ def get_paddle_job_custom_resource(
|
|
25
30
|
notifications: List[V1Notification],
|
26
31
|
clean_pod_policy: Optional[str],
|
27
32
|
scheduling_policy: Optional[V1SchedulingPolicy],
|
33
|
+
elastic_policy: Optional[V1PaddleElasticPolicy],
|
28
34
|
labels: Dict[str, str],
|
29
35
|
annotations: Dict[str, str],
|
30
36
|
) -> Dict:
|
@@ -58,6 +64,9 @@ def get_paddle_job_custom_resource(
|
|
58
64
|
template_spec=template_spec, scheduling_policy=scheduling_policy
|
59
65
|
)
|
60
66
|
|
67
|
+
if elastic_policy:
|
68
|
+
template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
|
69
|
+
|
61
70
|
custom_object = {"paddleJobSpec": template_spec}
|
62
71
|
custom_object = set_termination(
|
63
72
|
custom_object=custom_object, termination=termination
|
@@ -1,6 +1,11 @@
|
|
1
1
|
from typing import Dict, List, Optional
|
2
2
|
|
3
|
-
from polyaxon._flow import
|
3
|
+
from polyaxon._flow import (
|
4
|
+
V1Notification,
|
5
|
+
V1PytorchElasticPolicy,
|
6
|
+
V1SchedulingPolicy,
|
7
|
+
V1Termination,
|
8
|
+
)
|
4
9
|
from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
|
5
10
|
from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
|
6
11
|
from polyaxon._k8s.custom_resources.setter import (
|
@@ -25,6 +30,8 @@ def get_pytorch_job_custom_resource(
|
|
25
30
|
notifications: List[V1Notification],
|
26
31
|
clean_pod_policy: Optional[str],
|
27
32
|
scheduling_policy: Optional[V1SchedulingPolicy],
|
33
|
+
elastic_policy: Optional[V1PytorchElasticPolicy],
|
34
|
+
n_proc_per_node: Optional[int],
|
28
35
|
labels: Dict[str, str],
|
29
36
|
annotations: Dict[str, str],
|
30
37
|
) -> Dict:
|
@@ -58,6 +65,12 @@ def get_pytorch_job_custom_resource(
|
|
58
65
|
template_spec=template_spec, scheduling_policy=scheduling_policy
|
59
66
|
)
|
60
67
|
|
68
|
+
if elastic_policy:
|
69
|
+
template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
|
70
|
+
|
71
|
+
if n_proc_per_node is not None:
|
72
|
+
template_spec["nProcPerNode"] = str(n_proc_per_node)
|
73
|
+
|
61
74
|
custom_object = {"pytorchJobSpec": template_spec}
|
62
75
|
custom_object = set_termination(
|
63
76
|
custom_object=custom_object, termination=termination
|
@@ -28,6 +28,7 @@ def get_tf_job_custom_resource(
|
|
28
28
|
clean_pod_policy: Optional[str],
|
29
29
|
scheduling_policy: Optional[V1SchedulingPolicy],
|
30
30
|
enable_dynamic_worker: bool,
|
31
|
+
success_policy: Optional[str],
|
31
32
|
labels: Dict[str, str],
|
32
33
|
annotations: Dict[str, str],
|
33
34
|
) -> Dict:
|
@@ -73,6 +74,9 @@ def get_tf_job_custom_resource(
|
|
73
74
|
if enable_dynamic_worker:
|
74
75
|
template_spec["enableDynamicWorker"] = enable_dynamic_worker
|
75
76
|
|
77
|
+
if success_policy:
|
78
|
+
template_spec["successPolicy"] = success_policy
|
79
|
+
|
76
80
|
template_spec = {"replicaSpecs": template_spec}
|
77
81
|
|
78
82
|
template_spec = set_clean_pod_policy(
|
polyaxon/_k8s/executor/base.py
CHANGED
@@ -5,7 +5,7 @@ from kubernetes.client import Configuration
|
|
5
5
|
|
6
6
|
from polyaxon import settings
|
7
7
|
from polyaxon._k8s.converter.converters import CONVERTERS
|
8
|
-
from polyaxon._k8s.converter.mixins import MIXIN_MAPPING
|
8
|
+
from polyaxon._k8s.converter.mixins import MIXIN_MAPPING, BaseMixin
|
9
9
|
from polyaxon._runner.executor import BaseExecutor as _BaseExecutor
|
10
10
|
from polyaxon._runner.kinds import RunnerKind
|
11
11
|
from polyaxon._utils.fqn_utils import get_resource_name
|
@@ -56,7 +56,9 @@ class BaseExecutor(_BaseExecutor):
|
|
56
56
|
api = k8s_client.ApiClient()
|
57
57
|
return api.sanitize_for_serialization(resource)
|
58
58
|
|
59
|
-
def create(
|
59
|
+
def create(
|
60
|
+
self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
|
61
|
+
) -> Dict:
|
60
62
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
61
63
|
resource_name = get_resource_name(run_uuid)
|
62
64
|
return self.manager.create_custom_object(
|
@@ -65,9 +67,12 @@ class BaseExecutor(_BaseExecutor):
|
|
65
67
|
version=mixin.API_VERSION,
|
66
68
|
plural=mixin.PLURAL,
|
67
69
|
body=resource,
|
70
|
+
namespace=namespace,
|
68
71
|
)
|
69
72
|
|
70
|
-
def apply(
|
73
|
+
def apply(
|
74
|
+
self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
|
75
|
+
) -> Dict:
|
71
76
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
72
77
|
resource_name = get_resource_name(run_uuid)
|
73
78
|
return self.manager.update_custom_object(
|
@@ -76,9 +81,10 @@ class BaseExecutor(_BaseExecutor):
|
|
76
81
|
version=mixin.API_VERSION,
|
77
82
|
plural=mixin.PLURAL,
|
78
83
|
body=resource,
|
84
|
+
namespace=namespace,
|
79
85
|
)
|
80
86
|
|
81
|
-
def stop(self, run_uuid: str, run_kind: str):
|
87
|
+
def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
|
82
88
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
83
89
|
resource_name = get_resource_name(run_uuid)
|
84
90
|
return self.manager.delete_custom_object(
|
@@ -86,16 +92,18 @@ class BaseExecutor(_BaseExecutor):
|
|
86
92
|
group=mixin.GROUP,
|
87
93
|
version=mixin.API_VERSION,
|
88
94
|
plural=mixin.PLURAL,
|
95
|
+
namespace=namespace,
|
89
96
|
)
|
90
97
|
|
91
|
-
def clean(self, run_uuid: str, run_kind: str):
|
98
|
+
def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
|
92
99
|
return self.apply(
|
93
100
|
run_uuid=run_uuid,
|
94
101
|
run_kind=run_kind,
|
95
102
|
resource={"metadata": {"finalizers": None}},
|
103
|
+
namespace=namespace,
|
96
104
|
)
|
97
105
|
|
98
|
-
def get(self, run_uuid: str, run_kind: str):
|
106
|
+
def get(self, run_uuid: str, run_kind: str, namespace: str = None):
|
99
107
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
100
108
|
resource_name = get_resource_name(run_uuid)
|
101
109
|
return self.manager.get_custom_object(
|
@@ -103,4 +111,13 @@ class BaseExecutor(_BaseExecutor):
|
|
103
111
|
group=mixin.GROUP,
|
104
112
|
version=mixin.API_VERSION,
|
105
113
|
plural=mixin.PLURAL,
|
114
|
+
namespace=namespace,
|
115
|
+
)
|
116
|
+
|
117
|
+
def list_ops(self, namespace: str = None):
|
118
|
+
return self.manager.list_custom_objects(
|
119
|
+
group=BaseMixin.GROUP,
|
120
|
+
version=BaseMixin.API_VERSION,
|
121
|
+
plural=BaseMixin.PLURAL,
|
122
|
+
namespace=namespace,
|
106
123
|
)
|
@@ -86,6 +86,20 @@ async def query_k8s_operation_logs(
|
|
86
86
|
return logs, new_time
|
87
87
|
|
88
88
|
|
89
|
+
async def collect_agent_service_logs(
|
90
|
+
k8s_manager: AsyncK8sManager, pod: V1Pod
|
91
|
+
) -> List[V1Log]:
|
92
|
+
if not pod or not pod.spec.containers:
|
93
|
+
return []
|
94
|
+
container = pod.spec.containers[0]
|
95
|
+
return await handle_container_logs(
|
96
|
+
k8s_manager=k8s_manager,
|
97
|
+
pod=pod,
|
98
|
+
container_name=container.name,
|
99
|
+
tail_lines=V1Logs._CHUNK_SIZE * 3,
|
100
|
+
)
|
101
|
+
|
102
|
+
|
89
103
|
async def query_k8s_pod_logs(
|
90
104
|
k8s_manager: AsyncK8sManager,
|
91
105
|
pod: V1Pod,
|
@@ -107,7 +121,7 @@ async def query_k8s_pod_logs(
|
|
107
121
|
return logs, last_time
|
108
122
|
|
109
123
|
|
110
|
-
async def
|
124
|
+
async def get_op_pos_and_services(
|
111
125
|
k8s_manager: AsyncK8sManager,
|
112
126
|
run_uuid: str,
|
113
127
|
run_kind: str,
|
@@ -115,20 +129,67 @@ async def get_op_spec(
|
|
115
129
|
pods = await k8s_manager.list_pods(
|
116
130
|
label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
|
117
131
|
)
|
132
|
+
services = []
|
133
|
+
if V1RunKind.has_service(run_kind):
|
134
|
+
services = await k8s_manager.list_services(
|
135
|
+
label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
|
136
|
+
)
|
137
|
+
|
138
|
+
return pods, services
|
139
|
+
|
140
|
+
|
141
|
+
async def get_op_spec(
|
142
|
+
k8s_manager: AsyncK8sManager,
|
143
|
+
run_uuid: str,
|
144
|
+
run_kind: str,
|
145
|
+
):
|
146
|
+
pods, services = await get_op_pos_and_services(
|
147
|
+
k8s_manager=k8s_manager,
|
148
|
+
run_uuid=run_uuid,
|
149
|
+
run_kind=run_kind,
|
150
|
+
)
|
151
|
+
pods_list = {}
|
152
|
+
for pod in pods or []:
|
153
|
+
pods_list[
|
154
|
+
pod.metadata.name
|
155
|
+
] = k8s_manager.api_client.sanitize_for_serialization(pod)
|
156
|
+
services_list = {}
|
157
|
+
for service in services or []:
|
158
|
+
services_list[
|
159
|
+
service.metadata.name
|
160
|
+
] = k8s_manager.api_client.sanitize_for_serialization(service)
|
161
|
+
data = {"pods": pods_list, "services": services_list}
|
162
|
+
return data, pods, services
|
163
|
+
|
164
|
+
|
165
|
+
async def get_agent_pods_and_services(
|
166
|
+
k8s_manager: AsyncK8sManager,
|
167
|
+
):
|
168
|
+
pods = await k8s_manager.list_pods(
|
169
|
+
label_selector=k8s_manager.get_core_polyaxon(),
|
170
|
+
)
|
171
|
+
services = await k8s_manager.list_services(
|
172
|
+
label_selector=k8s_manager.get_core_polyaxon(),
|
173
|
+
)
|
174
|
+
return pods, services
|
175
|
+
|
176
|
+
|
177
|
+
async def get_agent_spec(
|
178
|
+
k8s_manager: AsyncK8sManager,
|
179
|
+
):
|
180
|
+
pods, services = await get_agent_pods_and_services(
|
181
|
+
k8s_manager=k8s_manager,
|
182
|
+
)
|
118
183
|
pods_list = {}
|
119
184
|
for pod in pods or []:
|
120
185
|
pods_list[
|
121
186
|
pod.metadata.name
|
122
187
|
] = k8s_manager.api_client.sanitize_for_serialization(pod)
|
123
188
|
data = {"pods": pods_list}
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
service.metadata.name
|
132
|
-
] = k8s_manager.api_client.sanitize_for_serialization(service)
|
133
|
-
data["services"] = services_list
|
134
|
-
return data
|
189
|
+
services_list = {}
|
190
|
+
for service in services or []:
|
191
|
+
services_list[
|
192
|
+
service.metadata.name
|
193
|
+
] = k8s_manager.api_client.sanitize_for_serialization(service)
|
194
|
+
data["services"] = services_list
|
195
|
+
return data, pods, services
|
@@ -38,7 +38,7 @@ class AsyncK8sManager(BaseK8sManager):
|
|
38
38
|
if self.api_client:
|
39
39
|
await self.api_client.close()
|
40
40
|
|
41
|
-
async def get_version(self, reraise=False):
|
41
|
+
async def get_version(self, reraise: bool = False):
|
42
42
|
try:
|
43
43
|
version = await self.k8s_version_api.get_code()
|
44
44
|
return version.to_dict()
|
@@ -47,48 +47,68 @@ class AsyncK8sManager(BaseK8sManager):
|
|
47
47
|
if reraise:
|
48
48
|
raise PolyaxonK8sError("Connection error: %s" % e) from e
|
49
49
|
|
50
|
-
async def get_pod(
|
50
|
+
async def get_pod(
|
51
|
+
self, name, reraise: bool = False, namespace: str = None
|
52
|
+
) -> Optional[client.V1Pod]:
|
51
53
|
try:
|
52
54
|
return await self.k8s_api.read_namespaced_pod( # type: ignore[attr-defined]
|
53
|
-
name=name, namespace=self.namespace
|
55
|
+
name=name, namespace=namespace or self.namespace
|
54
56
|
)
|
55
57
|
except ApiException as e:
|
56
58
|
if reraise:
|
57
59
|
raise PolyaxonK8sError("Connection error: %s" % e) from e
|
58
60
|
return None
|
59
61
|
|
60
|
-
async def is_pod_running(
|
61
|
-
|
62
|
+
async def is_pod_running(
|
63
|
+
self, pod_id: str, container_id: str, namespace: str = None
|
64
|
+
) -> bool:
|
65
|
+
event = await self.k8s_api.read_namespaced_pod_status(pod_id, namespace=namespace or self.namespace) # type: ignore[attr-defined]
|
62
66
|
return is_pod_running(event, container_id)
|
63
67
|
|
64
68
|
async def _list_namespace_resource(
|
65
|
-
self, resource_api, reraise=False, **kwargs
|
69
|
+
self, resource_api, reraise: bool = False, namespace: str = None, **kwargs
|
66
70
|
) -> List:
|
67
71
|
try:
|
68
|
-
res = await resource_api(namespace=self.namespace, **kwargs)
|
69
|
-
|
72
|
+
res = await resource_api(namespace=namespace or self.namespace, **kwargs)
|
73
|
+
if isinstance(res, dict):
|
74
|
+
items = res["items"]
|
75
|
+
else:
|
76
|
+
items = res.items
|
77
|
+
return [p for p in items]
|
70
78
|
except ApiException as e:
|
71
79
|
logger.error("K8S error: {}".format(e))
|
72
80
|
if reraise:
|
73
81
|
raise PolyaxonK8sError("Connection error: %s" % e) from e
|
74
82
|
return []
|
75
83
|
|
76
|
-
async def list_pods(
|
84
|
+
async def list_pods(
|
85
|
+
self, reraise: bool = False, namespace: str = None, **kwargs
|
86
|
+
) -> List[client.V1Pod]:
|
77
87
|
return await self._list_namespace_resource(
|
78
88
|
resource_api=self.k8s_api.list_namespaced_pod, # type: ignore[attr-defined]
|
79
89
|
reraise=reraise,
|
90
|
+
namespace=namespace,
|
80
91
|
**kwargs,
|
81
92
|
)
|
82
93
|
|
83
|
-
async def list_jobs(
|
94
|
+
async def list_jobs(
|
95
|
+
self, reraise: bool = False, namespace: str = None, **kwargs
|
96
|
+
) -> List[client.V1Job]:
|
84
97
|
return await self._list_namespace_resource(
|
85
98
|
resource_api=self.k8s_batch_api.list_namespaced_job, # type: ignore[attr-defined]
|
86
99
|
reraise=reraise,
|
100
|
+
namespace=namespace,
|
87
101
|
**kwargs,
|
88
102
|
)
|
89
103
|
|
90
104
|
async def list_custom_objects(
|
91
|
-
self,
|
105
|
+
self,
|
106
|
+
group,
|
107
|
+
version,
|
108
|
+
plural,
|
109
|
+
reraise: bool = False,
|
110
|
+
namespace: str = None,
|
111
|
+
**kwargs
|
92
112
|
) -> List:
|
93
113
|
return await self._list_namespace_resource(
|
94
114
|
resource_api=self.k8s_custom_object_api.list_namespaced_custom_object, # type: ignore[attr-defined]
|
@@ -96,49 +116,64 @@ class AsyncK8sManager(BaseK8sManager):
|
|
96
116
|
group=group,
|
97
117
|
version=version,
|
98
118
|
plural=plural,
|
119
|
+
namespace=namespace,
|
99
120
|
**kwargs,
|
100
121
|
)
|
101
122
|
|
102
123
|
async def list_services(
|
103
|
-
self, reraise: bool = False, **kwargs
|
124
|
+
self, reraise: bool = False, namespace: str = None, **kwargs
|
104
125
|
) -> List[client.V1Service]:
|
105
126
|
return await self._list_namespace_resource(
|
106
127
|
resource_api=self.k8s_api.list_namespaced_service, # type: ignore[attr-defined]
|
107
128
|
reraise=reraise,
|
129
|
+
namespace=namespace,
|
108
130
|
**kwargs,
|
109
131
|
)
|
110
132
|
|
111
133
|
async def list_deployments(
|
112
|
-
self, reraise: bool = False, **kwargs
|
134
|
+
self, reraise: bool = False, namespace: str = None, **kwargs
|
113
135
|
) -> List[client.V1Deployment]:
|
114
136
|
return await self._list_namespace_resource(
|
115
137
|
resource_api=self.k8s_apps_api.list_namespaced_deployment, # type: ignore[attr-defined]
|
116
138
|
reraise=reraise,
|
139
|
+
namespace=namespace,
|
117
140
|
**kwargs,
|
118
141
|
)
|
119
142
|
|
120
143
|
async def create_custom_object(
|
121
|
-
self,
|
144
|
+
self,
|
145
|
+
name: str,
|
146
|
+
group: str,
|
147
|
+
version: str,
|
148
|
+
plural: str,
|
149
|
+
body: Dict,
|
150
|
+
namespace: str = None,
|
122
151
|
) -> Dict:
|
123
152
|
resp = await self.k8s_custom_object_api.create_namespaced_custom_object( # type: ignore[attr-defined]
|
124
153
|
group=group,
|
125
154
|
version=version,
|
126
155
|
plural=plural,
|
127
|
-
namespace=self.namespace,
|
156
|
+
namespace=namespace or self.namespace,
|
128
157
|
body=body,
|
129
158
|
)
|
130
159
|
logger.debug("Custom object `{}` was created".format(name))
|
131
160
|
return resp
|
132
161
|
|
133
162
|
async def update_custom_object(
|
134
|
-
self,
|
163
|
+
self,
|
164
|
+
name: str,
|
165
|
+
group: str,
|
166
|
+
version: str,
|
167
|
+
plural: str,
|
168
|
+
body: Dict,
|
169
|
+
namespace: str = None,
|
135
170
|
) -> Dict:
|
136
171
|
resp = await self.k8s_custom_object_api.patch_namespaced_custom_object( # type: ignore[attr-defined]
|
137
172
|
name=name,
|
138
173
|
group=group,
|
139
174
|
version=version,
|
140
175
|
plural=plural,
|
141
|
-
namespace=self.namespace,
|
176
|
+
namespace=namespace or self.namespace,
|
142
177
|
body=body,
|
143
178
|
_content_type="application/merge-patch+json",
|
144
179
|
)
|
@@ -153,17 +188,28 @@ class AsyncK8sManager(BaseK8sManager):
|
|
153
188
|
plural: str,
|
154
189
|
body: Dict,
|
155
190
|
reraise: bool = False,
|
191
|
+
namespace: str = None,
|
156
192
|
) -> Tuple[Dict, bool]:
|
157
193
|
try:
|
158
194
|
create = await self.create_custom_object(
|
159
|
-
name=name,
|
195
|
+
name=name,
|
196
|
+
group=group,
|
197
|
+
version=version,
|
198
|
+
plural=plural,
|
199
|
+
body=body,
|
200
|
+
namespace=namespace,
|
160
201
|
)
|
161
202
|
return create, True
|
162
203
|
|
163
204
|
except ApiException as e_create:
|
164
205
|
try:
|
165
206
|
update = await self.update_custom_object(
|
166
|
-
name=name,
|
207
|
+
name=name,
|
208
|
+
group=group,
|
209
|
+
version=version,
|
210
|
+
plural=plural,
|
211
|
+
body=body,
|
212
|
+
namespace=namespace,
|
167
213
|
)
|
168
214
|
return update, False
|
169
215
|
except ApiException as e:
|
@@ -176,7 +222,13 @@ class AsyncK8sManager(BaseK8sManager):
|
|
176
222
|
return {}, False
|
177
223
|
|
178
224
|
async def get_custom_object(
|
179
|
-
self,
|
225
|
+
self,
|
226
|
+
name: str,
|
227
|
+
group: str,
|
228
|
+
version: str,
|
229
|
+
plural: str,
|
230
|
+
reraise: bool = False,
|
231
|
+
namespace: str = None,
|
180
232
|
) -> Optional[Dict]:
|
181
233
|
try:
|
182
234
|
return await self.k8s_custom_object_api.get_namespaced_custom_object(
|
@@ -184,7 +236,7 @@ class AsyncK8sManager(BaseK8sManager):
|
|
184
236
|
group=group,
|
185
237
|
version=version,
|
186
238
|
plural=plural,
|
187
|
-
namespace=self.namespace,
|
239
|
+
namespace=namespace or self.namespace,
|
188
240
|
)
|
189
241
|
except ApiException as e:
|
190
242
|
if reraise:
|
@@ -192,7 +244,13 @@ class AsyncK8sManager(BaseK8sManager):
|
|
192
244
|
return None
|
193
245
|
|
194
246
|
async def delete_custom_object(
|
195
|
-
self,
|
247
|
+
self,
|
248
|
+
name: str,
|
249
|
+
group: str,
|
250
|
+
version: str,
|
251
|
+
plural: str,
|
252
|
+
reraise: bool = False,
|
253
|
+
namespace: str = None,
|
196
254
|
):
|
197
255
|
try:
|
198
256
|
await self.k8s_custom_object_api.delete_namespaced_custom_object(
|
@@ -200,7 +258,7 @@ class AsyncK8sManager(BaseK8sManager):
|
|
200
258
|
group=group,
|
201
259
|
version=version,
|
202
260
|
plural=plural,
|
203
|
-
namespace=self.namespace,
|
261
|
+
namespace=namespace or self.namespace,
|
204
262
|
body=client.V1DeleteOptions(),
|
205
263
|
)
|
206
264
|
logger.debug("Custom object `{}` deleted".format(name))
|
polyaxon/_k8s/manager/base.py
CHANGED
@@ -28,6 +28,10 @@ class BaseK8sManager:
|
|
28
28
|
instance
|
29
29
|
)
|
30
30
|
|
31
|
+
@staticmethod
|
32
|
+
def get_core_polyaxon() -> str:
|
33
|
+
return "app.kubernetes.io/part-of=polyaxon-core"
|
34
|
+
|
31
35
|
@classmethod
|
32
36
|
def get_config_auth(cls, k8s_config: Optional[Configuration] = None) -> str:
|
33
37
|
if not k8s_config or not k8s_config.api_key:
|