polyaxon 2.0.6rc8__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. polyaxon/_cli/config.py +1 -1
  2. polyaxon/_cli/run.py +8 -0
  3. polyaxon/_cli/services/clean_artifacts.py +1 -1
  4. polyaxon/_client/client.py +17 -0
  5. polyaxon/_client/run.py +12 -0
  6. polyaxon/_compiler/resolver/agent.py +1 -1
  7. polyaxon/_compiler/resolver/runtime.py +1 -1
  8. polyaxon/_deploy/schemas/service.py +4 -0
  9. polyaxon/_docker/executor.py +10 -4
  10. polyaxon/_env_vars/getters/run.py +3 -0
  11. polyaxon/_env_vars/keys.py +5 -0
  12. polyaxon/_flow/__init__.py +2 -0
  13. polyaxon/_flow/builds/__init__.py +19 -6
  14. polyaxon/_flow/component/base.py +1 -0
  15. polyaxon/_flow/component/component.py +14 -0
  16. polyaxon/_flow/environment/__init__.py +5 -5
  17. polyaxon/_flow/hooks/__init__.py +19 -6
  18. polyaxon/_flow/matrix/tuner.py +18 -6
  19. polyaxon/_flow/operations/operation.py +19 -0
  20. polyaxon/_flow/run/__init__.py +2 -2
  21. polyaxon/_flow/run/kubeflow/paddle_job.py +34 -2
  22. polyaxon/_flow/run/kubeflow/pytorch_job.py +50 -3
  23. polyaxon/_flow/run/kubeflow/scheduling_policy.py +4 -0
  24. polyaxon/_flow/run/kubeflow/tf_job.py +2 -1
  25. polyaxon/_fs/fs.py +5 -0
  26. polyaxon/_k8s/converter/converters/job.py +1 -1
  27. polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
  28. polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
  29. polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
  30. polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
  31. polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
  32. polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
  33. polyaxon/_k8s/executor/base.py +23 -6
  34. polyaxon/_k8s/logging/async_monitor.py +73 -12
  35. polyaxon/_k8s/manager/async_manager.py +81 -23
  36. polyaxon/_k8s/manager/base.py +4 -0
  37. polyaxon/_k8s/manager/manager.py +266 -133
  38. polyaxon/_operations/tuner.py +1 -0
  39. polyaxon/_polyaxonfile/check.py +2 -0
  40. polyaxon/_polyaxonfile/manager/operations.py +3 -0
  41. polyaxon/_polyaxonfile/manager/workflows.py +2 -0
  42. polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
  43. polyaxon/_polyaxonfile/specs/operation.py +1 -0
  44. polyaxon/_polyaxonfile/specs/sections.py +3 -0
  45. polyaxon/_runner/agent/async_agent.py +94 -18
  46. polyaxon/_runner/agent/base_agent.py +25 -7
  47. polyaxon/_runner/agent/client.py +15 -1
  48. polyaxon/_runner/agent/sync_agent.py +83 -18
  49. polyaxon/_runner/executor.py +13 -7
  50. polyaxon/_schemas/agent.py +27 -1
  51. polyaxon/_schemas/client.py +30 -3
  52. polyaxon/_sdk/api/agents_v1_api.py +875 -51
  53. polyaxon/_sdk/api/service_accounts_v1_api.py +12 -12
  54. polyaxon/_sdk/schemas/__init__.py +3 -0
  55. polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
  56. polyaxon/_sidecar/container/__init__.py +1 -1
  57. polyaxon/_sidecar/container/monitors/spec.py +1 -1
  58. polyaxon/pkg.py +1 -1
  59. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/METADATA +6 -6
  60. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/RECORD +64 -63
  61. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/LICENSE +0 -0
  62. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/WHEEL +0 -0
  63. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/entry_points.txt +0 -0
  64. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
- from typing import Optional, Union
1
+ from typing import Dict, List, Optional, Union
2
2
  from typing_extensions import Literal
3
3
 
4
- from clipped.compact.pydantic import Field
5
- from clipped.types.ref_or_obj import RefField
4
+ from clipped.compact.pydantic import Field, StrictStr
5
+ from clipped.types.ref_or_obj import BoolOrRef, IntOrRef, RefField
6
6
 
7
7
  from polyaxon._flow.run.base import BaseRun
8
8
  from polyaxon._flow.run.enums import V1RunKind
@@ -12,6 +12,39 @@ from polyaxon._flow.run.kubeflow.scheduling_policy import V1SchedulingPolicy
12
12
  from polyaxon._flow.run.resources import V1RunResources
13
13
  from polyaxon._flow.run.utils import DestinationImageMixin
14
14
  from polyaxon._k8s.k8s_schemas import V1Container
15
+ from polyaxon._schemas.base import BaseSchemaModel
16
+
17
+
18
+ class V1PytorchElasticPolicy(BaseSchemaModel):
19
+ """Elastic policy for Pytorch distributed runs.
20
+
21
+ Args:
22
+ min_replicas: int, optional
23
+ max_replicas: int, optional
24
+ rdvz_backend: str, optional
25
+ rdvz_port: int, optional
26
+ rdvz_host: str, optional
27
+ rdvz_id: str, optional
28
+ rdvz_conf: List[Dict], optional
29
+ standalone: bool, optional
30
+ n_proc_per_node: int, optional
31
+ max_restarts: int, optional
32
+ metrics: List[Dict], optional
33
+ """
34
+
35
+ _IDENTIFIER = "elasticPolicy"
36
+
37
+ min_replicas: Optional[IntOrRef] = Field(alias="minReplicas")
38
+ max_replicas: Optional[IntOrRef] = Field(alias="maxReplicas")
39
+ rdvz_backend: Optional[StrictStr] = Field(alias="rdvzBackend")
40
+ rdvz_port: Optional[IntOrRef] = Field(alias="rdvzPort")
41
+ rdvz_host: Optional[StrictStr] = Field(alias="rdvzHost")
42
+ rdvz_id: Optional[StrictStr] = Field(alias="rdvzId")
43
+ rdvz_conf: Optional[List[Dict]] = Field(alias="rdvzConf")
44
+ standalone: Optional[BoolOrRef]
45
+ n_proc_per_node: Optional[IntOrRef] = Field(alias="nProcPerNode")
46
+ max_restarts: Optional[IntOrRef] = Field(alias="maxRestarts")
47
+ metrics: Optional[List[Dict]] = Field(alias="Metrics")
15
48
 
16
49
 
17
50
  class V1PytorchJob(BaseRun, DestinationImageMixin):
@@ -88,6 +121,18 @@ class V1PytorchJob(BaseRun, DestinationImageMixin):
88
121
  >>> ...
89
122
  ```
90
123
 
124
+ ### elasticPolicy
125
+
126
+ ElasticPolicy encapsulates various policies for elastic distributed training job.
127
+
128
+ ```yaml
129
+ >>> run:
130
+ >>> kind: pytorchjob
131
+ >>> elasticPolicy:
132
+ >>> ...
133
+ >>> ...
134
+ ```
135
+
91
136
  ### master
92
137
 
93
138
  The master replica in the distributed PytorchJob
@@ -122,6 +167,8 @@ class V1PytorchJob(BaseRun, DestinationImageMixin):
122
167
  kind: Literal[_IDENTIFIER] = _IDENTIFIER
123
168
  clean_pod_policy: Optional[V1CleanPodPolicy] = Field(alias="cleanPodPolicy")
124
169
  scheduling_policy: Optional[V1SchedulingPolicy] = Field(alias="schedulingPolicy")
170
+ elastic_policy: Optional[V1PytorchElasticPolicy] = Field(alias="elasticPolicy")
171
+ n_proc_per_node: Optional[IntOrRef] = Field(alias="nProcPerNode")
125
172
  master: Optional[Union[V1KFReplica, RefField]]
126
173
  worker: Optional[Union[V1KFReplica, RefField]]
127
174
 
@@ -12,11 +12,15 @@ class V1SchedulingPolicy(BaseSchemaModel):
12
12
  Args:
13
13
  min_available: int, optional
14
14
  queue: str, optional
15
+ min_resources: int, optional
15
16
  priority_class: str, optional
17
+ schedule_timeout_seconds: int, optional
16
18
  """
17
19
 
18
20
  _IDENTIFIER = "schedulingPolicy"
19
21
 
20
22
  min_available: Optional[IntOrRef] = Field(alias="minAvailable")
21
23
  queue: Optional[StrictStr]
24
+ min_resources: Optional[IntOrRef] = Field(alias="minResources")
22
25
  priority_class: Optional[StrictStr] = Field(alias="priorityClass")
26
+ schedule_timeout_seconds: Optional[IntOrRef] = Field(alias="scheduleTimeoutSeconds")
@@ -172,8 +172,9 @@ class V1TFJob(BaseRun, DestinationImageMixin):
172
172
 
173
173
  kind: Literal[_IDENTIFIER] = _IDENTIFIER
174
174
  clean_pod_policy: Optional[V1CleanPodPolicy] = Field(alias="cleanPodPolicy")
175
- enable_dynamic_worker: Optional[bool] = Field(alias="enableDynamicWorker")
176
175
  scheduling_policy: Optional[V1SchedulingPolicy] = Field(alias="schedulingPolicy")
176
+ enable_dynamic_worker: Optional[bool] = Field(alias="enableDynamicWorker")
177
+ success_policy: Optional[str] = Field(alias="successPolicy")
177
178
  chief: Optional[Union[V1KFReplica, RefField]]
178
179
  ps: Optional[Union[V1KFReplica, RefField]]
179
180
  worker: Optional[Union[V1KFReplica, RefField]]
polyaxon/_fs/fs.py CHANGED
@@ -72,6 +72,11 @@ def get_fs_from_name(connection_name: str, asynchronous: bool = False, **kwargs)
72
72
  )
73
73
 
74
74
 
75
+ def get_sync_default_fs(**kwargs):
76
+ connection = get_artifacts_connection()
77
+ return get_sync_fs_from_connection(connection=connection, **kwargs)
78
+
79
+
75
80
  async def get_default_fs(**kwargs):
76
81
  connection = get_artifacts_connection()
77
82
  return await get_async_fs_from_connection(connection=connection, **kwargs)
@@ -39,7 +39,7 @@ class JobConverter(JobMixin, BaseConverter):
39
39
  default_sa=default_sa,
40
40
  )
41
41
  return get_job_custom_resource(
42
- namespace=self.namespace,
42
+ namespace=compiled_operation.namespace or self.namespace,
43
43
  main_container=replica_spec.main_container,
44
44
  sidecar_containers=replica_spec.sidecar_containers,
45
45
  init_containers=replica_spec.init_containers,
@@ -58,6 +58,7 @@ class PaddleJobConverter(PaddleJobMixin, BaseConverter):
58
58
  termination=compiled_operation.termination,
59
59
  clean_pod_policy=job.clean_pod_policy,
60
60
  scheduling_policy=job.scheduling_policy,
61
+ elastic_policy=job.elastic_policy,
61
62
  collect_logs=plugins.collect_logs,
62
63
  sync_statuses=plugins.sync_statuses,
63
64
  notifications=plugins.notifications,
@@ -58,6 +58,8 @@ class PytorchJobConverter(PytorchJobMixin, BaseConverter):
58
58
  termination=compiled_operation.termination,
59
59
  clean_pod_policy=job.clean_pod_policy,
60
60
  scheduling_policy=job.scheduling_policy,
61
+ elastic_policy=job.elastic_policy,
62
+ n_proc_per_node=job.n_proc_per_node,
61
63
  collect_logs=plugins.collect_logs,
62
64
  sync_statuses=plugins.sync_statuses,
63
65
  notifications=plugins.notifications,
@@ -63,6 +63,7 @@ class TfJobConverter(TFJobMixin, BaseConverter):
63
63
  collect_logs=plugins.collect_logs,
64
64
  clean_pod_policy=job.clean_pod_policy,
65
65
  scheduling_policy=job.scheduling_policy,
66
+ success_policy=job.success_policy,
66
67
  enable_dynamic_worker=job.enable_dynamic_worker,
67
68
  sync_statuses=plugins.sync_statuses,
68
69
  notifications=plugins.notifications,
@@ -1,6 +1,11 @@
1
1
  from typing import Dict, List, Optional
2
2
 
3
- from polyaxon._flow import V1Notification, V1SchedulingPolicy, V1Termination
3
+ from polyaxon._flow import (
4
+ V1Notification,
5
+ V1PaddleElasticPolicy,
6
+ V1SchedulingPolicy,
7
+ V1Termination,
8
+ )
4
9
  from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
5
10
  from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
6
11
  from polyaxon._k8s.custom_resources.setter import (
@@ -25,6 +30,7 @@ def get_paddle_job_custom_resource(
25
30
  notifications: List[V1Notification],
26
31
  clean_pod_policy: Optional[str],
27
32
  scheduling_policy: Optional[V1SchedulingPolicy],
33
+ elastic_policy: Optional[V1PaddleElasticPolicy],
28
34
  labels: Dict[str, str],
29
35
  annotations: Dict[str, str],
30
36
  ) -> Dict:
@@ -58,6 +64,9 @@ def get_paddle_job_custom_resource(
58
64
  template_spec=template_spec, scheduling_policy=scheduling_policy
59
65
  )
60
66
 
67
+ if elastic_policy:
68
+ template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
69
+
61
70
  custom_object = {"paddleJobSpec": template_spec}
62
71
  custom_object = set_termination(
63
72
  custom_object=custom_object, termination=termination
@@ -1,6 +1,11 @@
1
1
  from typing import Dict, List, Optional
2
2
 
3
- from polyaxon._flow import V1Notification, V1SchedulingPolicy, V1Termination
3
+ from polyaxon._flow import (
4
+ V1Notification,
5
+ V1PytorchElasticPolicy,
6
+ V1SchedulingPolicy,
7
+ V1Termination,
8
+ )
4
9
  from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
5
10
  from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
6
11
  from polyaxon._k8s.custom_resources.setter import (
@@ -25,6 +30,8 @@ def get_pytorch_job_custom_resource(
25
30
  notifications: List[V1Notification],
26
31
  clean_pod_policy: Optional[str],
27
32
  scheduling_policy: Optional[V1SchedulingPolicy],
33
+ elastic_policy: Optional[V1PytorchElasticPolicy],
34
+ n_proc_per_node: Optional[int],
28
35
  labels: Dict[str, str],
29
36
  annotations: Dict[str, str],
30
37
  ) -> Dict:
@@ -58,6 +65,12 @@ def get_pytorch_job_custom_resource(
58
65
  template_spec=template_spec, scheduling_policy=scheduling_policy
59
66
  )
60
67
 
68
+ if elastic_policy:
69
+ template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
70
+
71
+ if n_proc_per_node is not None:
72
+ template_spec["nProcPerNode"] = str(n_proc_per_node)
73
+
61
74
  custom_object = {"pytorchJobSpec": template_spec}
62
75
  custom_object = set_termination(
63
76
  custom_object=custom_object, termination=termination
@@ -28,6 +28,7 @@ def get_tf_job_custom_resource(
28
28
  clean_pod_policy: Optional[str],
29
29
  scheduling_policy: Optional[V1SchedulingPolicy],
30
30
  enable_dynamic_worker: bool,
31
+ success_policy: Optional[str],
31
32
  labels: Dict[str, str],
32
33
  annotations: Dict[str, str],
33
34
  ) -> Dict:
@@ -73,6 +74,9 @@ def get_tf_job_custom_resource(
73
74
  if enable_dynamic_worker:
74
75
  template_spec["enableDynamicWorker"] = enable_dynamic_worker
75
76
 
77
+ if success_policy:
78
+ template_spec["successPolicy"] = success_policy
79
+
76
80
  template_spec = {"replicaSpecs": template_spec}
77
81
 
78
82
  template_spec = set_clean_pod_policy(
@@ -5,7 +5,7 @@ from kubernetes.client import Configuration
5
5
 
6
6
  from polyaxon import settings
7
7
  from polyaxon._k8s.converter.converters import CONVERTERS
8
- from polyaxon._k8s.converter.mixins import MIXIN_MAPPING
8
+ from polyaxon._k8s.converter.mixins import MIXIN_MAPPING, BaseMixin
9
9
  from polyaxon._runner.executor import BaseExecutor as _BaseExecutor
10
10
  from polyaxon._runner.kinds import RunnerKind
11
11
  from polyaxon._utils.fqn_utils import get_resource_name
@@ -56,7 +56,9 @@ class BaseExecutor(_BaseExecutor):
56
56
  api = k8s_client.ApiClient()
57
57
  return api.sanitize_for_serialization(resource)
58
58
 
59
- def create(self, run_uuid: str, run_kind: str, resource: Dict) -> Dict:
59
+ def create(
60
+ self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
61
+ ) -> Dict:
60
62
  mixin = self._get_mixin_for_kind(kind=run_kind)
61
63
  resource_name = get_resource_name(run_uuid)
62
64
  return self.manager.create_custom_object(
@@ -65,9 +67,12 @@ class BaseExecutor(_BaseExecutor):
65
67
  version=mixin.API_VERSION,
66
68
  plural=mixin.PLURAL,
67
69
  body=resource,
70
+ namespace=namespace,
68
71
  )
69
72
 
70
- def apply(self, run_uuid: str, run_kind: str, resource: Dict) -> Dict:
73
+ def apply(
74
+ self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
75
+ ) -> Dict:
71
76
  mixin = self._get_mixin_for_kind(kind=run_kind)
72
77
  resource_name = get_resource_name(run_uuid)
73
78
  return self.manager.update_custom_object(
@@ -76,9 +81,10 @@ class BaseExecutor(_BaseExecutor):
76
81
  version=mixin.API_VERSION,
77
82
  plural=mixin.PLURAL,
78
83
  body=resource,
84
+ namespace=namespace,
79
85
  )
80
86
 
81
- def stop(self, run_uuid: str, run_kind: str):
87
+ def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
82
88
  mixin = self._get_mixin_for_kind(kind=run_kind)
83
89
  resource_name = get_resource_name(run_uuid)
84
90
  return self.manager.delete_custom_object(
@@ -86,16 +92,18 @@ class BaseExecutor(_BaseExecutor):
86
92
  group=mixin.GROUP,
87
93
  version=mixin.API_VERSION,
88
94
  plural=mixin.PLURAL,
95
+ namespace=namespace,
89
96
  )
90
97
 
91
- def clean(self, run_uuid: str, run_kind: str):
98
+ def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
92
99
  return self.apply(
93
100
  run_uuid=run_uuid,
94
101
  run_kind=run_kind,
95
102
  resource={"metadata": {"finalizers": None}},
103
+ namespace=namespace,
96
104
  )
97
105
 
98
- def get(self, run_uuid: str, run_kind: str):
106
+ def get(self, run_uuid: str, run_kind: str, namespace: str = None):
99
107
  mixin = self._get_mixin_for_kind(kind=run_kind)
100
108
  resource_name = get_resource_name(run_uuid)
101
109
  return self.manager.get_custom_object(
@@ -103,4 +111,13 @@ class BaseExecutor(_BaseExecutor):
103
111
  group=mixin.GROUP,
104
112
  version=mixin.API_VERSION,
105
113
  plural=mixin.PLURAL,
114
+ namespace=namespace,
115
+ )
116
+
117
+ def list_ops(self, namespace: str = None):
118
+ return self.manager.list_custom_objects(
119
+ group=BaseMixin.GROUP,
120
+ version=BaseMixin.API_VERSION,
121
+ plural=BaseMixin.PLURAL,
122
+ namespace=namespace,
106
123
  )
@@ -86,6 +86,20 @@ async def query_k8s_operation_logs(
86
86
  return logs, new_time
87
87
 
88
88
 
89
+ async def collect_agent_service_logs(
90
+ k8s_manager: AsyncK8sManager, pod: V1Pod
91
+ ) -> List[V1Log]:
92
+ if not pod or not pod.spec.containers:
93
+ return []
94
+ container = pod.spec.containers[0]
95
+ return await handle_container_logs(
96
+ k8s_manager=k8s_manager,
97
+ pod=pod,
98
+ container_name=container.name,
99
+ tail_lines=V1Logs._CHUNK_SIZE * 3,
100
+ )
101
+
102
+
89
103
  async def query_k8s_pod_logs(
90
104
  k8s_manager: AsyncK8sManager,
91
105
  pod: V1Pod,
@@ -107,7 +121,7 @@ async def query_k8s_pod_logs(
107
121
  return logs, last_time
108
122
 
109
123
 
110
- async def get_op_spec(
124
+ async def get_op_pos_and_services(
111
125
  k8s_manager: AsyncK8sManager,
112
126
  run_uuid: str,
113
127
  run_kind: str,
@@ -115,20 +129,67 @@ async def get_op_spec(
115
129
  pods = await k8s_manager.list_pods(
116
130
  label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
117
131
  )
132
+ services = []
133
+ if V1RunKind.has_service(run_kind):
134
+ services = await k8s_manager.list_services(
135
+ label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
136
+ )
137
+
138
+ return pods, services
139
+
140
+
141
+ async def get_op_spec(
142
+ k8s_manager: AsyncK8sManager,
143
+ run_uuid: str,
144
+ run_kind: str,
145
+ ):
146
+ pods, services = await get_op_pos_and_services(
147
+ k8s_manager=k8s_manager,
148
+ run_uuid=run_uuid,
149
+ run_kind=run_kind,
150
+ )
151
+ pods_list = {}
152
+ for pod in pods or []:
153
+ pods_list[
154
+ pod.metadata.name
155
+ ] = k8s_manager.api_client.sanitize_for_serialization(pod)
156
+ services_list = {}
157
+ for service in services or []:
158
+ services_list[
159
+ service.metadata.name
160
+ ] = k8s_manager.api_client.sanitize_for_serialization(service)
161
+ data = {"pods": pods_list, "services": services_list}
162
+ return data, pods, services
163
+
164
+
165
+ async def get_agent_pods_and_services(
166
+ k8s_manager: AsyncK8sManager,
167
+ ):
168
+ pods = await k8s_manager.list_pods(
169
+ label_selector=k8s_manager.get_core_polyaxon(),
170
+ )
171
+ services = await k8s_manager.list_services(
172
+ label_selector=k8s_manager.get_core_polyaxon(),
173
+ )
174
+ return pods, services
175
+
176
+
177
+ async def get_agent_spec(
178
+ k8s_manager: AsyncK8sManager,
179
+ ):
180
+ pods, services = await get_agent_pods_and_services(
181
+ k8s_manager=k8s_manager,
182
+ )
118
183
  pods_list = {}
119
184
  for pod in pods or []:
120
185
  pods_list[
121
186
  pod.metadata.name
122
187
  ] = k8s_manager.api_client.sanitize_for_serialization(pod)
123
188
  data = {"pods": pods_list}
124
- if V1RunKind.has_service(run_kind):
125
- services = await k8s_manager.list_services(
126
- label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
127
- )
128
- services_list = {}
129
- for service in services or []:
130
- services_list[
131
- service.metadata.name
132
- ] = k8s_manager.api_client.sanitize_for_serialization(service)
133
- data["services"] = services_list
134
- return data
189
+ services_list = {}
190
+ for service in services or []:
191
+ services_list[
192
+ service.metadata.name
193
+ ] = k8s_manager.api_client.sanitize_for_serialization(service)
194
+ data["services"] = services_list
195
+ return data, pods, services
@@ -38,7 +38,7 @@ class AsyncK8sManager(BaseK8sManager):
38
38
  if self.api_client:
39
39
  await self.api_client.close()
40
40
 
41
- async def get_version(self, reraise=False):
41
+ async def get_version(self, reraise: bool = False):
42
42
  try:
43
43
  version = await self.k8s_version_api.get_code()
44
44
  return version.to_dict()
@@ -47,48 +47,68 @@ class AsyncK8sManager(BaseK8sManager):
47
47
  if reraise:
48
48
  raise PolyaxonK8sError("Connection error: %s" % e) from e
49
49
 
50
- async def get_pod(self, name, reraise=False) -> Optional[client.V1Pod]:
50
+ async def get_pod(
51
+ self, name, reraise: bool = False, namespace: str = None
52
+ ) -> Optional[client.V1Pod]:
51
53
  try:
52
54
  return await self.k8s_api.read_namespaced_pod( # type: ignore[attr-defined]
53
- name=name, namespace=self.namespace
55
+ name=name, namespace=namespace or self.namespace
54
56
  )
55
57
  except ApiException as e:
56
58
  if reraise:
57
59
  raise PolyaxonK8sError("Connection error: %s" % e) from e
58
60
  return None
59
61
 
60
- async def is_pod_running(self, pod_id: str, container_id: str) -> bool:
61
- event = await self.k8s_api.read_namespaced_pod_status(pod_id, self.namespace) # type: ignore[attr-defined]
62
+ async def is_pod_running(
63
+ self, pod_id: str, container_id: str, namespace: str = None
64
+ ) -> bool:
65
+ event = await self.k8s_api.read_namespaced_pod_status(pod_id, namespace=namespace or self.namespace) # type: ignore[attr-defined]
62
66
  return is_pod_running(event, container_id)
63
67
 
64
68
  async def _list_namespace_resource(
65
- self, resource_api, reraise=False, **kwargs
69
+ self, resource_api, reraise: bool = False, namespace: str = None, **kwargs
66
70
  ) -> List:
67
71
  try:
68
- res = await resource_api(namespace=self.namespace, **kwargs)
69
- return [p for p in res.items]
72
+ res = await resource_api(namespace=namespace or self.namespace, **kwargs)
73
+ if isinstance(res, dict):
74
+ items = res["items"]
75
+ else:
76
+ items = res.items
77
+ return [p for p in items]
70
78
  except ApiException as e:
71
79
  logger.error("K8S error: {}".format(e))
72
80
  if reraise:
73
81
  raise PolyaxonK8sError("Connection error: %s" % e) from e
74
82
  return []
75
83
 
76
- async def list_pods(self, reraise=False, **kwargs) -> List[client.V1Pod]:
84
+ async def list_pods(
85
+ self, reraise: bool = False, namespace: str = None, **kwargs
86
+ ) -> List[client.V1Pod]:
77
87
  return await self._list_namespace_resource(
78
88
  resource_api=self.k8s_api.list_namespaced_pod, # type: ignore[attr-defined]
79
89
  reraise=reraise,
90
+ namespace=namespace,
80
91
  **kwargs,
81
92
  )
82
93
 
83
- async def list_jobs(self, reraise=False, **kwargs) -> List[client.V1Job]:
94
+ async def list_jobs(
95
+ self, reraise: bool = False, namespace: str = None, **kwargs
96
+ ) -> List[client.V1Job]:
84
97
  return await self._list_namespace_resource(
85
98
  resource_api=self.k8s_batch_api.list_namespaced_job, # type: ignore[attr-defined]
86
99
  reraise=reraise,
100
+ namespace=namespace,
87
101
  **kwargs,
88
102
  )
89
103
 
90
104
  async def list_custom_objects(
91
- self, group, version, plural, reraise=False, **kwargs
105
+ self,
106
+ group,
107
+ version,
108
+ plural,
109
+ reraise: bool = False,
110
+ namespace: str = None,
111
+ **kwargs
92
112
  ) -> List:
93
113
  return await self._list_namespace_resource(
94
114
  resource_api=self.k8s_custom_object_api.list_namespaced_custom_object, # type: ignore[attr-defined]
@@ -96,49 +116,64 @@ class AsyncK8sManager(BaseK8sManager):
96
116
  group=group,
97
117
  version=version,
98
118
  plural=plural,
119
+ namespace=namespace,
99
120
  **kwargs,
100
121
  )
101
122
 
102
123
  async def list_services(
103
- self, reraise: bool = False, **kwargs
124
+ self, reraise: bool = False, namespace: str = None, **kwargs
104
125
  ) -> List[client.V1Service]:
105
126
  return await self._list_namespace_resource(
106
127
  resource_api=self.k8s_api.list_namespaced_service, # type: ignore[attr-defined]
107
128
  reraise=reraise,
129
+ namespace=namespace,
108
130
  **kwargs,
109
131
  )
110
132
 
111
133
  async def list_deployments(
112
- self, reraise: bool = False, **kwargs
134
+ self, reraise: bool = False, namespace: str = None, **kwargs
113
135
  ) -> List[client.V1Deployment]:
114
136
  return await self._list_namespace_resource(
115
137
  resource_api=self.k8s_apps_api.list_namespaced_deployment, # type: ignore[attr-defined]
116
138
  reraise=reraise,
139
+ namespace=namespace,
117
140
  **kwargs,
118
141
  )
119
142
 
120
143
  async def create_custom_object(
121
- self, name: str, group: str, version: str, plural: str, body: Dict
144
+ self,
145
+ name: str,
146
+ group: str,
147
+ version: str,
148
+ plural: str,
149
+ body: Dict,
150
+ namespace: str = None,
122
151
  ) -> Dict:
123
152
  resp = await self.k8s_custom_object_api.create_namespaced_custom_object( # type: ignore[attr-defined]
124
153
  group=group,
125
154
  version=version,
126
155
  plural=plural,
127
- namespace=self.namespace,
156
+ namespace=namespace or self.namespace,
128
157
  body=body,
129
158
  )
130
159
  logger.debug("Custom object `{}` was created".format(name))
131
160
  return resp
132
161
 
133
162
  async def update_custom_object(
134
- self, name: str, group: str, version: str, plural: str, body: Dict
163
+ self,
164
+ name: str,
165
+ group: str,
166
+ version: str,
167
+ plural: str,
168
+ body: Dict,
169
+ namespace: str = None,
135
170
  ) -> Dict:
136
171
  resp = await self.k8s_custom_object_api.patch_namespaced_custom_object( # type: ignore[attr-defined]
137
172
  name=name,
138
173
  group=group,
139
174
  version=version,
140
175
  plural=plural,
141
- namespace=self.namespace,
176
+ namespace=namespace or self.namespace,
142
177
  body=body,
143
178
  _content_type="application/merge-patch+json",
144
179
  )
@@ -153,17 +188,28 @@ class AsyncK8sManager(BaseK8sManager):
153
188
  plural: str,
154
189
  body: Dict,
155
190
  reraise: bool = False,
191
+ namespace: str = None,
156
192
  ) -> Tuple[Dict, bool]:
157
193
  try:
158
194
  create = await self.create_custom_object(
159
- name=name, group=group, version=version, plural=plural, body=body
195
+ name=name,
196
+ group=group,
197
+ version=version,
198
+ plural=plural,
199
+ body=body,
200
+ namespace=namespace,
160
201
  )
161
202
  return create, True
162
203
 
163
204
  except ApiException as e_create:
164
205
  try:
165
206
  update = await self.update_custom_object(
166
- name=name, group=group, version=version, plural=plural, body=body
207
+ name=name,
208
+ group=group,
209
+ version=version,
210
+ plural=plural,
211
+ body=body,
212
+ namespace=namespace,
167
213
  )
168
214
  return update, False
169
215
  except ApiException as e:
@@ -176,7 +222,13 @@ class AsyncK8sManager(BaseK8sManager):
176
222
  return {}, False
177
223
 
178
224
  async def get_custom_object(
179
- self, name: str, group: str, version: str, plural: str, reraise: bool = False
225
+ self,
226
+ name: str,
227
+ group: str,
228
+ version: str,
229
+ plural: str,
230
+ reraise: bool = False,
231
+ namespace: str = None,
180
232
  ) -> Optional[Dict]:
181
233
  try:
182
234
  return await self.k8s_custom_object_api.get_namespaced_custom_object(
@@ -184,7 +236,7 @@ class AsyncK8sManager(BaseK8sManager):
184
236
  group=group,
185
237
  version=version,
186
238
  plural=plural,
187
- namespace=self.namespace,
239
+ namespace=namespace or self.namespace,
188
240
  )
189
241
  except ApiException as e:
190
242
  if reraise:
@@ -192,7 +244,13 @@ class AsyncK8sManager(BaseK8sManager):
192
244
  return None
193
245
 
194
246
  async def delete_custom_object(
195
- self, name: str, group: str, version: str, plural: str, reraise: bool = False
247
+ self,
248
+ name: str,
249
+ group: str,
250
+ version: str,
251
+ plural: str,
252
+ reraise: bool = False,
253
+ namespace: str = None,
196
254
  ):
197
255
  try:
198
256
  await self.k8s_custom_object_api.delete_namespaced_custom_object(
@@ -200,7 +258,7 @@ class AsyncK8sManager(BaseK8sManager):
200
258
  group=group,
201
259
  version=version,
202
260
  plural=plural,
203
- namespace=self.namespace,
261
+ namespace=namespace or self.namespace,
204
262
  body=client.V1DeleteOptions(),
205
263
  )
206
264
  logger.debug("Custom object `{}` deleted".format(name))
@@ -28,6 +28,10 @@ class BaseK8sManager:
28
28
  instance
29
29
  )
30
30
 
31
+ @staticmethod
32
+ def get_core_polyaxon() -> str:
33
+ return "app.kubernetes.io/part-of=polyaxon-core"
34
+
31
35
  @classmethod
32
36
  def get_config_auth(cls, k8s_config: Optional[Configuration] = None) -> str:
33
37
  if not k8s_config or not k8s_config.api_key: