polyaxon 2.0.0rc49__py3-none-any.whl → 2.4.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polyaxon/_auxiliaries/cleaner.py +8 -3
- polyaxon/_auxiliaries/init.py +7 -2
- polyaxon/_auxiliaries/notifier.py +8 -2
- polyaxon/_auxiliaries/sidecar.py +30 -2
- polyaxon/_cli/artifacts.py +96 -11
- polyaxon/_cli/components.py +96 -11
- polyaxon/_cli/config.py +118 -22
- polyaxon/_cli/dashboard.py +15 -2
- polyaxon/_cli/init.py +1 -1
- polyaxon/_cli/models.py +96 -11
- polyaxon/_cli/operations.py +267 -90
- polyaxon/_cli/project_versions.py +139 -6
- polyaxon/_cli/projects.py +23 -9
- polyaxon/_cli/run.py +37 -9
- polyaxon/_cli/services/agent.py +2 -2
- polyaxon/_cli/services/clean_artifacts.py +1 -1
- polyaxon/_cli/services/sidecar.py +8 -1
- polyaxon/_client/client.py +17 -0
- polyaxon/_client/mixin.py +39 -0
- polyaxon/_client/project.py +218 -23
- polyaxon/_client/run.py +131 -33
- polyaxon/_compiler/contexts/contexts.py +2 -2
- polyaxon/_compiler/contexts/ray_job.py +4 -2
- polyaxon/_compiler/resolver/agent.py +12 -2
- polyaxon/_compiler/resolver/runtime.py +2 -2
- polyaxon/_contexts/paths.py +4 -7
- polyaxon/_deploy/operators/compose.py +1 -27
- polyaxon/_deploy/schemas/deployment.py +4 -1
- polyaxon/_deploy/schemas/intervals.py +0 -7
- polyaxon/_deploy/schemas/proxy.py +1 -0
- polyaxon/_deploy/schemas/service.py +11 -1
- polyaxon/_docker/converter/base/base.py +8 -0
- polyaxon/_docker/executor.py +10 -4
- polyaxon/_env_vars/getters/owner_entity.py +4 -2
- polyaxon/_env_vars/getters/project.py +4 -2
- polyaxon/_env_vars/getters/run.py +5 -2
- polyaxon/_env_vars/keys.py +7 -1
- polyaxon/_flow/__init__.py +2 -0
- polyaxon/_flow/builds/__init__.py +19 -6
- polyaxon/_flow/component/base.py +1 -0
- polyaxon/_flow/component/component.py +14 -0
- polyaxon/_flow/environment/__init__.py +8 -8
- polyaxon/_flow/hooks/__init__.py +19 -6
- polyaxon/_flow/init/__init__.py +6 -6
- polyaxon/_flow/matrix/iterative.py +0 -1
- polyaxon/_flow/matrix/tuner.py +18 -6
- polyaxon/_flow/operations/operation.py +44 -17
- polyaxon/_flow/plugins/__init__.py +6 -0
- polyaxon/_flow/run/__init__.py +2 -2
- polyaxon/_flow/run/dag.py +2 -2
- polyaxon/_flow/run/dask/dask.py +0 -1
- polyaxon/_flow/run/dask/replica.py +3 -3
- polyaxon/_flow/run/enums.py +5 -0
- polyaxon/_flow/run/job.py +4 -4
- polyaxon/_flow/run/kubeflow/mpi_job.py +1 -2
- polyaxon/_flow/run/kubeflow/mx_job.py +1 -2
- polyaxon/_flow/run/kubeflow/paddle_job.py +35 -4
- polyaxon/_flow/run/kubeflow/pytorch_job.py +51 -5
- polyaxon/_flow/run/kubeflow/replica.py +4 -4
- polyaxon/_flow/run/kubeflow/scheduling_policy.py +12 -0
- polyaxon/_flow/run/kubeflow/tf_job.py +3 -3
- polyaxon/_flow/run/kubeflow/xgboost_job.py +1 -2
- polyaxon/_flow/run/ray/ray.py +2 -3
- polyaxon/_flow/run/ray/replica.py +3 -3
- polyaxon/_flow/run/service.py +4 -4
- polyaxon/_fs/fs.py +7 -2
- polyaxon/_fs/utils.py +3 -2
- polyaxon/_k8s/converter/base/base.py +2 -1
- polyaxon/_k8s/converter/base/main.py +1 -0
- polyaxon/_k8s/converter/base/sidecar.py +16 -1
- polyaxon/_k8s/converter/common/accelerators.py +7 -4
- polyaxon/_k8s/converter/converters/job.py +1 -1
- polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
- polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
- polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
- polyaxon/_k8s/converter/converters/ray_job.py +4 -2
- polyaxon/_k8s/custom_resources/dask_job.py +3 -0
- polyaxon/_k8s/custom_resources/kubeflow/common.py +4 -1
- polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
- polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
- polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
- polyaxon/_k8s/custom_resources/ray_job.py +3 -0
- polyaxon/_k8s/custom_resources/setter.py +1 -1
- polyaxon/_k8s/executor/async_executor.py +2 -0
- polyaxon/_k8s/executor/base.py +23 -6
- polyaxon/_k8s/logging/async_monitor.py +150 -5
- polyaxon/_k8s/manager/async_manager.py +96 -23
- polyaxon/_k8s/manager/base.py +4 -0
- polyaxon/_k8s/manager/manager.py +282 -134
- polyaxon/_local_process/__init__.py +0 -0
- polyaxon/_local_process/agent.py +6 -0
- polyaxon/_local_process/converter/__init__.py +1 -0
- polyaxon/_local_process/converter/base/__init__.py +1 -0
- polyaxon/_local_process/converter/base/base.py +140 -0
- polyaxon/_local_process/converter/base/containers.py +69 -0
- polyaxon/_local_process/converter/base/env_vars.py +253 -0
- polyaxon/_local_process/converter/base/init.py +414 -0
- polyaxon/_local_process/converter/base/main.py +74 -0
- polyaxon/_local_process/converter/base/mounts.py +82 -0
- polyaxon/_local_process/converter/converters/__init__.py +8 -0
- polyaxon/_local_process/converter/converters/job.py +40 -0
- polyaxon/_local_process/converter/converters/service.py +41 -0
- polyaxon/_local_process/converter/mixins.py +38 -0
- polyaxon/_local_process/executor.py +132 -0
- polyaxon/_local_process/process_types.py +39 -0
- polyaxon/_managers/agent.py +2 -0
- polyaxon/_managers/home.py +2 -1
- polyaxon/_operations/tuner.py +1 -0
- polyaxon/_polyaxonfile/check.py +2 -0
- polyaxon/_polyaxonfile/manager/operations.py +3 -0
- polyaxon/_polyaxonfile/manager/workflows.py +2 -0
- polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
- polyaxon/_polyaxonfile/specs/operation.py +1 -0
- polyaxon/_polyaxonfile/specs/sections.py +3 -0
- polyaxon/_pql/manager.py +1 -1
- polyaxon/_runner/agent/async_agent.py +97 -21
- polyaxon/_runner/agent/base_agent.py +27 -9
- polyaxon/_runner/agent/client.py +15 -1
- polyaxon/_runner/agent/sync_agent.py +85 -20
- polyaxon/_runner/converter/converter.py +6 -2
- polyaxon/_runner/executor.py +13 -7
- polyaxon/_schemas/agent.py +27 -1
- polyaxon/_schemas/client.py +30 -3
- polyaxon/_schemas/installation.py +4 -3
- polyaxon/_schemas/lifecycle.py +10 -5
- polyaxon/_schemas/log_handler.py +2 -3
- polyaxon/_schemas/types/artifacts.py +3 -3
- polyaxon/_schemas/types/dockerfile.py +3 -3
- polyaxon/_schemas/types/file.py +3 -3
- polyaxon/_schemas/types/git.py +3 -3
- polyaxon/_schemas/types/tensorboard.py +3 -3
- polyaxon/_sdk/api/agents_v1_api.py +1076 -73
- polyaxon/_sdk/api/organizations_v1_api.py +371 -10
- polyaxon/_sdk/api/project_dashboards_v1_api.py +12 -12
- polyaxon/_sdk/api/project_searches_v1_api.py +12 -12
- polyaxon/_sdk/api/projects_v1_api.py +221 -44
- polyaxon/_sdk/api/runs_v1_api.py +917 -445
- polyaxon/_sdk/api/service_accounts_v1_api.py +16 -16
- polyaxon/_sdk/api/teams_v1_api.py +2827 -375
- polyaxon/_sdk/api/users_v1_api.py +231 -55
- polyaxon/_sdk/async_client/api_client.py +4 -0
- polyaxon/_sdk/schemas/__init__.py +10 -2
- polyaxon/_sdk/schemas/v1_agent.py +2 -1
- polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
- polyaxon/_sdk/schemas/v1_artifact_tree.py +1 -1
- polyaxon/_sdk/schemas/v1_dashboard_spec.py +4 -0
- polyaxon/_sdk/schemas/v1_events_response.py +4 -0
- polyaxon/_sdk/schemas/v1_organization.py +1 -0
- polyaxon/_sdk/schemas/v1_preset.py +8 -0
- polyaxon/_sdk/schemas/v1_project.py +1 -0
- polyaxon/_sdk/schemas/v1_project_settings.py +4 -2
- polyaxon/_sdk/schemas/v1_run.py +2 -2
- polyaxon/_sdk/schemas/v1_run_edge_lineage.py +14 -0
- polyaxon/_sdk/schemas/v1_run_edges_graph.py +9 -0
- polyaxon/_sdk/schemas/v1_section_spec.py +7 -2
- polyaxon/_sdk/schemas/v1_settings_catalog.py +1 -0
- polyaxon/_sdk/schemas/v1_team.py +3 -0
- polyaxon/_sdk/schemas/v1_user.py +1 -2
- polyaxon/_sdk/schemas/v1_user_access.py +17 -0
- polyaxon/_services/values.py +1 -0
- polyaxon/_sidecar/container/__init__.py +39 -18
- polyaxon/_sidecar/container/monitors/__init__.py +1 -0
- polyaxon/_sidecar/container/monitors/logs.py +10 -13
- polyaxon/_sidecar/container/monitors/spec.py +24 -0
- polyaxon/_sidecar/ignore.py +0 -1
- polyaxon/_utils/fqn_utils.py +25 -2
- polyaxon/client.py +1 -1
- polyaxon/pkg.py +1 -1
- polyaxon/schemas.py +8 -1
- polyaxon/settings.py +6 -0
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/METADATA +43 -43
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/RECORD +176 -155
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/WHEEL +1 -1
- polyaxon/_sdk/schemas/v1_project_user_access.py +0 -10
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/LICENSE +0 -0
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/entry_points.txt +0 -0
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/top_level.txt +0 -0
@@ -33,15 +33,26 @@ class SidecarConverter(_BaseConverter):
|
|
33
33
|
|
34
34
|
@staticmethod
|
35
35
|
def _get_sidecar_args(
|
36
|
-
container_id: str,
|
36
|
+
container_id: str,
|
37
|
+
sleep_interval: int,
|
38
|
+
sync_interval: int,
|
39
|
+
monitor_logs: bool,
|
40
|
+
monitor_spec: bool,
|
37
41
|
) -> List[str]:
|
38
42
|
args = [
|
39
43
|
"--container-id={}".format(container_id),
|
40
44
|
"--sleep-interval={}".format(sleep_interval),
|
41
45
|
"--sync-interval={}".format(sync_interval),
|
42
46
|
]
|
47
|
+
# enable monitor logs and spec by default
|
48
|
+
if monitor_logs is None:
|
49
|
+
monitor_logs = True
|
50
|
+
if monitor_spec is None:
|
51
|
+
monitor_spec = True
|
43
52
|
if monitor_logs:
|
44
53
|
args.append("--monitor-logs")
|
54
|
+
if monitor_spec:
|
55
|
+
args.append("--monitor-spec")
|
45
56
|
return args
|
46
57
|
|
47
58
|
@classmethod
|
@@ -87,6 +98,7 @@ class SidecarConverter(_BaseConverter):
|
|
87
98
|
sleep_interval = polyaxon_sidecar.sleep_interval
|
88
99
|
sync_interval = polyaxon_sidecar.sync_interval
|
89
100
|
monitor_logs = polyaxon_sidecar.monitor_logs
|
101
|
+
monitor_spec = polyaxon_sidecar.monitor_spec
|
90
102
|
if plugins and plugins.sidecar:
|
91
103
|
if plugins.sidecar.sleep_interval:
|
92
104
|
sleep_interval = plugins.sidecar.sleep_interval
|
@@ -94,11 +106,14 @@ class SidecarConverter(_BaseConverter):
|
|
94
106
|
sync_interval = plugins.sidecar.sync_interval
|
95
107
|
if plugins.sidecar.monitor_logs:
|
96
108
|
monitor_logs = plugins.sidecar.monitor_logs
|
109
|
+
if plugins.sidecar.monitor_spec:
|
110
|
+
monitor_spec = plugins.sidecar.monitor_spec
|
97
111
|
sidecar_args = cls._get_sidecar_args(
|
98
112
|
container_id=container_id,
|
99
113
|
sleep_interval=sleep_interval,
|
100
114
|
sync_interval=sync_interval,
|
101
115
|
monitor_logs=monitor_logs,
|
116
|
+
monitor_spec=monitor_spec,
|
102
117
|
)
|
103
118
|
|
104
119
|
env_from = []
|
@@ -37,14 +37,17 @@ def requests_gpu(resources: k8s_schemas.V1ResourceRequirements) -> bool:
|
|
37
37
|
if not resources:
|
38
38
|
return False
|
39
39
|
|
40
|
+
if not isinstance(resources, k8s_schemas.V1ResourceRequirements):
|
41
|
+
resources = k8s_schemas.V1ResourceRequirements(**resources)
|
42
|
+
|
40
43
|
if resources.requests:
|
41
|
-
for key in resources.requests.
|
42
|
-
if "gpu" in key:
|
44
|
+
for key, val in resources.requests.items():
|
45
|
+
if "gpu" in key and val is not None and val > 0:
|
43
46
|
return True
|
44
47
|
|
45
48
|
if resources.limits:
|
46
|
-
for key in resources.limits.
|
47
|
-
if "gpu" in key:
|
49
|
+
for key, val in resources.limits.items():
|
50
|
+
if "gpu" in key and val is not None and val > 0:
|
48
51
|
return True
|
49
52
|
|
50
53
|
return False
|
@@ -39,7 +39,7 @@ class JobConverter(JobMixin, BaseConverter):
|
|
39
39
|
default_sa=default_sa,
|
40
40
|
)
|
41
41
|
return get_job_custom_resource(
|
42
|
-
namespace=self.namespace,
|
42
|
+
namespace=compiled_operation.namespace or self.namespace,
|
43
43
|
main_container=replica_spec.main_container,
|
44
44
|
sidecar_containers=replica_spec.sidecar_containers,
|
45
45
|
init_containers=replica_spec.init_containers,
|
@@ -58,6 +58,7 @@ class PaddleJobConverter(PaddleJobMixin, BaseConverter):
|
|
58
58
|
termination=compiled_operation.termination,
|
59
59
|
clean_pod_policy=job.clean_pod_policy,
|
60
60
|
scheduling_policy=job.scheduling_policy,
|
61
|
+
elastic_policy=job.elastic_policy,
|
61
62
|
collect_logs=plugins.collect_logs,
|
62
63
|
sync_statuses=plugins.sync_statuses,
|
63
64
|
notifications=plugins.notifications,
|
@@ -58,6 +58,8 @@ class PytorchJobConverter(PytorchJobMixin, BaseConverter):
|
|
58
58
|
termination=compiled_operation.termination,
|
59
59
|
clean_pod_policy=job.clean_pod_policy,
|
60
60
|
scheduling_policy=job.scheduling_policy,
|
61
|
+
elastic_policy=job.elastic_policy,
|
62
|
+
n_proc_per_node=job.n_proc_per_node,
|
61
63
|
collect_logs=plugins.collect_logs,
|
62
64
|
sync_statuses=plugins.sync_statuses,
|
63
65
|
notifications=plugins.notifications,
|
@@ -63,6 +63,7 @@ class TfJobConverter(TFJobMixin, BaseConverter):
|
|
63
63
|
collect_logs=plugins.collect_logs,
|
64
64
|
clean_pod_policy=job.clean_pod_policy,
|
65
65
|
scheduling_policy=job.scheduling_policy,
|
66
|
+
success_policy=job.success_policy,
|
66
67
|
enable_dynamic_worker=job.enable_dynamic_worker,
|
67
68
|
sync_statuses=plugins.sync_statuses,
|
68
69
|
notifications=plugins.notifications,
|
@@ -58,7 +58,9 @@ class RayJobConverter(RayJobMixin, BaseConverter):
|
|
58
58
|
config=compiled_operation.plugins, auth=default_auth
|
59
59
|
)
|
60
60
|
head = _get_replica(job.head)
|
61
|
-
workers =
|
61
|
+
workers = None
|
62
|
+
if job.workers:
|
63
|
+
workers = {n: _get_replica(w) for n, w in job.workers.items()}
|
62
64
|
labels = self.get_labels(version=pkg.VERSION, labels={})
|
63
65
|
|
64
66
|
return get_ray_job_custom_resource(
|
@@ -68,7 +70,7 @@ class RayJobConverter(RayJobMixin, BaseConverter):
|
|
68
70
|
workers=workers,
|
69
71
|
entrypoint=job.entrypoint,
|
70
72
|
metadata=job.metadata,
|
71
|
-
runtime_env=
|
73
|
+
runtime_env=orjson_dumps(job.runtime_env),
|
72
74
|
ray_version=job.ray_version,
|
73
75
|
termination=compiled_operation.termination,
|
74
76
|
collect_logs=plugins.collect_logs,
|
@@ -46,6 +46,9 @@ def get_dask_replicas_template(
|
|
46
46
|
if liveness_probe and replica.main_container.liveness_probe is None:
|
47
47
|
replica.main_container.liveness_probe = liveness_probe
|
48
48
|
|
49
|
+
labels = {**labels, **replica.labels}
|
50
|
+
annotations = {**annotations, **replica.annotations}
|
51
|
+
|
49
52
|
metadata, pod_spec = get_pod_spec(
|
50
53
|
namespace=namespace,
|
51
54
|
main_container=replica.main_container,
|
@@ -16,6 +16,9 @@ def get_kf_replicas_template(
|
|
16
16
|
if not replica:
|
17
17
|
return
|
18
18
|
|
19
|
+
labels = {**labels, **replica.labels}
|
20
|
+
annotations = {**annotations, **replica.annotations}
|
21
|
+
|
19
22
|
metadata, pod_spec = get_pod_spec(
|
20
23
|
namespace=namespace,
|
21
24
|
main_container=replica.main_container,
|
@@ -30,6 +33,6 @@ def get_kf_replicas_template(
|
|
30
33
|
|
31
34
|
template_spec[replica_name] = {
|
32
35
|
"replicas": replica.num_replicas,
|
33
|
-
"restartPolicy": pod_spec.restart_policy or "
|
36
|
+
"restartPolicy": pod_spec.restart_policy or "OnFailure",
|
34
37
|
"template": get_pod_template_spec(metadata=metadata, pod_spec=pod_spec),
|
35
38
|
}
|
@@ -1,6 +1,11 @@
|
|
1
1
|
from typing import Dict, List, Optional
|
2
2
|
|
3
|
-
from polyaxon._flow import
|
3
|
+
from polyaxon._flow import (
|
4
|
+
V1Notification,
|
5
|
+
V1PaddleElasticPolicy,
|
6
|
+
V1SchedulingPolicy,
|
7
|
+
V1Termination,
|
8
|
+
)
|
4
9
|
from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
|
5
10
|
from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
|
6
11
|
from polyaxon._k8s.custom_resources.setter import (
|
@@ -25,6 +30,7 @@ def get_paddle_job_custom_resource(
|
|
25
30
|
notifications: List[V1Notification],
|
26
31
|
clean_pod_policy: Optional[str],
|
27
32
|
scheduling_policy: Optional[V1SchedulingPolicy],
|
33
|
+
elastic_policy: Optional[V1PaddleElasticPolicy],
|
28
34
|
labels: Dict[str, str],
|
29
35
|
annotations: Dict[str, str],
|
30
36
|
) -> Dict:
|
@@ -58,6 +64,9 @@ def get_paddle_job_custom_resource(
|
|
58
64
|
template_spec=template_spec, scheduling_policy=scheduling_policy
|
59
65
|
)
|
60
66
|
|
67
|
+
if elastic_policy:
|
68
|
+
template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
|
69
|
+
|
61
70
|
custom_object = {"paddleJobSpec": template_spec}
|
62
71
|
custom_object = set_termination(
|
63
72
|
custom_object=custom_object, termination=termination
|
@@ -1,6 +1,11 @@
|
|
1
1
|
from typing import Dict, List, Optional
|
2
2
|
|
3
|
-
from polyaxon._flow import
|
3
|
+
from polyaxon._flow import (
|
4
|
+
V1Notification,
|
5
|
+
V1PytorchElasticPolicy,
|
6
|
+
V1SchedulingPolicy,
|
7
|
+
V1Termination,
|
8
|
+
)
|
4
9
|
from polyaxon._k8s.custom_resources.kubeflow.common import get_kf_replicas_template
|
5
10
|
from polyaxon._k8s.custom_resources.operation import get_operation_custom_object
|
6
11
|
from polyaxon._k8s.custom_resources.setter import (
|
@@ -25,6 +30,8 @@ def get_pytorch_job_custom_resource(
|
|
25
30
|
notifications: List[V1Notification],
|
26
31
|
clean_pod_policy: Optional[str],
|
27
32
|
scheduling_policy: Optional[V1SchedulingPolicy],
|
33
|
+
elastic_policy: Optional[V1PytorchElasticPolicy],
|
34
|
+
n_proc_per_node: Optional[int],
|
28
35
|
labels: Dict[str, str],
|
29
36
|
annotations: Dict[str, str],
|
30
37
|
) -> Dict:
|
@@ -58,6 +65,12 @@ def get_pytorch_job_custom_resource(
|
|
58
65
|
template_spec=template_spec, scheduling_policy=scheduling_policy
|
59
66
|
)
|
60
67
|
|
68
|
+
if elastic_policy:
|
69
|
+
template_spec["elasticPolicy"] = elastic_policy.to_light_dict()
|
70
|
+
|
71
|
+
if n_proc_per_node is not None:
|
72
|
+
template_spec["nProcPerNode"] = str(n_proc_per_node)
|
73
|
+
|
61
74
|
custom_object = {"pytorchJobSpec": template_spec}
|
62
75
|
custom_object = set_termination(
|
63
76
|
custom_object=custom_object, termination=termination
|
@@ -28,6 +28,7 @@ def get_tf_job_custom_resource(
|
|
28
28
|
clean_pod_policy: Optional[str],
|
29
29
|
scheduling_policy: Optional[V1SchedulingPolicy],
|
30
30
|
enable_dynamic_worker: bool,
|
31
|
+
success_policy: Optional[str],
|
31
32
|
labels: Dict[str, str],
|
32
33
|
annotations: Dict[str, str],
|
33
34
|
) -> Dict:
|
@@ -73,6 +74,9 @@ def get_tf_job_custom_resource(
|
|
73
74
|
if enable_dynamic_worker:
|
74
75
|
template_spec["enableDynamicWorker"] = enable_dynamic_worker
|
75
76
|
|
77
|
+
if success_policy:
|
78
|
+
template_spec["successPolicy"] = success_policy
|
79
|
+
|
76
80
|
template_spec = {"replicaSpecs": template_spec}
|
77
81
|
|
78
82
|
template_spec = set_clean_pod_policy(
|
@@ -26,6 +26,9 @@ def _get_ray_replicas_template(
|
|
26
26
|
if not replica:
|
27
27
|
return
|
28
28
|
|
29
|
+
labels = {**labels, **replica.labels}
|
30
|
+
annotations = {**annotations, **replica.annotations}
|
31
|
+
|
29
32
|
metadata, pod_spec = get_pod_spec(
|
30
33
|
namespace=namespace,
|
31
34
|
main_container=replica.main_container,
|
@@ -46,7 +46,7 @@ def set_notify(custom_object: Dict, notifications: List[V1Notification]) -> Dict
|
|
46
46
|
def set_clean_pod_policy(template_spec: Dict, clean_pod_policy: str) -> Dict:
|
47
47
|
if not clean_pod_policy:
|
48
48
|
# Sets default clean pod policy
|
49
|
-
clean_pod_policy = "
|
49
|
+
clean_pod_policy = "None"
|
50
50
|
|
51
51
|
template_spec["cleanPodPolicy"] = clean_pod_policy.capitalize()
|
52
52
|
return template_spec
|
polyaxon/_k8s/executor/base.py
CHANGED
@@ -5,7 +5,7 @@ from kubernetes.client import Configuration
|
|
5
5
|
|
6
6
|
from polyaxon import settings
|
7
7
|
from polyaxon._k8s.converter.converters import CONVERTERS
|
8
|
-
from polyaxon._k8s.converter.mixins import MIXIN_MAPPING
|
8
|
+
from polyaxon._k8s.converter.mixins import MIXIN_MAPPING, BaseMixin
|
9
9
|
from polyaxon._runner.executor import BaseExecutor as _BaseExecutor
|
10
10
|
from polyaxon._runner.kinds import RunnerKind
|
11
11
|
from polyaxon._utils.fqn_utils import get_resource_name
|
@@ -56,7 +56,9 @@ class BaseExecutor(_BaseExecutor):
|
|
56
56
|
api = k8s_client.ApiClient()
|
57
57
|
return api.sanitize_for_serialization(resource)
|
58
58
|
|
59
|
-
def create(
|
59
|
+
def create(
|
60
|
+
self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
|
61
|
+
) -> Dict:
|
60
62
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
61
63
|
resource_name = get_resource_name(run_uuid)
|
62
64
|
return self.manager.create_custom_object(
|
@@ -65,9 +67,12 @@ class BaseExecutor(_BaseExecutor):
|
|
65
67
|
version=mixin.API_VERSION,
|
66
68
|
plural=mixin.PLURAL,
|
67
69
|
body=resource,
|
70
|
+
namespace=namespace,
|
68
71
|
)
|
69
72
|
|
70
|
-
def apply(
|
73
|
+
def apply(
|
74
|
+
self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
|
75
|
+
) -> Dict:
|
71
76
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
72
77
|
resource_name = get_resource_name(run_uuid)
|
73
78
|
return self.manager.update_custom_object(
|
@@ -76,9 +81,10 @@ class BaseExecutor(_BaseExecutor):
|
|
76
81
|
version=mixin.API_VERSION,
|
77
82
|
plural=mixin.PLURAL,
|
78
83
|
body=resource,
|
84
|
+
namespace=namespace,
|
79
85
|
)
|
80
86
|
|
81
|
-
def stop(self, run_uuid: str, run_kind: str):
|
87
|
+
def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
|
82
88
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
83
89
|
resource_name = get_resource_name(run_uuid)
|
84
90
|
return self.manager.delete_custom_object(
|
@@ -86,16 +92,18 @@ class BaseExecutor(_BaseExecutor):
|
|
86
92
|
group=mixin.GROUP,
|
87
93
|
version=mixin.API_VERSION,
|
88
94
|
plural=mixin.PLURAL,
|
95
|
+
namespace=namespace,
|
89
96
|
)
|
90
97
|
|
91
|
-
def clean(self, run_uuid: str, run_kind: str):
|
98
|
+
def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
|
92
99
|
return self.apply(
|
93
100
|
run_uuid=run_uuid,
|
94
101
|
run_kind=run_kind,
|
95
102
|
resource={"metadata": {"finalizers": None}},
|
103
|
+
namespace=namespace,
|
96
104
|
)
|
97
105
|
|
98
|
-
def get(self, run_uuid: str, run_kind: str):
|
106
|
+
def get(self, run_uuid: str, run_kind: str, namespace: str = None):
|
99
107
|
mixin = self._get_mixin_for_kind(kind=run_kind)
|
100
108
|
resource_name = get_resource_name(run_uuid)
|
101
109
|
return self.manager.get_custom_object(
|
@@ -103,4 +111,13 @@ class BaseExecutor(_BaseExecutor):
|
|
103
111
|
group=mixin.GROUP,
|
104
112
|
version=mixin.API_VERSION,
|
105
113
|
plural=mixin.PLURAL,
|
114
|
+
namespace=namespace,
|
115
|
+
)
|
116
|
+
|
117
|
+
def list_ops(self, namespace: str = None):
|
118
|
+
return self.manager.list_custom_objects(
|
119
|
+
group=BaseMixin.GROUP,
|
120
|
+
version=BaseMixin.API_VERSION,
|
121
|
+
plural=BaseMixin.PLURAL,
|
122
|
+
namespace=namespace,
|
106
123
|
)
|
@@ -6,6 +6,7 @@ from clipped.utils.tz import now
|
|
6
6
|
from kubernetes_asyncio.client.models import V1Pod
|
7
7
|
from kubernetes_asyncio.client.rest import ApiException
|
8
8
|
|
9
|
+
from polyaxon._flow import V1RunKind
|
9
10
|
from polyaxon._k8s.manager.async_manager import AsyncK8sManager
|
10
11
|
from traceml.logging import V1Log, V1Logs
|
11
12
|
|
@@ -65,7 +66,7 @@ async def query_k8s_operation_logs(
|
|
65
66
|
new_time = now()
|
66
67
|
params = {}
|
67
68
|
if last_time:
|
68
|
-
since_seconds = (new_time - last_time).total_seconds()
|
69
|
+
since_seconds = (new_time - last_time).total_seconds()
|
69
70
|
params["since_seconds"] = int(since_seconds)
|
70
71
|
if stream:
|
71
72
|
params["tail_lines"] = V1Logs._CHUNK_SIZE
|
@@ -82,9 +83,28 @@ async def query_k8s_operation_logs(
|
|
82
83
|
**params,
|
83
84
|
)
|
84
85
|
|
86
|
+
if logs and last_time:
|
87
|
+
# make sure to filter logs larger than last_time
|
88
|
+
logs = [log for log in logs if log.timestamp > last_time]
|
89
|
+
if logs and logs[-1].timestamp:
|
90
|
+
new_time = logs[-1].timestamp
|
85
91
|
return logs, new_time
|
86
92
|
|
87
93
|
|
94
|
+
async def collect_agent_service_logs(
|
95
|
+
k8s_manager: AsyncK8sManager, pod: V1Pod
|
96
|
+
) -> List[V1Log]:
|
97
|
+
if not pod or not pod.spec.containers:
|
98
|
+
return []
|
99
|
+
container = pod.spec.containers[0]
|
100
|
+
return await handle_container_logs(
|
101
|
+
k8s_manager=k8s_manager,
|
102
|
+
pod=pod,
|
103
|
+
container_name=container.name,
|
104
|
+
tail_lines=V1Logs._CHUNK_SIZE,
|
105
|
+
)
|
106
|
+
|
107
|
+
|
88
108
|
async def query_k8s_pod_logs(
|
89
109
|
k8s_manager: AsyncK8sManager,
|
90
110
|
pod: V1Pod,
|
@@ -94,13 +114,138 @@ async def query_k8s_pod_logs(
|
|
94
114
|
new_time = now()
|
95
115
|
params = {}
|
96
116
|
if last_time:
|
97
|
-
since_seconds = (new_time - last_time).total_seconds()
|
117
|
+
since_seconds = (new_time - last_time).total_seconds()
|
98
118
|
params["since_seconds"] = int(since_seconds)
|
99
119
|
if stream:
|
100
120
|
params["tail_lines"] = V1Logs._CHUNK_SIZE
|
101
121
|
|
102
122
|
logs = await handle_pod_logs(k8s_manager=k8s_manager, pod=pod, **params)
|
103
123
|
|
104
|
-
if logs:
|
105
|
-
|
106
|
-
|
124
|
+
if logs and last_time:
|
125
|
+
# make sure to filter logs larger than last_time
|
126
|
+
logs = [log for log in logs if log.timestamp > last_time]
|
127
|
+
if logs and logs[-1].timestamp:
|
128
|
+
new_time = logs[-1].timestamp
|
129
|
+
return logs, new_time
|
130
|
+
|
131
|
+
|
132
|
+
async def get_op_pods_and_services(
|
133
|
+
k8s_manager: AsyncK8sManager,
|
134
|
+
run_uuid: str,
|
135
|
+
run_kind: str,
|
136
|
+
):
|
137
|
+
pods = await k8s_manager.list_pods(
|
138
|
+
label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
|
139
|
+
)
|
140
|
+
services = []
|
141
|
+
if V1RunKind.has_service(run_kind):
|
142
|
+
services = await k8s_manager.list_services(
|
143
|
+
label_selector=k8s_manager.get_managed_by_polyaxon(run_uuid)
|
144
|
+
)
|
145
|
+
|
146
|
+
return pods, services
|
147
|
+
|
148
|
+
|
149
|
+
async def get_resource_events(
|
150
|
+
k8s_manager: AsyncK8sManager, resource_type: str, resource_name: str
|
151
|
+
):
|
152
|
+
field_selector = (
|
153
|
+
f"involvedObject.kind={resource_type},involvedObject.name={resource_name}"
|
154
|
+
)
|
155
|
+
try:
|
156
|
+
events = await k8s_manager.list_namespaced_events(field_selector=field_selector)
|
157
|
+
|
158
|
+
all_events = []
|
159
|
+
for event in events:
|
160
|
+
event_data = {
|
161
|
+
"reason": event.reason,
|
162
|
+
"message": event.message,
|
163
|
+
"first_timestamp": event.first_timestamp,
|
164
|
+
"last_timestamp": event.last_timestamp,
|
165
|
+
"count": event.count,
|
166
|
+
"type": event.type,
|
167
|
+
}
|
168
|
+
all_events.append(event_data)
|
169
|
+
|
170
|
+
return all_events
|
171
|
+
|
172
|
+
except ApiException as e:
|
173
|
+
print(f"Exception when calling CoreV1Api->list_namespaced_event: {e}")
|
174
|
+
return []
|
175
|
+
|
176
|
+
|
177
|
+
async def get_op_spec(
|
178
|
+
k8s_manager: AsyncK8sManager,
|
179
|
+
run_uuid: str,
|
180
|
+
run_kind: str,
|
181
|
+
):
|
182
|
+
pods, services = await get_op_pods_and_services(
|
183
|
+
k8s_manager=k8s_manager,
|
184
|
+
run_uuid=run_uuid,
|
185
|
+
run_kind=run_kind,
|
186
|
+
)
|
187
|
+
pods_list = {}
|
188
|
+
for pod in pods or []:
|
189
|
+
pods_list[
|
190
|
+
pod.metadata.name
|
191
|
+
] = k8s_manager.api_client.sanitize_for_serialization(pod)
|
192
|
+
pods_list[pod.metadata.name]["events"] = await get_resource_events(
|
193
|
+
k8s_manager=k8s_manager,
|
194
|
+
resource_type="Pod",
|
195
|
+
resource_name=pod.metadata.name,
|
196
|
+
)
|
197
|
+
services_list = {}
|
198
|
+
for service in services or []:
|
199
|
+
services_list[
|
200
|
+
service.metadata.name
|
201
|
+
] = k8s_manager.api_client.sanitize_for_serialization(service)
|
202
|
+
services_list[service.metadata.name]["events"] = await get_resource_events(
|
203
|
+
k8s_manager=k8s_manager,
|
204
|
+
resource_type="Service",
|
205
|
+
resource_name=service.metadata.name,
|
206
|
+
)
|
207
|
+
data = {"pods": pods_list, "services": services_list}
|
208
|
+
return data, pods, services
|
209
|
+
|
210
|
+
|
211
|
+
async def get_agent_pods_and_services(
|
212
|
+
k8s_manager: AsyncK8sManager,
|
213
|
+
):
|
214
|
+
pods = await k8s_manager.list_pods(
|
215
|
+
label_selector=k8s_manager.get_core_polyaxon(),
|
216
|
+
)
|
217
|
+
services = await k8s_manager.list_services(
|
218
|
+
label_selector=k8s_manager.get_core_polyaxon(),
|
219
|
+
)
|
220
|
+
return pods, services
|
221
|
+
|
222
|
+
|
223
|
+
async def get_agent_spec(
|
224
|
+
k8s_manager: AsyncK8sManager,
|
225
|
+
):
|
226
|
+
pods, services = await get_agent_pods_and_services(
|
227
|
+
k8s_manager=k8s_manager,
|
228
|
+
)
|
229
|
+
pods_list = {}
|
230
|
+
for pod in pods or []:
|
231
|
+
pods_list[
|
232
|
+
pod.metadata.name
|
233
|
+
] = k8s_manager.api_client.sanitize_for_serialization(pod)
|
234
|
+
pods_list[pod.metadata.name]["events"] = await get_resource_events(
|
235
|
+
k8s_manager=k8s_manager,
|
236
|
+
resource_type="Pod",
|
237
|
+
resource_name=pod.metadata.name,
|
238
|
+
)
|
239
|
+
data = {"pods": pods_list}
|
240
|
+
services_list = {}
|
241
|
+
for service in services or []:
|
242
|
+
services_list[
|
243
|
+
service.metadata.name
|
244
|
+
] = k8s_manager.api_client.sanitize_for_serialization(service)
|
245
|
+
services_list[service.metadata.name]["events"] = await get_resource_events(
|
246
|
+
k8s_manager=k8s_manager,
|
247
|
+
resource_type="Service",
|
248
|
+
resource_name=service.metadata.name,
|
249
|
+
)
|
250
|
+
data["services"] = services_list
|
251
|
+
return data, pods, services
|