polyaxon 2.0.0rc49__py3-none-any.whl → 2.4.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polyaxon/_auxiliaries/cleaner.py +8 -3
- polyaxon/_auxiliaries/init.py +7 -2
- polyaxon/_auxiliaries/notifier.py +8 -2
- polyaxon/_auxiliaries/sidecar.py +30 -2
- polyaxon/_cli/artifacts.py +96 -11
- polyaxon/_cli/components.py +96 -11
- polyaxon/_cli/config.py +118 -22
- polyaxon/_cli/dashboard.py +15 -2
- polyaxon/_cli/init.py +1 -1
- polyaxon/_cli/models.py +96 -11
- polyaxon/_cli/operations.py +267 -90
- polyaxon/_cli/project_versions.py +139 -6
- polyaxon/_cli/projects.py +23 -9
- polyaxon/_cli/run.py +37 -9
- polyaxon/_cli/services/agent.py +2 -2
- polyaxon/_cli/services/clean_artifacts.py +1 -1
- polyaxon/_cli/services/sidecar.py +8 -1
- polyaxon/_client/client.py +17 -0
- polyaxon/_client/mixin.py +39 -0
- polyaxon/_client/project.py +218 -23
- polyaxon/_client/run.py +131 -33
- polyaxon/_compiler/contexts/contexts.py +2 -2
- polyaxon/_compiler/contexts/ray_job.py +4 -2
- polyaxon/_compiler/resolver/agent.py +12 -2
- polyaxon/_compiler/resolver/runtime.py +2 -2
- polyaxon/_contexts/paths.py +4 -7
- polyaxon/_deploy/operators/compose.py +1 -27
- polyaxon/_deploy/schemas/deployment.py +4 -1
- polyaxon/_deploy/schemas/intervals.py +0 -7
- polyaxon/_deploy/schemas/proxy.py +1 -0
- polyaxon/_deploy/schemas/service.py +11 -1
- polyaxon/_docker/converter/base/base.py +8 -0
- polyaxon/_docker/executor.py +10 -4
- polyaxon/_env_vars/getters/owner_entity.py +4 -2
- polyaxon/_env_vars/getters/project.py +4 -2
- polyaxon/_env_vars/getters/run.py +5 -2
- polyaxon/_env_vars/keys.py +7 -1
- polyaxon/_flow/__init__.py +2 -0
- polyaxon/_flow/builds/__init__.py +19 -6
- polyaxon/_flow/component/base.py +1 -0
- polyaxon/_flow/component/component.py +14 -0
- polyaxon/_flow/environment/__init__.py +8 -8
- polyaxon/_flow/hooks/__init__.py +19 -6
- polyaxon/_flow/init/__init__.py +6 -6
- polyaxon/_flow/matrix/iterative.py +0 -1
- polyaxon/_flow/matrix/tuner.py +18 -6
- polyaxon/_flow/operations/operation.py +44 -17
- polyaxon/_flow/plugins/__init__.py +6 -0
- polyaxon/_flow/run/__init__.py +2 -2
- polyaxon/_flow/run/dag.py +2 -2
- polyaxon/_flow/run/dask/dask.py +0 -1
- polyaxon/_flow/run/dask/replica.py +3 -3
- polyaxon/_flow/run/enums.py +5 -0
- polyaxon/_flow/run/job.py +4 -4
- polyaxon/_flow/run/kubeflow/mpi_job.py +1 -2
- polyaxon/_flow/run/kubeflow/mx_job.py +1 -2
- polyaxon/_flow/run/kubeflow/paddle_job.py +35 -4
- polyaxon/_flow/run/kubeflow/pytorch_job.py +51 -5
- polyaxon/_flow/run/kubeflow/replica.py +4 -4
- polyaxon/_flow/run/kubeflow/scheduling_policy.py +12 -0
- polyaxon/_flow/run/kubeflow/tf_job.py +3 -3
- polyaxon/_flow/run/kubeflow/xgboost_job.py +1 -2
- polyaxon/_flow/run/ray/ray.py +2 -3
- polyaxon/_flow/run/ray/replica.py +3 -3
- polyaxon/_flow/run/service.py +4 -4
- polyaxon/_fs/fs.py +7 -2
- polyaxon/_fs/utils.py +3 -2
- polyaxon/_k8s/converter/base/base.py +2 -1
- polyaxon/_k8s/converter/base/main.py +1 -0
- polyaxon/_k8s/converter/base/sidecar.py +16 -1
- polyaxon/_k8s/converter/common/accelerators.py +7 -4
- polyaxon/_k8s/converter/converters/job.py +1 -1
- polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
- polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
- polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
- polyaxon/_k8s/converter/converters/ray_job.py +4 -2
- polyaxon/_k8s/custom_resources/dask_job.py +3 -0
- polyaxon/_k8s/custom_resources/kubeflow/common.py +4 -1
- polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
- polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
- polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
- polyaxon/_k8s/custom_resources/ray_job.py +3 -0
- polyaxon/_k8s/custom_resources/setter.py +1 -1
- polyaxon/_k8s/executor/async_executor.py +2 -0
- polyaxon/_k8s/executor/base.py +23 -6
- polyaxon/_k8s/logging/async_monitor.py +150 -5
- polyaxon/_k8s/manager/async_manager.py +96 -23
- polyaxon/_k8s/manager/base.py +4 -0
- polyaxon/_k8s/manager/manager.py +282 -134
- polyaxon/_local_process/__init__.py +0 -0
- polyaxon/_local_process/agent.py +6 -0
- polyaxon/_local_process/converter/__init__.py +1 -0
- polyaxon/_local_process/converter/base/__init__.py +1 -0
- polyaxon/_local_process/converter/base/base.py +140 -0
- polyaxon/_local_process/converter/base/containers.py +69 -0
- polyaxon/_local_process/converter/base/env_vars.py +253 -0
- polyaxon/_local_process/converter/base/init.py +414 -0
- polyaxon/_local_process/converter/base/main.py +74 -0
- polyaxon/_local_process/converter/base/mounts.py +82 -0
- polyaxon/_local_process/converter/converters/__init__.py +8 -0
- polyaxon/_local_process/converter/converters/job.py +40 -0
- polyaxon/_local_process/converter/converters/service.py +41 -0
- polyaxon/_local_process/converter/mixins.py +38 -0
- polyaxon/_local_process/executor.py +132 -0
- polyaxon/_local_process/process_types.py +39 -0
- polyaxon/_managers/agent.py +2 -0
- polyaxon/_managers/home.py +2 -1
- polyaxon/_operations/tuner.py +1 -0
- polyaxon/_polyaxonfile/check.py +2 -0
- polyaxon/_polyaxonfile/manager/operations.py +3 -0
- polyaxon/_polyaxonfile/manager/workflows.py +2 -0
- polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
- polyaxon/_polyaxonfile/specs/operation.py +1 -0
- polyaxon/_polyaxonfile/specs/sections.py +3 -0
- polyaxon/_pql/manager.py +1 -1
- polyaxon/_runner/agent/async_agent.py +97 -21
- polyaxon/_runner/agent/base_agent.py +27 -9
- polyaxon/_runner/agent/client.py +15 -1
- polyaxon/_runner/agent/sync_agent.py +85 -20
- polyaxon/_runner/converter/converter.py +6 -2
- polyaxon/_runner/executor.py +13 -7
- polyaxon/_schemas/agent.py +27 -1
- polyaxon/_schemas/client.py +30 -3
- polyaxon/_schemas/installation.py +4 -3
- polyaxon/_schemas/lifecycle.py +10 -5
- polyaxon/_schemas/log_handler.py +2 -3
- polyaxon/_schemas/types/artifacts.py +3 -3
- polyaxon/_schemas/types/dockerfile.py +3 -3
- polyaxon/_schemas/types/file.py +3 -3
- polyaxon/_schemas/types/git.py +3 -3
- polyaxon/_schemas/types/tensorboard.py +3 -3
- polyaxon/_sdk/api/agents_v1_api.py +1076 -73
- polyaxon/_sdk/api/organizations_v1_api.py +371 -10
- polyaxon/_sdk/api/project_dashboards_v1_api.py +12 -12
- polyaxon/_sdk/api/project_searches_v1_api.py +12 -12
- polyaxon/_sdk/api/projects_v1_api.py +221 -44
- polyaxon/_sdk/api/runs_v1_api.py +917 -445
- polyaxon/_sdk/api/service_accounts_v1_api.py +16 -16
- polyaxon/_sdk/api/teams_v1_api.py +2827 -375
- polyaxon/_sdk/api/users_v1_api.py +231 -55
- polyaxon/_sdk/async_client/api_client.py +4 -0
- polyaxon/_sdk/schemas/__init__.py +10 -2
- polyaxon/_sdk/schemas/v1_agent.py +2 -1
- polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
- polyaxon/_sdk/schemas/v1_artifact_tree.py +1 -1
- polyaxon/_sdk/schemas/v1_dashboard_spec.py +4 -0
- polyaxon/_sdk/schemas/v1_events_response.py +4 -0
- polyaxon/_sdk/schemas/v1_organization.py +1 -0
- polyaxon/_sdk/schemas/v1_preset.py +8 -0
- polyaxon/_sdk/schemas/v1_project.py +1 -0
- polyaxon/_sdk/schemas/v1_project_settings.py +4 -2
- polyaxon/_sdk/schemas/v1_run.py +2 -2
- polyaxon/_sdk/schemas/v1_run_edge_lineage.py +14 -0
- polyaxon/_sdk/schemas/v1_run_edges_graph.py +9 -0
- polyaxon/_sdk/schemas/v1_section_spec.py +7 -2
- polyaxon/_sdk/schemas/v1_settings_catalog.py +1 -0
- polyaxon/_sdk/schemas/v1_team.py +3 -0
- polyaxon/_sdk/schemas/v1_user.py +1 -2
- polyaxon/_sdk/schemas/v1_user_access.py +17 -0
- polyaxon/_services/values.py +1 -0
- polyaxon/_sidecar/container/__init__.py +39 -18
- polyaxon/_sidecar/container/monitors/__init__.py +1 -0
- polyaxon/_sidecar/container/monitors/logs.py +10 -13
- polyaxon/_sidecar/container/monitors/spec.py +24 -0
- polyaxon/_sidecar/ignore.py +0 -1
- polyaxon/_utils/fqn_utils.py +25 -2
- polyaxon/client.py +1 -1
- polyaxon/pkg.py +1 -1
- polyaxon/schemas.py +8 -1
- polyaxon/settings.py +6 -0
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/METADATA +43 -43
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/RECORD +176 -155
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/WHEEL +1 -1
- polyaxon/_sdk/schemas/v1_project_user_access.py +0 -10
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/LICENSE +0 -0
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/entry_points.txt +0 -0
- {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,132 @@
|
|
1
|
+
import os
|
2
|
+
import signal
|
3
|
+
import subprocess
|
4
|
+
|
5
|
+
from typing import Dict, List
|
6
|
+
|
7
|
+
from polyaxon._deploy.operators.cmd_operator import CmdOperator
|
8
|
+
from polyaxon._deploy.operators.conda import CondaOperator
|
9
|
+
from polyaxon._local_process import process_types
|
10
|
+
from polyaxon._local_process.converter.converters import CONVERTERS
|
11
|
+
from polyaxon._local_process.converter.mixins import MIXIN_MAPPING
|
12
|
+
from polyaxon._runner.executor import BaseExecutor
|
13
|
+
from polyaxon._runner.kinds import RunnerKind
|
14
|
+
from polyaxon._schemas.lifecycle import V1Statuses
|
15
|
+
from polyaxon.exceptions import PolyaxonAgentError
|
16
|
+
from polyaxon.logger import logger
|
17
|
+
|
18
|
+
|
19
|
+
class Executor(BaseExecutor):
|
20
|
+
MIXIN_MAPPING = MIXIN_MAPPING
|
21
|
+
CONVERTERS = CONVERTERS
|
22
|
+
RUNNER_KIND = RunnerKind.PROCESS
|
23
|
+
|
24
|
+
def __init__(self, conda_env: str = None, venv: str = None):
|
25
|
+
super().__init__()
|
26
|
+
self._ops = {}
|
27
|
+
self._conda_env = conda_env
|
28
|
+
self._venv = venv
|
29
|
+
|
30
|
+
def _get_manager(self):
|
31
|
+
if self._conda_env:
|
32
|
+
return CondaOperator()
|
33
|
+
return CmdOperator()
|
34
|
+
|
35
|
+
def _check_conda(self):
|
36
|
+
if not self.manager.check():
|
37
|
+
raise logger.error("Conda is required to run this command.")
|
38
|
+
|
39
|
+
envs = self.manager.execute(["env", "list", "--json"], is_json=True)
|
40
|
+
env_names = [os.path.basename(env) for env in envs["envs"]]
|
41
|
+
if self._conda_env not in env_names:
|
42
|
+
raise logger.error(
|
43
|
+
"Conda env `{}` is not installed.".format(self._conda_env),
|
44
|
+
sys_exit=True,
|
45
|
+
)
|
46
|
+
|
47
|
+
def _run_in_conda(self, cmd_bash, cmd_args):
|
48
|
+
cmd_args = ["source activate {}".format(self._conda_env)] + cmd_args
|
49
|
+
subprocess.Popen(cmd_bash + [" && ".join(cmd_args)], close_fds=True)
|
50
|
+
|
51
|
+
def _get_op_proc(self, run_uuid: str) -> List[subprocess.Popen]:
|
52
|
+
return self._ops.get(run_uuid)
|
53
|
+
|
54
|
+
def create(
|
55
|
+
self,
|
56
|
+
run_uuid: str,
|
57
|
+
run_kind: str,
|
58
|
+
resource: List[process_types.V1Container],
|
59
|
+
namespace: str = None,
|
60
|
+
) -> Dict:
|
61
|
+
logger.info(f"[Executor] Starting operation {run_uuid} {run_kind}.")
|
62
|
+
self._ops[run_uuid] = []
|
63
|
+
for task in resource:
|
64
|
+
logger.info(
|
65
|
+
f"[Executor] Starting task container {task.name} {task.image} ."
|
66
|
+
)
|
67
|
+
proc = self.manager.execute(
|
68
|
+
task.get_cmd_args(), env=os.environ, output_only=False
|
69
|
+
)
|
70
|
+
self._ops[run_uuid].append(proc)
|
71
|
+
proc.wait()
|
72
|
+
task_status = self._get_task_status(proc)
|
73
|
+
message = f"Task container {task.name} {task.image} with id {proc.pid} {task_status}"
|
74
|
+
if task_status == V1Statuses.SUCCEEDED:
|
75
|
+
logger.info(f"[Executor] {message}")
|
76
|
+
else:
|
77
|
+
logger.warning(f"[Executor] {message}")
|
78
|
+
self._clean_temp_execution_path(run_uuid)
|
79
|
+
return {
|
80
|
+
"status": V1Statuses.FAILED,
|
81
|
+
"tasks": self._ops[run_uuid],
|
82
|
+
"message": message,
|
83
|
+
}
|
84
|
+
self._clean_temp_execution_path(run_uuid)
|
85
|
+
return {"status": V1Statuses.SUCCEEDED, "tasks": self._ops[run_uuid]}
|
86
|
+
|
87
|
+
def apply(
|
88
|
+
self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
|
89
|
+
) -> Dict:
|
90
|
+
raise PolyaxonAgentError(
|
91
|
+
"Docker executor does not support apply method.\n"
|
92
|
+
"Run: <kind: {}, uuid: {}>".format(run_kind, run_uuid)
|
93
|
+
)
|
94
|
+
|
95
|
+
def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
|
96
|
+
proc = self._get_op_proc(run_uuid)
|
97
|
+
if proc.poll() is None:
|
98
|
+
# Kill the process tree rooted at the child if it's the leader of its own process
|
99
|
+
# group, otherwise just kill the child
|
100
|
+
try:
|
101
|
+
if proc.pid == os.getpgid(proc.pid):
|
102
|
+
os.killpg(proc.pid, signal.SIGTERM)
|
103
|
+
else:
|
104
|
+
proc.terminate()
|
105
|
+
except OSError:
|
106
|
+
# The child process may have exited before we attempted to terminate it, so we
|
107
|
+
# ignore OSErrors raised during child process termination
|
108
|
+
_msg = f"Failed to terminate operation {run_kind} {run_uuid} child process PID {proc.pid}"
|
109
|
+
logger.debug(_msg)
|
110
|
+
proc.wait()
|
111
|
+
|
112
|
+
def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
|
113
|
+
return self.apply(
|
114
|
+
run_uuid=run_uuid,
|
115
|
+
run_kind=run_kind,
|
116
|
+
resource={"metadata": {"finalizers": None}},
|
117
|
+
)
|
118
|
+
|
119
|
+
def _get_task_status(self, proc) -> V1Statuses:
|
120
|
+
exit_code = proc.poll()
|
121
|
+
if exit_code is None:
|
122
|
+
return V1Statuses.RUNNING
|
123
|
+
if exit_code == 0:
|
124
|
+
return V1Statuses.SUCCEEDED
|
125
|
+
return V1Statuses.FAILED
|
126
|
+
|
127
|
+
def get(self, run_uuid: str, run_kind: str, namespace: str = None) -> V1Statuses:
|
128
|
+
procs = self._get_op_proc(run_uuid)
|
129
|
+
return self._get_task_status(procs[-1])
|
130
|
+
|
131
|
+
def list_ops(self, namespace: str = None):
|
132
|
+
return []
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from typing import Dict, List, Optional, Tuple, Union
|
2
|
+
|
3
|
+
from clipped.compact.pydantic import Field
|
4
|
+
|
5
|
+
from polyaxon._schemas.base import BaseSchemaModel
|
6
|
+
|
7
|
+
|
8
|
+
class V1EnvVar(BaseSchemaModel):
|
9
|
+
__root__: Union[Tuple[str, str], Dict[str, str]]
|
10
|
+
|
11
|
+
def to_cmd(self):
|
12
|
+
if isinstance(self.__root__, tuple):
|
13
|
+
value = self.__root__
|
14
|
+
else:
|
15
|
+
value = self.__root__.items()
|
16
|
+
return [f"{value[0]}={value[1]}"]
|
17
|
+
|
18
|
+
|
19
|
+
class V1Container(BaseSchemaModel):
|
20
|
+
name: Optional[str]
|
21
|
+
command: Optional[List[str]]
|
22
|
+
args: Optional[List[str]]
|
23
|
+
env: Optional[List[V1EnvVar]]
|
24
|
+
working_dir: Optional[str] = Field(alias="workingDir")
|
25
|
+
|
26
|
+
def get_cmd_args(self):
|
27
|
+
cmd_args = ["run", "--rm"]
|
28
|
+
for env in self.env:
|
29
|
+
cmd_args += ["-e"] + env.to_cmd()
|
30
|
+
if self.working_dir:
|
31
|
+
cmd_args += ["-w", self.working_dir]
|
32
|
+
if self.command:
|
33
|
+
cmd_args += ["--entrypoint", self.command[0]]
|
34
|
+
cmd_args += [self.image]
|
35
|
+
if self.command:
|
36
|
+
cmd_args += self.command[1:]
|
37
|
+
if self.args:
|
38
|
+
cmd_args += self.args
|
39
|
+
return cmd_args
|
polyaxon/_managers/agent.py
CHANGED
@@ -13,7 +13,9 @@ class AgentConfigManager(ConfigManager):
|
|
13
13
|
|
14
14
|
VISIBILITY = ConfigManager.Visibility.GLOBAL
|
15
15
|
CONFIG_FILE_NAME = ".agent"
|
16
|
+
ALTERNATE_CONFIG_FILE_NAME = ".sandbox"
|
16
17
|
CONFIG: Type[AgentConfig] = AgentConfig
|
18
|
+
PERSIST_FORMAT = "yaml"
|
17
19
|
|
18
20
|
@classmethod
|
19
21
|
def get_config_or_default(cls) -> AgentConfig:
|
polyaxon/_managers/home.py
CHANGED
@@ -13,6 +13,7 @@ class HomeConfigManager(ConfigManager):
|
|
13
13
|
VISIBILITY = ConfigManager.Visibility.GLOBAL
|
14
14
|
CONFIG_FILE_NAME = ".home"
|
15
15
|
CONFIG: Type[HomeConfig] = HomeConfig
|
16
|
+
PERSIST_FORMAT = "yaml"
|
16
17
|
|
17
18
|
@classmethod
|
18
19
|
def get_config_defaults(cls) -> Dict[str, str]:
|
@@ -23,7 +24,7 @@ class HomeConfigManager(ConfigManager):
|
|
23
24
|
glob_path = cls.get_global_config_path()
|
24
25
|
home_config = cls._CONFIG_READER.read_configs(
|
25
26
|
[
|
26
|
-
ConfigSpec(glob_path, config_type=".
|
27
|
+
ConfigSpec(glob_path, config_type=".yaml", check_if_exists=False),
|
27
28
|
os.environ,
|
28
29
|
{"dummy": "dummy"},
|
29
30
|
]
|
polyaxon/_operations/tuner.py
CHANGED
polyaxon/_polyaxonfile/check.py
CHANGED
@@ -80,6 +80,7 @@ def check_polyaxonfile(
|
|
80
80
|
matrix: Optional[Union[Dict, V1Matrix]] = None,
|
81
81
|
presets: Optional[List[str]] = None,
|
82
82
|
queue: Optional[str] = None,
|
83
|
+
namespace: Optional[str] = None,
|
83
84
|
nocache: Optional[bool] = None,
|
84
85
|
cache: Optional[Union[int, str, bool]] = None,
|
85
86
|
verbose: bool = True,
|
@@ -177,6 +178,7 @@ def check_polyaxonfile(
|
|
177
178
|
matrix=matrix,
|
178
179
|
presets=presets,
|
179
180
|
queue=queue,
|
181
|
+
namespace=namespace,
|
180
182
|
nocache=nocache,
|
181
183
|
cache=cache,
|
182
184
|
approved=approved,
|
@@ -29,6 +29,7 @@ def get_op_specification(
|
|
29
29
|
matrix: Optional[Union[Dict, V1Matrix]] = None,
|
30
30
|
presets: Optional[List[str]] = None,
|
31
31
|
queue: Optional[str] = None,
|
32
|
+
namespace: Optional[str] = None,
|
32
33
|
nocache: Optional[bool] = None,
|
33
34
|
cache: Optional[Union[int, str, bool]] = None,
|
34
35
|
approved: Optional[Union[int, str, bool]] = None,
|
@@ -70,6 +71,8 @@ def get_op_specification(
|
|
70
71
|
# Check only
|
71
72
|
get_queue_info(queue)
|
72
73
|
op_data["queue"] = queue
|
74
|
+
if namespace:
|
75
|
+
op_data["namespace"] = namespace
|
73
76
|
if cache is not None:
|
74
77
|
op_data["cache"] = {"disable": not to_bool(cache)}
|
75
78
|
if nocache:
|
@@ -20,6 +20,7 @@ def get_op_from_schedule(
|
|
20
20
|
op_spec.skip_on_upstream_skip = None
|
21
21
|
op_spec.cache = compiled_operation.cache
|
22
22
|
op_spec.queue = compiled_operation.queue
|
23
|
+
op_spec.namespace = compiled_operation.namespace
|
23
24
|
op_spec.component.inputs = compiled_operation.inputs
|
24
25
|
op_spec.component.outputs = compiled_operation.outputs
|
25
26
|
op_spec.component.run = compiled_operation.run
|
@@ -59,6 +60,7 @@ def get_ops_from_suggestions(
|
|
59
60
|
op_spec.skip_on_upstream_skip = None
|
60
61
|
op_spec.cache = compiled_operation.cache
|
61
62
|
op_spec.queue = compiled_operation.queue
|
63
|
+
op_spec.namespace = compiled_operation.namespace
|
62
64
|
op_spec.params = params
|
63
65
|
op_spec.component.inputs = compiled_operation.inputs
|
64
66
|
op_spec.component.outputs = compiled_operation.outputs
|
@@ -13,6 +13,7 @@ class Sections:
|
|
13
13
|
QUEUE = "queue"
|
14
14
|
CACHE = "cache"
|
15
15
|
PLUGINS = "plugins"
|
16
|
+
NAMESPACE = "namespace"
|
16
17
|
BUILD = "build"
|
17
18
|
HOOKS = "hooks"
|
18
19
|
EVENTS = "events"
|
@@ -55,6 +56,7 @@ class Sections:
|
|
55
56
|
CACHE,
|
56
57
|
QUEUE,
|
57
58
|
PLUGINS,
|
59
|
+
NAMESPACE,
|
58
60
|
BUILD,
|
59
61
|
HOOKS,
|
60
62
|
EVENTS,
|
@@ -89,6 +91,7 @@ class Sections:
|
|
89
91
|
CACHE,
|
90
92
|
CONNECTIONS,
|
91
93
|
PLUGINS,
|
94
|
+
NAMESPACE,
|
92
95
|
TERMINATION,
|
93
96
|
SCHEDULE,
|
94
97
|
DEPENDENCIES,
|
polyaxon/_pql/manager.py
CHANGED
@@ -13,7 +13,7 @@ class PQLManager:
|
|
13
13
|
FIELDS_PROXY = {}
|
14
14
|
FIELDS_TRANS = {}
|
15
15
|
FIELDS_ORDERING = None
|
16
|
-
FIELDS_ORDERING_PROXY = None
|
16
|
+
FIELDS_ORDERING_PROXY = None # Do not set a field on both field and proxy
|
17
17
|
FIELDS_DEFAULT_ORDERING = None
|
18
18
|
FIELDS_DISTINCT = None
|
19
19
|
CHECK_ALIVE = True
|
@@ -14,6 +14,7 @@ from polyaxon._env_vars.getters import get_run_info
|
|
14
14
|
from polyaxon._runner.agent.base_agent import BaseAgent
|
15
15
|
from polyaxon._sdk.schemas.v1_agent import V1Agent
|
16
16
|
from polyaxon._sdk.schemas.v1_agent_state_response import V1AgentStateResponse
|
17
|
+
from polyaxon._utils.fqn_utils import get_run_instance
|
17
18
|
from polyaxon.exceptions import ApiException as SDKApiException
|
18
19
|
from polyaxon.exceptions import PolyaxonAgentError, PolyaxonConverterError
|
19
20
|
from polyaxon.logger import logger
|
@@ -25,14 +26,14 @@ class BaseAsyncAgent(BaseAgent):
|
|
25
26
|
async def _enter(self):
|
26
27
|
if not self.client._is_managed:
|
27
28
|
return self
|
28
|
-
|
29
|
+
logger.warning("Agent is starting.")
|
29
30
|
await self.executor.refresh()
|
30
31
|
try:
|
31
32
|
agent = await self.client.get_info()
|
32
33
|
self._check_status(agent)
|
33
34
|
await self.sync()
|
34
35
|
await self.client.log_agent_running()
|
35
|
-
|
36
|
+
logger.warning("Agent is running.")
|
36
37
|
return self
|
37
38
|
except (ApiException, SDKApiException, HTTPError) as e:
|
38
39
|
message = "Could not start the agent."
|
@@ -78,6 +79,49 @@ class BaseAsyncAgent(BaseAgent):
|
|
78
79
|
),
|
79
80
|
)
|
80
81
|
|
82
|
+
async def reconcile(self):
|
83
|
+
if (
|
84
|
+
now() - self._last_reconciled_at
|
85
|
+
).total_seconds() > self.SLEEP_AGENT_DATA_COLLECT_TIME:
|
86
|
+
return
|
87
|
+
|
88
|
+
# Collect data
|
89
|
+
await self.collect_agent_data()
|
90
|
+
|
91
|
+
# Update reconcile
|
92
|
+
namespaces = [settings.AGENT_CONFIG.namespace]
|
93
|
+
namespaces += settings.AGENT_CONFIG.additional_namespaces or []
|
94
|
+
ops = []
|
95
|
+
for namespace in namespaces:
|
96
|
+
_ops = await self.executor.list_ops(namespace=namespace)
|
97
|
+
if _ops:
|
98
|
+
ops += [
|
99
|
+
(
|
100
|
+
get_run_instance(
|
101
|
+
owner=op["metadata"]["annotations"][
|
102
|
+
"operation.polyaxon.com/owner"
|
103
|
+
],
|
104
|
+
project=op["metadata"]["annotations"][
|
105
|
+
"operation.polyaxon.com/project"
|
106
|
+
],
|
107
|
+
run_uuid=op["metadata"]["labels"][
|
108
|
+
"app.kubernetes.io/instance"
|
109
|
+
],
|
110
|
+
),
|
111
|
+
op["metadata"]["annotations"]["operation.polyaxon.com/kind"],
|
112
|
+
op["metadata"]["annotations"]["operation.polyaxon.com/name"],
|
113
|
+
namespace,
|
114
|
+
)
|
115
|
+
for op in _ops
|
116
|
+
]
|
117
|
+
if not ops:
|
118
|
+
return None
|
119
|
+
|
120
|
+
logger.info("Reconcile agent.")
|
121
|
+
return await self.client.reconcile_agent(
|
122
|
+
reconcile={"ops": ops},
|
123
|
+
)
|
124
|
+
|
81
125
|
async def start(self):
|
82
126
|
try:
|
83
127
|
async with async_exit_context() as exit_event:
|
@@ -91,7 +135,9 @@ class BaseAsyncAgent(BaseAgent):
|
|
91
135
|
except asyncio.TimeoutError:
|
92
136
|
index += 1
|
93
137
|
await self.refresh_executor()
|
94
|
-
if
|
138
|
+
if self._default_auth:
|
139
|
+
await self.reconcile()
|
140
|
+
else:
|
95
141
|
await self.cron()
|
96
142
|
agent_state = await self.process()
|
97
143
|
if not agent_state:
|
@@ -107,7 +153,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
107
153
|
timeout = get_wait(index, max_interval=self.max_interval)
|
108
154
|
logger.info("Sleeping for {} seconds".format(timeout))
|
109
155
|
except Exception as e:
|
110
|
-
|
156
|
+
logger.warning("Agent failed to start: {}".format(repr(e)))
|
111
157
|
finally:
|
112
158
|
self.end()
|
113
159
|
|
@@ -118,7 +164,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
118
164
|
self.sync_compatible_updates(agent_state.compatible_updates)
|
119
165
|
|
120
166
|
if agent_state:
|
121
|
-
logger.info("
|
167
|
+
logger.info("Checking agent state.")
|
122
168
|
else:
|
123
169
|
logger.info("No state was found.")
|
124
170
|
return V1AgentStateResponse.construct()
|
@@ -185,7 +231,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
185
231
|
)
|
186
232
|
return None
|
187
233
|
|
188
|
-
async def submit_run(self, run_data: Tuple[str, str, str, str]):
|
234
|
+
async def submit_run(self, run_data: Tuple[str, str, str, str, str]):
|
189
235
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
190
236
|
resource = await self.prepare_run_resource(
|
191
237
|
owner_name=run_owner,
|
@@ -197,9 +243,13 @@ class BaseAsyncAgent(BaseAgent):
|
|
197
243
|
if not resource:
|
198
244
|
return
|
199
245
|
|
246
|
+
namespace = None if len(run_data) < 5 else run_data[4]
|
200
247
|
try:
|
201
248
|
await self.executor.create(
|
202
|
-
run_uuid=run_uuid,
|
249
|
+
run_uuid=run_uuid,
|
250
|
+
run_kind=run_data[1],
|
251
|
+
resource=resource,
|
252
|
+
namespace=namespace,
|
203
253
|
)
|
204
254
|
except ApiException as e:
|
205
255
|
if e.status == 409:
|
@@ -222,7 +272,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
222
272
|
)
|
223
273
|
|
224
274
|
async def make_and_create_run(
|
225
|
-
self, run_data: Tuple[str, str, str, str], default_auth: bool = False
|
275
|
+
self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
|
226
276
|
):
|
227
277
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
228
278
|
resource = await self.make_run_resource(
|
@@ -236,9 +286,14 @@ class BaseAsyncAgent(BaseAgent):
|
|
236
286
|
if not resource:
|
237
287
|
return
|
238
288
|
|
289
|
+
namepsace = None if len(run_data) < 5 else run_data[4]
|
290
|
+
|
239
291
|
try:
|
240
292
|
await self.executor.create(
|
241
|
-
run_uuid=run_uuid,
|
293
|
+
run_uuid=run_uuid,
|
294
|
+
run_kind=run_data[1],
|
295
|
+
resource=resource,
|
296
|
+
namespace=namepsace,
|
242
297
|
)
|
243
298
|
except ApiException as e:
|
244
299
|
if e.status == 409:
|
@@ -252,7 +307,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
252
307
|
)
|
253
308
|
)
|
254
309
|
|
255
|
-
async def apply_run(self, run_data: Tuple[str, str, str, str]):
|
310
|
+
async def apply_run(self, run_data: Tuple[str, str, str, str, str]):
|
256
311
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
257
312
|
resource = await self.prepare_run_resource(
|
258
313
|
owner_name=run_owner,
|
@@ -264,9 +319,14 @@ class BaseAsyncAgent(BaseAgent):
|
|
264
319
|
if not resource:
|
265
320
|
return
|
266
321
|
|
322
|
+
namespace = None if len(run_data) < 5 else run_data[4]
|
323
|
+
|
267
324
|
try:
|
268
325
|
await self.executor.apply(
|
269
|
-
run_uuid=run_uuid,
|
326
|
+
run_uuid=run_uuid,
|
327
|
+
run_kind=run_data[1],
|
328
|
+
resource=resource,
|
329
|
+
namespace=namespace,
|
270
330
|
)
|
271
331
|
await self.client.log_run_running(
|
272
332
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
|
@@ -275,12 +335,17 @@ class BaseAsyncAgent(BaseAgent):
|
|
275
335
|
await self.client.log_run_failed(
|
276
336
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e
|
277
337
|
)
|
278
|
-
await self.clean_run(
|
338
|
+
await self.clean_run(
|
339
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
340
|
+
)
|
279
341
|
|
280
|
-
async def check_run(self, run_data: Tuple[str, str]):
|
342
|
+
async def check_run(self, run_data: Tuple[str, str, str]):
|
281
343
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
344
|
+
namespace = None if len(run_data) < 3 else run_data[2]
|
282
345
|
try:
|
283
|
-
await self.executor.get(
|
346
|
+
await self.executor.get(
|
347
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
348
|
+
)
|
284
349
|
except ApiException as e:
|
285
350
|
if e.status == 404:
|
286
351
|
logger.info(
|
@@ -290,10 +355,13 @@ class BaseAsyncAgent(BaseAgent):
|
|
290
355
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
|
291
356
|
)
|
292
357
|
|
293
|
-
async def stop_run(self, run_data: Tuple[str, str]):
|
358
|
+
async def stop_run(self, run_data: Tuple[str, str, str]):
|
294
359
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
360
|
+
namespace = None if len(run_data) < 3 else run_data[2]
|
295
361
|
try:
|
296
|
-
await self.executor.stop(
|
362
|
+
await self.executor.stop(
|
363
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
364
|
+
)
|
297
365
|
except ApiException as e:
|
298
366
|
if e.status == 404:
|
299
367
|
logger.info("Run does not exist anymore, it could have been stopped.")
|
@@ -309,16 +377,24 @@ class BaseAsyncAgent(BaseAgent):
|
|
309
377
|
message="Agent failed stopping run.\n",
|
310
378
|
)
|
311
379
|
|
312
|
-
async def delete_run(self, run_data: Tuple[str, str, str, str]):
|
380
|
+
async def delete_run(self, run_data: Tuple[str, str, str, str, str]):
|
313
381
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
314
|
-
|
382
|
+
namespace = None if len(run_data) < 5 else run_data[4]
|
315
383
|
if run_data[3]:
|
316
384
|
await self.make_and_create_run(run_data)
|
385
|
+
else:
|
386
|
+
await self.clean_run(
|
387
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
388
|
+
)
|
317
389
|
|
318
|
-
async def clean_run(self, run_uuid: str, run_kind: str):
|
390
|
+
async def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
|
319
391
|
try:
|
320
|
-
await self.executor.clean(
|
321
|
-
|
392
|
+
await self.executor.clean(
|
393
|
+
run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
|
394
|
+
)
|
395
|
+
await self.executor.stop(
|
396
|
+
run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
|
397
|
+
)
|
322
398
|
except ApiException as e:
|
323
399
|
if e.status == 404:
|
324
400
|
logger.info("Run does not exist.")
|
@@ -24,6 +24,7 @@ class BaseAgent:
|
|
24
24
|
HEALTH_FILE = "/tmp/.healthz"
|
25
25
|
SLEEP_STOP_TIME = 60 * 5
|
26
26
|
SLEEP_ARCHIVED_TIME = 60 * 60
|
27
|
+
SLEEP_AGENT_DATA_COLLECT_TIME = 60 * 30
|
27
28
|
IS_ASYNC = False
|
28
29
|
|
29
30
|
def __init__(
|
@@ -41,6 +42,7 @@ class BaseAgent:
|
|
41
42
|
self._default_auth = bool(agent_uuid)
|
42
43
|
self._executor_refreshed_at = now()
|
43
44
|
self._graceful_shutdown = False
|
45
|
+
self._last_reconciled_at = now()
|
44
46
|
self.client = AgentClient(
|
45
47
|
owner=owner, agent_uuid=agent_uuid, is_async=self.IS_ASYNC
|
46
48
|
)
|
@@ -50,9 +52,25 @@ class BaseAgent:
|
|
50
52
|
def sync(self):
|
51
53
|
raise NotImplementedError
|
52
54
|
|
55
|
+
def reconcile(self):
|
56
|
+
raise NotImplementedError
|
57
|
+
|
53
58
|
def cron(self):
|
54
59
|
return self.client.cron_agent()
|
55
60
|
|
61
|
+
def collect_agent_data(self):
|
62
|
+
logger.info("Collecting agent data.")
|
63
|
+
self._last_reconciled_at = now()
|
64
|
+
try:
|
65
|
+
return self.client.collect_agent_data(
|
66
|
+
namespace=settings.CLIENT_CONFIG.namespace
|
67
|
+
)
|
68
|
+
except Exception as e:
|
69
|
+
logger.warning(
|
70
|
+
"Agent failed to collect agent data: {}\n"
|
71
|
+
"Retrying ...".format(repr(e))
|
72
|
+
)
|
73
|
+
|
56
74
|
def sync_compatible_updates(self, compatible_updates: Dict):
|
57
75
|
if compatible_updates and settings.AGENT_CONFIG:
|
58
76
|
init = compatible_updates.get("init")
|
@@ -114,14 +132,14 @@ class BaseAgent:
|
|
114
132
|
|
115
133
|
def _check_status(self, agent_state):
|
116
134
|
if agent_state.status == V1Statuses.STOPPED:
|
117
|
-
|
135
|
+
logger.warning(
|
118
136
|
"Agent has been stopped from the platform,"
|
119
137
|
"but the deployment is still running."
|
120
138
|
"Please either set the agent to starting or teardown the agent deployment."
|
121
139
|
)
|
122
140
|
return self.end(sleep=self.SLEEP_STOP_TIME)
|
123
141
|
elif agent_state.live_state < LiveState.LIVE:
|
124
|
-
|
142
|
+
logger.warning(
|
125
143
|
"Agent has been archived from the platform,"
|
126
144
|
"but the deployment is still running."
|
127
145
|
"Please either restore the agent or teardown the agent deployment."
|
@@ -173,25 +191,25 @@ class BaseAgent:
|
|
173
191
|
) -> Optional[Any]:
|
174
192
|
raise NotImplementedError
|
175
193
|
|
176
|
-
def submit_run(self, run_data: Tuple[str, str, str, str]):
|
194
|
+
def submit_run(self, run_data: Tuple[str, str, str, str, str]):
|
177
195
|
raise NotImplementedError
|
178
196
|
|
179
197
|
def make_and_create_run(
|
180
|
-
self, run_data: Tuple[str, str, str, str], default_auth: bool = False
|
198
|
+
self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
|
181
199
|
):
|
182
200
|
raise NotImplementedError
|
183
201
|
|
184
|
-
def apply_run(self, run_data: Tuple[str, str, str, str]):
|
202
|
+
def apply_run(self, run_data: Tuple[str, str, str, str, str]):
|
185
203
|
raise NotImplementedError
|
186
204
|
|
187
|
-
def check_run(self, run_data: Tuple[str, str]):
|
205
|
+
def check_run(self, run_data: Tuple[str, str, str]):
|
188
206
|
raise NotImplementedError
|
189
207
|
|
190
|
-
def stop_run(self, run_data: Tuple[str, str]):
|
208
|
+
def stop_run(self, run_data: Tuple[str, str, str]):
|
191
209
|
raise NotImplementedError
|
192
210
|
|
193
|
-
def delete_run(self, run_data: Tuple[str, str, str, str]):
|
211
|
+
def delete_run(self, run_data: Tuple[str, str, str, str, str]):
|
194
212
|
raise NotImplementedError
|
195
213
|
|
196
|
-
def clean_run(self, run_uuid: str, run_kind: str):
|
214
|
+
def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
|
197
215
|
raise NotImplementedError
|
polyaxon/_runner/agent/client.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import traceback
|
2
2
|
|
3
|
-
from typing import Optional
|
3
|
+
from typing import Dict, Optional
|
4
4
|
|
5
5
|
from polyaxon._schemas.lifecycle import V1StatusCondition, V1Statuses
|
6
6
|
from polyaxon.client import PolyaxonClient, V1Agent, V1AgentStateResponse
|
@@ -65,6 +65,20 @@ class AgentClient:
|
|
65
65
|
def cron_agent(self):
|
66
66
|
return self.client.agents_v1.cron_agent(owner=self.owner, _request_timeout=10)
|
67
67
|
|
68
|
+
def collect_agent_data(self, namespace: str):
|
69
|
+
return self.client.internal_agents_v1.collect_agent_data(
|
70
|
+
owner=self.owner,
|
71
|
+
uuid=self.agent_uuid,
|
72
|
+
namespace=namespace,
|
73
|
+
)
|
74
|
+
|
75
|
+
def reconcile_agent(self, reconcile: Dict):
|
76
|
+
return self.client.agents_v1.reconcile_agent(
|
77
|
+
owner=self.owner,
|
78
|
+
uuid=self.agent_uuid,
|
79
|
+
body={"reconcile": reconcile},
|
80
|
+
)
|
81
|
+
|
68
82
|
def log_agent_running(self):
|
69
83
|
return self.log_agent_status(status=V1Statuses.RUNNING, reason="AgentLogger")
|
70
84
|
|