polyaxon 2.0.0rc49__py3-none-any.whl → 2.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. polyaxon/_auxiliaries/cleaner.py +8 -3
  2. polyaxon/_auxiliaries/init.py +7 -2
  3. polyaxon/_auxiliaries/notifier.py +8 -2
  4. polyaxon/_auxiliaries/sidecar.py +30 -2
  5. polyaxon/_cli/artifacts.py +96 -11
  6. polyaxon/_cli/components.py +96 -11
  7. polyaxon/_cli/config.py +118 -22
  8. polyaxon/_cli/dashboard.py +15 -2
  9. polyaxon/_cli/init.py +1 -1
  10. polyaxon/_cli/models.py +96 -11
  11. polyaxon/_cli/operations.py +267 -90
  12. polyaxon/_cli/project_versions.py +139 -6
  13. polyaxon/_cli/projects.py +23 -9
  14. polyaxon/_cli/run.py +37 -9
  15. polyaxon/_cli/services/agent.py +2 -2
  16. polyaxon/_cli/services/clean_artifacts.py +1 -1
  17. polyaxon/_cli/services/sidecar.py +8 -1
  18. polyaxon/_client/client.py +17 -0
  19. polyaxon/_client/mixin.py +39 -0
  20. polyaxon/_client/project.py +218 -23
  21. polyaxon/_client/run.py +131 -33
  22. polyaxon/_compiler/contexts/contexts.py +2 -2
  23. polyaxon/_compiler/contexts/ray_job.py +4 -2
  24. polyaxon/_compiler/resolver/agent.py +12 -2
  25. polyaxon/_compiler/resolver/runtime.py +2 -2
  26. polyaxon/_contexts/paths.py +4 -7
  27. polyaxon/_deploy/operators/compose.py +1 -27
  28. polyaxon/_deploy/schemas/deployment.py +4 -1
  29. polyaxon/_deploy/schemas/intervals.py +0 -7
  30. polyaxon/_deploy/schemas/proxy.py +1 -0
  31. polyaxon/_deploy/schemas/service.py +11 -1
  32. polyaxon/_docker/converter/base/base.py +8 -0
  33. polyaxon/_docker/executor.py +10 -4
  34. polyaxon/_env_vars/getters/owner_entity.py +4 -2
  35. polyaxon/_env_vars/getters/project.py +4 -2
  36. polyaxon/_env_vars/getters/run.py +5 -2
  37. polyaxon/_env_vars/keys.py +7 -1
  38. polyaxon/_flow/__init__.py +2 -0
  39. polyaxon/_flow/builds/__init__.py +19 -6
  40. polyaxon/_flow/component/base.py +1 -0
  41. polyaxon/_flow/component/component.py +14 -0
  42. polyaxon/_flow/environment/__init__.py +8 -8
  43. polyaxon/_flow/hooks/__init__.py +19 -6
  44. polyaxon/_flow/init/__init__.py +6 -6
  45. polyaxon/_flow/matrix/iterative.py +0 -1
  46. polyaxon/_flow/matrix/tuner.py +18 -6
  47. polyaxon/_flow/operations/operation.py +44 -17
  48. polyaxon/_flow/plugins/__init__.py +6 -0
  49. polyaxon/_flow/run/__init__.py +2 -2
  50. polyaxon/_flow/run/dag.py +2 -2
  51. polyaxon/_flow/run/dask/dask.py +0 -1
  52. polyaxon/_flow/run/dask/replica.py +3 -3
  53. polyaxon/_flow/run/enums.py +5 -0
  54. polyaxon/_flow/run/job.py +4 -4
  55. polyaxon/_flow/run/kubeflow/mpi_job.py +1 -2
  56. polyaxon/_flow/run/kubeflow/mx_job.py +1 -2
  57. polyaxon/_flow/run/kubeflow/paddle_job.py +35 -4
  58. polyaxon/_flow/run/kubeflow/pytorch_job.py +51 -5
  59. polyaxon/_flow/run/kubeflow/replica.py +4 -4
  60. polyaxon/_flow/run/kubeflow/scheduling_policy.py +12 -0
  61. polyaxon/_flow/run/kubeflow/tf_job.py +3 -3
  62. polyaxon/_flow/run/kubeflow/xgboost_job.py +1 -2
  63. polyaxon/_flow/run/ray/ray.py +2 -3
  64. polyaxon/_flow/run/ray/replica.py +3 -3
  65. polyaxon/_flow/run/service.py +4 -4
  66. polyaxon/_fs/fs.py +7 -2
  67. polyaxon/_fs/utils.py +3 -2
  68. polyaxon/_k8s/converter/base/base.py +2 -1
  69. polyaxon/_k8s/converter/base/main.py +1 -0
  70. polyaxon/_k8s/converter/base/sidecar.py +16 -1
  71. polyaxon/_k8s/converter/common/accelerators.py +7 -4
  72. polyaxon/_k8s/converter/converters/job.py +1 -1
  73. polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
  74. polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
  75. polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
  76. polyaxon/_k8s/converter/converters/ray_job.py +4 -2
  77. polyaxon/_k8s/custom_resources/dask_job.py +3 -0
  78. polyaxon/_k8s/custom_resources/kubeflow/common.py +4 -1
  79. polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
  80. polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
  81. polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
  82. polyaxon/_k8s/custom_resources/ray_job.py +3 -0
  83. polyaxon/_k8s/custom_resources/setter.py +1 -1
  84. polyaxon/_k8s/executor/async_executor.py +2 -0
  85. polyaxon/_k8s/executor/base.py +23 -6
  86. polyaxon/_k8s/logging/async_monitor.py +150 -5
  87. polyaxon/_k8s/manager/async_manager.py +96 -23
  88. polyaxon/_k8s/manager/base.py +4 -0
  89. polyaxon/_k8s/manager/manager.py +282 -134
  90. polyaxon/_local_process/__init__.py +0 -0
  91. polyaxon/_local_process/agent.py +6 -0
  92. polyaxon/_local_process/converter/__init__.py +1 -0
  93. polyaxon/_local_process/converter/base/__init__.py +1 -0
  94. polyaxon/_local_process/converter/base/base.py +140 -0
  95. polyaxon/_local_process/converter/base/containers.py +69 -0
  96. polyaxon/_local_process/converter/base/env_vars.py +253 -0
  97. polyaxon/_local_process/converter/base/init.py +414 -0
  98. polyaxon/_local_process/converter/base/main.py +74 -0
  99. polyaxon/_local_process/converter/base/mounts.py +82 -0
  100. polyaxon/_local_process/converter/converters/__init__.py +8 -0
  101. polyaxon/_local_process/converter/converters/job.py +40 -0
  102. polyaxon/_local_process/converter/converters/service.py +41 -0
  103. polyaxon/_local_process/converter/mixins.py +38 -0
  104. polyaxon/_local_process/executor.py +132 -0
  105. polyaxon/_local_process/process_types.py +39 -0
  106. polyaxon/_managers/agent.py +2 -0
  107. polyaxon/_managers/home.py +2 -1
  108. polyaxon/_operations/tuner.py +1 -0
  109. polyaxon/_polyaxonfile/check.py +2 -0
  110. polyaxon/_polyaxonfile/manager/operations.py +3 -0
  111. polyaxon/_polyaxonfile/manager/workflows.py +2 -0
  112. polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
  113. polyaxon/_polyaxonfile/specs/operation.py +1 -0
  114. polyaxon/_polyaxonfile/specs/sections.py +3 -0
  115. polyaxon/_pql/manager.py +1 -1
  116. polyaxon/_runner/agent/async_agent.py +97 -21
  117. polyaxon/_runner/agent/base_agent.py +27 -9
  118. polyaxon/_runner/agent/client.py +15 -1
  119. polyaxon/_runner/agent/sync_agent.py +85 -20
  120. polyaxon/_runner/converter/converter.py +6 -2
  121. polyaxon/_runner/executor.py +13 -7
  122. polyaxon/_schemas/agent.py +27 -1
  123. polyaxon/_schemas/client.py +30 -3
  124. polyaxon/_schemas/installation.py +4 -3
  125. polyaxon/_schemas/lifecycle.py +10 -5
  126. polyaxon/_schemas/log_handler.py +2 -3
  127. polyaxon/_schemas/types/artifacts.py +3 -3
  128. polyaxon/_schemas/types/dockerfile.py +3 -3
  129. polyaxon/_schemas/types/file.py +3 -3
  130. polyaxon/_schemas/types/git.py +3 -3
  131. polyaxon/_schemas/types/tensorboard.py +3 -3
  132. polyaxon/_sdk/api/agents_v1_api.py +1076 -73
  133. polyaxon/_sdk/api/organizations_v1_api.py +371 -10
  134. polyaxon/_sdk/api/project_dashboards_v1_api.py +12 -12
  135. polyaxon/_sdk/api/project_searches_v1_api.py +12 -12
  136. polyaxon/_sdk/api/projects_v1_api.py +221 -44
  137. polyaxon/_sdk/api/runs_v1_api.py +917 -445
  138. polyaxon/_sdk/api/service_accounts_v1_api.py +16 -16
  139. polyaxon/_sdk/api/teams_v1_api.py +2827 -375
  140. polyaxon/_sdk/api/users_v1_api.py +231 -55
  141. polyaxon/_sdk/async_client/api_client.py +4 -0
  142. polyaxon/_sdk/schemas/__init__.py +10 -2
  143. polyaxon/_sdk/schemas/v1_agent.py +2 -1
  144. polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
  145. polyaxon/_sdk/schemas/v1_artifact_tree.py +1 -1
  146. polyaxon/_sdk/schemas/v1_dashboard_spec.py +4 -0
  147. polyaxon/_sdk/schemas/v1_events_response.py +4 -0
  148. polyaxon/_sdk/schemas/v1_organization.py +1 -0
  149. polyaxon/_sdk/schemas/v1_preset.py +8 -0
  150. polyaxon/_sdk/schemas/v1_project.py +1 -0
  151. polyaxon/_sdk/schemas/v1_project_settings.py +4 -2
  152. polyaxon/_sdk/schemas/v1_run.py +2 -2
  153. polyaxon/_sdk/schemas/v1_run_edge_lineage.py +14 -0
  154. polyaxon/_sdk/schemas/v1_run_edges_graph.py +9 -0
  155. polyaxon/_sdk/schemas/v1_section_spec.py +7 -2
  156. polyaxon/_sdk/schemas/v1_settings_catalog.py +1 -0
  157. polyaxon/_sdk/schemas/v1_team.py +3 -0
  158. polyaxon/_sdk/schemas/v1_user.py +1 -2
  159. polyaxon/_sdk/schemas/v1_user_access.py +17 -0
  160. polyaxon/_services/values.py +1 -0
  161. polyaxon/_sidecar/container/__init__.py +39 -18
  162. polyaxon/_sidecar/container/monitors/__init__.py +1 -0
  163. polyaxon/_sidecar/container/monitors/logs.py +10 -13
  164. polyaxon/_sidecar/container/monitors/spec.py +24 -0
  165. polyaxon/_sidecar/ignore.py +0 -1
  166. polyaxon/_utils/fqn_utils.py +25 -2
  167. polyaxon/client.py +1 -1
  168. polyaxon/pkg.py +1 -1
  169. polyaxon/schemas.py +8 -1
  170. polyaxon/settings.py +6 -0
  171. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/METADATA +43 -43
  172. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/RECORD +176 -155
  173. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/WHEEL +1 -1
  174. polyaxon/_sdk/schemas/v1_project_user_access.py +0 -10
  175. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/LICENSE +0 -0
  176. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/entry_points.txt +0 -0
  177. {polyaxon-2.0.0rc49.dist-info → polyaxon-2.4.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,132 @@
1
+ import os
2
+ import signal
3
+ import subprocess
4
+
5
+ from typing import Dict, List
6
+
7
+ from polyaxon._deploy.operators.cmd_operator import CmdOperator
8
+ from polyaxon._deploy.operators.conda import CondaOperator
9
+ from polyaxon._local_process import process_types
10
+ from polyaxon._local_process.converter.converters import CONVERTERS
11
+ from polyaxon._local_process.converter.mixins import MIXIN_MAPPING
12
+ from polyaxon._runner.executor import BaseExecutor
13
+ from polyaxon._runner.kinds import RunnerKind
14
+ from polyaxon._schemas.lifecycle import V1Statuses
15
+ from polyaxon.exceptions import PolyaxonAgentError
16
+ from polyaxon.logger import logger
17
+
18
+
19
+ class Executor(BaseExecutor):
20
+ MIXIN_MAPPING = MIXIN_MAPPING
21
+ CONVERTERS = CONVERTERS
22
+ RUNNER_KIND = RunnerKind.PROCESS
23
+
24
+ def __init__(self, conda_env: str = None, venv: str = None):
25
+ super().__init__()
26
+ self._ops = {}
27
+ self._conda_env = conda_env
28
+ self._venv = venv
29
+
30
+ def _get_manager(self):
31
+ if self._conda_env:
32
+ return CondaOperator()
33
+ return CmdOperator()
34
+
35
+ def _check_conda(self):
36
+ if not self.manager.check():
37
+ raise logger.error("Conda is required to run this command.")
38
+
39
+ envs = self.manager.execute(["env", "list", "--json"], is_json=True)
40
+ env_names = [os.path.basename(env) for env in envs["envs"]]
41
+ if self._conda_env not in env_names:
42
+ raise logger.error(
43
+ "Conda env `{}` is not installed.".format(self._conda_env),
44
+ sys_exit=True,
45
+ )
46
+
47
+ def _run_in_conda(self, cmd_bash, cmd_args):
48
+ cmd_args = ["source activate {}".format(self._conda_env)] + cmd_args
49
+ subprocess.Popen(cmd_bash + [" && ".join(cmd_args)], close_fds=True)
50
+
51
+ def _get_op_proc(self, run_uuid: str) -> List[subprocess.Popen]:
52
+ return self._ops.get(run_uuid)
53
+
54
+ def create(
55
+ self,
56
+ run_uuid: str,
57
+ run_kind: str,
58
+ resource: List[process_types.V1Container],
59
+ namespace: str = None,
60
+ ) -> Dict:
61
+ logger.info(f"[Executor] Starting operation {run_uuid} {run_kind}.")
62
+ self._ops[run_uuid] = []
63
+ for task in resource:
64
+ logger.info(
65
+ f"[Executor] Starting task container {task.name} {task.image} ."
66
+ )
67
+ proc = self.manager.execute(
68
+ task.get_cmd_args(), env=os.environ, output_only=False
69
+ )
70
+ self._ops[run_uuid].append(proc)
71
+ proc.wait()
72
+ task_status = self._get_task_status(proc)
73
+ message = f"Task container {task.name} {task.image} with id {proc.pid} {task_status}"
74
+ if task_status == V1Statuses.SUCCEEDED:
75
+ logger.info(f"[Executor] {message}")
76
+ else:
77
+ logger.warning(f"[Executor] {message}")
78
+ self._clean_temp_execution_path(run_uuid)
79
+ return {
80
+ "status": V1Statuses.FAILED,
81
+ "tasks": self._ops[run_uuid],
82
+ "message": message,
83
+ }
84
+ self._clean_temp_execution_path(run_uuid)
85
+ return {"status": V1Statuses.SUCCEEDED, "tasks": self._ops[run_uuid]}
86
+
87
+ def apply(
88
+ self, run_uuid: str, run_kind: str, resource: Dict, namespace: str = None
89
+ ) -> Dict:
90
+ raise PolyaxonAgentError(
91
+ "Docker executor does not support apply method.\n"
92
+ "Run: <kind: {}, uuid: {}>".format(run_kind, run_uuid)
93
+ )
94
+
95
+ def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
96
+ proc = self._get_op_proc(run_uuid)
97
+ if proc.poll() is None:
98
+ # Kill the process tree rooted at the child if it's the leader of its own process
99
+ # group, otherwise just kill the child
100
+ try:
101
+ if proc.pid == os.getpgid(proc.pid):
102
+ os.killpg(proc.pid, signal.SIGTERM)
103
+ else:
104
+ proc.terminate()
105
+ except OSError:
106
+ # The child process may have exited before we attempted to terminate it, so we
107
+ # ignore OSErrors raised during child process termination
108
+ _msg = f"Failed to terminate operation {run_kind} {run_uuid} child process PID {proc.pid}"
109
+ logger.debug(_msg)
110
+ proc.wait()
111
+
112
+ def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
113
+ return self.apply(
114
+ run_uuid=run_uuid,
115
+ run_kind=run_kind,
116
+ resource={"metadata": {"finalizers": None}},
117
+ )
118
+
119
+ def _get_task_status(self, proc) -> V1Statuses:
120
+ exit_code = proc.poll()
121
+ if exit_code is None:
122
+ return V1Statuses.RUNNING
123
+ if exit_code == 0:
124
+ return V1Statuses.SUCCEEDED
125
+ return V1Statuses.FAILED
126
+
127
+ def get(self, run_uuid: str, run_kind: str, namespace: str = None) -> V1Statuses:
128
+ procs = self._get_op_proc(run_uuid)
129
+ return self._get_task_status(procs[-1])
130
+
131
+ def list_ops(self, namespace: str = None):
132
+ return []
@@ -0,0 +1,39 @@
1
+ from typing import Dict, List, Optional, Tuple, Union
2
+
3
+ from clipped.compact.pydantic import Field
4
+
5
+ from polyaxon._schemas.base import BaseSchemaModel
6
+
7
+
8
+ class V1EnvVar(BaseSchemaModel):
9
+ __root__: Union[Tuple[str, str], Dict[str, str]]
10
+
11
+ def to_cmd(self):
12
+ if isinstance(self.__root__, tuple):
13
+ value = self.__root__
14
+ else:
15
+ value = self.__root__.items()
16
+ return [f"{value[0]}={value[1]}"]
17
+
18
+
19
+ class V1Container(BaseSchemaModel):
20
+ name: Optional[str]
21
+ command: Optional[List[str]]
22
+ args: Optional[List[str]]
23
+ env: Optional[List[V1EnvVar]]
24
+ working_dir: Optional[str] = Field(alias="workingDir")
25
+
26
+ def get_cmd_args(self):
27
+ cmd_args = ["run", "--rm"]
28
+ for env in self.env:
29
+ cmd_args += ["-e"] + env.to_cmd()
30
+ if self.working_dir:
31
+ cmd_args += ["-w", self.working_dir]
32
+ if self.command:
33
+ cmd_args += ["--entrypoint", self.command[0]]
34
+ cmd_args += [self.image]
35
+ if self.command:
36
+ cmd_args += self.command[1:]
37
+ if self.args:
38
+ cmd_args += self.args
39
+ return cmd_args
@@ -13,7 +13,9 @@ class AgentConfigManager(ConfigManager):
13
13
 
14
14
  VISIBILITY = ConfigManager.Visibility.GLOBAL
15
15
  CONFIG_FILE_NAME = ".agent"
16
+ ALTERNATE_CONFIG_FILE_NAME = ".sandbox"
16
17
  CONFIG: Type[AgentConfig] = AgentConfig
18
+ PERSIST_FORMAT = "yaml"
17
19
 
18
20
  @classmethod
19
21
  def get_config_or_default(cls) -> AgentConfig:
@@ -13,6 +13,7 @@ class HomeConfigManager(ConfigManager):
13
13
  VISIBILITY = ConfigManager.Visibility.GLOBAL
14
14
  CONFIG_FILE_NAME = ".home"
15
15
  CONFIG: Type[HomeConfig] = HomeConfig
16
+ PERSIST_FORMAT = "yaml"
16
17
 
17
18
  @classmethod
18
19
  def get_config_defaults(cls) -> Dict[str, str]:
@@ -23,7 +24,7 @@ class HomeConfigManager(ConfigManager):
23
24
  glob_path = cls.get_global_config_path()
24
25
  home_config = cls._CONFIG_READER.read_configs(
25
26
  [
26
- ConfigSpec(glob_path, config_type=".json", check_if_exists=False),
27
+ ConfigSpec(glob_path, config_type=".yaml", check_if_exists=False),
27
28
  os.environ,
28
29
  {"dummy": "dummy"},
29
30
  ]
@@ -31,6 +31,7 @@ def get_tuner(
31
31
 
32
32
  return V1Operation(
33
33
  queue=tuner.queue,
34
+ namespace=tuner.namespace,
34
35
  joins=[join],
35
36
  params=params,
36
37
  hub_ref=tuner.hub_ref,
@@ -80,6 +80,7 @@ def check_polyaxonfile(
80
80
  matrix: Optional[Union[Dict, V1Matrix]] = None,
81
81
  presets: Optional[List[str]] = None,
82
82
  queue: Optional[str] = None,
83
+ namespace: Optional[str] = None,
83
84
  nocache: Optional[bool] = None,
84
85
  cache: Optional[Union[int, str, bool]] = None,
85
86
  verbose: bool = True,
@@ -177,6 +178,7 @@ def check_polyaxonfile(
177
178
  matrix=matrix,
178
179
  presets=presets,
179
180
  queue=queue,
181
+ namespace=namespace,
180
182
  nocache=nocache,
181
183
  cache=cache,
182
184
  approved=approved,
@@ -29,6 +29,7 @@ def get_op_specification(
29
29
  matrix: Optional[Union[Dict, V1Matrix]] = None,
30
30
  presets: Optional[List[str]] = None,
31
31
  queue: Optional[str] = None,
32
+ namespace: Optional[str] = None,
32
33
  nocache: Optional[bool] = None,
33
34
  cache: Optional[Union[int, str, bool]] = None,
34
35
  approved: Optional[Union[int, str, bool]] = None,
@@ -70,6 +71,8 @@ def get_op_specification(
70
71
  # Check only
71
72
  get_queue_info(queue)
72
73
  op_data["queue"] = queue
74
+ if namespace:
75
+ op_data["namespace"] = namespace
73
76
  if cache is not None:
74
77
  op_data["cache"] = {"disable": not to_bool(cache)}
75
78
  if nocache:
@@ -20,6 +20,7 @@ def get_op_from_schedule(
20
20
  op_spec.skip_on_upstream_skip = None
21
21
  op_spec.cache = compiled_operation.cache
22
22
  op_spec.queue = compiled_operation.queue
23
+ op_spec.namespace = compiled_operation.namespace
23
24
  op_spec.component.inputs = compiled_operation.inputs
24
25
  op_spec.component.outputs = compiled_operation.outputs
25
26
  op_spec.component.run = compiled_operation.run
@@ -59,6 +60,7 @@ def get_ops_from_suggestions(
59
60
  op_spec.skip_on_upstream_skip = None
60
61
  op_spec.cache = compiled_operation.cache
61
62
  op_spec.queue = compiled_operation.queue
63
+ op_spec.namespace = compiled_operation.namespace
62
64
  op_spec.params = params
63
65
  op_spec.component.inputs = compiled_operation.inputs
64
66
  op_spec.component.outputs = compiled_operation.outputs
@@ -333,6 +333,7 @@ class CompiledOperationSpecification(BaseSpecification):
333
333
  "is_approved",
334
334
  "presets",
335
335
  "queue",
336
+ "namespace",
336
337
  "cache",
337
338
  "build",
338
339
  "hooks",
@@ -92,6 +92,7 @@ class OperationSpecification(BaseSpecification):
92
92
  "is_approved",
93
93
  "presets",
94
94
  "queue",
95
+ "namespace",
95
96
  "cache",
96
97
  "build",
97
98
  "hooks",
@@ -13,6 +13,7 @@ class Sections:
13
13
  QUEUE = "queue"
14
14
  CACHE = "cache"
15
15
  PLUGINS = "plugins"
16
+ NAMESPACE = "namespace"
16
17
  BUILD = "build"
17
18
  HOOKS = "hooks"
18
19
  EVENTS = "events"
@@ -55,6 +56,7 @@ class Sections:
55
56
  CACHE,
56
57
  QUEUE,
57
58
  PLUGINS,
59
+ NAMESPACE,
58
60
  BUILD,
59
61
  HOOKS,
60
62
  EVENTS,
@@ -89,6 +91,7 @@ class Sections:
89
91
  CACHE,
90
92
  CONNECTIONS,
91
93
  PLUGINS,
94
+ NAMESPACE,
92
95
  TERMINATION,
93
96
  SCHEDULE,
94
97
  DEPENDENCIES,
polyaxon/_pql/manager.py CHANGED
@@ -13,7 +13,7 @@ class PQLManager:
13
13
  FIELDS_PROXY = {}
14
14
  FIELDS_TRANS = {}
15
15
  FIELDS_ORDERING = None
16
- FIELDS_ORDERING_PROXY = None
16
+ FIELDS_ORDERING_PROXY = None # Do not set a field on both field and proxy
17
17
  FIELDS_DEFAULT_ORDERING = None
18
18
  FIELDS_DISTINCT = None
19
19
  CHECK_ALIVE = True
@@ -14,6 +14,7 @@ from polyaxon._env_vars.getters import get_run_info
14
14
  from polyaxon._runner.agent.base_agent import BaseAgent
15
15
  from polyaxon._sdk.schemas.v1_agent import V1Agent
16
16
  from polyaxon._sdk.schemas.v1_agent_state_response import V1AgentStateResponse
17
+ from polyaxon._utils.fqn_utils import get_run_instance
17
18
  from polyaxon.exceptions import ApiException as SDKApiException
18
19
  from polyaxon.exceptions import PolyaxonAgentError, PolyaxonConverterError
19
20
  from polyaxon.logger import logger
@@ -25,14 +26,14 @@ class BaseAsyncAgent(BaseAgent):
25
26
  async def _enter(self):
26
27
  if not self.client._is_managed:
27
28
  return self
28
- print("Agent is starting.")
29
+ logger.warning("Agent is starting.")
29
30
  await self.executor.refresh()
30
31
  try:
31
32
  agent = await self.client.get_info()
32
33
  self._check_status(agent)
33
34
  await self.sync()
34
35
  await self.client.log_agent_running()
35
- print("Agent is running.")
36
+ logger.warning("Agent is running.")
36
37
  return self
37
38
  except (ApiException, SDKApiException, HTTPError) as e:
38
39
  message = "Could not start the agent."
@@ -78,6 +79,49 @@ class BaseAsyncAgent(BaseAgent):
78
79
  ),
79
80
  )
80
81
 
82
+ async def reconcile(self):
83
+ if (
84
+ now() - self._last_reconciled_at
85
+ ).total_seconds() > self.SLEEP_AGENT_DATA_COLLECT_TIME:
86
+ return
87
+
88
+ # Collect data
89
+ await self.collect_agent_data()
90
+
91
+ # Update reconcile
92
+ namespaces = [settings.AGENT_CONFIG.namespace]
93
+ namespaces += settings.AGENT_CONFIG.additional_namespaces or []
94
+ ops = []
95
+ for namespace in namespaces:
96
+ _ops = await self.executor.list_ops(namespace=namespace)
97
+ if _ops:
98
+ ops += [
99
+ (
100
+ get_run_instance(
101
+ owner=op["metadata"]["annotations"][
102
+ "operation.polyaxon.com/owner"
103
+ ],
104
+ project=op["metadata"]["annotations"][
105
+ "operation.polyaxon.com/project"
106
+ ],
107
+ run_uuid=op["metadata"]["labels"][
108
+ "app.kubernetes.io/instance"
109
+ ],
110
+ ),
111
+ op["metadata"]["annotations"]["operation.polyaxon.com/kind"],
112
+ op["metadata"]["annotations"]["operation.polyaxon.com/name"],
113
+ namespace,
114
+ )
115
+ for op in _ops
116
+ ]
117
+ if not ops:
118
+ return None
119
+
120
+ logger.info("Reconcile agent.")
121
+ return await self.client.reconcile_agent(
122
+ reconcile={"ops": ops},
123
+ )
124
+
81
125
  async def start(self):
82
126
  try:
83
127
  async with async_exit_context() as exit_event:
@@ -91,7 +135,9 @@ class BaseAsyncAgent(BaseAgent):
91
135
  except asyncio.TimeoutError:
92
136
  index += 1
93
137
  await self.refresh_executor()
94
- if not self._default_auth:
138
+ if self._default_auth:
139
+ await self.reconcile()
140
+ else:
95
141
  await self.cron()
96
142
  agent_state = await self.process()
97
143
  if not agent_state:
@@ -107,7 +153,7 @@ class BaseAsyncAgent(BaseAgent):
107
153
  timeout = get_wait(index, max_interval=self.max_interval)
108
154
  logger.info("Sleeping for {} seconds".format(timeout))
109
155
  except Exception as e:
110
- print(e)
156
+ logger.warning("Agent failed to start: {}".format(repr(e)))
111
157
  finally:
112
158
  self.end()
113
159
 
@@ -118,7 +164,7 @@ class BaseAsyncAgent(BaseAgent):
118
164
  self.sync_compatible_updates(agent_state.compatible_updates)
119
165
 
120
166
  if agent_state:
121
- logger.info("Starting runs submission process.")
167
+ logger.info("Checking agent state.")
122
168
  else:
123
169
  logger.info("No state was found.")
124
170
  return V1AgentStateResponse.construct()
@@ -185,7 +231,7 @@ class BaseAsyncAgent(BaseAgent):
185
231
  )
186
232
  return None
187
233
 
188
- async def submit_run(self, run_data: Tuple[str, str, str, str]):
234
+ async def submit_run(self, run_data: Tuple[str, str, str, str, str]):
189
235
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
190
236
  resource = await self.prepare_run_resource(
191
237
  owner_name=run_owner,
@@ -197,9 +243,13 @@ class BaseAsyncAgent(BaseAgent):
197
243
  if not resource:
198
244
  return
199
245
 
246
+ namespace = None if len(run_data) < 5 else run_data[4]
200
247
  try:
201
248
  await self.executor.create(
202
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
249
+ run_uuid=run_uuid,
250
+ run_kind=run_data[1],
251
+ resource=resource,
252
+ namespace=namespace,
203
253
  )
204
254
  except ApiException as e:
205
255
  if e.status == 409:
@@ -222,7 +272,7 @@ class BaseAsyncAgent(BaseAgent):
222
272
  )
223
273
 
224
274
  async def make_and_create_run(
225
- self, run_data: Tuple[str, str, str, str], default_auth: bool = False
275
+ self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
226
276
  ):
227
277
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
228
278
  resource = await self.make_run_resource(
@@ -236,9 +286,14 @@ class BaseAsyncAgent(BaseAgent):
236
286
  if not resource:
237
287
  return
238
288
 
289
+ namepsace = None if len(run_data) < 5 else run_data[4]
290
+
239
291
  try:
240
292
  await self.executor.create(
241
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
293
+ run_uuid=run_uuid,
294
+ run_kind=run_data[1],
295
+ resource=resource,
296
+ namespace=namepsace,
242
297
  )
243
298
  except ApiException as e:
244
299
  if e.status == 409:
@@ -252,7 +307,7 @@ class BaseAsyncAgent(BaseAgent):
252
307
  )
253
308
  )
254
309
 
255
- async def apply_run(self, run_data: Tuple[str, str, str, str]):
310
+ async def apply_run(self, run_data: Tuple[str, str, str, str, str]):
256
311
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
257
312
  resource = await self.prepare_run_resource(
258
313
  owner_name=run_owner,
@@ -264,9 +319,14 @@ class BaseAsyncAgent(BaseAgent):
264
319
  if not resource:
265
320
  return
266
321
 
322
+ namespace = None if len(run_data) < 5 else run_data[4]
323
+
267
324
  try:
268
325
  await self.executor.apply(
269
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
326
+ run_uuid=run_uuid,
327
+ run_kind=run_data[1],
328
+ resource=resource,
329
+ namespace=namespace,
270
330
  )
271
331
  await self.client.log_run_running(
272
332
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
@@ -275,12 +335,17 @@ class BaseAsyncAgent(BaseAgent):
275
335
  await self.client.log_run_failed(
276
336
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e
277
337
  )
278
- await self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
338
+ await self.clean_run(
339
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
340
+ )
279
341
 
280
- async def check_run(self, run_data: Tuple[str, str]):
342
+ async def check_run(self, run_data: Tuple[str, str, str]):
281
343
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
344
+ namespace = None if len(run_data) < 3 else run_data[2]
282
345
  try:
283
- await self.executor.get(run_uuid=run_uuid, run_kind=run_data[1])
346
+ await self.executor.get(
347
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
348
+ )
284
349
  except ApiException as e:
285
350
  if e.status == 404:
286
351
  logger.info(
@@ -290,10 +355,13 @@ class BaseAsyncAgent(BaseAgent):
290
355
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
291
356
  )
292
357
 
293
- async def stop_run(self, run_data: Tuple[str, str]):
358
+ async def stop_run(self, run_data: Tuple[str, str, str]):
294
359
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
360
+ namespace = None if len(run_data) < 3 else run_data[2]
295
361
  try:
296
- await self.executor.stop(run_uuid=run_uuid, run_kind=run_data[1])
362
+ await self.executor.stop(
363
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
364
+ )
297
365
  except ApiException as e:
298
366
  if e.status == 404:
299
367
  logger.info("Run does not exist anymore, it could have been stopped.")
@@ -309,16 +377,24 @@ class BaseAsyncAgent(BaseAgent):
309
377
  message="Agent failed stopping run.\n",
310
378
  )
311
379
 
312
- async def delete_run(self, run_data: Tuple[str, str, str, str]):
380
+ async def delete_run(self, run_data: Tuple[str, str, str, str, str]):
313
381
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
314
- await self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
382
+ namespace = None if len(run_data) < 5 else run_data[4]
315
383
  if run_data[3]:
316
384
  await self.make_and_create_run(run_data)
385
+ else:
386
+ await self.clean_run(
387
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
388
+ )
317
389
 
318
- async def clean_run(self, run_uuid: str, run_kind: str):
390
+ async def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
319
391
  try:
320
- await self.executor.clean(run_uuid=run_uuid, run_kind=run_kind)
321
- await self.executor.stop(run_uuid=run_uuid, run_kind=run_kind)
392
+ await self.executor.clean(
393
+ run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
394
+ )
395
+ await self.executor.stop(
396
+ run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
397
+ )
322
398
  except ApiException as e:
323
399
  if e.status == 404:
324
400
  logger.info("Run does not exist.")
@@ -24,6 +24,7 @@ class BaseAgent:
24
24
  HEALTH_FILE = "/tmp/.healthz"
25
25
  SLEEP_STOP_TIME = 60 * 5
26
26
  SLEEP_ARCHIVED_TIME = 60 * 60
27
+ SLEEP_AGENT_DATA_COLLECT_TIME = 60 * 30
27
28
  IS_ASYNC = False
28
29
 
29
30
  def __init__(
@@ -41,6 +42,7 @@ class BaseAgent:
41
42
  self._default_auth = bool(agent_uuid)
42
43
  self._executor_refreshed_at = now()
43
44
  self._graceful_shutdown = False
45
+ self._last_reconciled_at = now()
44
46
  self.client = AgentClient(
45
47
  owner=owner, agent_uuid=agent_uuid, is_async=self.IS_ASYNC
46
48
  )
@@ -50,9 +52,25 @@ class BaseAgent:
50
52
  def sync(self):
51
53
  raise NotImplementedError
52
54
 
55
+ def reconcile(self):
56
+ raise NotImplementedError
57
+
53
58
  def cron(self):
54
59
  return self.client.cron_agent()
55
60
 
61
+ def collect_agent_data(self):
62
+ logger.info("Collecting agent data.")
63
+ self._last_reconciled_at = now()
64
+ try:
65
+ return self.client.collect_agent_data(
66
+ namespace=settings.CLIENT_CONFIG.namespace
67
+ )
68
+ except Exception as e:
69
+ logger.warning(
70
+ "Agent failed to collect agent data: {}\n"
71
+ "Retrying ...".format(repr(e))
72
+ )
73
+
56
74
  def sync_compatible_updates(self, compatible_updates: Dict):
57
75
  if compatible_updates and settings.AGENT_CONFIG:
58
76
  init = compatible_updates.get("init")
@@ -114,14 +132,14 @@ class BaseAgent:
114
132
 
115
133
  def _check_status(self, agent_state):
116
134
  if agent_state.status == V1Statuses.STOPPED:
117
- print(
135
+ logger.warning(
118
136
  "Agent has been stopped from the platform,"
119
137
  "but the deployment is still running."
120
138
  "Please either set the agent to starting or teardown the agent deployment."
121
139
  )
122
140
  return self.end(sleep=self.SLEEP_STOP_TIME)
123
141
  elif agent_state.live_state < LiveState.LIVE:
124
- print(
142
+ logger.warning(
125
143
  "Agent has been archived from the platform,"
126
144
  "but the deployment is still running."
127
145
  "Please either restore the agent or teardown the agent deployment."
@@ -173,25 +191,25 @@ class BaseAgent:
173
191
  ) -> Optional[Any]:
174
192
  raise NotImplementedError
175
193
 
176
- def submit_run(self, run_data: Tuple[str, str, str, str]):
194
+ def submit_run(self, run_data: Tuple[str, str, str, str, str]):
177
195
  raise NotImplementedError
178
196
 
179
197
  def make_and_create_run(
180
- self, run_data: Tuple[str, str, str, str], default_auth: bool = False
198
+ self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
181
199
  ):
182
200
  raise NotImplementedError
183
201
 
184
- def apply_run(self, run_data: Tuple[str, str, str, str]):
202
+ def apply_run(self, run_data: Tuple[str, str, str, str, str]):
185
203
  raise NotImplementedError
186
204
 
187
- def check_run(self, run_data: Tuple[str, str]):
205
+ def check_run(self, run_data: Tuple[str, str, str]):
188
206
  raise NotImplementedError
189
207
 
190
- def stop_run(self, run_data: Tuple[str, str]):
208
+ def stop_run(self, run_data: Tuple[str, str, str]):
191
209
  raise NotImplementedError
192
210
 
193
- def delete_run(self, run_data: Tuple[str, str, str, str]):
211
+ def delete_run(self, run_data: Tuple[str, str, str, str, str]):
194
212
  raise NotImplementedError
195
213
 
196
- def clean_run(self, run_uuid: str, run_kind: str):
214
+ def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
197
215
  raise NotImplementedError
@@ -1,6 +1,6 @@
1
1
  import traceback
2
2
 
3
- from typing import Optional
3
+ from typing import Dict, Optional
4
4
 
5
5
  from polyaxon._schemas.lifecycle import V1StatusCondition, V1Statuses
6
6
  from polyaxon.client import PolyaxonClient, V1Agent, V1AgentStateResponse
@@ -65,6 +65,20 @@ class AgentClient:
65
65
  def cron_agent(self):
66
66
  return self.client.agents_v1.cron_agent(owner=self.owner, _request_timeout=10)
67
67
 
68
+ def collect_agent_data(self, namespace: str):
69
+ return self.client.internal_agents_v1.collect_agent_data(
70
+ owner=self.owner,
71
+ uuid=self.agent_uuid,
72
+ namespace=namespace,
73
+ )
74
+
75
+ def reconcile_agent(self, reconcile: Dict):
76
+ return self.client.agents_v1.reconcile_agent(
77
+ owner=self.owner,
78
+ uuid=self.agent_uuid,
79
+ body={"reconcile": reconcile},
80
+ )
81
+
68
82
  def log_agent_running(self):
69
83
  return self.log_agent_status(status=V1Statuses.RUNNING, reason="AgentLogger")
70
84