polyaxon 2.0.6rc8__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. polyaxon/_cli/config.py +1 -1
  2. polyaxon/_cli/run.py +8 -0
  3. polyaxon/_cli/services/clean_artifacts.py +1 -1
  4. polyaxon/_client/client.py +17 -0
  5. polyaxon/_client/run.py +12 -0
  6. polyaxon/_compiler/resolver/agent.py +1 -1
  7. polyaxon/_compiler/resolver/runtime.py +1 -1
  8. polyaxon/_deploy/schemas/service.py +4 -0
  9. polyaxon/_docker/executor.py +10 -4
  10. polyaxon/_env_vars/getters/run.py +3 -0
  11. polyaxon/_env_vars/keys.py +5 -0
  12. polyaxon/_flow/__init__.py +2 -0
  13. polyaxon/_flow/builds/__init__.py +19 -6
  14. polyaxon/_flow/component/base.py +1 -0
  15. polyaxon/_flow/component/component.py +14 -0
  16. polyaxon/_flow/environment/__init__.py +5 -5
  17. polyaxon/_flow/hooks/__init__.py +19 -6
  18. polyaxon/_flow/matrix/tuner.py +18 -6
  19. polyaxon/_flow/operations/operation.py +19 -0
  20. polyaxon/_flow/run/__init__.py +2 -2
  21. polyaxon/_flow/run/kubeflow/paddle_job.py +34 -2
  22. polyaxon/_flow/run/kubeflow/pytorch_job.py +50 -3
  23. polyaxon/_flow/run/kubeflow/scheduling_policy.py +4 -0
  24. polyaxon/_flow/run/kubeflow/tf_job.py +2 -1
  25. polyaxon/_fs/fs.py +5 -0
  26. polyaxon/_k8s/converter/converters/job.py +1 -1
  27. polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
  28. polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
  29. polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
  30. polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
  31. polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
  32. polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
  33. polyaxon/_k8s/executor/base.py +23 -6
  34. polyaxon/_k8s/logging/async_monitor.py +73 -12
  35. polyaxon/_k8s/manager/async_manager.py +81 -23
  36. polyaxon/_k8s/manager/base.py +4 -0
  37. polyaxon/_k8s/manager/manager.py +266 -133
  38. polyaxon/_operations/tuner.py +1 -0
  39. polyaxon/_polyaxonfile/check.py +2 -0
  40. polyaxon/_polyaxonfile/manager/operations.py +3 -0
  41. polyaxon/_polyaxonfile/manager/workflows.py +2 -0
  42. polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
  43. polyaxon/_polyaxonfile/specs/operation.py +1 -0
  44. polyaxon/_polyaxonfile/specs/sections.py +3 -0
  45. polyaxon/_runner/agent/async_agent.py +94 -18
  46. polyaxon/_runner/agent/base_agent.py +25 -7
  47. polyaxon/_runner/agent/client.py +15 -1
  48. polyaxon/_runner/agent/sync_agent.py +83 -18
  49. polyaxon/_runner/executor.py +13 -7
  50. polyaxon/_schemas/agent.py +27 -1
  51. polyaxon/_schemas/client.py +30 -3
  52. polyaxon/_sdk/api/agents_v1_api.py +875 -51
  53. polyaxon/_sdk/api/service_accounts_v1_api.py +12 -12
  54. polyaxon/_sdk/schemas/__init__.py +3 -0
  55. polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
  56. polyaxon/_sidecar/container/__init__.py +1 -1
  57. polyaxon/_sidecar/container/monitors/spec.py +1 -1
  58. polyaxon/pkg.py +1 -1
  59. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/METADATA +6 -6
  60. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/RECORD +64 -63
  61. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/LICENSE +0 -0
  62. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/WHEEL +0 -0
  63. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/entry_points.txt +0 -0
  64. {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/top_level.txt +0 -0
@@ -31,6 +31,7 @@ def get_tuner(
31
31
 
32
32
  return V1Operation(
33
33
  queue=tuner.queue,
34
+ namespace=tuner.namespace,
34
35
  joins=[join],
35
36
  params=params,
36
37
  hub_ref=tuner.hub_ref,
@@ -80,6 +80,7 @@ def check_polyaxonfile(
80
80
  matrix: Optional[Union[Dict, V1Matrix]] = None,
81
81
  presets: Optional[List[str]] = None,
82
82
  queue: Optional[str] = None,
83
+ namespace: Optional[str] = None,
83
84
  nocache: Optional[bool] = None,
84
85
  cache: Optional[Union[int, str, bool]] = None,
85
86
  verbose: bool = True,
@@ -177,6 +178,7 @@ def check_polyaxonfile(
177
178
  matrix=matrix,
178
179
  presets=presets,
179
180
  queue=queue,
181
+ namespace=namespace,
180
182
  nocache=nocache,
181
183
  cache=cache,
182
184
  approved=approved,
@@ -29,6 +29,7 @@ def get_op_specification(
29
29
  matrix: Optional[Union[Dict, V1Matrix]] = None,
30
30
  presets: Optional[List[str]] = None,
31
31
  queue: Optional[str] = None,
32
+ namespace: Optional[str] = None,
32
33
  nocache: Optional[bool] = None,
33
34
  cache: Optional[Union[int, str, bool]] = None,
34
35
  approved: Optional[Union[int, str, bool]] = None,
@@ -70,6 +71,8 @@ def get_op_specification(
70
71
  # Check only
71
72
  get_queue_info(queue)
72
73
  op_data["queue"] = queue
74
+ if namespace:
75
+ op_data["namespace"] = namespace
73
76
  if cache is not None:
74
77
  op_data["cache"] = {"disable": not to_bool(cache)}
75
78
  if nocache:
@@ -20,6 +20,7 @@ def get_op_from_schedule(
20
20
  op_spec.skip_on_upstream_skip = None
21
21
  op_spec.cache = compiled_operation.cache
22
22
  op_spec.queue = compiled_operation.queue
23
+ op_spec.namespace = compiled_operation.namespace
23
24
  op_spec.component.inputs = compiled_operation.inputs
24
25
  op_spec.component.outputs = compiled_operation.outputs
25
26
  op_spec.component.run = compiled_operation.run
@@ -59,6 +60,7 @@ def get_ops_from_suggestions(
59
60
  op_spec.skip_on_upstream_skip = None
60
61
  op_spec.cache = compiled_operation.cache
61
62
  op_spec.queue = compiled_operation.queue
63
+ op_spec.namespace = compiled_operation.namespace
62
64
  op_spec.params = params
63
65
  op_spec.component.inputs = compiled_operation.inputs
64
66
  op_spec.component.outputs = compiled_operation.outputs
@@ -333,6 +333,7 @@ class CompiledOperationSpecification(BaseSpecification):
333
333
  "is_approved",
334
334
  "presets",
335
335
  "queue",
336
+ "namespace",
336
337
  "cache",
337
338
  "build",
338
339
  "hooks",
@@ -92,6 +92,7 @@ class OperationSpecification(BaseSpecification):
92
92
  "is_approved",
93
93
  "presets",
94
94
  "queue",
95
+ "namespace",
95
96
  "cache",
96
97
  "build",
97
98
  "hooks",
@@ -13,6 +13,7 @@ class Sections:
13
13
  QUEUE = "queue"
14
14
  CACHE = "cache"
15
15
  PLUGINS = "plugins"
16
+ NAMESPACE = "namespace"
16
17
  BUILD = "build"
17
18
  HOOKS = "hooks"
18
19
  EVENTS = "events"
@@ -55,6 +56,7 @@ class Sections:
55
56
  CACHE,
56
57
  QUEUE,
57
58
  PLUGINS,
59
+ NAMESPACE,
58
60
  BUILD,
59
61
  HOOKS,
60
62
  EVENTS,
@@ -89,6 +91,7 @@ class Sections:
89
91
  CACHE,
90
92
  CONNECTIONS,
91
93
  PLUGINS,
94
+ NAMESPACE,
92
95
  TERMINATION,
93
96
  SCHEDULE,
94
97
  DEPENDENCIES,
@@ -14,6 +14,7 @@ from polyaxon._env_vars.getters import get_run_info
14
14
  from polyaxon._runner.agent.base_agent import BaseAgent
15
15
  from polyaxon._sdk.schemas.v1_agent import V1Agent
16
16
  from polyaxon._sdk.schemas.v1_agent_state_response import V1AgentStateResponse
17
+ from polyaxon._utils.fqn_utils import get_run_instance
17
18
  from polyaxon.exceptions import ApiException as SDKApiException
18
19
  from polyaxon.exceptions import PolyaxonAgentError, PolyaxonConverterError
19
20
  from polyaxon.logger import logger
@@ -78,6 +79,49 @@ class BaseAsyncAgent(BaseAgent):
78
79
  ),
79
80
  )
80
81
 
82
+ async def reconcile(self):
83
+ if (
84
+ now() - self._last_reconciled_at
85
+ ).total_seconds() > self.SLEEP_AGENT_DATA_COLLECT_TIME:
86
+ return
87
+
88
+ # Collect data
89
+ await self.collect_agent_data()
90
+
91
+ # Update reconcile
92
+ namespaces = [settings.AGENT_CONFIG.namespace]
93
+ namespaces += settings.AGENT_CONFIG.additional_namespaces or []
94
+ ops = []
95
+ for namespace in namespaces:
96
+ _ops = await self.executor.list_ops(namespace=namespace)
97
+ if _ops:
98
+ ops += [
99
+ (
100
+ get_run_instance(
101
+ owner=op["metadata"]["annotations"][
102
+ "operation.polyaxon.com/owner"
103
+ ],
104
+ project=op["metadata"]["annotations"][
105
+ "operation.polyaxon.com/project"
106
+ ],
107
+ run_uuid=op["metadata"]["labels"][
108
+ "app.kubernetes.io/instance"
109
+ ],
110
+ ),
111
+ op["metadata"]["annotations"]["operation.polyaxon.com/kind"],
112
+ op["metadata"]["annotations"]["operation.polyaxon.com/name"],
113
+ namespace,
114
+ )
115
+ for op in _ops
116
+ ]
117
+ if not ops:
118
+ return None
119
+
120
+ logger.info("Reconcile agent.")
121
+ return await self.client.reconcile_agent(
122
+ reconcile={"ops": ops},
123
+ )
124
+
81
125
  async def start(self):
82
126
  try:
83
127
  async with async_exit_context() as exit_event:
@@ -91,7 +135,9 @@ class BaseAsyncAgent(BaseAgent):
91
135
  except asyncio.TimeoutError:
92
136
  index += 1
93
137
  await self.refresh_executor()
94
- if not self._default_auth:
138
+ if self._default_auth:
139
+ await self.reconcile()
140
+ else:
95
141
  await self.cron()
96
142
  agent_state = await self.process()
97
143
  if not agent_state:
@@ -118,7 +164,7 @@ class BaseAsyncAgent(BaseAgent):
118
164
  self.sync_compatible_updates(agent_state.compatible_updates)
119
165
 
120
166
  if agent_state:
121
- logger.info("Starting runs submission process.")
167
+ logger.info("Checking agent state.")
122
168
  else:
123
169
  logger.info("No state was found.")
124
170
  return V1AgentStateResponse.construct()
@@ -185,7 +231,7 @@ class BaseAsyncAgent(BaseAgent):
185
231
  )
186
232
  return None
187
233
 
188
- async def submit_run(self, run_data: Tuple[str, str, str, str]):
234
+ async def submit_run(self, run_data: Tuple[str, str, str, str, str]):
189
235
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
190
236
  resource = await self.prepare_run_resource(
191
237
  owner_name=run_owner,
@@ -197,9 +243,13 @@ class BaseAsyncAgent(BaseAgent):
197
243
  if not resource:
198
244
  return
199
245
 
246
+ namespace = None if len(run_data) < 5 else run_data[4]
200
247
  try:
201
248
  await self.executor.create(
202
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
249
+ run_uuid=run_uuid,
250
+ run_kind=run_data[1],
251
+ resource=resource,
252
+ namespace=namespace,
203
253
  )
204
254
  except ApiException as e:
205
255
  if e.status == 409:
@@ -222,7 +272,7 @@ class BaseAsyncAgent(BaseAgent):
222
272
  )
223
273
 
224
274
  async def make_and_create_run(
225
- self, run_data: Tuple[str, str, str, str], default_auth: bool = False
275
+ self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
226
276
  ):
227
277
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
228
278
  resource = await self.make_run_resource(
@@ -236,9 +286,14 @@ class BaseAsyncAgent(BaseAgent):
236
286
  if not resource:
237
287
  return
238
288
 
289
+ namepsace = None if len(run_data) < 5 else run_data[4]
290
+
239
291
  try:
240
292
  await self.executor.create(
241
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
293
+ run_uuid=run_uuid,
294
+ run_kind=run_data[1],
295
+ resource=resource,
296
+ namespace=namepsace,
242
297
  )
243
298
  except ApiException as e:
244
299
  if e.status == 409:
@@ -252,7 +307,7 @@ class BaseAsyncAgent(BaseAgent):
252
307
  )
253
308
  )
254
309
 
255
- async def apply_run(self, run_data: Tuple[str, str, str, str]):
310
+ async def apply_run(self, run_data: Tuple[str, str, str, str, str]):
256
311
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
257
312
  resource = await self.prepare_run_resource(
258
313
  owner_name=run_owner,
@@ -264,9 +319,14 @@ class BaseAsyncAgent(BaseAgent):
264
319
  if not resource:
265
320
  return
266
321
 
322
+ namespace = None if len(run_data) < 5 else run_data[4]
323
+
267
324
  try:
268
325
  await self.executor.apply(
269
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
326
+ run_uuid=run_uuid,
327
+ run_kind=run_data[1],
328
+ resource=resource,
329
+ namespace=namespace,
270
330
  )
271
331
  await self.client.log_run_running(
272
332
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
@@ -275,12 +335,17 @@ class BaseAsyncAgent(BaseAgent):
275
335
  await self.client.log_run_failed(
276
336
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e
277
337
  )
278
- await self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
338
+ await self.clean_run(
339
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
340
+ )
279
341
 
280
- async def check_run(self, run_data: Tuple[str, str]):
342
+ async def check_run(self, run_data: Tuple[str, str, str]):
281
343
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
344
+ namespace = None if len(run_data) < 3 else run_data[2]
282
345
  try:
283
- await self.executor.get(run_uuid=run_uuid, run_kind=run_data[1])
346
+ await self.executor.get(
347
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
348
+ )
284
349
  except ApiException as e:
285
350
  if e.status == 404:
286
351
  logger.info(
@@ -290,10 +355,13 @@ class BaseAsyncAgent(BaseAgent):
290
355
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
291
356
  )
292
357
 
293
- async def stop_run(self, run_data: Tuple[str, str]):
358
+ async def stop_run(self, run_data: Tuple[str, str, str]):
294
359
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
360
+ namespace = None if len(run_data) < 3 else run_data[2]
295
361
  try:
296
- await self.executor.stop(run_uuid=run_uuid, run_kind=run_data[1])
362
+ await self.executor.stop(
363
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
364
+ )
297
365
  except ApiException as e:
298
366
  if e.status == 404:
299
367
  logger.info("Run does not exist anymore, it could have been stopped.")
@@ -309,16 +377,24 @@ class BaseAsyncAgent(BaseAgent):
309
377
  message="Agent failed stopping run.\n",
310
378
  )
311
379
 
312
- async def delete_run(self, run_data: Tuple[str, str, str, str]):
380
+ async def delete_run(self, run_data: Tuple[str, str, str, str, str]):
313
381
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
314
- await self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
382
+ namespace = None if len(run_data) < 5 else run_data[4]
315
383
  if run_data[3]:
316
384
  await self.make_and_create_run(run_data)
385
+ else:
386
+ await self.clean_run(
387
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
388
+ )
317
389
 
318
- async def clean_run(self, run_uuid: str, run_kind: str):
390
+ async def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
319
391
  try:
320
- await self.executor.clean(run_uuid=run_uuid, run_kind=run_kind)
321
- await self.executor.stop(run_uuid=run_uuid, run_kind=run_kind)
392
+ await self.executor.clean(
393
+ run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
394
+ )
395
+ await self.executor.stop(
396
+ run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
397
+ )
322
398
  except ApiException as e:
323
399
  if e.status == 404:
324
400
  logger.info("Run does not exist.")
@@ -24,6 +24,7 @@ class BaseAgent:
24
24
  HEALTH_FILE = "/tmp/.healthz"
25
25
  SLEEP_STOP_TIME = 60 * 5
26
26
  SLEEP_ARCHIVED_TIME = 60 * 60
27
+ SLEEP_AGENT_DATA_COLLECT_TIME = 60 * 30
27
28
  IS_ASYNC = False
28
29
 
29
30
  def __init__(
@@ -41,6 +42,7 @@ class BaseAgent:
41
42
  self._default_auth = bool(agent_uuid)
42
43
  self._executor_refreshed_at = now()
43
44
  self._graceful_shutdown = False
45
+ self._last_reconciled_at = now()
44
46
  self.client = AgentClient(
45
47
  owner=owner, agent_uuid=agent_uuid, is_async=self.IS_ASYNC
46
48
  )
@@ -50,9 +52,25 @@ class BaseAgent:
50
52
  def sync(self):
51
53
  raise NotImplementedError
52
54
 
55
+ def reconcile(self):
56
+ raise NotImplementedError
57
+
53
58
  def cron(self):
54
59
  return self.client.cron_agent()
55
60
 
61
+ def collect_agent_data(self):
62
+ logger.info("Collecting agent data.")
63
+ self._last_reconciled_at = now()
64
+ try:
65
+ return self.client.collect_agent_data(
66
+ namespace=settings.CLIENT_CONFIG.namespace
67
+ )
68
+ except Exception as e:
69
+ logger.warning(
70
+ "Agent failed to collect agent data: {}\n"
71
+ "Retrying ...".format(repr(e))
72
+ )
73
+
56
74
  def sync_compatible_updates(self, compatible_updates: Dict):
57
75
  if compatible_updates and settings.AGENT_CONFIG:
58
76
  init = compatible_updates.get("init")
@@ -173,25 +191,25 @@ class BaseAgent:
173
191
  ) -> Optional[Any]:
174
192
  raise NotImplementedError
175
193
 
176
- def submit_run(self, run_data: Tuple[str, str, str, str]):
194
+ def submit_run(self, run_data: Tuple[str, str, str, str, str]):
177
195
  raise NotImplementedError
178
196
 
179
197
  def make_and_create_run(
180
- self, run_data: Tuple[str, str, str, str], default_auth: bool = False
198
+ self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
181
199
  ):
182
200
  raise NotImplementedError
183
201
 
184
- def apply_run(self, run_data: Tuple[str, str, str, str]):
202
+ def apply_run(self, run_data: Tuple[str, str, str, str, str]):
185
203
  raise NotImplementedError
186
204
 
187
- def check_run(self, run_data: Tuple[str, str]):
205
+ def check_run(self, run_data: Tuple[str, str, str]):
188
206
  raise NotImplementedError
189
207
 
190
- def stop_run(self, run_data: Tuple[str, str]):
208
+ def stop_run(self, run_data: Tuple[str, str, str]):
191
209
  raise NotImplementedError
192
210
 
193
- def delete_run(self, run_data: Tuple[str, str, str, str]):
211
+ def delete_run(self, run_data: Tuple[str, str, str, str, str]):
194
212
  raise NotImplementedError
195
213
 
196
- def clean_run(self, run_uuid: str, run_kind: str):
214
+ def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
197
215
  raise NotImplementedError
@@ -1,6 +1,6 @@
1
1
  import traceback
2
2
 
3
- from typing import Optional
3
+ from typing import Dict, Optional
4
4
 
5
5
  from polyaxon._schemas.lifecycle import V1StatusCondition, V1Statuses
6
6
  from polyaxon.client import PolyaxonClient, V1Agent, V1AgentStateResponse
@@ -65,6 +65,20 @@ class AgentClient:
65
65
  def cron_agent(self):
66
66
  return self.client.agents_v1.cron_agent(owner=self.owner, _request_timeout=10)
67
67
 
68
+ def collect_agent_data(self, namespace: str):
69
+ return self.client.internal_agents_v1.collect_agent_data(
70
+ owner=self.owner,
71
+ uuid=self.agent_uuid,
72
+ namespace=namespace,
73
+ )
74
+
75
+ def reconcile_agent(self, reconcile: Dict):
76
+ return self.client.agents_v1.reconcile_agent(
77
+ owner=self.owner,
78
+ uuid=self.agent_uuid,
79
+ body={"reconcile": reconcile},
80
+ )
81
+
68
82
  def log_agent_running(self):
69
83
  return self.log_agent_status(status=V1Statuses.RUNNING, reason="AgentLogger")
70
84
 
@@ -13,6 +13,7 @@ from urllib3.exceptions import HTTPError
13
13
  from polyaxon import pkg, settings
14
14
  from polyaxon._env_vars.getters import get_run_info
15
15
  from polyaxon._runner.agent.base_agent import BaseAgent
16
+ from polyaxon._utils.fqn_utils import get_run_instance
16
17
  from polyaxon.client import V1Agent, V1AgentStateResponse
17
18
  from polyaxon.exceptions import ApiException as SDKApiException
18
19
  from polyaxon.exceptions import PolyaxonAgentError, PolyaxonConverterError
@@ -76,6 +77,49 @@ class BaseSyncAgent(BaseAgent):
76
77
  ),
77
78
  )
78
79
 
80
+ def reconcile(self):
81
+ if (
82
+ now() - self._last_reconciled_at
83
+ ).total_seconds() > self.SLEEP_AGENT_DATA_COLLECT_TIME:
84
+ return
85
+
86
+ # Collect data
87
+ self.collect_agent_data()
88
+
89
+ # Update reconcile
90
+ namespaces = [settings.AGENT_CONFIG.namespace]
91
+ namespaces += settings.AGENT_CONFIG.additional_namespaces or []
92
+ ops = []
93
+ for namespace in namespaces:
94
+ _ops = self.executor.list_ops(namespace=namespace)
95
+ if _ops:
96
+ ops += [
97
+ (
98
+ get_run_instance(
99
+ owner=op["metadata"]["annotations"][
100
+ "operation.polyaxon.com/owner"
101
+ ],
102
+ project=op["metadata"]["annotations"][
103
+ "operation.polyaxon.com/project"
104
+ ],
105
+ run_uuid=op["metadata"]["labels"][
106
+ "app.kubernetes.io/instance"
107
+ ],
108
+ ),
109
+ op["metadata"]["annotations"]["operation.polyaxon.com/kind"],
110
+ op["metadata"]["annotations"]["operation.polyaxon.com/name"],
111
+ namespace,
112
+ )
113
+ for op in _ops
114
+ ]
115
+ if not ops:
116
+ return None
117
+
118
+ logger.info("Reconcile agent.")
119
+ return self.client.reconcile_agent(
120
+ reconcile={"ops": ops},
121
+ )
122
+
79
123
  def start(self):
80
124
  try:
81
125
  with sync_exit_context() as exit_event:
@@ -88,7 +132,9 @@ class BaseSyncAgent(BaseAgent):
88
132
  while not exit_event.wait(timeout=timeout):
89
133
  index += 1
90
134
  self.refresh_executor()
91
- if not self._default_auth:
135
+ if self._default_auth:
136
+ self.reconcile()
137
+ else:
92
138
  self.cron()
93
139
  agent_state = self.process(pool)
94
140
  if not agent_state:
@@ -113,7 +159,7 @@ class BaseSyncAgent(BaseAgent):
113
159
  self.sync_compatible_updates(agent_state.compatible_updates)
114
160
 
115
161
  if agent_state:
116
- logger.info("Starting runs submission process.")
162
+ logger.info("Checking agent state.")
117
163
  else:
118
164
  logger.info("No state was found.")
119
165
  return V1AgentStateResponse.construct()
@@ -180,7 +226,7 @@ class BaseSyncAgent(BaseAgent):
180
226
  )
181
227
  return None
182
228
 
183
- def submit_run(self, run_data: Tuple[str, str, str, str]):
229
+ def submit_run(self, run_data: Tuple[str, str, str, str, str]):
184
230
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
185
231
  resource = self.prepare_run_resource(
186
232
  owner_name=run_owner,
@@ -194,7 +240,10 @@ class BaseSyncAgent(BaseAgent):
194
240
 
195
241
  try:
196
242
  self.executor.create(
197
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
243
+ run_uuid=run_uuid,
244
+ run_kind=run_data[1],
245
+ resource=resource,
246
+ namespace=run_data[4],
198
247
  )
199
248
  except ApiException as e:
200
249
  if e.status == 409:
@@ -217,7 +266,7 @@ class BaseSyncAgent(BaseAgent):
217
266
  )
218
267
 
219
268
  def make_and_create_run(
220
- self, run_data: Tuple[str, str, str, str], default_auth: bool = False
269
+ self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
221
270
  ):
222
271
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
223
272
  resource = self.make_run_resource(
@@ -233,7 +282,10 @@ class BaseSyncAgent(BaseAgent):
233
282
 
234
283
  try:
235
284
  self.executor.create(
236
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
285
+ run_uuid=run_uuid,
286
+ run_kind=run_data[1],
287
+ resource=resource,
288
+ namespace=run_data[4],
237
289
  )
238
290
  except ApiException as e:
239
291
  if e.status == 409:
@@ -247,7 +299,7 @@ class BaseSyncAgent(BaseAgent):
247
299
  )
248
300
  )
249
301
 
250
- def apply_run(self, run_data: Tuple[str, str, str, str]):
302
+ def apply_run(self, run_data: Tuple[str, str, str, str, str]):
251
303
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
252
304
  resource = self.prepare_run_resource(
253
305
  owner_name=run_owner,
@@ -261,7 +313,10 @@ class BaseSyncAgent(BaseAgent):
261
313
 
262
314
  try:
263
315
  self.executor.apply(
264
- run_uuid=run_uuid, run_kind=run_data[1], resource=resource
316
+ run_uuid=run_uuid,
317
+ run_kind=run_data[1],
318
+ resource=resource,
319
+ namespace=run_data[4],
265
320
  )
266
321
  self.client.log_run_running(
267
322
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
@@ -270,12 +325,16 @@ class BaseSyncAgent(BaseAgent):
270
325
  self.client.log_run_failed(
271
326
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e
272
327
  )
273
- self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
328
+ self.clean_run(
329
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[4]
330
+ )
274
331
 
275
- def check_run(self, run_data: Tuple[str, str]):
332
+ def check_run(self, run_data: Tuple[str, str, str]):
276
333
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
277
334
  try:
278
- self.executor.get(run_uuid=run_uuid, run_kind=run_data[1])
335
+ self.executor.get(
336
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[2]
337
+ )
279
338
  except ApiException as e:
280
339
  if e.status == 404:
281
340
  logger.info(
@@ -285,10 +344,12 @@ class BaseSyncAgent(BaseAgent):
285
344
  run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
286
345
  )
287
346
 
288
- def stop_run(self, run_data: Tuple[str, str]):
347
+ def stop_run(self, run_data: Tuple[str, str, str]):
289
348
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
290
349
  try:
291
- self.executor.stop(run_uuid=run_uuid, run_kind=run_data[1])
350
+ self.executor.stop(
351
+ run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[2]
352
+ )
292
353
  except ApiException as e:
293
354
  if e.status == 404:
294
355
  logger.info("Run does not exist anymore, it could have been stopped.")
@@ -304,16 +365,20 @@ class BaseSyncAgent(BaseAgent):
304
365
  message="Agent failed stopping run.\n",
305
366
  )
306
367
 
307
- def delete_run(self, run_data: Tuple[str, str, str, str]):
368
+ def delete_run(self, run_data: Tuple[str, str, str, str, str]):
308
369
  run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
309
- self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
370
+ self.clean_run(run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[4])
310
371
  if run_data[3]:
311
372
  self.make_and_create_run(run_data)
312
373
 
313
- def clean_run(self, run_uuid: str, run_kind: str):
374
+ def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
314
375
  try:
315
- self.executor.clean(run_uuid=run_uuid, run_kind=run_kind)
316
- self.executor.stop(run_uuid=run_uuid, run_kind=run_kind)
376
+ self.executor.clean(
377
+ run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
378
+ )
379
+ self.executor.stop(
380
+ run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
381
+ )
317
382
  except ApiException as e:
318
383
  if e.status == 404:
319
384
  logger.info("Run does not exist.")
@@ -50,19 +50,21 @@ class BaseExecutor:
50
50
  self._manager = None
51
51
  return self.manager
52
52
 
53
- def get(self, run_uuid: str, run_kind: str):
53
+ def get(self, run_uuid: str, run_kind: str, namespace: str = None):
54
54
  raise NotImplementedError
55
55
 
56
- def create(self, run_uuid: str, run_kind: str, resource: Any):
56
+ def create(
57
+ self, run_uuid: str, run_kind: str, resource: Any, namespace: str = None
58
+ ):
57
59
  raise NotImplementedError
58
60
 
59
- def apply(self, run_uuid: str, run_kind: str, resource: Any):
61
+ def apply(self, run_uuid: str, run_kind: str, resource: Any, namespace: str = None):
60
62
  raise NotImplementedError
61
63
 
62
- def stop(self, run_uuid: str, run_kind: str):
64
+ def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
63
65
  raise NotImplementedError
64
66
 
65
- def clean(self, run_uuid: str, run_kind: str):
67
+ def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
66
68
  raise NotImplementedError
67
69
 
68
70
  def _clean_temp_execution_path(self, run_uuid: str):
@@ -154,7 +156,7 @@ class BaseExecutor:
154
156
  run_name=run_name,
155
157
  run_uuid=run_uuid,
156
158
  run_path=run_uuid,
157
- namespace=agent_env.namespace,
159
+ namespace=compiled_operation.namespace or agent_env.namespace,
158
160
  compiled_operation=compiled_operation,
159
161
  polyaxon_init=agent_env.polyaxon_init,
160
162
  polyaxon_sidecar=agent_env.polyaxon_sidecar,
@@ -191,7 +193,7 @@ class BaseExecutor:
191
193
  params=operation.params,
192
194
  )
193
195
  return cls.get_resource(
194
- namespace=resolver_obj.namespace,
196
+ namespace=compiled_operation.namespace or resolver_obj.namespace,
195
197
  owner_name=resolver_obj.owner_name,
196
198
  project_name=resolver_obj.project_name,
197
199
  run_name=resolver_obj.run_name,
@@ -223,4 +225,8 @@ class BaseExecutor:
223
225
  run_uuid=response.uuid,
224
226
  run_kind=response.kind,
225
227
  resource=resource,
228
+ namespace=response.namespace,
226
229
  )
230
+
231
+ def list_ops(self, namespace: str = None):
232
+ raise NotImplementedError