polyaxon 2.0.6rc8__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polyaxon/_cli/config.py +1 -1
- polyaxon/_cli/run.py +8 -0
- polyaxon/_cli/services/clean_artifacts.py +1 -1
- polyaxon/_client/client.py +17 -0
- polyaxon/_client/run.py +12 -0
- polyaxon/_compiler/resolver/agent.py +1 -1
- polyaxon/_compiler/resolver/runtime.py +1 -1
- polyaxon/_deploy/schemas/service.py +4 -0
- polyaxon/_docker/executor.py +10 -4
- polyaxon/_env_vars/getters/run.py +3 -0
- polyaxon/_env_vars/keys.py +5 -0
- polyaxon/_flow/__init__.py +2 -0
- polyaxon/_flow/builds/__init__.py +19 -6
- polyaxon/_flow/component/base.py +1 -0
- polyaxon/_flow/component/component.py +14 -0
- polyaxon/_flow/environment/__init__.py +5 -5
- polyaxon/_flow/hooks/__init__.py +19 -6
- polyaxon/_flow/matrix/tuner.py +18 -6
- polyaxon/_flow/operations/operation.py +19 -0
- polyaxon/_flow/run/__init__.py +2 -2
- polyaxon/_flow/run/kubeflow/paddle_job.py +34 -2
- polyaxon/_flow/run/kubeflow/pytorch_job.py +50 -3
- polyaxon/_flow/run/kubeflow/scheduling_policy.py +4 -0
- polyaxon/_flow/run/kubeflow/tf_job.py +2 -1
- polyaxon/_fs/fs.py +5 -0
- polyaxon/_k8s/converter/converters/job.py +1 -1
- polyaxon/_k8s/converter/converters/kubeflow/paddle_job.py +1 -0
- polyaxon/_k8s/converter/converters/kubeflow/pytroch_job.py +2 -0
- polyaxon/_k8s/converter/converters/kubeflow/tf_job.py +1 -0
- polyaxon/_k8s/custom_resources/kubeflow/paddle_job.py +10 -1
- polyaxon/_k8s/custom_resources/kubeflow/pytorch_job.py +14 -1
- polyaxon/_k8s/custom_resources/kubeflow/tf_job.py +4 -0
- polyaxon/_k8s/executor/base.py +23 -6
- polyaxon/_k8s/logging/async_monitor.py +73 -12
- polyaxon/_k8s/manager/async_manager.py +81 -23
- polyaxon/_k8s/manager/base.py +4 -0
- polyaxon/_k8s/manager/manager.py +266 -133
- polyaxon/_operations/tuner.py +1 -0
- polyaxon/_polyaxonfile/check.py +2 -0
- polyaxon/_polyaxonfile/manager/operations.py +3 -0
- polyaxon/_polyaxonfile/manager/workflows.py +2 -0
- polyaxon/_polyaxonfile/specs/compiled_operation.py +1 -0
- polyaxon/_polyaxonfile/specs/operation.py +1 -0
- polyaxon/_polyaxonfile/specs/sections.py +3 -0
- polyaxon/_runner/agent/async_agent.py +94 -18
- polyaxon/_runner/agent/base_agent.py +25 -7
- polyaxon/_runner/agent/client.py +15 -1
- polyaxon/_runner/agent/sync_agent.py +83 -18
- polyaxon/_runner/executor.py +13 -7
- polyaxon/_schemas/agent.py +27 -1
- polyaxon/_schemas/client.py +30 -3
- polyaxon/_sdk/api/agents_v1_api.py +875 -51
- polyaxon/_sdk/api/service_accounts_v1_api.py +12 -12
- polyaxon/_sdk/schemas/__init__.py +3 -0
- polyaxon/_sdk/schemas/v1_agent_reconcile_body_request.py +14 -0
- polyaxon/_sidecar/container/__init__.py +1 -1
- polyaxon/_sidecar/container/monitors/spec.py +1 -1
- polyaxon/pkg.py +1 -1
- {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/METADATA +6 -6
- {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/RECORD +64 -63
- {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/LICENSE +0 -0
- {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/WHEEL +0 -0
- {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/entry_points.txt +0 -0
- {polyaxon-2.0.6rc8.dist-info → polyaxon-2.1.0.dist-info}/top_level.txt +0 -0
polyaxon/_operations/tuner.py
CHANGED
polyaxon/_polyaxonfile/check.py
CHANGED
@@ -80,6 +80,7 @@ def check_polyaxonfile(
|
|
80
80
|
matrix: Optional[Union[Dict, V1Matrix]] = None,
|
81
81
|
presets: Optional[List[str]] = None,
|
82
82
|
queue: Optional[str] = None,
|
83
|
+
namespace: Optional[str] = None,
|
83
84
|
nocache: Optional[bool] = None,
|
84
85
|
cache: Optional[Union[int, str, bool]] = None,
|
85
86
|
verbose: bool = True,
|
@@ -177,6 +178,7 @@ def check_polyaxonfile(
|
|
177
178
|
matrix=matrix,
|
178
179
|
presets=presets,
|
179
180
|
queue=queue,
|
181
|
+
namespace=namespace,
|
180
182
|
nocache=nocache,
|
181
183
|
cache=cache,
|
182
184
|
approved=approved,
|
@@ -29,6 +29,7 @@ def get_op_specification(
|
|
29
29
|
matrix: Optional[Union[Dict, V1Matrix]] = None,
|
30
30
|
presets: Optional[List[str]] = None,
|
31
31
|
queue: Optional[str] = None,
|
32
|
+
namespace: Optional[str] = None,
|
32
33
|
nocache: Optional[bool] = None,
|
33
34
|
cache: Optional[Union[int, str, bool]] = None,
|
34
35
|
approved: Optional[Union[int, str, bool]] = None,
|
@@ -70,6 +71,8 @@ def get_op_specification(
|
|
70
71
|
# Check only
|
71
72
|
get_queue_info(queue)
|
72
73
|
op_data["queue"] = queue
|
74
|
+
if namespace:
|
75
|
+
op_data["namespace"] = namespace
|
73
76
|
if cache is not None:
|
74
77
|
op_data["cache"] = {"disable": not to_bool(cache)}
|
75
78
|
if nocache:
|
@@ -20,6 +20,7 @@ def get_op_from_schedule(
|
|
20
20
|
op_spec.skip_on_upstream_skip = None
|
21
21
|
op_spec.cache = compiled_operation.cache
|
22
22
|
op_spec.queue = compiled_operation.queue
|
23
|
+
op_spec.namespace = compiled_operation.namespace
|
23
24
|
op_spec.component.inputs = compiled_operation.inputs
|
24
25
|
op_spec.component.outputs = compiled_operation.outputs
|
25
26
|
op_spec.component.run = compiled_operation.run
|
@@ -59,6 +60,7 @@ def get_ops_from_suggestions(
|
|
59
60
|
op_spec.skip_on_upstream_skip = None
|
60
61
|
op_spec.cache = compiled_operation.cache
|
61
62
|
op_spec.queue = compiled_operation.queue
|
63
|
+
op_spec.namespace = compiled_operation.namespace
|
62
64
|
op_spec.params = params
|
63
65
|
op_spec.component.inputs = compiled_operation.inputs
|
64
66
|
op_spec.component.outputs = compiled_operation.outputs
|
@@ -13,6 +13,7 @@ class Sections:
|
|
13
13
|
QUEUE = "queue"
|
14
14
|
CACHE = "cache"
|
15
15
|
PLUGINS = "plugins"
|
16
|
+
NAMESPACE = "namespace"
|
16
17
|
BUILD = "build"
|
17
18
|
HOOKS = "hooks"
|
18
19
|
EVENTS = "events"
|
@@ -55,6 +56,7 @@ class Sections:
|
|
55
56
|
CACHE,
|
56
57
|
QUEUE,
|
57
58
|
PLUGINS,
|
59
|
+
NAMESPACE,
|
58
60
|
BUILD,
|
59
61
|
HOOKS,
|
60
62
|
EVENTS,
|
@@ -89,6 +91,7 @@ class Sections:
|
|
89
91
|
CACHE,
|
90
92
|
CONNECTIONS,
|
91
93
|
PLUGINS,
|
94
|
+
NAMESPACE,
|
92
95
|
TERMINATION,
|
93
96
|
SCHEDULE,
|
94
97
|
DEPENDENCIES,
|
@@ -14,6 +14,7 @@ from polyaxon._env_vars.getters import get_run_info
|
|
14
14
|
from polyaxon._runner.agent.base_agent import BaseAgent
|
15
15
|
from polyaxon._sdk.schemas.v1_agent import V1Agent
|
16
16
|
from polyaxon._sdk.schemas.v1_agent_state_response import V1AgentStateResponse
|
17
|
+
from polyaxon._utils.fqn_utils import get_run_instance
|
17
18
|
from polyaxon.exceptions import ApiException as SDKApiException
|
18
19
|
from polyaxon.exceptions import PolyaxonAgentError, PolyaxonConverterError
|
19
20
|
from polyaxon.logger import logger
|
@@ -78,6 +79,49 @@ class BaseAsyncAgent(BaseAgent):
|
|
78
79
|
),
|
79
80
|
)
|
80
81
|
|
82
|
+
async def reconcile(self):
|
83
|
+
if (
|
84
|
+
now() - self._last_reconciled_at
|
85
|
+
).total_seconds() > self.SLEEP_AGENT_DATA_COLLECT_TIME:
|
86
|
+
return
|
87
|
+
|
88
|
+
# Collect data
|
89
|
+
await self.collect_agent_data()
|
90
|
+
|
91
|
+
# Update reconcile
|
92
|
+
namespaces = [settings.AGENT_CONFIG.namespace]
|
93
|
+
namespaces += settings.AGENT_CONFIG.additional_namespaces or []
|
94
|
+
ops = []
|
95
|
+
for namespace in namespaces:
|
96
|
+
_ops = await self.executor.list_ops(namespace=namespace)
|
97
|
+
if _ops:
|
98
|
+
ops += [
|
99
|
+
(
|
100
|
+
get_run_instance(
|
101
|
+
owner=op["metadata"]["annotations"][
|
102
|
+
"operation.polyaxon.com/owner"
|
103
|
+
],
|
104
|
+
project=op["metadata"]["annotations"][
|
105
|
+
"operation.polyaxon.com/project"
|
106
|
+
],
|
107
|
+
run_uuid=op["metadata"]["labels"][
|
108
|
+
"app.kubernetes.io/instance"
|
109
|
+
],
|
110
|
+
),
|
111
|
+
op["metadata"]["annotations"]["operation.polyaxon.com/kind"],
|
112
|
+
op["metadata"]["annotations"]["operation.polyaxon.com/name"],
|
113
|
+
namespace,
|
114
|
+
)
|
115
|
+
for op in _ops
|
116
|
+
]
|
117
|
+
if not ops:
|
118
|
+
return None
|
119
|
+
|
120
|
+
logger.info("Reconcile agent.")
|
121
|
+
return await self.client.reconcile_agent(
|
122
|
+
reconcile={"ops": ops},
|
123
|
+
)
|
124
|
+
|
81
125
|
async def start(self):
|
82
126
|
try:
|
83
127
|
async with async_exit_context() as exit_event:
|
@@ -91,7 +135,9 @@ class BaseAsyncAgent(BaseAgent):
|
|
91
135
|
except asyncio.TimeoutError:
|
92
136
|
index += 1
|
93
137
|
await self.refresh_executor()
|
94
|
-
if
|
138
|
+
if self._default_auth:
|
139
|
+
await self.reconcile()
|
140
|
+
else:
|
95
141
|
await self.cron()
|
96
142
|
agent_state = await self.process()
|
97
143
|
if not agent_state:
|
@@ -118,7 +164,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
118
164
|
self.sync_compatible_updates(agent_state.compatible_updates)
|
119
165
|
|
120
166
|
if agent_state:
|
121
|
-
logger.info("
|
167
|
+
logger.info("Checking agent state.")
|
122
168
|
else:
|
123
169
|
logger.info("No state was found.")
|
124
170
|
return V1AgentStateResponse.construct()
|
@@ -185,7 +231,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
185
231
|
)
|
186
232
|
return None
|
187
233
|
|
188
|
-
async def submit_run(self, run_data: Tuple[str, str, str, str]):
|
234
|
+
async def submit_run(self, run_data: Tuple[str, str, str, str, str]):
|
189
235
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
190
236
|
resource = await self.prepare_run_resource(
|
191
237
|
owner_name=run_owner,
|
@@ -197,9 +243,13 @@ class BaseAsyncAgent(BaseAgent):
|
|
197
243
|
if not resource:
|
198
244
|
return
|
199
245
|
|
246
|
+
namespace = None if len(run_data) < 5 else run_data[4]
|
200
247
|
try:
|
201
248
|
await self.executor.create(
|
202
|
-
run_uuid=run_uuid,
|
249
|
+
run_uuid=run_uuid,
|
250
|
+
run_kind=run_data[1],
|
251
|
+
resource=resource,
|
252
|
+
namespace=namespace,
|
203
253
|
)
|
204
254
|
except ApiException as e:
|
205
255
|
if e.status == 409:
|
@@ -222,7 +272,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
222
272
|
)
|
223
273
|
|
224
274
|
async def make_and_create_run(
|
225
|
-
self, run_data: Tuple[str, str, str, str], default_auth: bool = False
|
275
|
+
self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
|
226
276
|
):
|
227
277
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
228
278
|
resource = await self.make_run_resource(
|
@@ -236,9 +286,14 @@ class BaseAsyncAgent(BaseAgent):
|
|
236
286
|
if not resource:
|
237
287
|
return
|
238
288
|
|
289
|
+
namepsace = None if len(run_data) < 5 else run_data[4]
|
290
|
+
|
239
291
|
try:
|
240
292
|
await self.executor.create(
|
241
|
-
run_uuid=run_uuid,
|
293
|
+
run_uuid=run_uuid,
|
294
|
+
run_kind=run_data[1],
|
295
|
+
resource=resource,
|
296
|
+
namespace=namepsace,
|
242
297
|
)
|
243
298
|
except ApiException as e:
|
244
299
|
if e.status == 409:
|
@@ -252,7 +307,7 @@ class BaseAsyncAgent(BaseAgent):
|
|
252
307
|
)
|
253
308
|
)
|
254
309
|
|
255
|
-
async def apply_run(self, run_data: Tuple[str, str, str, str]):
|
310
|
+
async def apply_run(self, run_data: Tuple[str, str, str, str, str]):
|
256
311
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
257
312
|
resource = await self.prepare_run_resource(
|
258
313
|
owner_name=run_owner,
|
@@ -264,9 +319,14 @@ class BaseAsyncAgent(BaseAgent):
|
|
264
319
|
if not resource:
|
265
320
|
return
|
266
321
|
|
322
|
+
namespace = None if len(run_data) < 5 else run_data[4]
|
323
|
+
|
267
324
|
try:
|
268
325
|
await self.executor.apply(
|
269
|
-
run_uuid=run_uuid,
|
326
|
+
run_uuid=run_uuid,
|
327
|
+
run_kind=run_data[1],
|
328
|
+
resource=resource,
|
329
|
+
namespace=namespace,
|
270
330
|
)
|
271
331
|
await self.client.log_run_running(
|
272
332
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
|
@@ -275,12 +335,17 @@ class BaseAsyncAgent(BaseAgent):
|
|
275
335
|
await self.client.log_run_failed(
|
276
336
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e
|
277
337
|
)
|
278
|
-
await self.clean_run(
|
338
|
+
await self.clean_run(
|
339
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
340
|
+
)
|
279
341
|
|
280
|
-
async def check_run(self, run_data: Tuple[str, str]):
|
342
|
+
async def check_run(self, run_data: Tuple[str, str, str]):
|
281
343
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
344
|
+
namespace = None if len(run_data) < 3 else run_data[2]
|
282
345
|
try:
|
283
|
-
await self.executor.get(
|
346
|
+
await self.executor.get(
|
347
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
348
|
+
)
|
284
349
|
except ApiException as e:
|
285
350
|
if e.status == 404:
|
286
351
|
logger.info(
|
@@ -290,10 +355,13 @@ class BaseAsyncAgent(BaseAgent):
|
|
290
355
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
|
291
356
|
)
|
292
357
|
|
293
|
-
async def stop_run(self, run_data: Tuple[str, str]):
|
358
|
+
async def stop_run(self, run_data: Tuple[str, str, str]):
|
294
359
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
360
|
+
namespace = None if len(run_data) < 3 else run_data[2]
|
295
361
|
try:
|
296
|
-
await self.executor.stop(
|
362
|
+
await self.executor.stop(
|
363
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
364
|
+
)
|
297
365
|
except ApiException as e:
|
298
366
|
if e.status == 404:
|
299
367
|
logger.info("Run does not exist anymore, it could have been stopped.")
|
@@ -309,16 +377,24 @@ class BaseAsyncAgent(BaseAgent):
|
|
309
377
|
message="Agent failed stopping run.\n",
|
310
378
|
)
|
311
379
|
|
312
|
-
async def delete_run(self, run_data: Tuple[str, str, str, str]):
|
380
|
+
async def delete_run(self, run_data: Tuple[str, str, str, str, str]):
|
313
381
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
314
|
-
|
382
|
+
namespace = None if len(run_data) < 5 else run_data[4]
|
315
383
|
if run_data[3]:
|
316
384
|
await self.make_and_create_run(run_data)
|
385
|
+
else:
|
386
|
+
await self.clean_run(
|
387
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=namespace
|
388
|
+
)
|
317
389
|
|
318
|
-
async def clean_run(self, run_uuid: str, run_kind: str):
|
390
|
+
async def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
|
319
391
|
try:
|
320
|
-
await self.executor.clean(
|
321
|
-
|
392
|
+
await self.executor.clean(
|
393
|
+
run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
|
394
|
+
)
|
395
|
+
await self.executor.stop(
|
396
|
+
run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
|
397
|
+
)
|
322
398
|
except ApiException as e:
|
323
399
|
if e.status == 404:
|
324
400
|
logger.info("Run does not exist.")
|
@@ -24,6 +24,7 @@ class BaseAgent:
|
|
24
24
|
HEALTH_FILE = "/tmp/.healthz"
|
25
25
|
SLEEP_STOP_TIME = 60 * 5
|
26
26
|
SLEEP_ARCHIVED_TIME = 60 * 60
|
27
|
+
SLEEP_AGENT_DATA_COLLECT_TIME = 60 * 30
|
27
28
|
IS_ASYNC = False
|
28
29
|
|
29
30
|
def __init__(
|
@@ -41,6 +42,7 @@ class BaseAgent:
|
|
41
42
|
self._default_auth = bool(agent_uuid)
|
42
43
|
self._executor_refreshed_at = now()
|
43
44
|
self._graceful_shutdown = False
|
45
|
+
self._last_reconciled_at = now()
|
44
46
|
self.client = AgentClient(
|
45
47
|
owner=owner, agent_uuid=agent_uuid, is_async=self.IS_ASYNC
|
46
48
|
)
|
@@ -50,9 +52,25 @@ class BaseAgent:
|
|
50
52
|
def sync(self):
|
51
53
|
raise NotImplementedError
|
52
54
|
|
55
|
+
def reconcile(self):
|
56
|
+
raise NotImplementedError
|
57
|
+
|
53
58
|
def cron(self):
|
54
59
|
return self.client.cron_agent()
|
55
60
|
|
61
|
+
def collect_agent_data(self):
|
62
|
+
logger.info("Collecting agent data.")
|
63
|
+
self._last_reconciled_at = now()
|
64
|
+
try:
|
65
|
+
return self.client.collect_agent_data(
|
66
|
+
namespace=settings.CLIENT_CONFIG.namespace
|
67
|
+
)
|
68
|
+
except Exception as e:
|
69
|
+
logger.warning(
|
70
|
+
"Agent failed to collect agent data: {}\n"
|
71
|
+
"Retrying ...".format(repr(e))
|
72
|
+
)
|
73
|
+
|
56
74
|
def sync_compatible_updates(self, compatible_updates: Dict):
|
57
75
|
if compatible_updates and settings.AGENT_CONFIG:
|
58
76
|
init = compatible_updates.get("init")
|
@@ -173,25 +191,25 @@ class BaseAgent:
|
|
173
191
|
) -> Optional[Any]:
|
174
192
|
raise NotImplementedError
|
175
193
|
|
176
|
-
def submit_run(self, run_data: Tuple[str, str, str, str]):
|
194
|
+
def submit_run(self, run_data: Tuple[str, str, str, str, str]):
|
177
195
|
raise NotImplementedError
|
178
196
|
|
179
197
|
def make_and_create_run(
|
180
|
-
self, run_data: Tuple[str, str, str, str], default_auth: bool = False
|
198
|
+
self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
|
181
199
|
):
|
182
200
|
raise NotImplementedError
|
183
201
|
|
184
|
-
def apply_run(self, run_data: Tuple[str, str, str, str]):
|
202
|
+
def apply_run(self, run_data: Tuple[str, str, str, str, str]):
|
185
203
|
raise NotImplementedError
|
186
204
|
|
187
|
-
def check_run(self, run_data: Tuple[str, str]):
|
205
|
+
def check_run(self, run_data: Tuple[str, str, str]):
|
188
206
|
raise NotImplementedError
|
189
207
|
|
190
|
-
def stop_run(self, run_data: Tuple[str, str]):
|
208
|
+
def stop_run(self, run_data: Tuple[str, str, str]):
|
191
209
|
raise NotImplementedError
|
192
210
|
|
193
|
-
def delete_run(self, run_data: Tuple[str, str, str, str]):
|
211
|
+
def delete_run(self, run_data: Tuple[str, str, str, str, str]):
|
194
212
|
raise NotImplementedError
|
195
213
|
|
196
|
-
def clean_run(self, run_uuid: str, run_kind: str):
|
214
|
+
def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
|
197
215
|
raise NotImplementedError
|
polyaxon/_runner/agent/client.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import traceback
|
2
2
|
|
3
|
-
from typing import Optional
|
3
|
+
from typing import Dict, Optional
|
4
4
|
|
5
5
|
from polyaxon._schemas.lifecycle import V1StatusCondition, V1Statuses
|
6
6
|
from polyaxon.client import PolyaxonClient, V1Agent, V1AgentStateResponse
|
@@ -65,6 +65,20 @@ class AgentClient:
|
|
65
65
|
def cron_agent(self):
|
66
66
|
return self.client.agents_v1.cron_agent(owner=self.owner, _request_timeout=10)
|
67
67
|
|
68
|
+
def collect_agent_data(self, namespace: str):
|
69
|
+
return self.client.internal_agents_v1.collect_agent_data(
|
70
|
+
owner=self.owner,
|
71
|
+
uuid=self.agent_uuid,
|
72
|
+
namespace=namespace,
|
73
|
+
)
|
74
|
+
|
75
|
+
def reconcile_agent(self, reconcile: Dict):
|
76
|
+
return self.client.agents_v1.reconcile_agent(
|
77
|
+
owner=self.owner,
|
78
|
+
uuid=self.agent_uuid,
|
79
|
+
body={"reconcile": reconcile},
|
80
|
+
)
|
81
|
+
|
68
82
|
def log_agent_running(self):
|
69
83
|
return self.log_agent_status(status=V1Statuses.RUNNING, reason="AgentLogger")
|
70
84
|
|
@@ -13,6 +13,7 @@ from urllib3.exceptions import HTTPError
|
|
13
13
|
from polyaxon import pkg, settings
|
14
14
|
from polyaxon._env_vars.getters import get_run_info
|
15
15
|
from polyaxon._runner.agent.base_agent import BaseAgent
|
16
|
+
from polyaxon._utils.fqn_utils import get_run_instance
|
16
17
|
from polyaxon.client import V1Agent, V1AgentStateResponse
|
17
18
|
from polyaxon.exceptions import ApiException as SDKApiException
|
18
19
|
from polyaxon.exceptions import PolyaxonAgentError, PolyaxonConverterError
|
@@ -76,6 +77,49 @@ class BaseSyncAgent(BaseAgent):
|
|
76
77
|
),
|
77
78
|
)
|
78
79
|
|
80
|
+
def reconcile(self):
|
81
|
+
if (
|
82
|
+
now() - self._last_reconciled_at
|
83
|
+
).total_seconds() > self.SLEEP_AGENT_DATA_COLLECT_TIME:
|
84
|
+
return
|
85
|
+
|
86
|
+
# Collect data
|
87
|
+
self.collect_agent_data()
|
88
|
+
|
89
|
+
# Update reconcile
|
90
|
+
namespaces = [settings.AGENT_CONFIG.namespace]
|
91
|
+
namespaces += settings.AGENT_CONFIG.additional_namespaces or []
|
92
|
+
ops = []
|
93
|
+
for namespace in namespaces:
|
94
|
+
_ops = self.executor.list_ops(namespace=namespace)
|
95
|
+
if _ops:
|
96
|
+
ops += [
|
97
|
+
(
|
98
|
+
get_run_instance(
|
99
|
+
owner=op["metadata"]["annotations"][
|
100
|
+
"operation.polyaxon.com/owner"
|
101
|
+
],
|
102
|
+
project=op["metadata"]["annotations"][
|
103
|
+
"operation.polyaxon.com/project"
|
104
|
+
],
|
105
|
+
run_uuid=op["metadata"]["labels"][
|
106
|
+
"app.kubernetes.io/instance"
|
107
|
+
],
|
108
|
+
),
|
109
|
+
op["metadata"]["annotations"]["operation.polyaxon.com/kind"],
|
110
|
+
op["metadata"]["annotations"]["operation.polyaxon.com/name"],
|
111
|
+
namespace,
|
112
|
+
)
|
113
|
+
for op in _ops
|
114
|
+
]
|
115
|
+
if not ops:
|
116
|
+
return None
|
117
|
+
|
118
|
+
logger.info("Reconcile agent.")
|
119
|
+
return self.client.reconcile_agent(
|
120
|
+
reconcile={"ops": ops},
|
121
|
+
)
|
122
|
+
|
79
123
|
def start(self):
|
80
124
|
try:
|
81
125
|
with sync_exit_context() as exit_event:
|
@@ -88,7 +132,9 @@ class BaseSyncAgent(BaseAgent):
|
|
88
132
|
while not exit_event.wait(timeout=timeout):
|
89
133
|
index += 1
|
90
134
|
self.refresh_executor()
|
91
|
-
if
|
135
|
+
if self._default_auth:
|
136
|
+
self.reconcile()
|
137
|
+
else:
|
92
138
|
self.cron()
|
93
139
|
agent_state = self.process(pool)
|
94
140
|
if not agent_state:
|
@@ -113,7 +159,7 @@ class BaseSyncAgent(BaseAgent):
|
|
113
159
|
self.sync_compatible_updates(agent_state.compatible_updates)
|
114
160
|
|
115
161
|
if agent_state:
|
116
|
-
logger.info("
|
162
|
+
logger.info("Checking agent state.")
|
117
163
|
else:
|
118
164
|
logger.info("No state was found.")
|
119
165
|
return V1AgentStateResponse.construct()
|
@@ -180,7 +226,7 @@ class BaseSyncAgent(BaseAgent):
|
|
180
226
|
)
|
181
227
|
return None
|
182
228
|
|
183
|
-
def submit_run(self, run_data: Tuple[str, str, str, str]):
|
229
|
+
def submit_run(self, run_data: Tuple[str, str, str, str, str]):
|
184
230
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
185
231
|
resource = self.prepare_run_resource(
|
186
232
|
owner_name=run_owner,
|
@@ -194,7 +240,10 @@ class BaseSyncAgent(BaseAgent):
|
|
194
240
|
|
195
241
|
try:
|
196
242
|
self.executor.create(
|
197
|
-
run_uuid=run_uuid,
|
243
|
+
run_uuid=run_uuid,
|
244
|
+
run_kind=run_data[1],
|
245
|
+
resource=resource,
|
246
|
+
namespace=run_data[4],
|
198
247
|
)
|
199
248
|
except ApiException as e:
|
200
249
|
if e.status == 409:
|
@@ -217,7 +266,7 @@ class BaseSyncAgent(BaseAgent):
|
|
217
266
|
)
|
218
267
|
|
219
268
|
def make_and_create_run(
|
220
|
-
self, run_data: Tuple[str, str, str, str], default_auth: bool = False
|
269
|
+
self, run_data: Tuple[str, str, str, str, str], default_auth: bool = False
|
221
270
|
):
|
222
271
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
223
272
|
resource = self.make_run_resource(
|
@@ -233,7 +282,10 @@ class BaseSyncAgent(BaseAgent):
|
|
233
282
|
|
234
283
|
try:
|
235
284
|
self.executor.create(
|
236
|
-
run_uuid=run_uuid,
|
285
|
+
run_uuid=run_uuid,
|
286
|
+
run_kind=run_data[1],
|
287
|
+
resource=resource,
|
288
|
+
namespace=run_data[4],
|
237
289
|
)
|
238
290
|
except ApiException as e:
|
239
291
|
if e.status == 409:
|
@@ -247,7 +299,7 @@ class BaseSyncAgent(BaseAgent):
|
|
247
299
|
)
|
248
300
|
)
|
249
301
|
|
250
|
-
def apply_run(self, run_data: Tuple[str, str, str, str]):
|
302
|
+
def apply_run(self, run_data: Tuple[str, str, str, str, str]):
|
251
303
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
252
304
|
resource = self.prepare_run_resource(
|
253
305
|
owner_name=run_owner,
|
@@ -261,7 +313,10 @@ class BaseSyncAgent(BaseAgent):
|
|
261
313
|
|
262
314
|
try:
|
263
315
|
self.executor.apply(
|
264
|
-
run_uuid=run_uuid,
|
316
|
+
run_uuid=run_uuid,
|
317
|
+
run_kind=run_data[1],
|
318
|
+
resource=resource,
|
319
|
+
namespace=run_data[4],
|
265
320
|
)
|
266
321
|
self.client.log_run_running(
|
267
322
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
|
@@ -270,12 +325,16 @@ class BaseSyncAgent(BaseAgent):
|
|
270
325
|
self.client.log_run_failed(
|
271
326
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid, exc=e
|
272
327
|
)
|
273
|
-
self.clean_run(
|
328
|
+
self.clean_run(
|
329
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[4]
|
330
|
+
)
|
274
331
|
|
275
|
-
def check_run(self, run_data: Tuple[str, str]):
|
332
|
+
def check_run(self, run_data: Tuple[str, str, str]):
|
276
333
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
277
334
|
try:
|
278
|
-
self.executor.get(
|
335
|
+
self.executor.get(
|
336
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[2]
|
337
|
+
)
|
279
338
|
except ApiException as e:
|
280
339
|
if e.status == 404:
|
281
340
|
logger.info(
|
@@ -285,10 +344,12 @@ class BaseSyncAgent(BaseAgent):
|
|
285
344
|
run_owner=run_owner, run_project=run_project, run_uuid=run_uuid
|
286
345
|
)
|
287
346
|
|
288
|
-
def stop_run(self, run_data: Tuple[str, str]):
|
347
|
+
def stop_run(self, run_data: Tuple[str, str, str]):
|
289
348
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
290
349
|
try:
|
291
|
-
self.executor.stop(
|
350
|
+
self.executor.stop(
|
351
|
+
run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[2]
|
352
|
+
)
|
292
353
|
except ApiException as e:
|
293
354
|
if e.status == 404:
|
294
355
|
logger.info("Run does not exist anymore, it could have been stopped.")
|
@@ -304,16 +365,20 @@ class BaseSyncAgent(BaseAgent):
|
|
304
365
|
message="Agent failed stopping run.\n",
|
305
366
|
)
|
306
367
|
|
307
|
-
def delete_run(self, run_data: Tuple[str, str, str, str]):
|
368
|
+
def delete_run(self, run_data: Tuple[str, str, str, str, str]):
|
308
369
|
run_owner, run_project, run_uuid = get_run_info(run_instance=run_data[0])
|
309
|
-
self.clean_run(run_uuid=run_uuid, run_kind=run_data[1])
|
370
|
+
self.clean_run(run_uuid=run_uuid, run_kind=run_data[1], namespace=run_data[4])
|
310
371
|
if run_data[3]:
|
311
372
|
self.make_and_create_run(run_data)
|
312
373
|
|
313
|
-
def clean_run(self, run_uuid: str, run_kind: str):
|
374
|
+
def clean_run(self, run_uuid: str, run_kind: str, namespace: str = None):
|
314
375
|
try:
|
315
|
-
self.executor.clean(
|
316
|
-
|
376
|
+
self.executor.clean(
|
377
|
+
run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
|
378
|
+
)
|
379
|
+
self.executor.stop(
|
380
|
+
run_uuid=run_uuid, run_kind=run_kind, namespace=namespace
|
381
|
+
)
|
317
382
|
except ApiException as e:
|
318
383
|
if e.status == 404:
|
319
384
|
logger.info("Run does not exist.")
|
polyaxon/_runner/executor.py
CHANGED
@@ -50,19 +50,21 @@ class BaseExecutor:
|
|
50
50
|
self._manager = None
|
51
51
|
return self.manager
|
52
52
|
|
53
|
-
def get(self, run_uuid: str, run_kind: str):
|
53
|
+
def get(self, run_uuid: str, run_kind: str, namespace: str = None):
|
54
54
|
raise NotImplementedError
|
55
55
|
|
56
|
-
def create(
|
56
|
+
def create(
|
57
|
+
self, run_uuid: str, run_kind: str, resource: Any, namespace: str = None
|
58
|
+
):
|
57
59
|
raise NotImplementedError
|
58
60
|
|
59
|
-
def apply(self, run_uuid: str, run_kind: str, resource: Any):
|
61
|
+
def apply(self, run_uuid: str, run_kind: str, resource: Any, namespace: str = None):
|
60
62
|
raise NotImplementedError
|
61
63
|
|
62
|
-
def stop(self, run_uuid: str, run_kind: str):
|
64
|
+
def stop(self, run_uuid: str, run_kind: str, namespace: str = None):
|
63
65
|
raise NotImplementedError
|
64
66
|
|
65
|
-
def clean(self, run_uuid: str, run_kind: str):
|
67
|
+
def clean(self, run_uuid: str, run_kind: str, namespace: str = None):
|
66
68
|
raise NotImplementedError
|
67
69
|
|
68
70
|
def _clean_temp_execution_path(self, run_uuid: str):
|
@@ -154,7 +156,7 @@ class BaseExecutor:
|
|
154
156
|
run_name=run_name,
|
155
157
|
run_uuid=run_uuid,
|
156
158
|
run_path=run_uuid,
|
157
|
-
namespace=agent_env.namespace,
|
159
|
+
namespace=compiled_operation.namespace or agent_env.namespace,
|
158
160
|
compiled_operation=compiled_operation,
|
159
161
|
polyaxon_init=agent_env.polyaxon_init,
|
160
162
|
polyaxon_sidecar=agent_env.polyaxon_sidecar,
|
@@ -191,7 +193,7 @@ class BaseExecutor:
|
|
191
193
|
params=operation.params,
|
192
194
|
)
|
193
195
|
return cls.get_resource(
|
194
|
-
namespace=resolver_obj.namespace,
|
196
|
+
namespace=compiled_operation.namespace or resolver_obj.namespace,
|
195
197
|
owner_name=resolver_obj.owner_name,
|
196
198
|
project_name=resolver_obj.project_name,
|
197
199
|
run_name=resolver_obj.run_name,
|
@@ -223,4 +225,8 @@ class BaseExecutor:
|
|
223
225
|
run_uuid=response.uuid,
|
224
226
|
run_kind=response.kind,
|
225
227
|
resource=resource,
|
228
|
+
namespace=response.namespace,
|
226
229
|
)
|
230
|
+
|
231
|
+
def list_ops(self, namespace: str = None):
|
232
|
+
raise NotImplementedError
|