torchx-nightly 2023.10.21__py3-none-any.whl → 2025.12.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/__init__.py +2 -0
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/apps/serve/serve.py +2 -0
- torchx/apps/utils/booth_main.py +2 -0
- torchx/apps/utils/copy_main.py +2 -0
- torchx/apps/utils/process_monitor.py +2 -0
- torchx/cli/__init__.py +2 -0
- torchx/cli/argparse_util.py +38 -3
- torchx/cli/cmd_base.py +2 -0
- torchx/cli/cmd_cancel.py +2 -0
- torchx/cli/cmd_configure.py +2 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_describe.py +2 -0
- torchx/cli/cmd_list.py +8 -4
- torchx/cli/cmd_log.py +6 -24
- torchx/cli/cmd_run.py +269 -45
- torchx/cli/cmd_runopts.py +2 -0
- torchx/cli/cmd_status.py +12 -1
- torchx/cli/cmd_tracker.py +3 -1
- torchx/cli/colors.py +2 -0
- torchx/cli/main.py +4 -0
- torchx/components/__init__.py +3 -8
- torchx/components/component_test_base.py +2 -0
- torchx/components/dist.py +18 -7
- torchx/components/integration_tests/component_provider.py +4 -2
- torchx/components/integration_tests/integ_tests.py +2 -0
- torchx/components/serve.py +2 -0
- torchx/components/structured_arg.py +7 -6
- torchx/components/utils.py +15 -4
- torchx/distributed/__init__.py +2 -4
- torchx/examples/apps/datapreproc/datapreproc.py +2 -0
- torchx/examples/apps/lightning/data.py +5 -3
- torchx/examples/apps/lightning/model.py +7 -6
- torchx/examples/apps/lightning/profiler.py +7 -4
- torchx/examples/apps/lightning/train.py +11 -2
- torchx/examples/torchx_out_of_sync_training.py +11 -0
- torchx/notebook.py +2 -0
- torchx/runner/__init__.py +2 -0
- torchx/runner/api.py +167 -60
- torchx/runner/config.py +43 -10
- torchx/runner/events/__init__.py +57 -13
- torchx/runner/events/api.py +14 -3
- torchx/runner/events/handlers.py +2 -0
- torchx/runtime/tracking/__init__.py +2 -0
- torchx/runtime/tracking/api.py +2 -0
- torchx/schedulers/__init__.py +16 -15
- torchx/schedulers/api.py +70 -14
- torchx/schedulers/aws_batch_scheduler.py +79 -5
- torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
- torchx/schedulers/devices.py +17 -4
- torchx/schedulers/docker_scheduler.py +43 -11
- torchx/schedulers/ids.py +29 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +10 -8
- torchx/schedulers/kubernetes_scheduler.py +383 -38
- torchx/schedulers/local_scheduler.py +100 -27
- torchx/schedulers/lsf_scheduler.py +5 -4
- torchx/schedulers/slurm_scheduler.py +336 -20
- torchx/schedulers/streams.py +2 -0
- torchx/specs/__init__.py +89 -12
- torchx/specs/api.py +431 -32
- torchx/specs/builders.py +176 -38
- torchx/specs/file_linter.py +143 -57
- torchx/specs/finder.py +68 -28
- torchx/specs/named_resources_aws.py +254 -22
- torchx/specs/named_resources_generic.py +2 -0
- torchx/specs/overlays.py +106 -0
- torchx/specs/test/components/__init__.py +2 -0
- torchx/specs/test/components/a/__init__.py +2 -0
- torchx/specs/test/components/a/b/__init__.py +2 -0
- torchx/specs/test/components/a/b/c.py +2 -0
- torchx/specs/test/components/c/__init__.py +2 -0
- torchx/specs/test/components/c/d.py +2 -0
- torchx/tracker/__init__.py +12 -6
- torchx/tracker/api.py +15 -18
- torchx/tracker/backend/fsspec.py +2 -0
- torchx/util/cuda.py +2 -0
- torchx/util/datetime.py +2 -0
- torchx/util/entrypoints.py +39 -15
- torchx/util/io.py +2 -0
- torchx/util/log_tee_helpers.py +210 -0
- torchx/util/modules.py +65 -0
- torchx/util/session.py +42 -0
- torchx/util/shlex.py +2 -0
- torchx/util/strings.py +3 -1
- torchx/util/types.py +90 -29
- torchx/version.py +4 -2
- torchx/workspace/__init__.py +2 -0
- torchx/workspace/api.py +136 -6
- torchx/workspace/dir_workspace.py +2 -0
- torchx/workspace/docker_workspace.py +30 -2
- torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
- torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
- torchx/pipelines/kfp/__init__.py +0 -28
- torchx/pipelines/kfp/adapter.py +0 -271
- torchx/pipelines/kfp/version.py +0 -17
- torchx/schedulers/gcp_batch_scheduler.py +0 -487
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -453
- torchx_nightly-2023.10.21.dist-info/METADATA +0 -174
- torchx_nightly-2023.10.21.dist-info/RECORD +0 -118
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
torchx/runner/api.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
2
|
# All rights reserved.
|
|
4
3
|
#
|
|
5
4
|
# This source code is licensed under the BSD-style license found in the
|
|
6
5
|
# LICENSE file in the root directory of this source tree.
|
|
7
6
|
|
|
7
|
+
# pyre-strict
|
|
8
|
+
|
|
8
9
|
import json
|
|
9
10
|
import logging
|
|
10
11
|
import os
|
|
@@ -12,7 +13,19 @@ import time
|
|
|
12
13
|
import warnings
|
|
13
14
|
from datetime import datetime
|
|
14
15
|
from types import TracebackType
|
|
15
|
-
from typing import
|
|
16
|
+
from typing import (
|
|
17
|
+
Any,
|
|
18
|
+
Dict,
|
|
19
|
+
Iterable,
|
|
20
|
+
List,
|
|
21
|
+
Mapping,
|
|
22
|
+
Optional,
|
|
23
|
+
Tuple,
|
|
24
|
+
Type,
|
|
25
|
+
TYPE_CHECKING,
|
|
26
|
+
TypeVar,
|
|
27
|
+
Union,
|
|
28
|
+
)
|
|
16
29
|
|
|
17
30
|
from torchx.runner.events import log_event
|
|
18
31
|
from torchx.schedulers import get_scheduler_factories, SchedulerFactory
|
|
@@ -29,6 +42,7 @@ from torchx.specs import (
|
|
|
29
42
|
parse_app_handle,
|
|
30
43
|
runopts,
|
|
31
44
|
UnknownAppException,
|
|
45
|
+
Workspace,
|
|
32
46
|
)
|
|
33
47
|
from torchx.specs.finder import get_component
|
|
34
48
|
from torchx.tracker.api import (
|
|
@@ -37,9 +51,13 @@ from torchx.tracker.api import (
|
|
|
37
51
|
ENV_TORCHX_TRACKERS,
|
|
38
52
|
tracker_config_env_var_name,
|
|
39
53
|
)
|
|
54
|
+
from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
|
|
40
55
|
|
|
41
56
|
from torchx.util.types import none_throws
|
|
42
|
-
from torchx.workspace
|
|
57
|
+
from torchx.workspace import WorkspaceMixin
|
|
58
|
+
|
|
59
|
+
if TYPE_CHECKING:
|
|
60
|
+
from typing_extensions import Self
|
|
43
61
|
|
|
44
62
|
from .config import get_config, get_configs
|
|
45
63
|
|
|
@@ -47,6 +65,8 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
47
65
|
|
|
48
66
|
|
|
49
67
|
NONE: str = "<NONE>"
|
|
68
|
+
S = TypeVar("S")
|
|
69
|
+
T = TypeVar("T")
|
|
50
70
|
|
|
51
71
|
|
|
52
72
|
def get_configured_trackers() -> Dict[str, Optional[str]]:
|
|
@@ -96,15 +116,26 @@ class Runner:
|
|
|
96
116
|
"""
|
|
97
117
|
self._name: str = name
|
|
98
118
|
self._scheduler_factories = scheduler_factories
|
|
99
|
-
self._scheduler_params: Dict[str,
|
|
100
|
-
|
|
119
|
+
self._scheduler_params: Dict[str, Any] = {
|
|
120
|
+
**(self._get_scheduler_params_from_env()),
|
|
121
|
+
**(scheduler_params or {}),
|
|
122
|
+
}
|
|
123
|
+
# pyre-fixme[24]: SchedulerOpts is a generic, and we don't have access to the corresponding type
|
|
101
124
|
self._scheduler_instances: Dict[str, Scheduler] = {}
|
|
102
125
|
self._apps: Dict[AppHandle, AppDef] = {}
|
|
103
126
|
|
|
104
127
|
# component_name -> map of component_fn_param_name -> user-specified default val encoded as str
|
|
105
128
|
self._component_defaults: Dict[str, Dict[str, str]] = component_defaults or {}
|
|
106
129
|
|
|
107
|
-
def
|
|
130
|
+
def _get_scheduler_params_from_env(self) -> Dict[str, str]:
|
|
131
|
+
scheduler_params = {}
|
|
132
|
+
for key, value in os.environ.items():
|
|
133
|
+
key = key.lower()
|
|
134
|
+
if key.startswith("torchx_"):
|
|
135
|
+
scheduler_params[key.removeprefix("torchx_")] = value
|
|
136
|
+
return scheduler_params
|
|
137
|
+
|
|
138
|
+
def __enter__(self) -> "Self":
|
|
108
139
|
return self
|
|
109
140
|
|
|
110
141
|
def __exit__(
|
|
@@ -131,16 +162,16 @@ class Runner:
|
|
|
131
162
|
It is ok to call this method multiple times on the same runner object.
|
|
132
163
|
"""
|
|
133
164
|
|
|
134
|
-
for
|
|
165
|
+
for scheduler in self._scheduler_instances.values():
|
|
135
166
|
scheduler.close()
|
|
136
167
|
|
|
137
168
|
def run_component(
|
|
138
169
|
self,
|
|
139
170
|
component: str,
|
|
140
|
-
component_args:
|
|
171
|
+
component_args: Union[list[str], dict[str, Any]],
|
|
141
172
|
scheduler: str,
|
|
142
173
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
143
|
-
workspace: Optional[str] = None,
|
|
174
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
144
175
|
parent_run_id: Optional[str] = None,
|
|
145
176
|
) -> AppHandle:
|
|
146
177
|
"""
|
|
@@ -175,23 +206,32 @@ class Runner:
|
|
|
175
206
|
ComponentNotFoundException: if the ``component_path`` is failed to resolve.
|
|
176
207
|
"""
|
|
177
208
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
209
|
+
with log_event("run_component") as ctx:
|
|
210
|
+
dryrun_info = self.dryrun_component(
|
|
211
|
+
component,
|
|
212
|
+
component_args,
|
|
213
|
+
scheduler,
|
|
214
|
+
cfg=cfg,
|
|
215
|
+
workspace=workspace,
|
|
216
|
+
parent_run_id=parent_run_id,
|
|
217
|
+
)
|
|
218
|
+
handle = self.schedule(dryrun_info)
|
|
219
|
+
app = none_throws(dryrun_info._app)
|
|
220
|
+
|
|
221
|
+
ctx._torchx_event.workspace = str(workspace)
|
|
222
|
+
ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
|
|
223
|
+
ctx._torchx_event.app_image = app.roles[0].image
|
|
224
|
+
ctx._torchx_event.app_id = parse_app_handle(handle)[2]
|
|
225
|
+
ctx._torchx_event.app_metadata = app.metadata
|
|
226
|
+
return handle
|
|
187
227
|
|
|
188
228
|
def dryrun_component(
|
|
189
229
|
self,
|
|
190
230
|
component: str,
|
|
191
|
-
component_args:
|
|
231
|
+
component_args: Union[list[str], dict[str, Any]],
|
|
192
232
|
scheduler: str,
|
|
193
233
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
194
|
-
workspace: Optional[str] = None,
|
|
234
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
195
235
|
parent_run_id: Optional[str] = None,
|
|
196
236
|
) -> AppDryRunInfo:
|
|
197
237
|
"""
|
|
@@ -199,10 +239,13 @@ class Runner:
|
|
|
199
239
|
component, but just returns what "would" have run.
|
|
200
240
|
"""
|
|
201
241
|
component_def = get_component(component)
|
|
242
|
+
args_from_cli = component_args if isinstance(component_args, list) else []
|
|
243
|
+
args_from_json = component_args if isinstance(component_args, dict) else {}
|
|
202
244
|
app = materialize_appdef(
|
|
203
245
|
component_def.fn,
|
|
204
|
-
|
|
246
|
+
args_from_cli,
|
|
205
247
|
self._component_defaults.get(component, None),
|
|
248
|
+
args_from_json,
|
|
206
249
|
)
|
|
207
250
|
return self.dryrun(
|
|
208
251
|
app,
|
|
@@ -217,7 +260,7 @@ class Runner:
|
|
|
217
260
|
app: AppDef,
|
|
218
261
|
scheduler: str,
|
|
219
262
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
220
|
-
workspace: Optional[str] = None,
|
|
263
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
221
264
|
parent_run_id: Optional[str] = None,
|
|
222
265
|
) -> AppHandle:
|
|
223
266
|
"""
|
|
@@ -230,10 +273,25 @@ class Runner:
|
|
|
230
273
|
An application handle that is used to call other action APIs on the app.
|
|
231
274
|
"""
|
|
232
275
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
276
|
+
with log_event(api="run") as ctx:
|
|
277
|
+
dryrun_info = self.dryrun(
|
|
278
|
+
app,
|
|
279
|
+
scheduler,
|
|
280
|
+
cfg=cfg,
|
|
281
|
+
workspace=workspace,
|
|
282
|
+
parent_run_id=parent_run_id,
|
|
283
|
+
)
|
|
284
|
+
handle = self.schedule(dryrun_info)
|
|
285
|
+
|
|
286
|
+
event = ctx._torchx_event
|
|
287
|
+
event.scheduler = scheduler
|
|
288
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
289
|
+
event.workspace = str(workspace)
|
|
290
|
+
event.app_id = parse_app_handle(handle)[2]
|
|
291
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
292
|
+
event.app_metadata = app.metadata
|
|
293
|
+
|
|
294
|
+
return handle
|
|
237
295
|
|
|
238
296
|
def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle:
|
|
239
297
|
"""
|
|
@@ -266,21 +324,22 @@ class Runner:
|
|
|
266
324
|
|
|
267
325
|
"""
|
|
268
326
|
scheduler = none_throws(dryrun_info._scheduler)
|
|
269
|
-
app_image = none_throws(dryrun_info._app).roles[0].image
|
|
270
327
|
cfg = dryrun_info._cfg
|
|
271
|
-
with log_event(
|
|
272
|
-
"schedule",
|
|
273
|
-
scheduler,
|
|
274
|
-
app_image=app_image,
|
|
275
|
-
runcfg=json.dumps(cfg) if cfg else None,
|
|
276
|
-
) as ctx:
|
|
328
|
+
with log_event("schedule") as ctx:
|
|
277
329
|
sched = self._scheduler(scheduler)
|
|
278
330
|
app_id = sched.schedule(dryrun_info)
|
|
279
331
|
app_handle = make_app_handle(scheduler, self._name, app_id)
|
|
332
|
+
|
|
280
333
|
app = none_throws(dryrun_info._app)
|
|
281
334
|
self._apps[app_handle] = app
|
|
282
|
-
|
|
283
|
-
ctx._torchx_event
|
|
335
|
+
|
|
336
|
+
event = ctx._torchx_event
|
|
337
|
+
event.scheduler = scheduler
|
|
338
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
339
|
+
event.app_id = app_id
|
|
340
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
341
|
+
event.app_metadata = app.metadata
|
|
342
|
+
|
|
284
343
|
return app_handle
|
|
285
344
|
|
|
286
345
|
def name(self) -> str:
|
|
@@ -291,7 +350,7 @@ class Runner:
|
|
|
291
350
|
app: AppDef,
|
|
292
351
|
scheduler: str,
|
|
293
352
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
294
|
-
workspace: Optional[str] = None,
|
|
353
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
295
354
|
parent_run_id: Optional[str] = None,
|
|
296
355
|
) -> AppDryRunInfo:
|
|
297
356
|
"""
|
|
@@ -343,6 +402,7 @@ class Runner:
|
|
|
343
402
|
role.env[ENV_TORCHX_JOB_ID] = make_app_handle(
|
|
344
403
|
scheduler, self._name, macros.app_id
|
|
345
404
|
)
|
|
405
|
+
role.env[TORCHX_INTERNAL_SESSION_ID] = get_session_id_or_create_new()
|
|
346
406
|
|
|
347
407
|
if parent_run_id:
|
|
348
408
|
role.env[ENV_TORCHX_PARENT_RUN_ID] = parent_run_id
|
|
@@ -355,33 +415,49 @@ class Runner:
|
|
|
355
415
|
role.env[tracker_config_env_var_name(name)] = config
|
|
356
416
|
|
|
357
417
|
cfg = cfg or dict()
|
|
358
|
-
with log_event(
|
|
418
|
+
with log_event(
|
|
419
|
+
"dryrun",
|
|
420
|
+
scheduler,
|
|
421
|
+
runcfg=json.dumps(cfg) if cfg else None,
|
|
422
|
+
workspace=str(workspace),
|
|
423
|
+
) as ctx:
|
|
359
424
|
sched = self._scheduler(scheduler)
|
|
360
425
|
resolved_cfg = sched.run_opts().resolve(cfg)
|
|
361
|
-
if workspace and isinstance(sched, WorkspaceMixin):
|
|
362
|
-
role = app.roles[0]
|
|
363
|
-
old_img = role.image
|
|
364
426
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
427
|
+
sched._pre_build_validate(app, scheduler, resolved_cfg)
|
|
428
|
+
|
|
429
|
+
if isinstance(sched, WorkspaceMixin):
|
|
430
|
+
if workspace:
|
|
431
|
+
# NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
|
|
432
|
+
# later, torchx added support for the workspace attr in Role
|
|
433
|
+
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
|
|
434
|
+
if app.roles[0].workspace:
|
|
435
|
+
logger.info(
|
|
436
|
+
"Overriding role[%d] (%s) workspace to `%s`"
|
|
437
|
+
"To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
|
|
438
|
+
0,
|
|
439
|
+
role.name,
|
|
440
|
+
str(app.roles[0].workspace),
|
|
441
|
+
)
|
|
442
|
+
app.roles[0].workspace = (
|
|
443
|
+
Workspace.from_str(workspace)
|
|
444
|
+
if isinstance(workspace, str)
|
|
445
|
+
else workspace
|
|
380
446
|
)
|
|
381
447
|
|
|
382
|
-
|
|
448
|
+
sched.build_workspaces(app.roles, resolved_cfg)
|
|
449
|
+
|
|
450
|
+
sched._validate(app, scheduler, resolved_cfg)
|
|
383
451
|
dryrun_info = sched.submit_dryrun(app, resolved_cfg)
|
|
384
452
|
dryrun_info._scheduler = scheduler
|
|
453
|
+
|
|
454
|
+
event = ctx._torchx_event
|
|
455
|
+
event.scheduler = scheduler
|
|
456
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
457
|
+
event.app_id = app.name
|
|
458
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
459
|
+
event.app_metadata = app.metadata
|
|
460
|
+
|
|
385
461
|
return dryrun_info
|
|
386
462
|
|
|
387
463
|
def scheduler_run_opts(self, scheduler: str) -> runopts:
|
|
@@ -400,6 +476,27 @@ class Runner:
|
|
|
400
476
|
"""
|
|
401
477
|
return self._scheduler(scheduler).run_opts()
|
|
402
478
|
|
|
479
|
+
def cfg_from_str(self, scheduler: str, *cfg_literal: str) -> Mapping[str, CfgVal]:
|
|
480
|
+
"""
|
|
481
|
+
Convenience function around the scheduler's ``runopts.cfg_from_str()`` method.
|
|
482
|
+
|
|
483
|
+
Usage:
|
|
484
|
+
|
|
485
|
+
.. doctest::
|
|
486
|
+
|
|
487
|
+
from torchx.runner import get_runner
|
|
488
|
+
|
|
489
|
+
runner = get_runner()
|
|
490
|
+
cfg = runner.cfg_from_str("local_cwd", "log_dir=/tmp/foobar", "prepend_cwd=True")
|
|
491
|
+
assert cfg == {"log_dir": "/tmp/foobar", "prepend_cwd": True, "auto_set_cuda_visible_devices": False}
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
opts = self._scheduler(scheduler).run_opts()
|
|
495
|
+
cfg = {}
|
|
496
|
+
for cfg_str in cfg_literal:
|
|
497
|
+
cfg.update(opts.cfg_from_str(cfg_str))
|
|
498
|
+
return cfg
|
|
499
|
+
|
|
403
500
|
def scheduler_backends(self) -> List[str]:
|
|
404
501
|
"""
|
|
405
502
|
Returns a list of all supported scheduler backends.
|
|
@@ -490,6 +587,16 @@ class Runner:
|
|
|
490
587
|
if status is not None and not status.is_terminal():
|
|
491
588
|
scheduler.cancel(app_id)
|
|
492
589
|
|
|
590
|
+
def delete(self, app_handle: AppHandle) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Deletes the application from the scheduler.
|
|
593
|
+
"""
|
|
594
|
+
scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
|
|
595
|
+
with log_event("delete", scheduler_backend, app_id):
|
|
596
|
+
status = self.status(app_handle)
|
|
597
|
+
if status is not None:
|
|
598
|
+
scheduler.delete(app_id)
|
|
599
|
+
|
|
493
600
|
def stop(self, app_handle: AppHandle) -> None:
|
|
494
601
|
"""
|
|
495
602
|
See method ``cancel``.
|
|
@@ -525,7 +632,7 @@ class Runner:
|
|
|
525
632
|
if not app:
|
|
526
633
|
desc = scheduler.describe(app_id)
|
|
527
634
|
if desc:
|
|
528
|
-
app = AppDef(name=app_id, roles=desc.roles)
|
|
635
|
+
app = AppDef(name=app_id, roles=desc.roles, metadata=desc.metadata)
|
|
529
636
|
return app
|
|
530
637
|
|
|
531
638
|
def log_lines(
|
|
@@ -637,7 +744,7 @@ class Runner:
|
|
|
637
744
|
app.app_handle = make_app_handle(scheduler, self._name, app.app_id)
|
|
638
745
|
return apps
|
|
639
746
|
|
|
640
|
-
# pyre-fixme:
|
|
747
|
+
# pyre-fixme[24]: SchedulerOpts is a generic, and we don't have access to the corresponding type
|
|
641
748
|
def _scheduler(self, scheduler: str) -> Scheduler:
|
|
642
749
|
sched = self._scheduler_instances.get(scheduler)
|
|
643
750
|
if not sched:
|
|
@@ -654,8 +761,8 @@ class Runner:
|
|
|
654
761
|
def _scheduler_app_id(
|
|
655
762
|
self,
|
|
656
763
|
app_handle: AppHandle,
|
|
657
|
-
check_session: bool = True
|
|
658
|
-
# pyre-fixme:
|
|
764
|
+
check_session: bool = True,
|
|
765
|
+
# pyre-fixme[24]: SchedulerOpts is a generic, and we don't have access to the corresponding type
|
|
659
766
|
) -> Tuple[Scheduler, str, str]:
|
|
660
767
|
"""
|
|
661
768
|
Returns the scheduler and app_id from the app_handle.
|
torchx/runner/config.py
CHANGED
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
# This source code is licensed under the BSD-style license found in the
|
|
6
6
|
# LICENSE file in the root directory of this source tree.
|
|
7
7
|
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
8
10
|
"""
|
|
9
11
|
Status: Beta
|
|
10
12
|
|
|
@@ -71,7 +73,7 @@ CLI Usage
|
|
|
71
73
|
|
|
72
74
|
#. In addition, it is possible to specify a different config other than .torchxconfig to
|
|
73
75
|
load at runtime. Requirements are that the config path is specified by enviornment
|
|
74
|
-
variable
|
|
76
|
+
variable TORCHXCONFIG. It also disables hierarchy loading configs from multiple
|
|
75
77
|
directories as the cases otherwise.
|
|
76
78
|
|
|
77
79
|
#. User level .torchxconfig
|
|
@@ -195,7 +197,15 @@ def _configparser() -> configparser.ConfigParser:
|
|
|
195
197
|
|
|
196
198
|
|
|
197
199
|
def _get_scheduler(name: str) -> Scheduler:
|
|
198
|
-
schedulers =
|
|
200
|
+
schedulers = {
|
|
201
|
+
**get_scheduler_factories(),
|
|
202
|
+
**(
|
|
203
|
+
get_scheduler_factories(
|
|
204
|
+
group="torchx.schedulers.orchestrator", skip_defaults=True
|
|
205
|
+
)
|
|
206
|
+
or {}
|
|
207
|
+
),
|
|
208
|
+
}
|
|
199
209
|
if name not in schedulers:
|
|
200
210
|
raise ValueError(
|
|
201
211
|
f"`{name}` is not a registered scheduler. Valid scheduler names: {schedulers.keys()}"
|
|
@@ -239,7 +249,16 @@ def dump(
|
|
|
239
249
|
if schedulers:
|
|
240
250
|
scheds = schedulers
|
|
241
251
|
else:
|
|
242
|
-
|
|
252
|
+
scheduler_factories = {
|
|
253
|
+
**get_scheduler_factories(),
|
|
254
|
+
**(
|
|
255
|
+
get_scheduler_factories(
|
|
256
|
+
group="torchx.schedulers.orchestrator", skip_defaults=True
|
|
257
|
+
)
|
|
258
|
+
or {}
|
|
259
|
+
),
|
|
260
|
+
}
|
|
261
|
+
scheds = scheduler_factories.keys()
|
|
243
262
|
|
|
244
263
|
config = _configparser()
|
|
245
264
|
for sched_name in scheds:
|
|
@@ -259,13 +278,20 @@ def dump(
|
|
|
259
278
|
continue
|
|
260
279
|
|
|
261
280
|
# serialize list elements with `;` delimiter (consistent with torchx cli)
|
|
262
|
-
if opt.
|
|
281
|
+
if opt.is_type_list_of_str:
|
|
263
282
|
# deal with empty or None default lists
|
|
264
283
|
if opt.default:
|
|
265
284
|
# pyre-ignore[6] opt.default type checked already as List[str]
|
|
266
285
|
val = ";".join(opt.default)
|
|
267
286
|
else:
|
|
268
287
|
val = _NONE
|
|
288
|
+
elif opt.is_type_dict_of_str:
|
|
289
|
+
# deal with empty or None default lists
|
|
290
|
+
if opt.default:
|
|
291
|
+
# pyre-ignore[16] opt.default type checked already as Dict[str, str]
|
|
292
|
+
val = ";".join([f"{k}:{v}" for k, v in opt.default.items()])
|
|
293
|
+
else:
|
|
294
|
+
val = _NONE
|
|
269
295
|
else:
|
|
270
296
|
val = f"{opt.default}"
|
|
271
297
|
|
|
@@ -468,6 +494,8 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
|
|
|
468
494
|
|
|
469
495
|
config = os.getenv(ENV_TORCHXCONFIG)
|
|
470
496
|
if config is not None:
|
|
497
|
+
if not config:
|
|
498
|
+
return []
|
|
471
499
|
configfile = Path(config)
|
|
472
500
|
if not configfile.is_file():
|
|
473
501
|
raise FileNotFoundError(
|
|
@@ -480,7 +508,7 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
|
|
|
480
508
|
dirs = DEFAULT_CONFIG_DIRS
|
|
481
509
|
for d in dirs:
|
|
482
510
|
configfile = Path(d) / CONFIG_FILE
|
|
483
|
-
if configfile.
|
|
511
|
+
if os.access(configfile, os.R_OK):
|
|
484
512
|
config_files.append(str(configfile))
|
|
485
513
|
return config_files
|
|
486
514
|
|
|
@@ -510,21 +538,26 @@ def load(scheduler: str, f: TextIO, cfg: Dict[str, CfgVal]) -> None:
|
|
|
510
538
|
# this also handles empty or None lists
|
|
511
539
|
cfg[name] = None
|
|
512
540
|
else:
|
|
513
|
-
|
|
541
|
+
opt = runopts.get(name)
|
|
514
542
|
|
|
515
|
-
if
|
|
543
|
+
if opt is None:
|
|
516
544
|
log.warning(
|
|
517
545
|
f"`{name} = {value}` was declared in the [{section}] section "
|
|
518
546
|
f" of the config file but is not a runopt of `{scheduler}` scheduler."
|
|
519
547
|
f" Remove the entry from the config file to no longer see this warning"
|
|
520
548
|
)
|
|
521
549
|
else:
|
|
522
|
-
if
|
|
550
|
+
if opt.opt_type is bool:
|
|
523
551
|
# need to handle bool specially since str -> bool is based on
|
|
524
552
|
# str emptiness not value (e.g. bool("False") == True)
|
|
525
553
|
cfg[name] = config.getboolean(section, name)
|
|
526
|
-
elif
|
|
554
|
+
elif opt.is_type_list_of_str:
|
|
527
555
|
cfg[name] = value.split(";")
|
|
556
|
+
elif opt.is_type_dict_of_str:
|
|
557
|
+
cfg[name] = {
|
|
558
|
+
s.split(":", 1)[0]: s.split(":", 1)[1]
|
|
559
|
+
for s in value.replace(",", ";").split(";")
|
|
560
|
+
}
|
|
528
561
|
else:
|
|
529
562
|
# pyre-ignore[29]
|
|
530
|
-
cfg[name] =
|
|
563
|
+
cfg[name] = opt.opt_type(value)
|
torchx/runner/events/__init__.py
CHANGED
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
# This source code is licensed under the BSD-style license found in the
|
|
6
6
|
# LICENSE file in the root directory of this source tree.
|
|
7
7
|
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
8
10
|
"""
|
|
9
11
|
Module contains events processing mechanisms that are integrated with the standard python logging.
|
|
10
12
|
|
|
@@ -18,18 +20,22 @@ Example of usage:
|
|
|
18
20
|
|
|
19
21
|
"""
|
|
20
22
|
|
|
23
|
+
import json
|
|
21
24
|
import logging
|
|
25
|
+
import sys
|
|
22
26
|
import time
|
|
23
27
|
import traceback
|
|
24
28
|
from types import TracebackType
|
|
25
|
-
from typing import Optional, Type
|
|
29
|
+
from typing import Dict, Optional, Type
|
|
26
30
|
|
|
27
31
|
from torchx.runner.events.handlers import get_logging_handler
|
|
32
|
+
from torchx.util.session import get_session_id_or_create_new
|
|
28
33
|
|
|
29
34
|
from .api import SourceType, TorchxEvent # noqa F401
|
|
30
35
|
|
|
31
|
-
|
|
32
|
-
|
|
36
|
+
_events_logger: Optional[logging.Logger] = None
|
|
37
|
+
|
|
38
|
+
log: logging.Logger = logging.getLogger(__name__)
|
|
33
39
|
|
|
34
40
|
|
|
35
41
|
def _get_or_create_logger(destination: str = "null") -> logging.Logger:
|
|
@@ -46,19 +52,28 @@ def _get_or_create_logger(destination: str = "null") -> logging.Logger:
|
|
|
46
52
|
a new logger if None provided.
|
|
47
53
|
"""
|
|
48
54
|
global _events_logger
|
|
55
|
+
|
|
49
56
|
if _events_logger:
|
|
50
57
|
return _events_logger
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
else:
|
|
59
|
+
logging_handler = get_logging_handler(destination)
|
|
60
|
+
logging_handler.setLevel(logging.DEBUG)
|
|
61
|
+
_events_logger = logging.getLogger(f"torchx-events-{destination}")
|
|
62
|
+
# Do not propagate message to the root logger
|
|
63
|
+
_events_logger.propagate = False
|
|
64
|
+
_events_logger.addHandler(logging_handler)
|
|
65
|
+
|
|
66
|
+
assert _events_logger # make type-checker happy
|
|
67
|
+
return _events_logger
|
|
58
68
|
|
|
59
69
|
|
|
60
70
|
def record(event: TorchxEvent, destination: str = "null") -> None:
|
|
61
|
-
|
|
71
|
+
try:
|
|
72
|
+
serialized_event = event.serialize()
|
|
73
|
+
except Exception:
|
|
74
|
+
log.exception("failed to serialize event, will not record event")
|
|
75
|
+
else:
|
|
76
|
+
_get_or_create_logger(destination).info(serialized_event)
|
|
62
77
|
|
|
63
78
|
|
|
64
79
|
class log_event:
|
|
@@ -82,17 +97,28 @@ class log_event:
|
|
|
82
97
|
scheduler: Optional[str] = None,
|
|
83
98
|
app_id: Optional[str] = None,
|
|
84
99
|
app_image: Optional[str] = None,
|
|
100
|
+
app_metadata: Optional[Dict[str, str]] = None,
|
|
85
101
|
runcfg: Optional[str] = None,
|
|
102
|
+
workspace: Optional[str] = None,
|
|
86
103
|
) -> None:
|
|
87
104
|
self._torchx_event: TorchxEvent = self._generate_torchx_event(
|
|
88
|
-
api,
|
|
105
|
+
api,
|
|
106
|
+
scheduler or "",
|
|
107
|
+
app_id,
|
|
108
|
+
app_image=app_image,
|
|
109
|
+
app_metadata=app_metadata,
|
|
110
|
+
runcfg=runcfg,
|
|
111
|
+
workspace=workspace,
|
|
89
112
|
)
|
|
90
113
|
self._start_cpu_time_ns = 0
|
|
91
114
|
self._start_wall_time_ns = 0
|
|
115
|
+
self._start_epoch_time_usec = 0
|
|
92
116
|
|
|
93
117
|
def __enter__(self) -> "log_event":
|
|
94
118
|
self._start_cpu_time_ns = time.process_time_ns()
|
|
95
119
|
self._start_wall_time_ns = time.perf_counter_ns()
|
|
120
|
+
self._torchx_event.start_epoch_time_usec = int(time.time() * 1_000_000)
|
|
121
|
+
|
|
96
122
|
return self
|
|
97
123
|
|
|
98
124
|
def __exit__(
|
|
@@ -109,6 +135,20 @@ class log_event:
|
|
|
109
135
|
) // 1000
|
|
110
136
|
if traceback_type:
|
|
111
137
|
self._torchx_event.raw_exception = traceback.format_exc()
|
|
138
|
+
typ, value, tb = sys.exc_info()
|
|
139
|
+
if tb:
|
|
140
|
+
last_frame = traceback.extract_tb(tb)[-1]
|
|
141
|
+
self._torchx_event.exception_source_location = json.dumps(
|
|
142
|
+
{
|
|
143
|
+
"filename": last_frame.filename,
|
|
144
|
+
"lineno": last_frame.lineno,
|
|
145
|
+
"name": last_frame.name,
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
if exec_type:
|
|
149
|
+
self._torchx_event.exception_type = exec_type.__name__
|
|
150
|
+
if exec_value:
|
|
151
|
+
self._torchx_event.exception_message = str(exec_value)
|
|
112
152
|
record(self._torchx_event)
|
|
113
153
|
|
|
114
154
|
def _generate_torchx_event(
|
|
@@ -117,15 +157,19 @@ class log_event:
|
|
|
117
157
|
scheduler: str,
|
|
118
158
|
app_id: Optional[str] = None,
|
|
119
159
|
app_image: Optional[str] = None,
|
|
160
|
+
app_metadata: Optional[Dict[str, str]] = None,
|
|
120
161
|
runcfg: Optional[str] = None,
|
|
121
162
|
source: SourceType = SourceType.UNKNOWN,
|
|
163
|
+
workspace: Optional[str] = None,
|
|
122
164
|
) -> TorchxEvent:
|
|
123
165
|
return TorchxEvent(
|
|
124
|
-
session=
|
|
166
|
+
session=get_session_id_or_create_new(),
|
|
125
167
|
scheduler=scheduler,
|
|
126
168
|
api=api,
|
|
127
169
|
app_id=app_id,
|
|
128
170
|
app_image=app_image,
|
|
171
|
+
app_metadata=app_metadata,
|
|
129
172
|
runcfg=runcfg,
|
|
130
173
|
source=source,
|
|
174
|
+
workspace=workspace,
|
|
131
175
|
)
|