torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_list.py +1 -2
- torchx/cli/cmd_run.py +202 -28
- torchx/cli/cmd_tracker.py +1 -1
- torchx/cli/main.py +2 -0
- torchx/components/__init__.py +1 -8
- torchx/components/dist.py +9 -3
- torchx/components/integration_tests/component_provider.py +2 -2
- torchx/components/utils.py +1 -1
- torchx/distributed/__init__.py +1 -1
- torchx/runner/api.py +102 -81
- torchx/runner/config.py +3 -1
- torchx/runner/events/__init__.py +20 -10
- torchx/runner/events/api.py +1 -1
- torchx/schedulers/__init__.py +7 -10
- torchx/schedulers/api.py +66 -25
- torchx/schedulers/aws_batch_scheduler.py +47 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
- torchx/schedulers/docker_scheduler.py +4 -3
- torchx/schedulers/ids.py +27 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
- torchx/schedulers/kubernetes_scheduler.py +355 -36
- torchx/schedulers/local_scheduler.py +2 -1
- torchx/schedulers/lsf_scheduler.py +1 -1
- torchx/schedulers/slurm_scheduler.py +102 -27
- torchx/specs/__init__.py +40 -9
- torchx/specs/api.py +222 -12
- torchx/specs/builders.py +109 -28
- torchx/specs/file_linter.py +117 -53
- torchx/specs/finder.py +25 -37
- torchx/specs/named_resources_aws.py +13 -2
- torchx/specs/overlays.py +106 -0
- torchx/tracker/__init__.py +2 -2
- torchx/tracker/api.py +1 -1
- torchx/util/entrypoints.py +1 -6
- torchx/util/strings.py +1 -1
- torchx/util/types.py +12 -1
- torchx/version.py +2 -2
- torchx/workspace/api.py +102 -5
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
- torchx/pipelines/kfp/__init__.py +0 -30
- torchx/pipelines/kfp/adapter.py +0 -274
- torchx/pipelines/kfp/version.py +0 -19
- torchx/schedulers/gcp_batch_scheduler.py +0 -497
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -454
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
torchx/runner/api.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
2
|
# All rights reserved.
|
|
4
3
|
#
|
|
@@ -25,6 +24,7 @@ from typing import (
|
|
|
25
24
|
Type,
|
|
26
25
|
TYPE_CHECKING,
|
|
27
26
|
TypeVar,
|
|
27
|
+
Union,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
30
|
from torchx.runner.events import log_event
|
|
@@ -42,6 +42,7 @@ from torchx.specs import (
|
|
|
42
42
|
parse_app_handle,
|
|
43
43
|
runopts,
|
|
44
44
|
UnknownAppException,
|
|
45
|
+
Workspace,
|
|
45
46
|
)
|
|
46
47
|
from torchx.specs.finder import get_component
|
|
47
48
|
from torchx.tracker.api import (
|
|
@@ -53,7 +54,7 @@ from torchx.tracker.api import (
|
|
|
53
54
|
from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
|
|
54
55
|
|
|
55
56
|
from torchx.util.types import none_throws
|
|
56
|
-
from torchx.workspace
|
|
57
|
+
from torchx.workspace import WorkspaceMixin
|
|
57
58
|
|
|
58
59
|
if TYPE_CHECKING:
|
|
59
60
|
from typing_extensions import Self
|
|
@@ -129,9 +130,9 @@ class Runner:
|
|
|
129
130
|
def _get_scheduler_params_from_env(self) -> Dict[str, str]:
|
|
130
131
|
scheduler_params = {}
|
|
131
132
|
for key, value in os.environ.items():
|
|
132
|
-
|
|
133
|
-
if
|
|
134
|
-
scheduler_params[
|
|
133
|
+
key = key.lower()
|
|
134
|
+
if key.startswith("torchx_"):
|
|
135
|
+
scheduler_params[key.removeprefix("torchx_")] = value
|
|
135
136
|
return scheduler_params
|
|
136
137
|
|
|
137
138
|
def __enter__(self) -> "Self":
|
|
@@ -164,25 +165,13 @@ class Runner:
|
|
|
164
165
|
for scheduler in self._scheduler_instances.values():
|
|
165
166
|
scheduler.close()
|
|
166
167
|
|
|
167
|
-
def build_standalone_workspace(
|
|
168
|
-
self,
|
|
169
|
-
workspace_builder: WorkspaceBuilder[S, T],
|
|
170
|
-
sync: bool = True,
|
|
171
|
-
) -> PkgInfo[S]:
|
|
172
|
-
"""
|
|
173
|
-
Build a standalone workspace for the given role.
|
|
174
|
-
This method is used to build a workspace for a role independent of the scheduler and
|
|
175
|
-
also enables asynchronous workspace building using the Role overrides.
|
|
176
|
-
"""
|
|
177
|
-
return workspace_builder.build_workspace(sync)
|
|
178
|
-
|
|
179
168
|
def run_component(
|
|
180
169
|
self,
|
|
181
170
|
component: str,
|
|
182
|
-
component_args:
|
|
171
|
+
component_args: Union[list[str], dict[str, Any]],
|
|
183
172
|
scheduler: str,
|
|
184
173
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
185
|
-
workspace: Optional[str] = None,
|
|
174
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
186
175
|
parent_run_id: Optional[str] = None,
|
|
187
176
|
) -> AppHandle:
|
|
188
177
|
"""
|
|
@@ -217,7 +206,7 @@ class Runner:
|
|
|
217
206
|
ComponentNotFoundException: if the ``component_path`` is failed to resolve.
|
|
218
207
|
"""
|
|
219
208
|
|
|
220
|
-
with log_event("run_component"
|
|
209
|
+
with log_event("run_component") as ctx:
|
|
221
210
|
dryrun_info = self.dryrun_component(
|
|
222
211
|
component,
|
|
223
212
|
component_args,
|
|
@@ -228,7 +217,8 @@ class Runner:
|
|
|
228
217
|
)
|
|
229
218
|
handle = self.schedule(dryrun_info)
|
|
230
219
|
app = none_throws(dryrun_info._app)
|
|
231
|
-
|
|
220
|
+
|
|
221
|
+
ctx._torchx_event.workspace = str(workspace)
|
|
232
222
|
ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
|
|
233
223
|
ctx._torchx_event.app_image = app.roles[0].image
|
|
234
224
|
ctx._torchx_event.app_id = parse_app_handle(handle)[2]
|
|
@@ -238,10 +228,10 @@ class Runner:
|
|
|
238
228
|
def dryrun_component(
|
|
239
229
|
self,
|
|
240
230
|
component: str,
|
|
241
|
-
component_args:
|
|
231
|
+
component_args: Union[list[str], dict[str, Any]],
|
|
242
232
|
scheduler: str,
|
|
243
233
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
244
|
-
workspace: Optional[str] = None,
|
|
234
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
245
235
|
parent_run_id: Optional[str] = None,
|
|
246
236
|
) -> AppDryRunInfo:
|
|
247
237
|
"""
|
|
@@ -249,10 +239,13 @@ class Runner:
|
|
|
249
239
|
component, but just returns what "would" have run.
|
|
250
240
|
"""
|
|
251
241
|
component_def = get_component(component)
|
|
242
|
+
args_from_cli = component_args if isinstance(component_args, list) else []
|
|
243
|
+
args_from_json = component_args if isinstance(component_args, dict) else {}
|
|
252
244
|
app = materialize_appdef(
|
|
253
245
|
component_def.fn,
|
|
254
|
-
|
|
246
|
+
args_from_cli,
|
|
255
247
|
self._component_defaults.get(component, None),
|
|
248
|
+
args_from_json,
|
|
256
249
|
)
|
|
257
250
|
return self.dryrun(
|
|
258
251
|
app,
|
|
@@ -267,7 +260,7 @@ class Runner:
|
|
|
267
260
|
app: AppDef,
|
|
268
261
|
scheduler: str,
|
|
269
262
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
270
|
-
workspace: Optional[str] = None,
|
|
263
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
271
264
|
parent_run_id: Optional[str] = None,
|
|
272
265
|
) -> AppHandle:
|
|
273
266
|
"""
|
|
@@ -280,9 +273,7 @@ class Runner:
|
|
|
280
273
|
An application handle that is used to call other action APIs on the app.
|
|
281
274
|
"""
|
|
282
275
|
|
|
283
|
-
with log_event(
|
|
284
|
-
api="run", runcfg=json.dumps(cfg) if cfg else None, workspace=workspace
|
|
285
|
-
) as ctx:
|
|
276
|
+
with log_event(api="run") as ctx:
|
|
286
277
|
dryrun_info = self.dryrun(
|
|
287
278
|
app,
|
|
288
279
|
scheduler,
|
|
@@ -291,10 +282,15 @@ class Runner:
|
|
|
291
282
|
parent_run_id=parent_run_id,
|
|
292
283
|
)
|
|
293
284
|
handle = self.schedule(dryrun_info)
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
285
|
+
|
|
286
|
+
event = ctx._torchx_event
|
|
287
|
+
event.scheduler = scheduler
|
|
288
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
289
|
+
event.workspace = str(workspace)
|
|
290
|
+
event.app_id = parse_app_handle(handle)[2]
|
|
291
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
292
|
+
event.app_metadata = app.metadata
|
|
293
|
+
|
|
298
294
|
return handle
|
|
299
295
|
|
|
300
296
|
def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle:
|
|
@@ -328,21 +324,22 @@ class Runner:
|
|
|
328
324
|
|
|
329
325
|
"""
|
|
330
326
|
scheduler = none_throws(dryrun_info._scheduler)
|
|
331
|
-
app_image = none_throws(dryrun_info._app).roles[0].image
|
|
332
327
|
cfg = dryrun_info._cfg
|
|
333
|
-
with log_event(
|
|
334
|
-
"schedule",
|
|
335
|
-
scheduler,
|
|
336
|
-
app_image=app_image,
|
|
337
|
-
runcfg=json.dumps(cfg) if cfg else None,
|
|
338
|
-
) as ctx:
|
|
328
|
+
with log_event("schedule") as ctx:
|
|
339
329
|
sched = self._scheduler(scheduler)
|
|
340
330
|
app_id = sched.schedule(dryrun_info)
|
|
341
331
|
app_handle = make_app_handle(scheduler, self._name, app_id)
|
|
332
|
+
|
|
342
333
|
app = none_throws(dryrun_info._app)
|
|
343
334
|
self._apps[app_handle] = app
|
|
344
|
-
|
|
345
|
-
ctx._torchx_event
|
|
335
|
+
|
|
336
|
+
event = ctx._torchx_event
|
|
337
|
+
event.scheduler = scheduler
|
|
338
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
339
|
+
event.app_id = app_id
|
|
340
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
341
|
+
event.app_metadata = app.metadata
|
|
342
|
+
|
|
346
343
|
return app_handle
|
|
347
344
|
|
|
348
345
|
def name(self) -> str:
|
|
@@ -353,7 +350,7 @@ class Runner:
|
|
|
353
350
|
app: AppDef,
|
|
354
351
|
scheduler: str,
|
|
355
352
|
cfg: Optional[Mapping[str, CfgVal]] = None,
|
|
356
|
-
workspace: Optional[str] = None,
|
|
353
|
+
workspace: Optional[Union[Workspace, str]] = None,
|
|
357
354
|
parent_run_id: Optional[str] = None,
|
|
358
355
|
) -> AppDryRunInfo:
|
|
359
356
|
"""
|
|
@@ -422,52 +419,45 @@ class Runner:
|
|
|
422
419
|
"dryrun",
|
|
423
420
|
scheduler,
|
|
424
421
|
runcfg=json.dumps(cfg) if cfg else None,
|
|
425
|
-
workspace=workspace,
|
|
426
|
-
):
|
|
422
|
+
workspace=str(workspace),
|
|
423
|
+
) as ctx:
|
|
427
424
|
sched = self._scheduler(scheduler)
|
|
428
425
|
resolved_cfg = sched.run_opts().resolve(cfg)
|
|
429
426
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
|
|
450
|
-
ctx._torchx_event.app_image = role.image
|
|
451
|
-
ctx._torchx_event.workspace = workspace
|
|
452
|
-
|
|
453
|
-
if old_img != role.image:
|
|
454
|
-
logger.info(
|
|
455
|
-
f"Built new image `{role.image}` based on original image `{old_img}`"
|
|
456
|
-
f" and changes in workspace `{workspace}` for role[0]={role.name}."
|
|
457
|
-
)
|
|
458
|
-
else:
|
|
459
|
-
logger.info(
|
|
460
|
-
f"Reusing original image `{old_img}` for role[0]={role.name}."
|
|
461
|
-
" Either a patch was built or no changes to workspace was detected."
|
|
427
|
+
sched._pre_build_validate(app, scheduler, resolved_cfg)
|
|
428
|
+
|
|
429
|
+
if isinstance(sched, WorkspaceMixin):
|
|
430
|
+
if workspace:
|
|
431
|
+
# NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
|
|
432
|
+
# later, torchx added support for the workspace attr in Role
|
|
433
|
+
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
|
|
434
|
+
if app.roles[0].workspace:
|
|
435
|
+
logger.info(
|
|
436
|
+
"Overriding role[%d] (%s) workspace to `%s`"
|
|
437
|
+
"To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
|
|
438
|
+
0,
|
|
439
|
+
role.name,
|
|
440
|
+
str(app.roles[0].workspace),
|
|
441
|
+
)
|
|
442
|
+
app.roles[0].workspace = (
|
|
443
|
+
Workspace.from_str(workspace)
|
|
444
|
+
if isinstance(workspace, str)
|
|
445
|
+
else workspace
|
|
462
446
|
)
|
|
463
447
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
):
|
|
468
|
-
sched._validate(app, scheduler, resolved_cfg)
|
|
448
|
+
sched.build_workspaces(app.roles, resolved_cfg)
|
|
449
|
+
|
|
450
|
+
sched._validate(app, scheduler, resolved_cfg)
|
|
469
451
|
dryrun_info = sched.submit_dryrun(app, resolved_cfg)
|
|
470
452
|
dryrun_info._scheduler = scheduler
|
|
453
|
+
|
|
454
|
+
event = ctx._torchx_event
|
|
455
|
+
event.scheduler = scheduler
|
|
456
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
457
|
+
event.app_id = app.name
|
|
458
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
459
|
+
event.app_metadata = app.metadata
|
|
460
|
+
|
|
471
461
|
return dryrun_info
|
|
472
462
|
|
|
473
463
|
def scheduler_run_opts(self, scheduler: str) -> runopts:
|
|
@@ -486,6 +476,27 @@ class Runner:
|
|
|
486
476
|
"""
|
|
487
477
|
return self._scheduler(scheduler).run_opts()
|
|
488
478
|
|
|
479
|
+
def cfg_from_str(self, scheduler: str, *cfg_literal: str) -> Mapping[str, CfgVal]:
|
|
480
|
+
"""
|
|
481
|
+
Convenience function around the scheduler's ``runopts.cfg_from_str()`` method.
|
|
482
|
+
|
|
483
|
+
Usage:
|
|
484
|
+
|
|
485
|
+
.. doctest::
|
|
486
|
+
|
|
487
|
+
from torchx.runner import get_runner
|
|
488
|
+
|
|
489
|
+
runner = get_runner()
|
|
490
|
+
cfg = runner.cfg_from_str("local_cwd", "log_dir=/tmp/foobar", "prepend_cwd=True")
|
|
491
|
+
assert cfg == {"log_dir": "/tmp/foobar", "prepend_cwd": True, "auto_set_cuda_visible_devices": False}
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
opts = self._scheduler(scheduler).run_opts()
|
|
495
|
+
cfg = {}
|
|
496
|
+
for cfg_str in cfg_literal:
|
|
497
|
+
cfg.update(opts.cfg_from_str(cfg_str))
|
|
498
|
+
return cfg
|
|
499
|
+
|
|
489
500
|
def scheduler_backends(self) -> List[str]:
|
|
490
501
|
"""
|
|
491
502
|
Returns a list of all supported scheduler backends.
|
|
@@ -576,6 +587,16 @@ class Runner:
|
|
|
576
587
|
if status is not None and not status.is_terminal():
|
|
577
588
|
scheduler.cancel(app_id)
|
|
578
589
|
|
|
590
|
+
def delete(self, app_handle: AppHandle) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Deletes the application from the scheduler.
|
|
593
|
+
"""
|
|
594
|
+
scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
|
|
595
|
+
with log_event("delete", scheduler_backend, app_id):
|
|
596
|
+
status = self.status(app_handle)
|
|
597
|
+
if status is not None:
|
|
598
|
+
scheduler.delete(app_id)
|
|
599
|
+
|
|
579
600
|
def stop(self, app_handle: AppHandle) -> None:
|
|
580
601
|
"""
|
|
581
602
|
See method ``cancel``.
|
torchx/runner/config.py
CHANGED
|
@@ -73,7 +73,7 @@ CLI Usage
|
|
|
73
73
|
|
|
74
74
|
#. In addition, it is possible to specify a different config other than .torchxconfig to
|
|
75
75
|
load at runtime. Requirements are that the config path is specified by enviornment
|
|
76
|
-
variable
|
|
76
|
+
variable TORCHXCONFIG. It also disables hierarchy loading configs from multiple
|
|
77
77
|
directories as the cases otherwise.
|
|
78
78
|
|
|
79
79
|
#. User level .torchxconfig
|
|
@@ -494,6 +494,8 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
|
|
|
494
494
|
|
|
495
495
|
config = os.getenv(ENV_TORCHXCONFIG)
|
|
496
496
|
if config is not None:
|
|
497
|
+
if not config:
|
|
498
|
+
return []
|
|
497
499
|
configfile = Path(config)
|
|
498
500
|
if not configfile.is_file():
|
|
499
501
|
raise FileNotFoundError(
|
torchx/runner/events/__init__.py
CHANGED
|
@@ -33,8 +33,9 @@ from torchx.util.session import get_session_id_or_create_new
|
|
|
33
33
|
|
|
34
34
|
from .api import SourceType, TorchxEvent # noqa F401
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
_events_logger: Optional[logging.Logger] = None
|
|
37
|
+
|
|
38
|
+
log: logging.Logger = logging.getLogger(__name__)
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
def _get_or_create_logger(destination: str = "null") -> logging.Logger:
|
|
@@ -51,19 +52,28 @@ def _get_or_create_logger(destination: str = "null") -> logging.Logger:
|
|
|
51
52
|
a new logger if None provided.
|
|
52
53
|
"""
|
|
53
54
|
global _events_logger
|
|
55
|
+
|
|
54
56
|
if _events_logger:
|
|
55
57
|
return _events_logger
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
58
|
+
else:
|
|
59
|
+
logging_handler = get_logging_handler(destination)
|
|
60
|
+
logging_handler.setLevel(logging.DEBUG)
|
|
61
|
+
_events_logger = logging.getLogger(f"torchx-events-{destination}")
|
|
62
|
+
# Do not propagate message to the root logger
|
|
63
|
+
_events_logger.propagate = False
|
|
64
|
+
_events_logger.addHandler(logging_handler)
|
|
65
|
+
|
|
66
|
+
assert _events_logger # make type-checker happy
|
|
67
|
+
return _events_logger
|
|
63
68
|
|
|
64
69
|
|
|
65
70
|
def record(event: TorchxEvent, destination: str = "null") -> None:
|
|
66
|
-
|
|
71
|
+
try:
|
|
72
|
+
serialized_event = event.serialize()
|
|
73
|
+
except Exception:
|
|
74
|
+
log.exception("failed to serialize event, will not record event")
|
|
75
|
+
else:
|
|
76
|
+
_get_or_create_logger(destination).info(serialized_event)
|
|
67
77
|
|
|
68
78
|
|
|
69
79
|
class log_event:
|
torchx/runner/events/api.py
CHANGED
|
@@ -29,7 +29,7 @@ class TorchxEvent:
|
|
|
29
29
|
scheduler: Scheduler that is used to execute request
|
|
30
30
|
api: Api name
|
|
31
31
|
app_id: Unique id that is set by the underlying scheduler
|
|
32
|
-
|
|
32
|
+
app_image: Image/container bundle that is used to execute request.
|
|
33
33
|
app_metadata: metadata to the app (treatment of metadata is scheduler dependent)
|
|
34
34
|
runcfg: Run config that was used to schedule app.
|
|
35
35
|
source: Type of source the event is generated.
|
torchx/schedulers/__init__.py
CHANGED
|
@@ -21,8 +21,6 @@ DEFAULT_SCHEDULER_MODULES: Mapping[str, str] = {
|
|
|
21
21
|
"kubernetes_mcad": "torchx.schedulers.kubernetes_mcad_scheduler",
|
|
22
22
|
"aws_batch": "torchx.schedulers.aws_batch_scheduler",
|
|
23
23
|
"aws_sagemaker": "torchx.schedulers.aws_sagemaker_scheduler",
|
|
24
|
-
"gcp_batch": "torchx.schedulers.gcp_batch_scheduler",
|
|
25
|
-
"ray": "torchx.schedulers.ray_scheduler",
|
|
26
24
|
"lsf": "torchx.schedulers.lsf_scheduler",
|
|
27
25
|
}
|
|
28
26
|
|
|
@@ -51,15 +49,14 @@ def get_scheduler_factories(
|
|
|
51
49
|
The first scheduler in the dictionary is used as the default scheduler.
|
|
52
50
|
"""
|
|
53
51
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
52
|
+
if skip_defaults:
|
|
53
|
+
default_schedulers = {}
|
|
54
|
+
else:
|
|
55
|
+
default_schedulers: dict[str, SchedulerFactory] = {}
|
|
56
|
+
for scheduler, path in DEFAULT_SCHEDULER_MODULES.items():
|
|
57
|
+
default_schedulers[scheduler] = _defer_load_scheduler(path)
|
|
57
58
|
|
|
58
|
-
return load_group(
|
|
59
|
-
group,
|
|
60
|
-
default=default_schedulers,
|
|
61
|
-
skip_defaults=skip_defaults,
|
|
62
|
-
)
|
|
59
|
+
return load_group(group, default=default_schedulers)
|
|
63
60
|
|
|
64
61
|
|
|
65
62
|
def get_default_scheduler_name() -> str:
|
torchx/schedulers/api.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
2
|
# All rights reserved.
|
|
4
3
|
#
|
|
@@ -16,14 +15,16 @@ from typing import Generic, Iterable, List, Optional, TypeVar
|
|
|
16
15
|
|
|
17
16
|
from torchx.specs import (
|
|
18
17
|
AppDef,
|
|
18
|
+
AppDryRunInfo,
|
|
19
19
|
AppState,
|
|
20
20
|
NONE,
|
|
21
21
|
NULL_RESOURCE,
|
|
22
22
|
Role,
|
|
23
23
|
RoleStatus,
|
|
24
24
|
runopts,
|
|
25
|
+
Workspace,
|
|
25
26
|
)
|
|
26
|
-
from torchx.workspace
|
|
27
|
+
from torchx.workspace import WorkspaceMixin
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
DAYS_IN_2_WEEKS = 14
|
|
@@ -95,11 +96,9 @@ class ListAppResponse:
|
|
|
95
96
|
|
|
96
97
|
|
|
97
98
|
T = TypeVar("T")
|
|
98
|
-
A = TypeVar("A")
|
|
99
|
-
D = TypeVar("D")
|
|
100
99
|
|
|
101
100
|
|
|
102
|
-
class Scheduler(abc.ABC, Generic[T
|
|
101
|
+
class Scheduler(abc.ABC, Generic[T]):
|
|
103
102
|
"""
|
|
104
103
|
An interface abstracting functionalities of a scheduler.
|
|
105
104
|
Implementers need only implement those methods annotated with
|
|
@@ -129,9 +128,9 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
129
128
|
|
|
130
129
|
def submit(
|
|
131
130
|
self,
|
|
132
|
-
app:
|
|
131
|
+
app: AppDef,
|
|
133
132
|
cfg: T,
|
|
134
|
-
workspace:
|
|
133
|
+
workspace: str | Workspace | None = None,
|
|
135
134
|
) -> str:
|
|
136
135
|
"""
|
|
137
136
|
Submits the application to be run by the scheduler.
|
|
@@ -144,16 +143,20 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
144
143
|
# pyre-fixme: Generic cfg type passed to resolve
|
|
145
144
|
resolved_cfg = self.run_opts().resolve(cfg)
|
|
146
145
|
if workspace:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
146
|
+
assert isinstance(self, WorkspaceMixin)
|
|
147
|
+
|
|
148
|
+
if isinstance(workspace, str):
|
|
149
|
+
workspace = Workspace.from_str(workspace)
|
|
150
|
+
|
|
151
|
+
app.roles[0].workspace = workspace
|
|
152
|
+
self.build_workspaces(app.roles, resolved_cfg)
|
|
153
|
+
|
|
151
154
|
# pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
|
|
152
155
|
dryrun_info = self.submit_dryrun(app, resolved_cfg)
|
|
153
156
|
return self.schedule(dryrun_info)
|
|
154
157
|
|
|
155
158
|
@abc.abstractmethod
|
|
156
|
-
def schedule(self, dryrun_info:
|
|
159
|
+
def schedule(self, dryrun_info: AppDryRunInfo) -> str:
|
|
157
160
|
"""
|
|
158
161
|
Same as ``submit`` except that it takes an ``AppDryRunInfo``.
|
|
159
162
|
Implementers are encouraged to implement this method rather than
|
|
@@ -169,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
169
172
|
|
|
170
173
|
raise NotImplementedError()
|
|
171
174
|
|
|
172
|
-
def submit_dryrun(self, app:
|
|
175
|
+
def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
|
|
173
176
|
"""
|
|
174
177
|
Rather than submitting the request to run the app, returns the
|
|
175
178
|
request object that would have been submitted to the underlying
|
|
@@ -183,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
183
186
|
# pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
|
|
184
187
|
dryrun_info = self._submit_dryrun(app, resolved_cfg)
|
|
185
188
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
+
for role in app.roles:
|
|
190
|
+
dryrun_info = role.pre_proc(self.backend, dryrun_info)
|
|
191
|
+
|
|
189
192
|
dryrun_info._app = app
|
|
190
193
|
dryrun_info._cfg = resolved_cfg
|
|
191
194
|
return dryrun_info
|
|
192
195
|
|
|
193
196
|
@abc.abstractmethod
|
|
194
|
-
def _submit_dryrun(self, app:
|
|
197
|
+
def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
|
|
195
198
|
raise NotImplementedError()
|
|
196
199
|
|
|
197
200
|
def run_opts(self) -> runopts:
|
|
@@ -260,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
260
263
|
# do nothing if the app does not exist
|
|
261
264
|
return
|
|
262
265
|
|
|
266
|
+
def delete(self, app_id: str) -> None:
|
|
267
|
+
"""
|
|
268
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
269
|
+
scheduler's data-plane. Basically "deep-purging" the job from the
|
|
270
|
+
scheduler's data-plane. Calling this API on a "live" job (e.g in a
|
|
271
|
+
non-terminal status such as PENDING or RUNNING) cancels the job.
|
|
272
|
+
|
|
273
|
+
Note that this API is only relevant for schedulers for which its
|
|
274
|
+
data-plane persistently stores the "JobDefinition" (which is often
|
|
275
|
+
versioned). AWS Batch and Kubernetes are examples of such schedulers.
|
|
276
|
+
On these schedulers, a finished job may fall out of the data-plane
|
|
277
|
+
(e.g. really old finished jobs get deleted) but the JobDefinition is
|
|
278
|
+
typically permanently stored. In this case, calling
|
|
279
|
+
:py:meth:`~cancel` would not delete the job definition.
|
|
280
|
+
|
|
281
|
+
In schedulers with no such feature (e.g. SLURM)
|
|
282
|
+
:py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
|
|
283
|
+
default implementation. Hence implementors of such schedulers need not
|
|
284
|
+
override this method.
|
|
285
|
+
|
|
286
|
+
.. warning::
|
|
287
|
+
Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
|
|
288
|
+
the scheduler's data-plane does nothing. The user is responsible for
|
|
289
|
+
manually tracking down and cleaning up any dangling resources related
|
|
290
|
+
to the job.
|
|
291
|
+
"""
|
|
292
|
+
if self.exists(app_id):
|
|
293
|
+
self._delete_existing(app_id)
|
|
294
|
+
|
|
295
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
296
|
+
"""
|
|
297
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
298
|
+
scheduler's data-plane. This method will only be called on an
|
|
299
|
+
application that exists.
|
|
300
|
+
|
|
301
|
+
The default implementation calls :py:meth:`~_cancel_existing` which is
|
|
302
|
+
appropriate for schedulers without persistent job definitions.
|
|
303
|
+
"""
|
|
304
|
+
self._cancel_existing(app_id)
|
|
305
|
+
|
|
263
306
|
def log_iter(
|
|
264
307
|
self,
|
|
265
308
|
app_id: str,
|
|
@@ -350,19 +393,17 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
350
393
|
"""
|
|
351
394
|
pass
|
|
352
395
|
|
|
353
|
-
def _validate(self, app:
|
|
396
|
+
def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
|
|
354
397
|
"""
|
|
355
398
|
Validates after workspace build whether application is consistent with the scheduler.
|
|
356
399
|
|
|
357
400
|
Raises error if application is not compatible with scheduler
|
|
358
401
|
"""
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
f" Did you forget to attach resource to the role"
|
|
365
|
-
)
|
|
402
|
+
for role in app.roles:
|
|
403
|
+
if role.resource == NULL_RESOURCE:
|
|
404
|
+
raise ValueError(
|
|
405
|
+
f"No resource for role: {role.image}. Did you forget to attach resource to the role"
|
|
406
|
+
)
|
|
366
407
|
|
|
367
408
|
|
|
368
409
|
def filter_regex(regex: str, data: Iterable[str]) -> Iterable[str]:
|