torchx-nightly 2025.10.16__py3-none-any.whl → 2025.12.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/_version.py +8 -0
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/main.py +2 -0
- torchx/runner/api.py +35 -33
- torchx/schedulers/api.py +58 -17
- torchx/schedulers/aws_batch_scheduler.py +2 -4
- torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
- torchx/schedulers/docker_scheduler.py +1 -3
- torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
- torchx/schedulers/kubernetes_scheduler.py +234 -20
- torchx/schedulers/local_scheduler.py +1 -1
- torchx/schedulers/lsf_scheduler.py +1 -1
- torchx/schedulers/slurm_scheduler.py +9 -3
- torchx/specs/__init__.py +17 -3
- torchx/specs/api.py +82 -41
- torchx/version.py +2 -2
- torchx/workspace/api.py +63 -42
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.12.2.dist-info}/METADATA +21 -8
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.12.2.dist-info}/RECORD +23 -21
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.12.2.dist-info}/WHEEL +1 -1
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.12.2.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.12.2.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.12.2.dist-info}/top_level.txt +0 -0
torchx/_version.py
ADDED
torchx/cli/cmd_delete.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
# All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# This source code is licensed under the BSD-style license found in the
|
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
|
7
|
+
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
from torchx.cli.cmd_base import SubCommand
|
|
14
|
+
from torchx.runner import get_runner
|
|
15
|
+
|
|
16
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CmdDelete(SubCommand):
|
|
20
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
|
21
|
+
subparser.add_argument(
|
|
22
|
+
"app_handle",
|
|
23
|
+
type=str,
|
|
24
|
+
help="torchx app handle (e.g. local://session-name/app-id)",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def run(self, args: argparse.Namespace) -> None:
|
|
28
|
+
app_handle = args.app_handle
|
|
29
|
+
runner = get_runner()
|
|
30
|
+
runner.delete(app_handle)
|
torchx/cli/main.py
CHANGED
|
@@ -16,6 +16,7 @@ import torchx
|
|
|
16
16
|
from torchx.cli.cmd_base import SubCommand
|
|
17
17
|
from torchx.cli.cmd_cancel import CmdCancel
|
|
18
18
|
from torchx.cli.cmd_configure import CmdConfigure
|
|
19
|
+
from torchx.cli.cmd_delete import CmdDelete
|
|
19
20
|
from torchx.cli.cmd_describe import CmdDescribe
|
|
20
21
|
from torchx.cli.cmd_list import CmdList
|
|
21
22
|
from torchx.cli.cmd_log import CmdLog
|
|
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
|
|
|
37
38
|
"builtins": CmdBuiltins(),
|
|
38
39
|
"cancel": CmdCancel(),
|
|
39
40
|
"configure": CmdConfigure(),
|
|
41
|
+
"delete": CmdDelete(),
|
|
40
42
|
"describe": CmdDescribe(),
|
|
41
43
|
"list": CmdList(),
|
|
42
44
|
"log": CmdLog(),
|
torchx/runner/api.py
CHANGED
|
@@ -420,52 +420,44 @@ class Runner:
|
|
|
420
420
|
scheduler,
|
|
421
421
|
runcfg=json.dumps(cfg) if cfg else None,
|
|
422
422
|
workspace=str(workspace),
|
|
423
|
-
):
|
|
423
|
+
) as ctx:
|
|
424
424
|
sched = self._scheduler(scheduler)
|
|
425
425
|
resolved_cfg = sched.run_opts().resolve(cfg)
|
|
426
426
|
|
|
427
427
|
sched._pre_build_validate(app, scheduler, resolved_cfg)
|
|
428
428
|
|
|
429
429
|
if isinstance(sched, WorkspaceMixin):
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
# later, torchx added support for the workspace attr in Role
|
|
436
|
-
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
|
|
437
|
-
if role_workspace:
|
|
438
|
-
logger.info(
|
|
439
|
-
f"Using workspace={workspace} over role[{i}].workspace={role_workspace} for role[{i}]={role.name}."
|
|
440
|
-
" To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically." # noqa: B950
|
|
441
|
-
)
|
|
442
|
-
role_workspace = workspace
|
|
443
|
-
|
|
444
|
-
if role_workspace:
|
|
445
|
-
old_img = role.image
|
|
430
|
+
if workspace:
|
|
431
|
+
# NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
|
|
432
|
+
# later, torchx added support for the workspace attr in Role
|
|
433
|
+
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
|
|
434
|
+
if app.roles[0].workspace:
|
|
446
435
|
logger.info(
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
role, role_workspace, resolved_cfg
|
|
436
|
+
"Overriding role[%d] (%s) workspace to `%s`"
|
|
437
|
+
"To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
|
|
438
|
+
0,
|
|
439
|
+
role.name,
|
|
440
|
+
str(app.roles[0].workspace),
|
|
453
441
|
)
|
|
442
|
+
app.roles[0].workspace = (
|
|
443
|
+
Workspace.from_str(workspace)
|
|
444
|
+
if isinstance(workspace, str)
|
|
445
|
+
else workspace
|
|
446
|
+
)
|
|
454
447
|
|
|
455
|
-
|
|
456
|
-
logger.info(
|
|
457
|
-
f"Built new image `{role.image}` based on original image `{old_img}`"
|
|
458
|
-
f" and changes in workspace `{role_workspace}` for role[{i}]={role.name}."
|
|
459
|
-
)
|
|
460
|
-
else:
|
|
461
|
-
logger.info(
|
|
462
|
-
f"Reusing original image `{old_img}` for role[{i}]={role.name}."
|
|
463
|
-
" Either a patch was built or no changes to workspace was detected."
|
|
464
|
-
)
|
|
448
|
+
sched.build_workspaces(app.roles, resolved_cfg)
|
|
465
449
|
|
|
466
450
|
sched._validate(app, scheduler, resolved_cfg)
|
|
467
451
|
dryrun_info = sched.submit_dryrun(app, resolved_cfg)
|
|
468
452
|
dryrun_info._scheduler = scheduler
|
|
453
|
+
|
|
454
|
+
event = ctx._torchx_event
|
|
455
|
+
event.scheduler = scheduler
|
|
456
|
+
event.runcfg = json.dumps(cfg) if cfg else None
|
|
457
|
+
event.app_id = app.name
|
|
458
|
+
event.app_image = none_throws(dryrun_info._app).roles[0].image
|
|
459
|
+
event.app_metadata = app.metadata
|
|
460
|
+
|
|
469
461
|
return dryrun_info
|
|
470
462
|
|
|
471
463
|
def scheduler_run_opts(self, scheduler: str) -> runopts:
|
|
@@ -595,6 +587,16 @@ class Runner:
|
|
|
595
587
|
if status is not None and not status.is_terminal():
|
|
596
588
|
scheduler.cancel(app_id)
|
|
597
589
|
|
|
590
|
+
def delete(self, app_handle: AppHandle) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Deletes the application from the scheduler.
|
|
593
|
+
"""
|
|
594
|
+
scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
|
|
595
|
+
with log_event("delete", scheduler_backend, app_id):
|
|
596
|
+
status = self.status(app_handle)
|
|
597
|
+
if status is not None:
|
|
598
|
+
scheduler.delete(app_id)
|
|
599
|
+
|
|
598
600
|
def stop(self, app_handle: AppHandle) -> None:
|
|
599
601
|
"""
|
|
600
602
|
See method ``cancel``.
|
torchx/schedulers/api.py
CHANGED
|
@@ -11,10 +11,11 @@ import re
|
|
|
11
11
|
from dataclasses import dataclass, field
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
from enum import Enum
|
|
14
|
-
from typing import Generic, Iterable, List, Optional, TypeVar
|
|
14
|
+
from typing import Generic, Iterable, List, Optional, TypeVar
|
|
15
15
|
|
|
16
16
|
from torchx.specs import (
|
|
17
17
|
AppDef,
|
|
18
|
+
AppDryRunInfo,
|
|
18
19
|
AppState,
|
|
19
20
|
NONE,
|
|
20
21
|
NULL_RESOURCE,
|
|
@@ -95,11 +96,9 @@ class ListAppResponse:
|
|
|
95
96
|
|
|
96
97
|
|
|
97
98
|
T = TypeVar("T")
|
|
98
|
-
A = TypeVar("A")
|
|
99
|
-
D = TypeVar("D")
|
|
100
99
|
|
|
101
100
|
|
|
102
|
-
class Scheduler(abc.ABC, Generic[T
|
|
101
|
+
class Scheduler(abc.ABC, Generic[T]):
|
|
103
102
|
"""
|
|
104
103
|
An interface abstracting functionalities of a scheduler.
|
|
105
104
|
Implementers need only implement those methods annotated with
|
|
@@ -129,9 +128,9 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
129
128
|
|
|
130
129
|
def submit(
|
|
131
130
|
self,
|
|
132
|
-
app:
|
|
131
|
+
app: AppDef,
|
|
133
132
|
cfg: T,
|
|
134
|
-
workspace:
|
|
133
|
+
workspace: str | Workspace | None = None,
|
|
135
134
|
) -> str:
|
|
136
135
|
"""
|
|
137
136
|
Submits the application to be run by the scheduler.
|
|
@@ -145,14 +144,19 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
145
144
|
resolved_cfg = self.run_opts().resolve(cfg)
|
|
146
145
|
if workspace:
|
|
147
146
|
assert isinstance(self, WorkspaceMixin)
|
|
148
|
-
|
|
147
|
+
|
|
148
|
+
if isinstance(workspace, str):
|
|
149
|
+
workspace = Workspace.from_str(workspace)
|
|
150
|
+
|
|
151
|
+
app.roles[0].workspace = workspace
|
|
152
|
+
self.build_workspaces(app.roles, resolved_cfg)
|
|
149
153
|
|
|
150
154
|
# pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
|
|
151
155
|
dryrun_info = self.submit_dryrun(app, resolved_cfg)
|
|
152
156
|
return self.schedule(dryrun_info)
|
|
153
157
|
|
|
154
158
|
@abc.abstractmethod
|
|
155
|
-
def schedule(self, dryrun_info:
|
|
159
|
+
def schedule(self, dryrun_info: AppDryRunInfo) -> str:
|
|
156
160
|
"""
|
|
157
161
|
Same as ``submit`` except that it takes an ``AppDryRunInfo``.
|
|
158
162
|
Implementers are encouraged to implement this method rather than
|
|
@@ -168,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
168
172
|
|
|
169
173
|
raise NotImplementedError()
|
|
170
174
|
|
|
171
|
-
def submit_dryrun(self, app:
|
|
175
|
+
def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
|
|
172
176
|
"""
|
|
173
177
|
Rather than submitting the request to run the app, returns the
|
|
174
178
|
request object that would have been submitted to the underlying
|
|
@@ -182,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
182
186
|
# pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
|
|
183
187
|
dryrun_info = self._submit_dryrun(app, resolved_cfg)
|
|
184
188
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
189
|
+
for role in app.roles:
|
|
190
|
+
dryrun_info = role.pre_proc(self.backend, dryrun_info)
|
|
191
|
+
|
|
188
192
|
dryrun_info._app = app
|
|
189
193
|
dryrun_info._cfg = resolved_cfg
|
|
190
194
|
return dryrun_info
|
|
191
195
|
|
|
192
196
|
@abc.abstractmethod
|
|
193
|
-
def _submit_dryrun(self, app:
|
|
197
|
+
def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
|
|
194
198
|
raise NotImplementedError()
|
|
195
199
|
|
|
196
200
|
def run_opts(self) -> runopts:
|
|
@@ -259,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
259
263
|
# do nothing if the app does not exist
|
|
260
264
|
return
|
|
261
265
|
|
|
266
|
+
def delete(self, app_id: str) -> None:
|
|
267
|
+
"""
|
|
268
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
269
|
+
scheduler's data-plane. Basically "deep-purging" the job from the
|
|
270
|
+
scheduler's data-plane. Calling this API on a "live" job (e.g in a
|
|
271
|
+
non-terminal status such as PENDING or RUNNING) cancels the job.
|
|
272
|
+
|
|
273
|
+
Note that this API is only relevant for schedulers for which its
|
|
274
|
+
data-plane persistently stores the "JobDefinition" (which is often
|
|
275
|
+
versioned). AWS Batch and Kubernetes are examples of such schedulers.
|
|
276
|
+
On these schedulers, a finished job may fall out of the data-plane
|
|
277
|
+
(e.g. really old finished jobs get deleted) but the JobDefinition is
|
|
278
|
+
typically permanently stored. In this case, calling
|
|
279
|
+
:py:meth:`~cancel` would not delete the job definition.
|
|
280
|
+
|
|
281
|
+
In schedulers with no such feature (e.g. SLURM)
|
|
282
|
+
:py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
|
|
283
|
+
default implementation. Hence implementors of such schedulers need not
|
|
284
|
+
override this method.
|
|
285
|
+
|
|
286
|
+
.. warning::
|
|
287
|
+
Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
|
|
288
|
+
the scheduler's data-plane does nothing. The user is responsible for
|
|
289
|
+
manually tracking down and cleaning up any dangling resources related
|
|
290
|
+
to the job.
|
|
291
|
+
"""
|
|
292
|
+
if self.exists(app_id):
|
|
293
|
+
self._delete_existing(app_id)
|
|
294
|
+
|
|
295
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
296
|
+
"""
|
|
297
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
298
|
+
scheduler's data-plane. This method will only be called on an
|
|
299
|
+
application that exists.
|
|
300
|
+
|
|
301
|
+
The default implementation calls :py:meth:`~_cancel_existing` which is
|
|
302
|
+
appropriate for schedulers without persistent job definitions.
|
|
303
|
+
"""
|
|
304
|
+
self._cancel_existing(app_id)
|
|
305
|
+
|
|
262
306
|
def log_iter(
|
|
263
307
|
self,
|
|
264
308
|
app_id: str,
|
|
@@ -349,15 +393,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
349
393
|
"""
|
|
350
394
|
pass
|
|
351
395
|
|
|
352
|
-
def _validate(self, app:
|
|
396
|
+
def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
|
|
353
397
|
"""
|
|
354
398
|
Validates after workspace build whether application is consistent with the scheduler.
|
|
355
399
|
|
|
356
400
|
Raises error if application is not compatible with scheduler
|
|
357
401
|
"""
|
|
358
|
-
if not isinstance(app, AppDef):
|
|
359
|
-
return
|
|
360
|
-
|
|
361
402
|
for role in app.roles:
|
|
362
403
|
if role.resource == NULL_RESOURCE:
|
|
363
404
|
raise ValueError(
|
|
@@ -381,7 +381,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
|
|
|
381
381
|
|
|
382
382
|
|
|
383
383
|
@_thread_local_cache
|
|
384
|
-
def _local_session() -> "boto3.session.Session":
|
|
384
|
+
def _local_session() -> "boto3.session.Session": # noqa: F821
|
|
385
385
|
import boto3.session
|
|
386
386
|
|
|
387
387
|
return boto3.session.Session()
|
|
@@ -399,9 +399,7 @@ class AWSBatchOpts(TypedDict, total=False):
|
|
|
399
399
|
ulimits: Optional[list[str]]
|
|
400
400
|
|
|
401
401
|
|
|
402
|
-
class AWSBatchScheduler(
|
|
403
|
-
DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
|
|
404
|
-
):
|
|
402
|
+
class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
|
|
405
403
|
"""
|
|
406
404
|
AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.
|
|
407
405
|
|
|
@@ -157,7 +157,7 @@ def _merge_ordered(
|
|
|
157
157
|
|
|
158
158
|
class AWSSageMakerScheduler(
|
|
159
159
|
DockerWorkspaceMixin,
|
|
160
|
-
Scheduler[AWSSageMakerOpts
|
|
160
|
+
Scheduler[AWSSageMakerOpts],
|
|
161
161
|
):
|
|
162
162
|
"""
|
|
163
163
|
AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.
|
|
@@ -129,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
|
|
|
129
129
|
privileged: bool
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
class DockerScheduler(
|
|
133
|
-
DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
|
|
134
|
-
):
|
|
132
|
+
class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
|
|
135
133
|
"""
|
|
136
134
|
DockerScheduler is a TorchX scheduling interface to Docker.
|
|
137
135
|
|
|
@@ -796,10 +796,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
|
|
|
796
796
|
network: Optional[str]
|
|
797
797
|
|
|
798
798
|
|
|
799
|
-
class KubernetesMCADScheduler(
|
|
800
|
-
DockerWorkspaceMixin,
|
|
801
|
-
Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
|
|
802
|
-
):
|
|
799
|
+
class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
|
|
803
800
|
"""
|
|
804
801
|
KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.
|
|
805
802
|
|