torchx-nightly 2025.11.12__py3-none-any.whl → 2026.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/main.py +2 -0
- torchx/runner/api.py +10 -0
- torchx/schedulers/api.py +51 -15
- torchx/schedulers/aws_batch_scheduler.py +2 -4
- torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
- torchx/schedulers/docker_scheduler.py +1 -3
- torchx/schedulers/ids.py +27 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
- torchx/schedulers/kubernetes_scheduler.py +154 -18
- torchx/schedulers/local_scheduler.py +1 -1
- torchx/schedulers/lsf_scheduler.py +1 -1
- torchx/schedulers/slurm_scheduler.py +9 -3
- torchx/specs/__init__.py +17 -3
- torchx/specs/api.py +3 -1
- torchx/specs/overlays.py +106 -0
- {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +2 -2
- {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +22 -20
- {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/licenses/LICENSE +0 -0
- {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
torchx/cli/cmd_delete.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
# All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# This source code is licensed under the BSD-style license found in the
|
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
|
7
|
+
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
from torchx.cli.cmd_base import SubCommand
|
|
14
|
+
from torchx.runner import get_runner
|
|
15
|
+
|
|
16
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CmdDelete(SubCommand):
|
|
20
|
+
def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
|
|
21
|
+
subparser.add_argument(
|
|
22
|
+
"app_handle",
|
|
23
|
+
type=str,
|
|
24
|
+
help="torchx app handle (e.g. local://session-name/app-id)",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def run(self, args: argparse.Namespace) -> None:
|
|
28
|
+
app_handle = args.app_handle
|
|
29
|
+
runner = get_runner()
|
|
30
|
+
runner.delete(app_handle)
|
torchx/cli/main.py
CHANGED
|
@@ -16,6 +16,7 @@ import torchx
|
|
|
16
16
|
from torchx.cli.cmd_base import SubCommand
|
|
17
17
|
from torchx.cli.cmd_cancel import CmdCancel
|
|
18
18
|
from torchx.cli.cmd_configure import CmdConfigure
|
|
19
|
+
from torchx.cli.cmd_delete import CmdDelete
|
|
19
20
|
from torchx.cli.cmd_describe import CmdDescribe
|
|
20
21
|
from torchx.cli.cmd_list import CmdList
|
|
21
22
|
from torchx.cli.cmd_log import CmdLog
|
|
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
|
|
|
37
38
|
"builtins": CmdBuiltins(),
|
|
38
39
|
"cancel": CmdCancel(),
|
|
39
40
|
"configure": CmdConfigure(),
|
|
41
|
+
"delete": CmdDelete(),
|
|
40
42
|
"describe": CmdDescribe(),
|
|
41
43
|
"list": CmdList(),
|
|
42
44
|
"log": CmdLog(),
|
torchx/runner/api.py
CHANGED
|
@@ -587,6 +587,16 @@ class Runner:
|
|
|
587
587
|
if status is not None and not status.is_terminal():
|
|
588
588
|
scheduler.cancel(app_id)
|
|
589
589
|
|
|
590
|
+
def delete(self, app_handle: AppHandle) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Deletes the application from the scheduler.
|
|
593
|
+
"""
|
|
594
|
+
scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
|
|
595
|
+
with log_event("delete", scheduler_backend, app_id):
|
|
596
|
+
status = self.status(app_handle)
|
|
597
|
+
if status is not None:
|
|
598
|
+
scheduler.delete(app_id)
|
|
599
|
+
|
|
590
600
|
def stop(self, app_handle: AppHandle) -> None:
|
|
591
601
|
"""
|
|
592
602
|
See method ``cancel``.
|
torchx/schedulers/api.py
CHANGED
|
@@ -11,10 +11,11 @@ import re
|
|
|
11
11
|
from dataclasses import dataclass, field
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
from enum import Enum
|
|
14
|
-
from typing import Generic, Iterable, List, Optional, TypeVar
|
|
14
|
+
from typing import Generic, Iterable, List, Optional, TypeVar
|
|
15
15
|
|
|
16
16
|
from torchx.specs import (
|
|
17
17
|
AppDef,
|
|
18
|
+
AppDryRunInfo,
|
|
18
19
|
AppState,
|
|
19
20
|
NONE,
|
|
20
21
|
NULL_RESOURCE,
|
|
@@ -95,11 +96,9 @@ class ListAppResponse:
|
|
|
95
96
|
|
|
96
97
|
|
|
97
98
|
T = TypeVar("T")
|
|
98
|
-
A = TypeVar("A")
|
|
99
|
-
D = TypeVar("D")
|
|
100
99
|
|
|
101
100
|
|
|
102
|
-
class Scheduler(abc.ABC, Generic[T
|
|
101
|
+
class Scheduler(abc.ABC, Generic[T]):
|
|
103
102
|
"""
|
|
104
103
|
An interface abstracting functionalities of a scheduler.
|
|
105
104
|
Implementers need only implement those methods annotated with
|
|
@@ -129,7 +128,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
129
128
|
|
|
130
129
|
def submit(
|
|
131
130
|
self,
|
|
132
|
-
app:
|
|
131
|
+
app: AppDef,
|
|
133
132
|
cfg: T,
|
|
134
133
|
workspace: str | Workspace | None = None,
|
|
135
134
|
) -> str:
|
|
@@ -157,7 +156,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
157
156
|
return self.schedule(dryrun_info)
|
|
158
157
|
|
|
159
158
|
@abc.abstractmethod
|
|
160
|
-
def schedule(self, dryrun_info:
|
|
159
|
+
def schedule(self, dryrun_info: AppDryRunInfo) -> str:
|
|
161
160
|
"""
|
|
162
161
|
Same as ``submit`` except that it takes an ``AppDryRunInfo``.
|
|
163
162
|
Implementers are encouraged to implement this method rather than
|
|
@@ -173,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
173
172
|
|
|
174
173
|
raise NotImplementedError()
|
|
175
174
|
|
|
176
|
-
def submit_dryrun(self, app:
|
|
175
|
+
def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
|
|
177
176
|
"""
|
|
178
177
|
Rather than submitting the request to run the app, returns the
|
|
179
178
|
request object that would have been submitted to the underlying
|
|
@@ -187,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
187
186
|
# pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
|
|
188
187
|
dryrun_info = self._submit_dryrun(app, resolved_cfg)
|
|
189
188
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
189
|
+
for role in app.roles:
|
|
190
|
+
dryrun_info = role.pre_proc(self.backend, dryrun_info)
|
|
191
|
+
|
|
193
192
|
dryrun_info._app = app
|
|
194
193
|
dryrun_info._cfg = resolved_cfg
|
|
195
194
|
return dryrun_info
|
|
196
195
|
|
|
197
196
|
@abc.abstractmethod
|
|
198
|
-
def _submit_dryrun(self, app:
|
|
197
|
+
def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
|
|
199
198
|
raise NotImplementedError()
|
|
200
199
|
|
|
201
200
|
def run_opts(self) -> runopts:
|
|
@@ -264,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
264
263
|
# do nothing if the app does not exist
|
|
265
264
|
return
|
|
266
265
|
|
|
266
|
+
def delete(self, app_id: str) -> None:
|
|
267
|
+
"""
|
|
268
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
269
|
+
scheduler's data-plane. Basically "deep-purging" the job from the
|
|
270
|
+
scheduler's data-plane. Calling this API on a "live" job (e.g in a
|
|
271
|
+
non-terminal status such as PENDING or RUNNING) cancels the job.
|
|
272
|
+
|
|
273
|
+
Note that this API is only relevant for schedulers for which its
|
|
274
|
+
data-plane persistently stores the "JobDefinition" (which is often
|
|
275
|
+
versioned). AWS Batch and Kubernetes are examples of such schedulers.
|
|
276
|
+
On these schedulers, a finished job may fall out of the data-plane
|
|
277
|
+
(e.g. really old finished jobs get deleted) but the JobDefinition is
|
|
278
|
+
typically permanently stored. In this case, calling
|
|
279
|
+
:py:meth:`~cancel` would not delete the job definition.
|
|
280
|
+
|
|
281
|
+
In schedulers with no such feature (e.g. SLURM)
|
|
282
|
+
:py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
|
|
283
|
+
default implementation. Hence implementors of such schedulers need not
|
|
284
|
+
override this method.
|
|
285
|
+
|
|
286
|
+
.. warning::
|
|
287
|
+
Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
|
|
288
|
+
the scheduler's data-plane does nothing. The user is responsible for
|
|
289
|
+
manually tracking down and cleaning up any dangling resources related
|
|
290
|
+
to the job.
|
|
291
|
+
"""
|
|
292
|
+
if self.exists(app_id):
|
|
293
|
+
self._delete_existing(app_id)
|
|
294
|
+
|
|
295
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
296
|
+
"""
|
|
297
|
+
Deletes the job information for the specified ``app_id`` from the
|
|
298
|
+
scheduler's data-plane. This method will only be called on an
|
|
299
|
+
application that exists.
|
|
300
|
+
|
|
301
|
+
The default implementation calls :py:meth:`~_cancel_existing` which is
|
|
302
|
+
appropriate for schedulers without persistent job definitions.
|
|
303
|
+
"""
|
|
304
|
+
self._cancel_existing(app_id)
|
|
305
|
+
|
|
267
306
|
def log_iter(
|
|
268
307
|
self,
|
|
269
308
|
app_id: str,
|
|
@@ -354,15 +393,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
|
|
|
354
393
|
"""
|
|
355
394
|
pass
|
|
356
395
|
|
|
357
|
-
def _validate(self, app:
|
|
396
|
+
def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
|
|
358
397
|
"""
|
|
359
398
|
Validates after workspace build whether application is consistent with the scheduler.
|
|
360
399
|
|
|
361
400
|
Raises error if application is not compatible with scheduler
|
|
362
401
|
"""
|
|
363
|
-
if not isinstance(app, AppDef):
|
|
364
|
-
return
|
|
365
|
-
|
|
366
402
|
for role in app.roles:
|
|
367
403
|
if role.resource == NULL_RESOURCE:
|
|
368
404
|
raise ValueError(
|
|
@@ -381,7 +381,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
|
|
|
381
381
|
|
|
382
382
|
|
|
383
383
|
@_thread_local_cache
|
|
384
|
-
def _local_session() -> "boto3.session.Session":
|
|
384
|
+
def _local_session() -> "boto3.session.Session": # noqa: F821
|
|
385
385
|
import boto3.session
|
|
386
386
|
|
|
387
387
|
return boto3.session.Session()
|
|
@@ -399,9 +399,7 @@ class AWSBatchOpts(TypedDict, total=False):
|
|
|
399
399
|
ulimits: Optional[list[str]]
|
|
400
400
|
|
|
401
401
|
|
|
402
|
-
class AWSBatchScheduler(
|
|
403
|
-
DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
|
|
404
|
-
):
|
|
402
|
+
class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
|
|
405
403
|
"""
|
|
406
404
|
AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.
|
|
407
405
|
|
|
@@ -157,7 +157,7 @@ def _merge_ordered(
|
|
|
157
157
|
|
|
158
158
|
class AWSSageMakerScheduler(
|
|
159
159
|
DockerWorkspaceMixin,
|
|
160
|
-
Scheduler[AWSSageMakerOpts
|
|
160
|
+
Scheduler[AWSSageMakerOpts],
|
|
161
161
|
):
|
|
162
162
|
"""
|
|
163
163
|
AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.
|
|
@@ -129,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
|
|
|
129
129
|
privileged: bool
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
class DockerScheduler(
|
|
133
|
-
DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
|
|
134
|
-
):
|
|
132
|
+
class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
|
|
135
133
|
"""
|
|
136
134
|
DockerScheduler is a TorchX scheduling interface to Docker.
|
|
137
135
|
|
torchx/schedulers/ids.py
CHANGED
|
@@ -8,9 +8,9 @@
|
|
|
8
8
|
# pyre-strict
|
|
9
9
|
|
|
10
10
|
import os
|
|
11
|
-
import random
|
|
12
11
|
import struct
|
|
13
12
|
|
|
13
|
+
|
|
14
14
|
START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
|
|
15
15
|
END_CANDIDATES: str = START_CANDIDATES + "012345679"
|
|
16
16
|
|
|
@@ -19,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
|
|
|
19
19
|
"""
|
|
20
20
|
Appends a unique 64-bit string to the input argument.
|
|
21
21
|
|
|
22
|
+
Note that the unique string pulls entropy from `/dev/urandom` hence is not
|
|
23
|
+
affected by `random.seed()`
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
name: the name string to unique-ify
|
|
27
|
+
string_length: max length of the unique 64-bit string to append to the ``name``.
|
|
28
|
+
Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
|
|
29
|
+
|
|
22
30
|
Returns:
|
|
23
|
-
string in format
|
|
31
|
+
string in format ``{name}-{unique_suffix}`
|
|
24
32
|
"""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
if string_length == 0
|
|
28
|
-
else f"{name}-{get_len_random_id(string_length)}"
|
|
29
|
-
)
|
|
33
|
+
max_length = None if string_length == 0 else string_length
|
|
34
|
+
return f"{name}-{random_id(max_length)}"
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
def random_uint64() -> int:
|
|
@@ -36,13 +41,24 @@ def random_uint64() -> int:
|
|
|
36
41
|
return struct.unpack("!Q", os.urandom(8))[0]
|
|
37
42
|
|
|
38
43
|
|
|
39
|
-
def random_id() -> str:
|
|
44
|
+
def random_id(max_length: int | None = None) -> str:
|
|
40
45
|
"""
|
|
41
46
|
Generates an alphanumeric string ID that matches the requirements from
|
|
42
47
|
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
|
|
48
|
+
|
|
49
|
+
Note that the unique string pulls entropy from `/dev/urandom` hence is not
|
|
50
|
+
affected by `random.seed()`
|
|
51
|
+
|
|
52
|
+
If ``max_length`` is provided, the returned ID will be at most that many characters long.
|
|
53
|
+
|
|
43
54
|
"""
|
|
55
|
+
# If a max_length is provided and is non-positive, return empty string
|
|
56
|
+
if max_length is not None and max_length <= 0:
|
|
57
|
+
return ""
|
|
58
|
+
|
|
44
59
|
out = ""
|
|
45
60
|
v = random_uint64()
|
|
61
|
+
|
|
46
62
|
while v > 0:
|
|
47
63
|
if out == "":
|
|
48
64
|
candidates = START_CANDIDATES
|
|
@@ -52,21 +68,9 @@ def random_id() -> str:
|
|
|
52
68
|
char = v % len(candidates)
|
|
53
69
|
v = v // len(candidates)
|
|
54
70
|
out += candidates[char]
|
|
55
|
-
return out
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def get_len_random_id(string_length: int) -> str:
|
|
59
|
-
"""
|
|
60
|
-
Generates an alphanumeric string ID that matches the requirements from
|
|
61
|
-
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
|
|
62
|
-
"""
|
|
63
|
-
out = ""
|
|
64
|
-
for i in range(string_length):
|
|
65
|
-
if out == "":
|
|
66
|
-
candidates = START_CANDIDATES
|
|
67
|
-
else:
|
|
68
|
-
candidates = END_CANDIDATES
|
|
69
71
|
|
|
70
|
-
|
|
72
|
+
if max_length is not None and len(out) >= max_length:
|
|
73
|
+
break
|
|
71
74
|
|
|
75
|
+
# NOTE: statistically the length of `out` is typically between 12-14 characters long
|
|
72
76
|
return out
|
|
@@ -796,10 +796,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
|
|
|
796
796
|
network: Optional[str]
|
|
797
797
|
|
|
798
798
|
|
|
799
|
-
class KubernetesMCADScheduler(
|
|
800
|
-
DockerWorkspaceMixin,
|
|
801
|
-
Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
|
|
802
|
-
):
|
|
799
|
+
class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
|
|
803
800
|
"""
|
|
804
801
|
KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.
|
|
805
802
|
|
|
@@ -149,7 +149,6 @@ from torchx.specs.api import (
|
|
|
149
149
|
from torchx.util.strings import normalize_str
|
|
150
150
|
from torchx.workspace.docker_workspace import DockerWorkspaceMixin
|
|
151
151
|
|
|
152
|
-
|
|
153
152
|
if TYPE_CHECKING:
|
|
154
153
|
from docker import DockerClient
|
|
155
154
|
from kubernetes.client import ApiClient, CustomObjectsApi
|
|
@@ -159,6 +158,7 @@ if TYPE_CHECKING:
|
|
|
159
158
|
)
|
|
160
159
|
from kubernetes.client.rest import ApiException
|
|
161
160
|
|
|
161
|
+
|
|
162
162
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
163
163
|
|
|
164
164
|
# Kubernetes reserves a small amount of resources per host for the system. For
|
|
@@ -294,7 +294,14 @@ def sanitize_for_serialization(obj: object) -> object:
|
|
|
294
294
|
return api.sanitize_for_serialization(obj)
|
|
295
295
|
|
|
296
296
|
|
|
297
|
-
def role_to_pod(
|
|
297
|
+
def role_to_pod(
|
|
298
|
+
name: str,
|
|
299
|
+
role: Role,
|
|
300
|
+
service_account: Optional[str],
|
|
301
|
+
reserved_millicpu: int = RESERVED_MILLICPU,
|
|
302
|
+
reserved_memmb: int = RESERVED_MEMMB,
|
|
303
|
+
efa_device_count: Optional[int] = None,
|
|
304
|
+
) -> "V1Pod":
|
|
298
305
|
from kubernetes.client.models import ( # noqa: F811 redefinition of unused
|
|
299
306
|
V1Container,
|
|
300
307
|
V1ContainerPort,
|
|
@@ -324,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
|
|
|
324
331
|
if resource.cpu > 0:
|
|
325
332
|
mcpu = int(resource.cpu * 1000)
|
|
326
333
|
limits["cpu"] = f"{mcpu}m"
|
|
327
|
-
request_mcpu = max(mcpu -
|
|
334
|
+
request_mcpu = max(mcpu - reserved_millicpu, 0)
|
|
328
335
|
requests["cpu"] = f"{request_mcpu}m"
|
|
329
336
|
if resource.memMB > 0:
|
|
330
337
|
limits["memory"] = f"{int(resource.memMB)}M"
|
|
331
|
-
request_memMB = max(int(resource.memMB) -
|
|
338
|
+
request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
|
|
332
339
|
requests["memory"] = f"{request_memMB}M"
|
|
333
340
|
if resource.gpu > 0:
|
|
334
341
|
requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
|
|
335
342
|
|
|
343
|
+
EFA_DEVICE = "vpc.amazonaws.com/efa"
|
|
336
344
|
for device_name, device_limit in resource.devices.items():
|
|
337
345
|
limits[device_name] = str(device_limit)
|
|
338
346
|
|
|
347
|
+
# Handle EFA device count override:
|
|
348
|
+
# - None (default): use whatever count is in the resource spec (already added above)
|
|
349
|
+
# - 0: remove EFA devices entirely
|
|
350
|
+
# - N > 0: set EFA device count to N (override or add)
|
|
351
|
+
if efa_device_count is not None:
|
|
352
|
+
if efa_device_count == 0:
|
|
353
|
+
limits.pop(EFA_DEVICE, None)
|
|
354
|
+
else:
|
|
355
|
+
limits[EFA_DEVICE] = str(efa_device_count)
|
|
356
|
+
|
|
339
357
|
resources = V1ResourceRequirements(
|
|
340
358
|
limits=limits,
|
|
341
359
|
requests=requests,
|
|
@@ -475,6 +493,9 @@ def app_to_resource(
|
|
|
475
493
|
queue: str,
|
|
476
494
|
service_account: Optional[str],
|
|
477
495
|
priority_class: Optional[str] = None,
|
|
496
|
+
reserved_millicpu: int = RESERVED_MILLICPU,
|
|
497
|
+
reserved_memmb: int = RESERVED_MEMMB,
|
|
498
|
+
efa_device_count: Optional[int] = None,
|
|
478
499
|
) -> Dict[str, Any]:
|
|
479
500
|
"""
|
|
480
501
|
app_to_resource creates a volcano job kubernetes resource definition from
|
|
@@ -507,7 +528,14 @@ def app_to_resource(
|
|
|
507
528
|
replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
|
|
508
529
|
replica_role.env["TORCHX_IMAGE"] = replica_role.image
|
|
509
530
|
|
|
510
|
-
pod = role_to_pod(
|
|
531
|
+
pod = role_to_pod(
|
|
532
|
+
name,
|
|
533
|
+
replica_role,
|
|
534
|
+
service_account,
|
|
535
|
+
reserved_millicpu,
|
|
536
|
+
reserved_memmb,
|
|
537
|
+
efa_device_count,
|
|
538
|
+
)
|
|
511
539
|
if k8s_metadata := role.metadata.get("kubernetes"):
|
|
512
540
|
if isinstance(k8s_metadata, str):
|
|
513
541
|
import fsspec
|
|
@@ -589,12 +617,12 @@ class KubernetesOpts(TypedDict, total=False):
|
|
|
589
617
|
service_account: Optional[str]
|
|
590
618
|
priority_class: Optional[str]
|
|
591
619
|
validate_spec: Optional[bool]
|
|
620
|
+
reserved_millicpu: Optional[int]
|
|
621
|
+
reserved_memmb: Optional[int]
|
|
622
|
+
efa_device_count: Optional[int]
|
|
592
623
|
|
|
593
624
|
|
|
594
|
-
class KubernetesScheduler(
|
|
595
|
-
DockerWorkspaceMixin,
|
|
596
|
-
Scheduler[KubernetesOpts, AppDef, AppDryRunInfo[KubernetesJob]],
|
|
597
|
-
):
|
|
625
|
+
class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
|
|
598
626
|
"""
|
|
599
627
|
KubernetesScheduler is a TorchX scheduling interface to Kubernetes.
|
|
600
628
|
|
|
@@ -622,6 +650,16 @@ class KubernetesScheduler(
|
|
|
622
650
|
$ torchx status kubernetes://torchx_user/1234
|
|
623
651
|
...
|
|
624
652
|
|
|
653
|
+
**Cancellation**
|
|
654
|
+
|
|
655
|
+
Canceling a job aborts it while preserving the job spec for inspection
|
|
656
|
+
and cloning via kubectl apply. Use the delete command to remove the job entirely:
|
|
657
|
+
|
|
658
|
+
.. code-block:: bash
|
|
659
|
+
|
|
660
|
+
$ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
|
|
661
|
+
$ torchx delete kubernetes://namespace/jobname # delete completely
|
|
662
|
+
|
|
625
663
|
**Config Options**
|
|
626
664
|
|
|
627
665
|
.. runopts::
|
|
@@ -700,9 +738,14 @@ class KubernetesScheduler(
|
|
|
700
738
|
if c is None:
|
|
701
739
|
configuration = client.Configuration()
|
|
702
740
|
try:
|
|
703
|
-
config
|
|
704
|
-
|
|
705
|
-
|
|
741
|
+
# Try in-cluster config first (for pods with ServiceAccount)
|
|
742
|
+
config.load_incluster_config(client_configuration=configuration)
|
|
743
|
+
except config.ConfigException:
|
|
744
|
+
# Fall back to kubeconfig (for local development)
|
|
745
|
+
try:
|
|
746
|
+
config.load_kube_config(client_configuration=configuration)
|
|
747
|
+
except config.ConfigException as e:
|
|
748
|
+
warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
|
|
706
749
|
|
|
707
750
|
c = self._client = client.ApiClient(configuration)
|
|
708
751
|
|
|
@@ -776,7 +819,26 @@ class KubernetesScheduler(
|
|
|
776
819
|
priority_class, str
|
|
777
820
|
), "priority_class must be a str"
|
|
778
821
|
|
|
779
|
-
|
|
822
|
+
reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
|
|
823
|
+
assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
|
|
824
|
+
|
|
825
|
+
reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
|
|
826
|
+
assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
|
|
827
|
+
|
|
828
|
+
efa_device_count = cfg.get("efa_device_count")
|
|
829
|
+
assert efa_device_count is None or isinstance(
|
|
830
|
+
efa_device_count, int
|
|
831
|
+
), "efa_device_count must be an int or None"
|
|
832
|
+
|
|
833
|
+
resource = app_to_resource(
|
|
834
|
+
app,
|
|
835
|
+
queue,
|
|
836
|
+
service_account,
|
|
837
|
+
priority_class,
|
|
838
|
+
reserved_millicpu,
|
|
839
|
+
reserved_memmb,
|
|
840
|
+
efa_device_count,
|
|
841
|
+
)
|
|
780
842
|
|
|
781
843
|
if cfg.get("validate_spec"):
|
|
782
844
|
try:
|
|
@@ -818,6 +880,31 @@ class KubernetesScheduler(
|
|
|
818
880
|
pass
|
|
819
881
|
|
|
820
882
|
def _cancel_existing(self, app_id: str) -> None:
|
|
883
|
+
"""
|
|
884
|
+
Abort a Volcano job while preserving the spec for inspection.
|
|
885
|
+
"""
|
|
886
|
+
namespace, name = app_id.split(":")
|
|
887
|
+
vcjob = self._custom_objects_api().get_namespaced_custom_object(
|
|
888
|
+
group="batch.volcano.sh",
|
|
889
|
+
version="v1alpha1",
|
|
890
|
+
namespace=namespace,
|
|
891
|
+
plural="jobs",
|
|
892
|
+
name=name,
|
|
893
|
+
)
|
|
894
|
+
vcjob["status"]["state"]["phase"] = "Aborted"
|
|
895
|
+
self._custom_objects_api().replace_namespaced_custom_object_status(
|
|
896
|
+
group="batch.volcano.sh",
|
|
897
|
+
version="v1alpha1",
|
|
898
|
+
namespace=namespace,
|
|
899
|
+
plural="jobs",
|
|
900
|
+
name=name,
|
|
901
|
+
body=vcjob,
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
def _delete_existing(self, app_id: str) -> None:
|
|
905
|
+
"""
|
|
906
|
+
Delete a Volcano job completely from the cluster.
|
|
907
|
+
"""
|
|
821
908
|
namespace, name = app_id.split(":")
|
|
822
909
|
self._custom_objects_api().delete_namespaced_custom_object(
|
|
823
910
|
group="batch.volcano.sh",
|
|
@@ -857,9 +944,29 @@ class KubernetesScheduler(
|
|
|
857
944
|
help="Validate job spec using Kubernetes API dry-run before submission",
|
|
858
945
|
default=True,
|
|
859
946
|
)
|
|
947
|
+
opts.add(
|
|
948
|
+
"reserved_millicpu",
|
|
949
|
+
type_=int,
|
|
950
|
+
help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
|
|
951
|
+
default=RESERVED_MILLICPU,
|
|
952
|
+
)
|
|
953
|
+
opts.add(
|
|
954
|
+
"reserved_memmb",
|
|
955
|
+
type_=int,
|
|
956
|
+
help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
|
|
957
|
+
default=RESERVED_MEMMB,
|
|
958
|
+
)
|
|
959
|
+
opts.add(
|
|
960
|
+
"efa_device_count",
|
|
961
|
+
type_=int,
|
|
962
|
+
help="EFA device count override: None/unset=use resource spec, "
|
|
963
|
+
"0=remove EFA, N>0=set EFA count to N",
|
|
964
|
+
default=None,
|
|
965
|
+
)
|
|
860
966
|
return opts
|
|
861
967
|
|
|
862
968
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
969
|
+
from kubernetes import client
|
|
863
970
|
from kubernetes.client.rest import ApiException
|
|
864
971
|
|
|
865
972
|
namespace, name = app_id.split(":")
|
|
@@ -885,18 +992,44 @@ class KubernetesScheduler(
|
|
|
885
992
|
TASK_STATUS_COUNT = "taskStatusCount"
|
|
886
993
|
|
|
887
994
|
if TASK_STATUS_COUNT in status:
|
|
888
|
-
for
|
|
889
|
-
role, _, idx =
|
|
995
|
+
for task_name, task_status in status[TASK_STATUS_COUNT].items():
|
|
996
|
+
role, _, idx = task_name.rpartition("-")
|
|
890
997
|
|
|
891
|
-
state_str = next(iter(
|
|
998
|
+
state_str = next(iter(task_status["phase"].keys()))
|
|
892
999
|
state = TASK_STATE[state_str]
|
|
893
1000
|
|
|
894
1001
|
if role not in roles:
|
|
895
1002
|
roles[role] = Role(name=role, num_replicas=0, image="")
|
|
896
1003
|
roles_statuses[role] = RoleStatus(role, [])
|
|
897
1004
|
roles[role].num_replicas += 1
|
|
1005
|
+
|
|
1006
|
+
# Pod name follows the pattern: {job_name}-{task_name}-0
|
|
1007
|
+
# Get the pod to retrieve its IP address
|
|
1008
|
+
pod_name_k8s = f"{name}-{task_name}-0"
|
|
1009
|
+
hostname = ""
|
|
1010
|
+
try:
|
|
1011
|
+
core_api = client.CoreV1Api(self._api_client())
|
|
1012
|
+
pod = core_api.read_namespaced_pod(
|
|
1013
|
+
name=pod_name_k8s, namespace=namespace
|
|
1014
|
+
)
|
|
1015
|
+
pod_ip = pod.status.pod_ip
|
|
1016
|
+
|
|
1017
|
+
if pod_ip is not None:
|
|
1018
|
+
# Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
|
|
1019
|
+
pod_ip_dashed = pod_ip.replace(".", "-")
|
|
1020
|
+
|
|
1021
|
+
# Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
|
|
1022
|
+
# Note: This will only be useful if the client using the IPs is in the cluster.
|
|
1023
|
+
hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
|
|
1024
|
+
|
|
1025
|
+
except ApiException:
|
|
1026
|
+
# Pod not found - hostname remains empty
|
|
1027
|
+
pass
|
|
1028
|
+
|
|
898
1029
|
roles_statuses[role].replicas.append(
|
|
899
|
-
ReplicaStatus(
|
|
1030
|
+
ReplicaStatus(
|
|
1031
|
+
id=int(idx), role=role, state=state, hostname=hostname
|
|
1032
|
+
)
|
|
900
1033
|
)
|
|
901
1034
|
else:
|
|
902
1035
|
app_state = AppState.UNKNOWN
|
|
@@ -940,7 +1073,10 @@ class KubernetesScheduler(
|
|
|
940
1073
|
core_api = client.CoreV1Api(self._api_client())
|
|
941
1074
|
if should_tail:
|
|
942
1075
|
w = watch.Watch()
|
|
943
|
-
iterator =
|
|
1076
|
+
iterator = (
|
|
1077
|
+
f"{line}\n"
|
|
1078
|
+
for line in w.stream(core_api.read_namespaced_pod_log, **args)
|
|
1079
|
+
)
|
|
944
1080
|
else:
|
|
945
1081
|
resp = core_api.read_namespaced_pod_log(**args)
|
|
946
1082
|
iterator = split_lines(resp)
|
|
@@ -529,7 +529,7 @@ def _register_termination_signals() -> None:
|
|
|
529
529
|
signal.signal(signal.SIGINT, _terminate_process_handler)
|
|
530
530
|
|
|
531
531
|
|
|
532
|
-
class LocalScheduler(Scheduler[LocalOpts
|
|
532
|
+
class LocalScheduler(Scheduler[LocalOpts]):
|
|
533
533
|
"""
|
|
534
534
|
Schedules on localhost. Containers are modeled as processes and
|
|
535
535
|
certain properties of the container that are either not relevant
|
|
@@ -135,6 +135,7 @@ SBATCH_JOB_OPTIONS = {
|
|
|
135
135
|
"comment",
|
|
136
136
|
"mail-user",
|
|
137
137
|
"mail-type",
|
|
138
|
+
"account",
|
|
138
139
|
}
|
|
139
140
|
SBATCH_GROUP_OPTIONS = {
|
|
140
141
|
"partition",
|
|
@@ -159,6 +160,7 @@ def _apply_app_id_env(s: str) -> str:
|
|
|
159
160
|
SlurmOpts = TypedDict(
|
|
160
161
|
"SlurmOpts",
|
|
161
162
|
{
|
|
163
|
+
"account": Optional[str],
|
|
162
164
|
"partition": str,
|
|
163
165
|
"time": str,
|
|
164
166
|
"comment": Optional[str],
|
|
@@ -335,9 +337,7 @@ fi
|
|
|
335
337
|
{self.materialize()}"""
|
|
336
338
|
|
|
337
339
|
|
|
338
|
-
class SlurmScheduler(
|
|
339
|
-
DirWorkspaceMixin, Scheduler[SlurmOpts, AppDef, AppDryRunInfo[SlurmBatchRequest]]
|
|
340
|
-
):
|
|
340
|
+
class SlurmScheduler(DirWorkspaceMixin, Scheduler[SlurmOpts]):
|
|
341
341
|
"""
|
|
342
342
|
SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
|
|
343
343
|
that slurm CLI tools are locally installed and job accounting is enabled.
|
|
@@ -406,6 +406,12 @@ class SlurmScheduler(
|
|
|
406
406
|
|
|
407
407
|
def _run_opts(self) -> runopts:
|
|
408
408
|
opts = runopts()
|
|
409
|
+
opts.add(
|
|
410
|
+
"account",
|
|
411
|
+
type_=str,
|
|
412
|
+
help="The account to use for the slurm job.",
|
|
413
|
+
default=None,
|
|
414
|
+
)
|
|
409
415
|
opts.add(
|
|
410
416
|
"partition",
|
|
411
417
|
type_=str,
|
torchx/specs/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@ scheduler or pipeline adapter.
|
|
|
14
14
|
import difflib
|
|
15
15
|
|
|
16
16
|
import os
|
|
17
|
-
from typing import Callable, Dict, Mapping, Optional
|
|
17
|
+
from typing import Callable, Dict, Iterator, Mapping, Optional
|
|
18
18
|
|
|
19
19
|
from torchx.specs.api import (
|
|
20
20
|
ALL,
|
|
@@ -113,8 +113,22 @@ class _NamedResourcesLibrary:
|
|
|
113
113
|
def __contains__(self, key: str) -> bool:
|
|
114
114
|
return key in _named_resource_factories
|
|
115
115
|
|
|
116
|
-
def __iter__(self) ->
|
|
117
|
-
|
|
116
|
+
def __iter__(self) -> Iterator[str]:
|
|
117
|
+
"""Iterates through the names of the registered named_resources.
|
|
118
|
+
|
|
119
|
+
Usage:
|
|
120
|
+
|
|
121
|
+
.. doctest::
|
|
122
|
+
|
|
123
|
+
from torchx import specs
|
|
124
|
+
|
|
125
|
+
for resource_name in specs.named_resources:
|
|
126
|
+
resource = specs.resource(h=resource_name)
|
|
127
|
+
assert isinstance(resource, specs.Resource)
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
for key in _named_resource_factories:
|
|
131
|
+
yield (key)
|
|
118
132
|
|
|
119
133
|
|
|
120
134
|
named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
|
torchx/specs/api.py
CHANGED
|
@@ -253,7 +253,9 @@ class macros:
|
|
|
253
253
|
current_dict[k] = self.substitute(v)
|
|
254
254
|
elif isinstance(v, list):
|
|
255
255
|
for i in range(len(v)):
|
|
256
|
-
if isinstance(v[i],
|
|
256
|
+
if isinstance(v[i], dict):
|
|
257
|
+
stack.append(v[i])
|
|
258
|
+
elif isinstance(v[i], str):
|
|
257
259
|
v[i] = self.substitute(v[i])
|
|
258
260
|
return d
|
|
259
261
|
|
torchx/specs/overlays.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
# pyre-strict
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
Overlays are JSON structs applied to :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`
|
|
11
|
+
to specify attributes of the scheduler's submit-job request that are not currently representable
|
|
12
|
+
as attributes of :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`.
|
|
13
|
+
|
|
14
|
+
For end-uses, here are a few use-cases of overlays:
|
|
15
|
+
|
|
16
|
+
1. A new version of the scheduler has concepts/features that have not yet been added to TorchX.
|
|
17
|
+
2. A bespoke internal scheduler has custom features that do not generalize hence not in TorchX.
|
|
18
|
+
3. Re-using a pre-built ``AppDef`` but need to make a small change to the resulting scheduler request.
|
|
19
|
+
|
|
20
|
+
And for scheduler authors:
|
|
21
|
+
|
|
22
|
+
1. Scheduler setting needs to be applied to a ``Role``, which makes it hard to add as ``runopts``
|
|
23
|
+
since ``runopts`` apply at the ``AppDef`` level.
|
|
24
|
+
2. Scheduler setting cannot be represented naturally as the types supported by ``runopts``.
|
|
25
|
+
3. Exposing the setting as a ``runopts`` obfuscates things.
|
|
26
|
+
|
|
27
|
+
See :py:func:`~torchx.specs.overlays.apply_overlay` for rules on how overlays are applied.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
Json = dict[str, Any]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def apply_overlay(base: Json, overlay: Json) -> None:
|
|
36
|
+
"""Applies ``overlay`` on ``base``.
|
|
37
|
+
|
|
38
|
+
.. note:: this function mutates the ``base``!
|
|
39
|
+
|
|
40
|
+
Overlays follow these rules:
|
|
41
|
+
|
|
42
|
+
1. Dicts, upsert key, value in base with the ones in overlay.
|
|
43
|
+
2. Nested dicts, overlay recursively.
|
|
44
|
+
3. Lists, append the overlay values to the base values.
|
|
45
|
+
4. Nested lists DO NOT append recursively.
|
|
46
|
+
5. Primitives (bool, str, int, float), replace base with the value in overlay.
|
|
47
|
+
|
|
48
|
+
.. doctest::
|
|
49
|
+
|
|
50
|
+
from torchx.specs.overlays import apply_overlay
|
|
51
|
+
|
|
52
|
+
base = {
|
|
53
|
+
"scheduler": {"policy": "default"},
|
|
54
|
+
"resources": {"limits": {"cpu": "500m"}},
|
|
55
|
+
"tolerations": [{"key": "gpu"}],
|
|
56
|
+
"nodeSelectorTerms": [
|
|
57
|
+
[{"matchExpressions": []}]
|
|
58
|
+
],
|
|
59
|
+
"maxPods": 110,
|
|
60
|
+
}
|
|
61
|
+
overlay = {
|
|
62
|
+
"scheduler": {"policy": "binpacking"},
|
|
63
|
+
"resources": {"limits": {"memory": "1Gi"}},
|
|
64
|
+
"tolerations": [{"key": "spot"}],
|
|
65
|
+
"nodeSelectorTerms": [
|
|
66
|
+
[{"matchExpressions": [{"key": "disk"}]}]
|
|
67
|
+
],
|
|
68
|
+
"maxPods": 250,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
apply_overlay(base, overlay)
|
|
72
|
+
|
|
73
|
+
assert {
|
|
74
|
+
"scheduler": {"policy": "binpacking"},
|
|
75
|
+
"resources": {"limits": {"cpu": "500m", "memory": "1Gi"}},
|
|
76
|
+
"tolerations": [{"key": "gpu"}, {"key": "spot"}],
|
|
77
|
+
"nodeSelectorTerms": [
|
|
78
|
+
[{"matchExpressions": []}],
|
|
79
|
+
[{"matchExpressions": [{"key": "disk"}]}],
|
|
80
|
+
],
|
|
81
|
+
"maxPods": 250,
|
|
82
|
+
} == base
|
|
83
|
+
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def assert_type_equal(key: str, o1: object, o2: object) -> None:
|
|
87
|
+
o1_type = type(o1)
|
|
88
|
+
o2_type = type(o2)
|
|
89
|
+
assert (
|
|
90
|
+
o1_type == o2_type
|
|
91
|
+
), f"Type mismatch for attr: `{key}`. {o1_type.__qualname__} != {o2_type.__qualname__}"
|
|
92
|
+
|
|
93
|
+
for key, overlay_value in overlay.items():
|
|
94
|
+
if key in base:
|
|
95
|
+
base_value = base[key]
|
|
96
|
+
|
|
97
|
+
assert_type_equal(key, base_value, overlay_value)
|
|
98
|
+
|
|
99
|
+
if isinstance(base_value, dict) and isinstance(overlay_value, dict):
|
|
100
|
+
apply_overlay(base_value, overlay_value)
|
|
101
|
+
elif isinstance(base_value, list) and isinstance(overlay_value, list):
|
|
102
|
+
base_value.extend(overlay_value)
|
|
103
|
+
else:
|
|
104
|
+
base[key] = overlay_value
|
|
105
|
+
else:
|
|
106
|
+
base[key] = overlay_value
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: torchx-nightly
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2026.1.11
|
|
4
4
|
Summary: TorchX SDK and Components
|
|
5
5
|
Home-page: https://github.com/meta-pytorch/torchx
|
|
6
6
|
Author: TorchX Devs
|
|
@@ -47,7 +47,7 @@ Requires-Dist: pytest; extra == "dev"
|
|
|
47
47
|
Requires-Dist: pytest-cov; extra == "dev"
|
|
48
48
|
Requires-Dist: pytorch-lightning==2.5.0; extra == "dev"
|
|
49
49
|
Requires-Dist: tensorboard==2.14.0; extra == "dev"
|
|
50
|
-
Requires-Dist: sagemaker==2.
|
|
50
|
+
Requires-Dist: sagemaker==2.237.3; extra == "dev"
|
|
51
51
|
Requires-Dist: torch-model-archiver>=0.4.2; extra == "dev"
|
|
52
52
|
Requires-Dist: torch; extra == "dev"
|
|
53
53
|
Requires-Dist: torchmetrics==1.6.3; extra == "dev"
|
|
@@ -14,6 +14,7 @@ torchx/cli/argparse_util.py,sha256=kZb1ubEHDrBsmrxpySFRQCW7wmHuRHD8eAInuEZjlsI,3
|
|
|
14
14
|
torchx/cli/cmd_base.py,sha256=SdqMtqi04CEqnzcgcS35DbDbsBeMxSgEhfynfpIkMGk,790
|
|
15
15
|
torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
|
|
16
16
|
torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
|
|
17
|
+
torchx/cli/cmd_delete.py,sha256=US1f6Jvyhz4R_0Q0a8GeNTDMrhzo8WE_ECcdOf0MjKE,835
|
|
17
18
|
torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
|
|
18
19
|
torchx/cli/cmd_list.py,sha256=alkS9aIaDI8lX3W8uj8Vtr3IU3G2VeCuokKSd3zOFug,1409
|
|
19
20
|
torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
|
|
@@ -22,7 +23,7 @@ torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,130
|
|
|
22
23
|
torchx/cli/cmd_status.py,sha256=22IAEmKs0qkG6kJi83u9dRX2Q-ntT7yehVx7FxtY-vQ,2114
|
|
23
24
|
torchx/cli/cmd_tracker.py,sha256=9gmOmYi-89qQRGQfSrXCTto7ve54_JKFqs_wa7oRUA8,5223
|
|
24
25
|
torchx/cli/colors.py,sha256=yLMes7e_UoLAfhxE0W6edhc58t83UHAlnCN2ANPeuXw,568
|
|
25
|
-
torchx/cli/main.py,sha256=
|
|
26
|
+
torchx/cli/main.py,sha256=1DJTmKdvPW_7hod8OUVT3Br2uwsZVEDU-2bTE0NJ0zY,3559
|
|
26
27
|
torchx/components/__init__.py,sha256=JaVte0j9Gqi6IrjZKudJ2Kr3gkdHsvlCdRTo-zYpSRo,11815
|
|
27
28
|
torchx/components/component_test_base.py,sha256=22iNSdVa_qTW3SMM30Pw5UEWlK4DZVw0C03EqYiaLOI,4150
|
|
28
29
|
torchx/components/dist.py,sha256=6DNPEvHVqEifmM8g1L7HVY169cQv_7tSfSlh3o6lTp4,14930
|
|
@@ -49,7 +50,7 @@ torchx/examples/apps/lightning/profiler.py,sha256=SSSihnwjeUTkBoz0E3qn1b-wbkfUIo
|
|
|
49
50
|
torchx/examples/apps/lightning/train.py,sha256=0wvvshGHvZowePB4LfclXwn40X7i9euM0ReETWBcPSo,6253
|
|
50
51
|
torchx/pipelines/__init__.py,sha256=2MbRVk5xwRjg-d2qPemeXpEhDsocMQumPQ53lsesZAI,606
|
|
51
52
|
torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
|
|
52
|
-
torchx/runner/api.py,sha256=
|
|
53
|
+
torchx/runner/api.py,sha256=Qi12Kjkr_zpQBesbLuCtgKET8JhHnQk22MV7Czi4l1A,30832
|
|
53
54
|
torchx/runner/config.py,sha256=SaKOB50d79WaMFPWK8CC4as6UaNFaRGhrBkfajq3KC4,18311
|
|
54
55
|
torchx/runner/events/__init__.py,sha256=cMiNjnr4eUNQ2Nxxtu4nsvN5lu56b-a6nJ-ct3i7DQk,5536
|
|
55
56
|
torchx/runner/events/api.py,sha256=bvxKBAYK8LzbrBNaNLgL1x0aivtfANmWo1EMGOrSR8k,2668
|
|
@@ -58,25 +59,26 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
|
|
|
58
59
|
torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
|
|
59
60
|
torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
|
|
60
61
|
torchx/schedulers/__init__.py,sha256=FQN9boQM4mwOD3sK9LZ3GBgw-gJ7Vx4MFj6z6ATQIrc,2211
|
|
61
|
-
torchx/schedulers/api.py,sha256=
|
|
62
|
-
torchx/schedulers/aws_batch_scheduler.py,sha256
|
|
63
|
-
torchx/schedulers/aws_sagemaker_scheduler.py,sha256=
|
|
62
|
+
torchx/schedulers/api.py,sha256=wT9H_ZTmpTHHweevDJbkV7NKXfwileHrt1bbhhCgj3c,16488
|
|
63
|
+
torchx/schedulers/aws_batch_scheduler.py,sha256=b6xC4BQKb7zagOGS6_z3_6fmOLsSEOxSprkGUE-yfJE,29412
|
|
64
|
+
torchx/schedulers/aws_sagemaker_scheduler.py,sha256=DnNF6huHGZLSUGWqKml4qGiWvmyDzX0i45tjsRfkedg,20881
|
|
64
65
|
torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
|
|
65
|
-
torchx/schedulers/docker_scheduler.py,sha256=
|
|
66
|
-
torchx/schedulers/ids.py,sha256=
|
|
67
|
-
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=
|
|
68
|
-
torchx/schedulers/kubernetes_scheduler.py,sha256=
|
|
69
|
-
torchx/schedulers/local_scheduler.py,sha256=
|
|
70
|
-
torchx/schedulers/lsf_scheduler.py,sha256=
|
|
71
|
-
torchx/schedulers/slurm_scheduler.py,sha256=
|
|
66
|
+
torchx/schedulers/docker_scheduler.py,sha256=Kud3AIzQtMekgjlqcg1eNDb8kk29aPbGYOMAvPTZdhM,16840
|
|
67
|
+
torchx/schedulers/ids.py,sha256=8Qhf1Xqh845mwL-RXnWZXqIILNvml3z8udEXPFpyO7U,2247
|
|
68
|
+
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=FclJEdBdlgtBqKDbgd95oAk5Ya5XNTrwysfX7GS80GY,42896
|
|
69
|
+
torchx/schedulers/kubernetes_scheduler.py,sha256=kYO08hqVlZtNe_FZQP_e8WQk1P8-8SVkXZuY3Zm_Znk,39640
|
|
70
|
+
torchx/schedulers/local_scheduler.py,sha256=xGQbI02BNWGF91g00So6hCcYvR90bUAZ7fPzqnm3Ww8,41892
|
|
71
|
+
torchx/schedulers/lsf_scheduler.py,sha256=vUvEJb02u7WI6y7DsWJxJFXNylRucU7FqkBX7xwLTak,17638
|
|
72
|
+
torchx/schedulers/slurm_scheduler.py,sha256=ipDVDtgfqgL6c35NyoJgSPuQFt8-AeXVXAnXJVvmzrc,32032
|
|
72
73
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
73
|
-
torchx/specs/__init__.py,sha256=
|
|
74
|
-
torchx/specs/api.py,sha256=
|
|
74
|
+
torchx/specs/__init__.py,sha256=TaC0AveTebkCMo5hmdY1wGpo09vFDqzWnsT166ionTw,7108
|
|
75
|
+
torchx/specs/api.py,sha256=7FdLFfadNWqXTLJ_EtP5t1uVS2Vc_4Gj5GLFoI628oE,49338
|
|
75
76
|
torchx/specs/builders.py,sha256=Ye3of4MupJ-da8vLaX6_-nzGo_FRw1BFpYsX6dAZCNk,13730
|
|
76
77
|
torchx/specs/file_linter.py,sha256=z0c4mKJv47BWiPaWCdUM0A8kHwnj4b1s7oTmESuD9Tc,14407
|
|
77
78
|
torchx/specs/finder.py,sha256=gWQNEFrLYqrZoI0gMMhQ70YAC4sxqS0ZFpoWAmcVi44,17438
|
|
78
79
|
torchx/specs/named_resources_aws.py,sha256=ZNAbw6lD8NUlMfcJ-LpX14dMSaHO7m4Yt9iHwAF44yg,11674
|
|
79
80
|
torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
|
|
81
|
+
torchx/specs/overlays.py,sha256=HmY2yzC8ejgihviNWFT4rbYmP-gTcqpxVZTP6qBiIYM,3778
|
|
80
82
|
torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
|
81
83
|
torchx/specs/test/components/a/__init__.py,sha256=kdxEgnI8QBSBiuTjaB4qDD7JX84hWowyPWU4B2Cqe9A,561
|
|
82
84
|
torchx/specs/test/components/a/b/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
|
|
@@ -103,9 +105,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
103
105
|
torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
|
|
104
106
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
105
107
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
106
|
-
torchx_nightly-
|
|
107
|
-
torchx_nightly-
|
|
108
|
-
torchx_nightly-
|
|
109
|
-
torchx_nightly-
|
|
110
|
-
torchx_nightly-
|
|
111
|
-
torchx_nightly-
|
|
108
|
+
torchx_nightly-2026.1.11.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
109
|
+
torchx_nightly-2026.1.11.dist-info/METADATA,sha256=VzSwxPN0aaQV3U3gNuMZMvhXiVRwO3W51DLXH1jaEr0,5323
|
|
110
|
+
torchx_nightly-2026.1.11.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
111
|
+
torchx_nightly-2026.1.11.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
112
|
+
torchx_nightly-2026.1.11.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
113
|
+
torchx_nightly-2026.1.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|