torchx-nightly 2025.11.12__py3-none-any.whl → 2026.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. torchx/cli/cmd_delete.py +30 -0
  2. torchx/cli/cmd_list.py +0 -1
  3. torchx/cli/cmd_log.py +0 -1
  4. torchx/cli/cmd_tracker.py +0 -1
  5. torchx/cli/main.py +2 -0
  6. torchx/components/component_test_base.py +0 -2
  7. torchx/components/integration_tests/integ_tests.py +0 -1
  8. torchx/distributed/__init__.py +0 -1
  9. torchx/examples/apps/lightning/profiler.py +0 -1
  10. torchx/runner/api.py +10 -1
  11. torchx/schedulers/api.py +51 -15
  12. torchx/schedulers/aws_batch_scheduler.py +3 -6
  13. torchx/schedulers/aws_sagemaker_scheduler.py +1 -2
  14. torchx/schedulers/docker_scheduler.py +1 -3
  15. torchx/schedulers/ids.py +27 -23
  16. torchx/schedulers/kubernetes_mcad_scheduler.py +1 -6
  17. torchx/schedulers/kubernetes_scheduler.py +154 -18
  18. torchx/schedulers/local_scheduler.py +1 -2
  19. torchx/schedulers/lsf_scheduler.py +1 -1
  20. torchx/schedulers/slurm_scheduler.py +9 -3
  21. torchx/specs/__init__.py +17 -6
  22. torchx/specs/api.py +3 -1
  23. torchx/specs/finder.py +0 -1
  24. torchx/specs/overlays.py +106 -0
  25. torchx/tracker/api.py +1 -1
  26. torchx/tracker/backend/fsspec.py +0 -1
  27. torchx/tracker/mlflow.py +0 -1
  28. torchx/workspace/docker_workspace.py +0 -1
  29. {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/METADATA +2 -2
  30. {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/RECORD +34 -32
  31. {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/WHEEL +0 -0
  32. {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/entry_points.txt +0 -0
  33. {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/licenses/LICENSE +0 -0
  34. {torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import argparse
11
+ import logging
12
+
13
+ from torchx.cli.cmd_base import SubCommand
14
+ from torchx.runner import get_runner
15
+
16
+ logger: logging.Logger = logging.getLogger(__name__)
17
+
18
+
19
+ class CmdDelete(SubCommand):
20
+ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
21
+ subparser.add_argument(
22
+ "app_handle",
23
+ type=str,
24
+ help="torchx app handle (e.g. local://session-name/app-id)",
25
+ )
26
+
27
+ def run(self, args: argparse.Namespace) -> None:
28
+ app_handle = args.app_handle
29
+ runner = get_runner()
30
+ runner.delete(app_handle)
torchx/cli/cmd_list.py CHANGED
@@ -11,7 +11,6 @@ import argparse
11
11
  import logging
12
12
 
13
13
  from tabulate import tabulate
14
-
15
14
  from torchx.cli.cmd_base import SubCommand
16
15
  from torchx.runner import get_runner
17
16
  from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories
torchx/cli/cmd_log.py CHANGED
@@ -27,7 +27,6 @@ from torchx.util.log_tee_helpers import (
27
27
  _find_role_replicas as find_role_replicas,
28
28
  _prefix_line,
29
29
  )
30
-
31
30
  from torchx.util.types import none_throws
32
31
 
33
32
  logger: logging.Logger = logging.getLogger(__name__)
torchx/cli/cmd_tracker.py CHANGED
@@ -10,7 +10,6 @@ import argparse
10
10
  import logging
11
11
 
12
12
  from tabulate import tabulate
13
-
14
13
  from torchx.cli.cmd_base import SubCommand
15
14
  from torchx.runner.api import get_configured_trackers
16
15
  from torchx.tracker.api import build_trackers, TrackerBase
torchx/cli/main.py CHANGED
@@ -16,6 +16,7 @@ import torchx
16
16
  from torchx.cli.cmd_base import SubCommand
17
17
  from torchx.cli.cmd_cancel import CmdCancel
18
18
  from torchx.cli.cmd_configure import CmdConfigure
19
+ from torchx.cli.cmd_delete import CmdDelete
19
20
  from torchx.cli.cmd_describe import CmdDescribe
20
21
  from torchx.cli.cmd_list import CmdList
21
22
  from torchx.cli.cmd_log import CmdLog
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
37
38
  "builtins": CmdBuiltins(),
38
39
  "cancel": CmdCancel(),
39
40
  "configure": CmdConfigure(),
41
+ "delete": CmdDelete(),
40
42
  "describe": CmdDescribe(),
41
43
  "list": CmdList(),
42
44
  "log": CmdLog(),
@@ -25,9 +25,7 @@ from types import ModuleType
25
25
  from typing import Any, Callable, Dict, Optional
26
26
 
27
27
  from torchx.runner import get_runner
28
-
29
28
  from torchx.specs import AppDef, AppStatus
30
-
31
29
  from torchx.specs.builders import _create_args_parser
32
30
  from torchx.specs.finder import get_component
33
31
 
@@ -18,7 +18,6 @@ from torchx.cli.cmd_log import get_logs
18
18
  from torchx.components.integration_tests.component_provider import ComponentProvider
19
19
  from torchx.runner import get_runner
20
20
  from torchx.specs import AppHandle, AppState, AppStatus, CfgVal
21
-
22
21
  from torchx.util.types import none_throws
23
22
 
24
23
 
@@ -17,7 +17,6 @@ from typing import Any, Iterator
17
17
  import torch
18
18
  import torch.distributed as dist
19
19
  from torch.distributed.distributed_c10d import _get_default_group
20
-
21
20
  from torchx.util.cuda import has_cuda_devices
22
21
  from typing_extensions import Literal
23
22
 
@@ -20,7 +20,6 @@ import time
20
20
  from typing import Dict
21
21
 
22
22
  from pytorch_lightning.loggers.logger import Logger
23
-
24
23
  from pytorch_lightning.profilers.profiler import Profiler
25
24
 
26
25
 
torchx/runner/api.py CHANGED
@@ -52,7 +52,6 @@ from torchx.tracker.api import (
52
52
  tracker_config_env_var_name,
53
53
  )
54
54
  from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
55
-
56
55
  from torchx.util.types import none_throws
57
56
  from torchx.workspace import WorkspaceMixin
58
57
 
@@ -587,6 +586,16 @@ class Runner:
587
586
  if status is not None and not status.is_terminal():
588
587
  scheduler.cancel(app_id)
589
588
 
589
+ def delete(self, app_handle: AppHandle) -> None:
590
+ """
591
+ Deletes the application from the scheduler.
592
+ """
593
+ scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
594
+ with log_event("delete", scheduler_backend, app_id):
595
+ status = self.status(app_handle)
596
+ if status is not None:
597
+ scheduler.delete(app_id)
598
+
590
599
  def stop(self, app_handle: AppHandle) -> None:
591
600
  """
592
601
  See method ``cancel``.
torchx/schedulers/api.py CHANGED
@@ -11,10 +11,11 @@ import re
11
11
  from dataclasses import dataclass, field
12
12
  from datetime import datetime
13
13
  from enum import Enum
14
- from typing import Generic, Iterable, List, Optional, TypeVar, Union
14
+ from typing import Generic, Iterable, List, Optional, TypeVar
15
15
 
16
16
  from torchx.specs import (
17
17
  AppDef,
18
+ AppDryRunInfo,
18
19
  AppState,
19
20
  NONE,
20
21
  NULL_RESOURCE,
@@ -95,11 +96,9 @@ class ListAppResponse:
95
96
 
96
97
 
97
98
  T = TypeVar("T")
98
- A = TypeVar("A")
99
- D = TypeVar("D")
100
99
 
101
100
 
102
- class Scheduler(abc.ABC, Generic[T, A, D]):
101
+ class Scheduler(abc.ABC, Generic[T]):
103
102
  """
104
103
  An interface abstracting functionalities of a scheduler.
105
104
  Implementers need only implement those methods annotated with
@@ -129,7 +128,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
129
128
 
130
129
  def submit(
131
130
  self,
132
- app: A,
131
+ app: AppDef,
133
132
  cfg: T,
134
133
  workspace: str | Workspace | None = None,
135
134
  ) -> str:
@@ -157,7 +156,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
157
156
  return self.schedule(dryrun_info)
158
157
 
159
158
  @abc.abstractmethod
160
- def schedule(self, dryrun_info: D) -> str:
159
+ def schedule(self, dryrun_info: AppDryRunInfo) -> str:
161
160
  """
162
161
  Same as ``submit`` except that it takes an ``AppDryRunInfo``.
163
162
  Implementers are encouraged to implement this method rather than
@@ -173,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
173
172
 
174
173
  raise NotImplementedError()
175
174
 
176
- def submit_dryrun(self, app: A, cfg: T) -> D:
175
+ def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
177
176
  """
178
177
  Rather than submitting the request to run the app, returns the
179
178
  request object that would have been submitted to the underlying
@@ -187,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
187
186
  # pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
188
187
  dryrun_info = self._submit_dryrun(app, resolved_cfg)
189
188
 
190
- if isinstance(app, AppDef):
191
- for role in app.roles:
192
- dryrun_info = role.pre_proc(self.backend, dryrun_info)
189
+ for role in app.roles:
190
+ dryrun_info = role.pre_proc(self.backend, dryrun_info)
191
+
193
192
  dryrun_info._app = app
194
193
  dryrun_info._cfg = resolved_cfg
195
194
  return dryrun_info
196
195
 
197
196
  @abc.abstractmethod
198
- def _submit_dryrun(self, app: A, cfg: T) -> D:
197
+ def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
199
198
  raise NotImplementedError()
200
199
 
201
200
  def run_opts(self) -> runopts:
@@ -264,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
264
263
  # do nothing if the app does not exist
265
264
  return
266
265
 
266
+ def delete(self, app_id: str) -> None:
267
+ """
268
+ Deletes the job information for the specified ``app_id`` from the
269
+ scheduler's data-plane. Basically "deep-purging" the job from the
270
+ scheduler's data-plane. Calling this API on a "live" job (e.g in a
271
+ non-terminal status such as PENDING or RUNNING) cancels the job.
272
+
273
+ Note that this API is only relevant for schedulers for which its
274
+ data-plane persistently stores the "JobDefinition" (which is often
275
+ versioned). AWS Batch and Kubernetes are examples of such schedulers.
276
+ On these schedulers, a finished job may fall out of the data-plane
277
+ (e.g. really old finished jobs get deleted) but the JobDefinition is
278
+ typically permanently stored. In this case, calling
279
+ :py:meth:`~cancel` would not delete the job definition.
280
+
281
+ In schedulers with no such feature (e.g. SLURM)
282
+ :py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
283
+ default implementation. Hence implementors of such schedulers need not
284
+ override this method.
285
+
286
+ .. warning::
287
+ Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
288
+ the scheduler's data-plane does nothing. The user is responsible for
289
+ manually tracking down and cleaning up any dangling resources related
290
+ to the job.
291
+ """
292
+ if self.exists(app_id):
293
+ self._delete_existing(app_id)
294
+
295
+ def _delete_existing(self, app_id: str) -> None:
296
+ """
297
+ Deletes the job information for the specified ``app_id`` from the
298
+ scheduler's data-plane. This method will only be called on an
299
+ application that exists.
300
+
301
+ The default implementation calls :py:meth:`~_cancel_existing` which is
302
+ appropriate for schedulers without persistent job definitions.
303
+ """
304
+ self._cancel_existing(app_id)
305
+
267
306
  def log_iter(
268
307
  self,
269
308
  app_id: str,
@@ -354,15 +393,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
354
393
  """
355
394
  pass
356
395
 
357
- def _validate(self, app: A, scheduler: str, cfg: T) -> None:
396
+ def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
358
397
  """
359
398
  Validates after workspace build whether application is consistent with the scheduler.
360
399
 
361
400
  Raises error if application is not compatible with scheduler
362
401
  """
363
- if not isinstance(app, AppDef):
364
- return
365
-
366
402
  for role in app.roles:
367
403
  if role.resource == NULL_RESOURCE:
368
404
  raise ValueError(
@@ -66,7 +66,6 @@ from torchx.schedulers.api import (
66
66
  Scheduler,
67
67
  Stream,
68
68
  )
69
-
70
69
  from torchx.schedulers.devices import get_device_mounts
71
70
  from torchx.schedulers.ids import make_unique
72
71
  from torchx.specs.api import (
@@ -188,7 +187,7 @@ def resource_requirements_from_resource(resource: Resource) -> List[Dict[str, st
188
187
 
189
188
 
190
189
  def resource_from_resource_requirements(
191
- resource_requirements: List[Dict[str, str]]
190
+ resource_requirements: List[Dict[str, str]],
192
191
  ) -> Resource:
193
192
  resrc_req = {
194
193
  ResourceType.from_str(r["type"]): int(r["value"]) for r in resource_requirements
@@ -381,7 +380,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
381
380
 
382
381
 
383
382
  @_thread_local_cache
384
- def _local_session() -> "boto3.session.Session":
383
+ def _local_session() -> "boto3.session.Session": # noqa: F821
385
384
  import boto3.session
386
385
 
387
386
  return boto3.session.Session()
@@ -399,9 +398,7 @@ class AWSBatchOpts(TypedDict, total=False):
399
398
  ulimits: Optional[list[str]]
400
399
 
401
400
 
402
- class AWSBatchScheduler(
403
- DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
404
- ):
401
+ class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
405
402
  """
406
403
  AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.
407
404
 
@@ -31,7 +31,6 @@ from typing import (
31
31
 
32
32
  import boto3
33
33
  import yaml
34
-
35
34
  from sagemaker.pytorch import PyTorch
36
35
  from torchx.components.structured_arg import StructuredNameArgument
37
36
  from torchx.schedulers.api import (
@@ -157,7 +156,7 @@ def _merge_ordered(
157
156
 
158
157
  class AWSSageMakerScheduler(
159
158
  DockerWorkspaceMixin,
160
- Scheduler[AWSSageMakerOpts, AppDef, AppDryRunInfo[AWSSageMakerJob]],
159
+ Scheduler[AWSSageMakerOpts],
161
160
  ):
162
161
  """
163
162
  AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.
@@ -129,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
129
129
  privileged: bool
130
130
 
131
131
 
132
- class DockerScheduler(
133
- DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
134
- ):
132
+ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
135
133
  """
136
134
  DockerScheduler is a TorchX scheduling interface to Docker.
137
135
 
torchx/schedulers/ids.py CHANGED
@@ -8,9 +8,9 @@
8
8
  # pyre-strict
9
9
 
10
10
  import os
11
- import random
12
11
  import struct
13
12
 
13
+
14
14
  START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
15
15
  END_CANDIDATES: str = START_CANDIDATES + "012345679"
16
16
 
@@ -19,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
19
19
  """
20
20
  Appends a unique 64-bit string to the input argument.
21
21
 
22
+ Note that the unique string pulls entropy from `/dev/urandom` hence is not
23
+ affected by `random.seed()`
24
+
25
+ Args:
26
+ name: the name string to unique-ify
27
+ string_length: max length of the unique 64-bit string to append to the ``name``.
28
+ Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
29
+
22
30
  Returns:
23
- string in format $name-$unique_suffix
31
+ string in format ``{name}-{unique_suffix}`
24
32
  """
25
- return (
26
- f"{name}-{random_id()}"
27
- if string_length == 0
28
- else f"{name}-{get_len_random_id(string_length)}"
29
- )
33
+ max_length = None if string_length == 0 else string_length
34
+ return f"{name}-{random_id(max_length)}"
30
35
 
31
36
 
32
37
  def random_uint64() -> int:
@@ -36,13 +41,24 @@ def random_uint64() -> int:
36
41
  return struct.unpack("!Q", os.urandom(8))[0]
37
42
 
38
43
 
39
- def random_id() -> str:
44
+ def random_id(max_length: int | None = None) -> str:
40
45
  """
41
46
  Generates an alphanumeric string ID that matches the requirements from
42
47
  https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
48
+
49
+ Note that the unique string pulls entropy from `/dev/urandom` hence is not
50
+ affected by `random.seed()`
51
+
52
+ If ``max_length`` is provided, the returned ID will be at most that many characters long.
53
+
43
54
  """
55
+ # If a max_length is provided and is non-positive, return empty string
56
+ if max_length is not None and max_length <= 0:
57
+ return ""
58
+
44
59
  out = ""
45
60
  v = random_uint64()
61
+
46
62
  while v > 0:
47
63
  if out == "":
48
64
  candidates = START_CANDIDATES
@@ -52,21 +68,9 @@ def random_id() -> str:
52
68
  char = v % len(candidates)
53
69
  v = v // len(candidates)
54
70
  out += candidates[char]
55
- return out
56
-
57
-
58
- def get_len_random_id(string_length: int) -> str:
59
- """
60
- Generates an alphanumeric string ID that matches the requirements from
61
- https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
62
- """
63
- out = ""
64
- for i in range(string_length):
65
- if out == "":
66
- candidates = START_CANDIDATES
67
- else:
68
- candidates = END_CANDIDATES
69
71
 
70
- out += random.choice(candidates)
72
+ if max_length is not None and len(out) >= max_length:
73
+ break
71
74
 
75
+ # NOTE: statistically the length of `out` is typically between 12-14 characters long
72
76
  return out
@@ -32,7 +32,6 @@ Learn more about running distributed trainers :py:mod:`torchx.components.dist`
32
32
  import json
33
33
  import logging
34
34
  import re
35
-
36
35
  import warnings
37
36
  from dataclasses import dataclass
38
37
  from datetime import datetime
@@ -77,7 +76,6 @@ from torchx.specs.api import (
77
76
  runopts,
78
77
  VolumeMount,
79
78
  )
80
-
81
79
  from torchx.workspace.docker_workspace import DockerWorkspaceMixin
82
80
 
83
81
  if TYPE_CHECKING:
@@ -796,10 +794,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
796
794
  network: Optional[str]
797
795
 
798
796
 
799
- class KubernetesMCADScheduler(
800
- DockerWorkspaceMixin,
801
- Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
802
- ):
797
+ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
803
798
  """
804
799
  KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.
805
800
 
@@ -149,7 +149,6 @@ from torchx.specs.api import (
149
149
  from torchx.util.strings import normalize_str
150
150
  from torchx.workspace.docker_workspace import DockerWorkspaceMixin
151
151
 
152
-
153
152
  if TYPE_CHECKING:
154
153
  from docker import DockerClient
155
154
  from kubernetes.client import ApiClient, CustomObjectsApi
@@ -159,6 +158,7 @@ if TYPE_CHECKING:
159
158
  )
160
159
  from kubernetes.client.rest import ApiException
161
160
 
161
+
162
162
  logger: logging.Logger = logging.getLogger(__name__)
163
163
 
164
164
  # Kubernetes reserves a small amount of resources per host for the system. For
@@ -294,7 +294,14 @@ def sanitize_for_serialization(obj: object) -> object:
294
294
  return api.sanitize_for_serialization(obj)
295
295
 
296
296
 
297
- def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod":
297
+ def role_to_pod(
298
+ name: str,
299
+ role: Role,
300
+ service_account: Optional[str],
301
+ reserved_millicpu: int = RESERVED_MILLICPU,
302
+ reserved_memmb: int = RESERVED_MEMMB,
303
+ efa_device_count: Optional[int] = None,
304
+ ) -> "V1Pod":
298
305
  from kubernetes.client.models import ( # noqa: F811 redefinition of unused
299
306
  V1Container,
300
307
  V1ContainerPort,
@@ -324,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
324
331
  if resource.cpu > 0:
325
332
  mcpu = int(resource.cpu * 1000)
326
333
  limits["cpu"] = f"{mcpu}m"
327
- request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
334
+ request_mcpu = max(mcpu - reserved_millicpu, 0)
328
335
  requests["cpu"] = f"{request_mcpu}m"
329
336
  if resource.memMB > 0:
330
337
  limits["memory"] = f"{int(resource.memMB)}M"
331
- request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
338
+ request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
332
339
  requests["memory"] = f"{request_memMB}M"
333
340
  if resource.gpu > 0:
334
341
  requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
335
342
 
343
+ EFA_DEVICE = "vpc.amazonaws.com/efa"
336
344
  for device_name, device_limit in resource.devices.items():
337
345
  limits[device_name] = str(device_limit)
338
346
 
347
+ # Handle EFA device count override:
348
+ # - None (default): use whatever count is in the resource spec (already added above)
349
+ # - 0: remove EFA devices entirely
350
+ # - N > 0: set EFA device count to N (override or add)
351
+ if efa_device_count is not None:
352
+ if efa_device_count == 0:
353
+ limits.pop(EFA_DEVICE, None)
354
+ else:
355
+ limits[EFA_DEVICE] = str(efa_device_count)
356
+
339
357
  resources = V1ResourceRequirements(
340
358
  limits=limits,
341
359
  requests=requests,
@@ -475,6 +493,9 @@ def app_to_resource(
475
493
  queue: str,
476
494
  service_account: Optional[str],
477
495
  priority_class: Optional[str] = None,
496
+ reserved_millicpu: int = RESERVED_MILLICPU,
497
+ reserved_memmb: int = RESERVED_MEMMB,
498
+ efa_device_count: Optional[int] = None,
478
499
  ) -> Dict[str, Any]:
479
500
  """
480
501
  app_to_resource creates a volcano job kubernetes resource definition from
@@ -507,7 +528,14 @@ def app_to_resource(
507
528
  replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
508
529
  replica_role.env["TORCHX_IMAGE"] = replica_role.image
509
530
 
510
- pod = role_to_pod(name, replica_role, service_account)
531
+ pod = role_to_pod(
532
+ name,
533
+ replica_role,
534
+ service_account,
535
+ reserved_millicpu,
536
+ reserved_memmb,
537
+ efa_device_count,
538
+ )
511
539
  if k8s_metadata := role.metadata.get("kubernetes"):
512
540
  if isinstance(k8s_metadata, str):
513
541
  import fsspec
@@ -589,12 +617,12 @@ class KubernetesOpts(TypedDict, total=False):
589
617
  service_account: Optional[str]
590
618
  priority_class: Optional[str]
591
619
  validate_spec: Optional[bool]
620
+ reserved_millicpu: Optional[int]
621
+ reserved_memmb: Optional[int]
622
+ efa_device_count: Optional[int]
592
623
 
593
624
 
594
- class KubernetesScheduler(
595
- DockerWorkspaceMixin,
596
- Scheduler[KubernetesOpts, AppDef, AppDryRunInfo[KubernetesJob]],
597
- ):
625
+ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
598
626
  """
599
627
  KubernetesScheduler is a TorchX scheduling interface to Kubernetes.
600
628
 
@@ -622,6 +650,16 @@ class KubernetesScheduler(
622
650
  $ torchx status kubernetes://torchx_user/1234
623
651
  ...
624
652
 
653
+ **Cancellation**
654
+
655
+ Canceling a job aborts it while preserving the job spec for inspection
656
+ and cloning via kubectl apply. Use the delete command to remove the job entirely:
657
+
658
+ .. code-block:: bash
659
+
660
+ $ torchx cancel kubernetes://namespace/jobname # abort, preserves spec
661
+ $ torchx delete kubernetes://namespace/jobname # delete completely
662
+
625
663
  **Config Options**
626
664
 
627
665
  .. runopts::
@@ -700,9 +738,14 @@ class KubernetesScheduler(
700
738
  if c is None:
701
739
  configuration = client.Configuration()
702
740
  try:
703
- config.load_kube_config(client_configuration=configuration)
704
- except config.ConfigException as e:
705
- warnings.warn(f"failed to load kube config: {e}")
741
+ # Try in-cluster config first (for pods with ServiceAccount)
742
+ config.load_incluster_config(client_configuration=configuration)
743
+ except config.ConfigException:
744
+ # Fall back to kubeconfig (for local development)
745
+ try:
746
+ config.load_kube_config(client_configuration=configuration)
747
+ except config.ConfigException as e:
748
+ warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
706
749
 
707
750
  c = self._client = client.ApiClient(configuration)
708
751
 
@@ -776,7 +819,26 @@ class KubernetesScheduler(
776
819
  priority_class, str
777
820
  ), "priority_class must be a str"
778
821
 
779
- resource = app_to_resource(app, queue, service_account, priority_class)
822
+ reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
823
+ assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
824
+
825
+ reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
826
+ assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
827
+
828
+ efa_device_count = cfg.get("efa_device_count")
829
+ assert efa_device_count is None or isinstance(
830
+ efa_device_count, int
831
+ ), "efa_device_count must be an int or None"
832
+
833
+ resource = app_to_resource(
834
+ app,
835
+ queue,
836
+ service_account,
837
+ priority_class,
838
+ reserved_millicpu,
839
+ reserved_memmb,
840
+ efa_device_count,
841
+ )
780
842
 
781
843
  if cfg.get("validate_spec"):
782
844
  try:
@@ -818,6 +880,31 @@ class KubernetesScheduler(
818
880
  pass
819
881
 
820
882
  def _cancel_existing(self, app_id: str) -> None:
883
+ """
884
+ Abort a Volcano job while preserving the spec for inspection.
885
+ """
886
+ namespace, name = app_id.split(":")
887
+ vcjob = self._custom_objects_api().get_namespaced_custom_object(
888
+ group="batch.volcano.sh",
889
+ version="v1alpha1",
890
+ namespace=namespace,
891
+ plural="jobs",
892
+ name=name,
893
+ )
894
+ vcjob["status"]["state"]["phase"] = "Aborted"
895
+ self._custom_objects_api().replace_namespaced_custom_object_status(
896
+ group="batch.volcano.sh",
897
+ version="v1alpha1",
898
+ namespace=namespace,
899
+ plural="jobs",
900
+ name=name,
901
+ body=vcjob,
902
+ )
903
+
904
+ def _delete_existing(self, app_id: str) -> None:
905
+ """
906
+ Delete a Volcano job completely from the cluster.
907
+ """
821
908
  namespace, name = app_id.split(":")
822
909
  self._custom_objects_api().delete_namespaced_custom_object(
823
910
  group="batch.volcano.sh",
@@ -857,9 +944,29 @@ class KubernetesScheduler(
857
944
  help="Validate job spec using Kubernetes API dry-run before submission",
858
945
  default=True,
859
946
  )
947
+ opts.add(
948
+ "reserved_millicpu",
949
+ type_=int,
950
+ help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
951
+ default=RESERVED_MILLICPU,
952
+ )
953
+ opts.add(
954
+ "reserved_memmb",
955
+ type_=int,
956
+ help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
957
+ default=RESERVED_MEMMB,
958
+ )
959
+ opts.add(
960
+ "efa_device_count",
961
+ type_=int,
962
+ help="EFA device count override: None/unset=use resource spec, "
963
+ "0=remove EFA, N>0=set EFA count to N",
964
+ default=None,
965
+ )
860
966
  return opts
861
967
 
862
968
  def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
969
+ from kubernetes import client
863
970
  from kubernetes.client.rest import ApiException
864
971
 
865
972
  namespace, name = app_id.split(":")
@@ -885,18 +992,44 @@ class KubernetesScheduler(
885
992
  TASK_STATUS_COUNT = "taskStatusCount"
886
993
 
887
994
  if TASK_STATUS_COUNT in status:
888
- for name, status in status[TASK_STATUS_COUNT].items():
889
- role, _, idx = name.rpartition("-")
995
+ for task_name, task_status in status[TASK_STATUS_COUNT].items():
996
+ role, _, idx = task_name.rpartition("-")
890
997
 
891
- state_str = next(iter(status["phase"].keys()))
998
+ state_str = next(iter(task_status["phase"].keys()))
892
999
  state = TASK_STATE[state_str]
893
1000
 
894
1001
  if role not in roles:
895
1002
  roles[role] = Role(name=role, num_replicas=0, image="")
896
1003
  roles_statuses[role] = RoleStatus(role, [])
897
1004
  roles[role].num_replicas += 1
1005
+
1006
+ # Pod name follows the pattern: {job_name}-{task_name}-0
1007
+ # Get the pod to retrieve its IP address
1008
+ pod_name_k8s = f"{name}-{task_name}-0"
1009
+ hostname = ""
1010
+ try:
1011
+ core_api = client.CoreV1Api(self._api_client())
1012
+ pod = core_api.read_namespaced_pod(
1013
+ name=pod_name_k8s, namespace=namespace
1014
+ )
1015
+ pod_ip = pod.status.pod_ip
1016
+
1017
+ if pod_ip is not None:
1018
+ # Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
1019
+ pod_ip_dashed = pod_ip.replace(".", "-")
1020
+
1021
+ # Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
1022
+ # Note: This will only be useful if the client using the IPs is in the cluster.
1023
+ hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
1024
+
1025
+ except ApiException:
1026
+ # Pod not found - hostname remains empty
1027
+ pass
1028
+
898
1029
  roles_statuses[role].replicas.append(
899
- ReplicaStatus(id=int(idx), role=role, state=state, hostname="")
1030
+ ReplicaStatus(
1031
+ id=int(idx), role=role, state=state, hostname=hostname
1032
+ )
900
1033
  )
901
1034
  else:
902
1035
  app_state = AppState.UNKNOWN
@@ -940,7 +1073,10 @@ class KubernetesScheduler(
940
1073
  core_api = client.CoreV1Api(self._api_client())
941
1074
  if should_tail:
942
1075
  w = watch.Watch()
943
- iterator = w.stream(core_api.read_namespaced_pod_log, **args)
1076
+ iterator = (
1077
+ f"{line}\n"
1078
+ for line in w.stream(core_api.read_namespaced_pod_log, **args)
1079
+ )
944
1080
  else:
945
1081
  resp = core_api.read_namespaced_pod_log(**args)
946
1082
  iterator = split_lines(resp)
@@ -55,7 +55,6 @@ from torchx.schedulers.ids import make_unique
55
55
  from torchx.schedulers.streams import Tee
56
56
  from torchx.specs import AppDryRunInfo
57
57
  from torchx.specs.api import AppDef, AppState, is_terminal, macros, NONE, Role, runopts
58
-
59
58
  from torchx.util.types import none_throws
60
59
 
61
60
  log: logging.Logger = logging.getLogger(__name__)
@@ -529,7 +528,7 @@ def _register_termination_signals() -> None:
529
528
  signal.signal(signal.SIGINT, _terminate_process_handler)
530
529
 
531
530
 
532
- class LocalScheduler(Scheduler[LocalOpts, AppDef, AppDryRunInfo[PopenRequest]]):
531
+ class LocalScheduler(Scheduler[LocalOpts]):
533
532
  """
534
533
  Schedules on localhost. Containers are modeled as processes and
535
534
  certain properties of the container that are either not relevant
@@ -394,7 +394,7 @@ class LsfBsub:
394
394
  {self.materialize()}"""
395
395
 
396
396
 
397
- class LsfScheduler(Scheduler[LsfOpts, AppDef, AppDryRunInfo]):
397
+ class LsfScheduler(Scheduler[LsfOpts]):
398
398
  """
399
399
  **Example: hello_world**
400
400
 
@@ -135,6 +135,7 @@ SBATCH_JOB_OPTIONS = {
135
135
  "comment",
136
136
  "mail-user",
137
137
  "mail-type",
138
+ "account",
138
139
  }
139
140
  SBATCH_GROUP_OPTIONS = {
140
141
  "partition",
@@ -159,6 +160,7 @@ def _apply_app_id_env(s: str) -> str:
159
160
  SlurmOpts = TypedDict(
160
161
  "SlurmOpts",
161
162
  {
163
+ "account": Optional[str],
162
164
  "partition": str,
163
165
  "time": str,
164
166
  "comment": Optional[str],
@@ -335,9 +337,7 @@ fi
335
337
  {self.materialize()}"""
336
338
 
337
339
 
338
- class SlurmScheduler(
339
- DirWorkspaceMixin, Scheduler[SlurmOpts, AppDef, AppDryRunInfo[SlurmBatchRequest]]
340
- ):
340
+ class SlurmScheduler(DirWorkspaceMixin, Scheduler[SlurmOpts]):
341
341
  """
342
342
  SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
343
343
  that slurm CLI tools are locally installed and job accounting is enabled.
@@ -406,6 +406,12 @@ class SlurmScheduler(
406
406
 
407
407
  def _run_opts(self) -> runopts:
408
408
  opts = runopts()
409
+ opts.add(
410
+ "account",
411
+ type_=str,
412
+ help="The account to use for the slurm job.",
413
+ default=None,
414
+ )
409
415
  opts.add(
410
416
  "partition",
411
417
  type_=str,
torchx/specs/__init__.py CHANGED
@@ -12,9 +12,8 @@ used by components to define the apps which can then be launched via a TorchX
12
12
  scheduler or pipeline adapter.
13
13
  """
14
14
  import difflib
15
-
16
15
  import os
17
- from typing import Callable, Dict, Mapping, Optional
16
+ from typing import Callable, Dict, Iterator, Mapping, Optional
18
17
 
19
18
  from torchx.specs.api import (
20
19
  ALL,
@@ -50,9 +49,7 @@ from torchx.specs.api import (
50
49
  Workspace,
51
50
  )
52
51
  from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
53
-
54
52
  from torchx.util.entrypoints import load_group
55
-
56
53
  from torchx.util.modules import import_attr
57
54
 
58
55
  GiB: int = 1024
@@ -113,8 +110,22 @@ class _NamedResourcesLibrary:
113
110
  def __contains__(self, key: str) -> bool:
114
111
  return key in _named_resource_factories
115
112
 
116
- def __iter__(self) -> None:
117
- raise NotImplementedError("named resources doesn't support iterating")
113
+ def __iter__(self) -> Iterator[str]:
114
+ """Iterates through the names of the registered named_resources.
115
+
116
+ Usage:
117
+
118
+ .. doctest::
119
+
120
+ from torchx import specs
121
+
122
+ for resource_name in specs.named_resources:
123
+ resource = specs.resource(h=resource_name)
124
+ assert isinstance(resource, specs.Resource)
125
+
126
+ """
127
+ for key in _named_resource_factories:
128
+ yield (key)
118
129
 
119
130
 
120
131
  named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
torchx/specs/api.py CHANGED
@@ -253,7 +253,9 @@ class macros:
253
253
  current_dict[k] = self.substitute(v)
254
254
  elif isinstance(v, list):
255
255
  for i in range(len(v)):
256
- if isinstance(v[i], str):
256
+ if isinstance(v[i], dict):
257
+ stack.append(v[i])
258
+ elif isinstance(v[i], str):
257
259
  v[i] = self.substitute(v[i])
258
260
  return d
259
261
 
torchx/specs/finder.py CHANGED
@@ -20,7 +20,6 @@ from types import ModuleType
20
20
  from typing import Callable, Dict, Generator, List, Optional, Union
21
21
 
22
22
  from torchx.specs import AppDef
23
-
24
23
  from torchx.specs.file_linter import (
25
24
  ComponentFunctionValidator,
26
25
  get_fn_docstring,
@@ -0,0 +1,106 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ """
10
+ Overlays are JSON structs applied to :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`
11
+ to specify attributes of the scheduler's submit-job request that are not currently representable
12
+ as attributes of :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`.
13
+
14
+ For end-uses, here are a few use-cases of overlays:
15
+
16
+ 1. A new version of the scheduler has concepts/features that have not yet been added to TorchX.
17
+ 2. A bespoke internal scheduler has custom features that do not generalize hence not in TorchX.
18
+ 3. Re-using a pre-built ``AppDef`` but need to make a small change to the resulting scheduler request.
19
+
20
+ And for scheduler authors:
21
+
22
+ 1. Scheduler setting needs to be applied to a ``Role``, which makes it hard to add as ``runopts``
23
+ since ``runopts`` apply at the ``AppDef`` level.
24
+ 2. Scheduler setting cannot be represented naturally as the types supported by ``runopts``.
25
+ 3. Exposing the setting as a ``runopts`` obfuscates things.
26
+
27
+ See :py:func:`~torchx.specs.overlays.apply_overlay` for rules on how overlays are applied.
28
+ """
29
+
30
+ from typing import Any
31
+
32
+ Json = dict[str, Any]
33
+
34
+
35
+ def apply_overlay(base: Json, overlay: Json) -> None:
36
+ """Applies ``overlay`` on ``base``.
37
+
38
+ .. note:: this function mutates the ``base``!
39
+
40
+ Overlays follow these rules:
41
+
42
+ 1. Dicts, upsert key, value in base with the ones in overlay.
43
+ 2. Nested dicts, overlay recursively.
44
+ 3. Lists, append the overlay values to the base values.
45
+ 4. Nested lists DO NOT append recursively.
46
+ 5. Primitives (bool, str, int, float), replace base with the value in overlay.
47
+
48
+ .. doctest::
49
+
50
+ from torchx.specs.overlays import apply_overlay
51
+
52
+ base = {
53
+ "scheduler": {"policy": "default"},
54
+ "resources": {"limits": {"cpu": "500m"}},
55
+ "tolerations": [{"key": "gpu"}],
56
+ "nodeSelectorTerms": [
57
+ [{"matchExpressions": []}]
58
+ ],
59
+ "maxPods": 110,
60
+ }
61
+ overlay = {
62
+ "scheduler": {"policy": "binpacking"},
63
+ "resources": {"limits": {"memory": "1Gi"}},
64
+ "tolerations": [{"key": "spot"}],
65
+ "nodeSelectorTerms": [
66
+ [{"matchExpressions": [{"key": "disk"}]}]
67
+ ],
68
+ "maxPods": 250,
69
+ }
70
+
71
+ apply_overlay(base, overlay)
72
+
73
+ assert {
74
+ "scheduler": {"policy": "binpacking"},
75
+ "resources": {"limits": {"cpu": "500m", "memory": "1Gi"}},
76
+ "tolerations": [{"key": "gpu"}, {"key": "spot"}],
77
+ "nodeSelectorTerms": [
78
+ [{"matchExpressions": []}],
79
+ [{"matchExpressions": [{"key": "disk"}]}],
80
+ ],
81
+ "maxPods": 250,
82
+ } == base
83
+
84
+ """
85
+
86
+ def assert_type_equal(key: str, o1: object, o2: object) -> None:
87
+ o1_type = type(o1)
88
+ o2_type = type(o2)
89
+ assert (
90
+ o1_type == o2_type
91
+ ), f"Type mismatch for attr: `{key}`. {o1_type.__qualname__} != {o2_type.__qualname__}"
92
+
93
+ for key, overlay_value in overlay.items():
94
+ if key in base:
95
+ base_value = base[key]
96
+
97
+ assert_type_equal(key, base_value, overlay_value)
98
+
99
+ if isinstance(base_value, dict) and isinstance(overlay_value, dict):
100
+ apply_overlay(base_value, overlay_value)
101
+ elif isinstance(base_value, list) and isinstance(overlay_value, list):
102
+ base_value.extend(overlay_value)
103
+ else:
104
+ base[key] = overlay_value
105
+ else:
106
+ base[key] = overlay_value
torchx/tracker/api.py CHANGED
@@ -179,7 +179,7 @@ def _extract_tracker_name_and_config_from_environ() -> Mapping[str, Optional[str
179
179
 
180
180
 
181
181
  def build_trackers(
182
- factory_and_config: Mapping[str, Optional[str]]
182
+ factory_and_config: Mapping[str, Optional[str]],
183
183
  ) -> Iterable[TrackerBase]:
184
184
  trackers = []
185
185
 
@@ -16,7 +16,6 @@ from dataclasses import dataclass
16
16
  from typing import Any, Dict, Iterable, Mapping, Optional
17
17
 
18
18
  import fsspec
19
-
20
19
  from torchx.tracker.api import Lineage, TrackerArtifact, TrackerBase, TrackerSource
21
20
 
22
21
 
torchx/tracker/mlflow.py CHANGED
@@ -16,7 +16,6 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Sequence
16
16
  import mlflow
17
17
  from mlflow import MlflowClient
18
18
  from mlflow.entities import Experiment, Run
19
-
20
19
  from torchx.distributed import on_rank0_first
21
20
  from torchx.runner.config import get_configs
22
21
  from torchx.tracker.api import (
@@ -16,7 +16,6 @@ import tempfile
16
16
  from typing import Dict, IO, Iterable, Mapping, Optional, TextIO, Tuple, TYPE_CHECKING
17
17
 
18
18
  import fsspec
19
-
20
19
  import torchx
21
20
  from docker.errors import BuildError
22
21
  from torchx.specs import AppDef, CfgVal, Role, runopts
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: torchx-nightly
3
- Version: 2025.11.12
3
+ Version: 2026.1.22
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/meta-pytorch/torchx
6
6
  Author: TorchX Devs
@@ -47,7 +47,7 @@ Requires-Dist: pytest; extra == "dev"
47
47
  Requires-Dist: pytest-cov; extra == "dev"
48
48
  Requires-Dist: pytorch-lightning==2.5.0; extra == "dev"
49
49
  Requires-Dist: tensorboard==2.14.0; extra == "dev"
50
- Requires-Dist: sagemaker==2.230.0; extra == "dev"
50
+ Requires-Dist: sagemaker==2.237.3; extra == "dev"
51
51
  Requires-Dist: torch-model-archiver>=0.4.2; extra == "dev"
52
52
  Requires-Dist: torch; extra == "dev"
53
53
  Requires-Dist: torchmetrics==1.6.3; extra == "dev"
@@ -14,17 +14,18 @@ torchx/cli/argparse_util.py,sha256=kZb1ubEHDrBsmrxpySFRQCW7wmHuRHD8eAInuEZjlsI,3
14
14
  torchx/cli/cmd_base.py,sha256=SdqMtqi04CEqnzcgcS35DbDbsBeMxSgEhfynfpIkMGk,790
15
15
  torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
16
16
  torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
17
+ torchx/cli/cmd_delete.py,sha256=US1f6Jvyhz4R_0Q0a8GeNTDMrhzo8WE_ECcdOf0MjKE,835
17
18
  torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
18
- torchx/cli/cmd_list.py,sha256=alkS9aIaDI8lX3W8uj8Vtr3IU3G2VeCuokKSd3zOFug,1409
19
- torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
19
+ torchx/cli/cmd_list.py,sha256=deu920UTFJFTNVBdgSXhgMUpbJF4G9-xNny6XIqU4KA,1408
20
+ torchx/cli/cmd_log.py,sha256=nEzwVs1QwtrDFXtiAgPXnCu2YiBgVAtACIdpOSAYAU8,5474
20
21
  torchx/cli/cmd_run.py,sha256=z8wS-M2W9hHZfLkA6DFiV6Y0LFS9KfEBc_NTwAwdviQ,18780
21
22
  torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,1302
22
23
  torchx/cli/cmd_status.py,sha256=22IAEmKs0qkG6kJi83u9dRX2Q-ntT7yehVx7FxtY-vQ,2114
23
- torchx/cli/cmd_tracker.py,sha256=9gmOmYi-89qQRGQfSrXCTto7ve54_JKFqs_wa7oRUA8,5223
24
+ torchx/cli/cmd_tracker.py,sha256=pWVqXGUiwPE5_aWPCn_j-ov2EQkH2f0Xdv5DZW5U3Tg,5222
24
25
  torchx/cli/colors.py,sha256=yLMes7e_UoLAfhxE0W6edhc58t83UHAlnCN2ANPeuXw,568
25
- torchx/cli/main.py,sha256=1Jf2cnO6Y2W69Adt88avmNPVrL6ZR4Hkff6GVB4293k,3484
26
+ torchx/cli/main.py,sha256=1DJTmKdvPW_7hod8OUVT3Br2uwsZVEDU-2bTE0NJ0zY,3559
26
27
  torchx/components/__init__.py,sha256=JaVte0j9Gqi6IrjZKudJ2Kr3gkdHsvlCdRTo-zYpSRo,11815
27
- torchx/components/component_test_base.py,sha256=22iNSdVa_qTW3SMM30Pw5UEWlK4DZVw0C03EqYiaLOI,4150
28
+ torchx/components/component_test_base.py,sha256=2kIC7odZQwpsFRjdHW1m0_BY5Uh6IZlIOx0bWgLB_JI,4148
28
29
  torchx/components/dist.py,sha256=6DNPEvHVqEifmM8g1L7HVY169cQv_7tSfSlh3o6lTp4,14930
29
30
  torchx/components/interpret.py,sha256=g8gkKdDJvsBfX1ZrpVT7n2bMEtmwRV_1AqDyAnnQ_aA,697
30
31
  torchx/components/metrics.py,sha256=1gbp8BfzZWGa7PD1db5vRADlONzmae4qSBUUdCWayr0,2814
@@ -34,8 +35,8 @@ torchx/components/train.py,sha256=vtrQXRcD7bIcbb3lSeyD9BBlIe1mv1WNW6rnLK9R0Mw,12
34
35
  torchx/components/utils.py,sha256=IMjihhgs7nO67YtTetUBjN_CRpyIyyQsaJBkp7mpHfk,9368
35
36
  torchx/components/integration_tests/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
36
37
  torchx/components/integration_tests/component_provider.py,sha256=g-4ig1vtd5Vzgug0VAKRAFUt6KAV3TgQrBCrwRSJ7ZY,3981
37
- torchx/components/integration_tests/integ_tests.py,sha256=O8jd8Jq5O0mns7xzIFsHexBDHkIIAIfELQkWCzNPzRw,5165
38
- torchx/distributed/__init__.py,sha256=kh9YzDwWX7zFJJ8StR9qhMM2V3-66INs9i3ztDF-1ho,10252
38
+ torchx/components/integration_tests/integ_tests.py,sha256=JrOAauk4xbB3bB_yf8yZl69ddTESdacEf9JrMHcoaJU,5164
39
+ torchx/distributed/__init__.py,sha256=m0QXzwqpXyubk4g7JB79tHjT3Ab5JiVCQM7MRf5H9a0,10251
39
40
  torchx/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
41
  torchx/examples/torchx_out_of_sync_training.py,sha256=sXiI1G8aGsfuvxRdBszDgM8pSplqhgfXjRnAcgRwNGM,397
41
42
  torchx/examples/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -45,11 +46,11 @@ torchx/examples/apps/lightning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
45
46
  torchx/examples/apps/lightning/data.py,sha256=kSv_DFqtFVkNjZ46HT7GApImc9lMD7liy929dUrFWwM,6610
46
47
  torchx/examples/apps/lightning/interpret.py,sha256=Hd3kE5a6FyhxCmJBfTzb4Tlj518zhX8V0XvZfzu4nqE,5256
47
48
  torchx/examples/apps/lightning/model.py,sha256=4CgObWfANqDN9emYSdmCpbRe_V_Lef_Hd3M-yayDbZE,4045
48
- torchx/examples/apps/lightning/profiler.py,sha256=SSSihnwjeUTkBoz0E3qn1b-wbkfUIowscx2ND_37zyw,1915
49
+ torchx/examples/apps/lightning/profiler.py,sha256=ogL3mO4YGPebdCFckkeHX3BzJD3niU189DCnrBEoBGI,1914
49
50
  torchx/examples/apps/lightning/train.py,sha256=0wvvshGHvZowePB4LfclXwn40X7i9euM0ReETWBcPSo,6253
50
51
  torchx/pipelines/__init__.py,sha256=2MbRVk5xwRjg-d2qPemeXpEhDsocMQumPQ53lsesZAI,606
51
52
  torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
52
- torchx/runner/api.py,sha256=xQpgiUz9jCX4zZriubbWk4tTJRe7MxNJQK64g0o7KQ8,30438
53
+ torchx/runner/api.py,sha256=tN8087Hi7OHX1lVCmjccFgM1tcZwaxeJGMIvE4ZDrb4,30831
53
54
  torchx/runner/config.py,sha256=SaKOB50d79WaMFPWK8CC4as6UaNFaRGhrBkfajq3KC4,18311
54
55
  torchx/runner/events/__init__.py,sha256=cMiNjnr4eUNQ2Nxxtu4nsvN5lu56b-a6nJ-ct3i7DQk,5536
55
56
  torchx/runner/events/api.py,sha256=bvxKBAYK8LzbrBNaNLgL1x0aivtfANmWo1EMGOrSR8k,2668
@@ -58,25 +59,26 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
58
59
  torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
59
60
  torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
60
61
  torchx/schedulers/__init__.py,sha256=FQN9boQM4mwOD3sK9LZ3GBgw-gJ7Vx4MFj6z6ATQIrc,2211
61
- torchx/schedulers/api.py,sha256=smoUv1ocfqsBRmesXbz9i1F86zBOixZ8QHxYmI_MzgQ,14649
62
- torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
63
- torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
62
+ torchx/schedulers/api.py,sha256=wT9H_ZTmpTHHweevDJbkV7NKXfwileHrt1bbhhCgj3c,16488
63
+ torchx/schedulers/aws_batch_scheduler.py,sha256=tsQmeqEBLR_Zcm7jWbbZnoZ5TFvo9FHhEt00LgZAnzM,29412
64
+ torchx/schedulers/aws_sagemaker_scheduler.py,sha256=BRa85fqWcPK-B10cYMmm-CbJu0smxOsTXknAbOCfaYA,20880
64
65
  torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
65
- torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
66
- torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
67
- torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
68
- torchx/schedulers/kubernetes_scheduler.py,sha256=86ny9XXt9tdeV6Y7AlVFQ6vhxlviOdNeZUz4gOzU3cc,34478
69
- torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
70
- torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
71
- torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
66
+ torchx/schedulers/docker_scheduler.py,sha256=Kud3AIzQtMekgjlqcg1eNDb8kk29aPbGYOMAvPTZdhM,16840
67
+ torchx/schedulers/ids.py,sha256=8Qhf1Xqh845mwL-RXnWZXqIILNvml3z8udEXPFpyO7U,2247
68
+ torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=G2LZTNMEJRo34osBxMUScYXUG9fYi_Hak78-PH5cJUo,42894
69
+ torchx/schedulers/kubernetes_scheduler.py,sha256=kYO08hqVlZtNe_FZQP_e8WQk1P8-8SVkXZuY3Zm_Znk,39640
70
+ torchx/schedulers/local_scheduler.py,sha256=Ga5nZ6mxqBa8KcD32UAgZiY7-uhHXnBAIhwNHilhEkw,41891
71
+ torchx/schedulers/lsf_scheduler.py,sha256=vUvEJb02u7WI6y7DsWJxJFXNylRucU7FqkBX7xwLTak,17638
72
+ torchx/schedulers/slurm_scheduler.py,sha256=ipDVDtgfqgL6c35NyoJgSPuQFt8-AeXVXAnXJVvmzrc,32032
72
73
  torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
73
- torchx/specs/__init__.py,sha256=SXS4r_roOkbbAL-p7EY5fl5ou-AG7S9Ck-zKtRBdHOk,6760
74
- torchx/specs/api.py,sha256=OrLX4gGa97qtjUbl3x_YnOKCdP0rQkVEruPIbNjo7fk,49230
74
+ torchx/specs/__init__.py,sha256=tFvFg0uRwwZgZHiD3hfMDGlEpfu9SIZPWvCohEgqcvQ,7105
75
+ torchx/specs/api.py,sha256=7FdLFfadNWqXTLJ_EtP5t1uVS2Vc_4Gj5GLFoI628oE,49338
75
76
  torchx/specs/builders.py,sha256=Ye3of4MupJ-da8vLaX6_-nzGo_FRw1BFpYsX6dAZCNk,13730
76
77
  torchx/specs/file_linter.py,sha256=z0c4mKJv47BWiPaWCdUM0A8kHwnj4b1s7oTmESuD9Tc,14407
77
- torchx/specs/finder.py,sha256=gWQNEFrLYqrZoI0gMMhQ70YAC4sxqS0ZFpoWAmcVi44,17438
78
+ torchx/specs/finder.py,sha256=zBSjcywPO-BnYAUwG9EMi0_1UPBfEBNdA3C8WXz8KQU,17437
78
79
  torchx/specs/named_resources_aws.py,sha256=ZNAbw6lD8NUlMfcJ-LpX14dMSaHO7m4Yt9iHwAF44yg,11674
79
80
  torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
81
+ torchx/specs/overlays.py,sha256=HmY2yzC8ejgihviNWFT4rbYmP-gTcqpxVZTP6qBiIYM,3778
80
82
  torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
81
83
  torchx/specs/test/components/a/__init__.py,sha256=kdxEgnI8QBSBiuTjaB4qDD7JX84hWowyPWU4B2Cqe9A,561
82
84
  torchx/specs/test/components/a/b/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
@@ -84,10 +86,10 @@ torchx/specs/test/components/a/b/c.py,sha256=FhixafzNqpS5zvggtWIWLxRd6HIxsOmct-d
84
86
  torchx/specs/test/components/c/__init__.py,sha256=5CBMckkpqJUdxBQBYHGSsItqq1gj2V0UiCw02Qfq6MM,246
85
87
  torchx/specs/test/components/c/d.py,sha256=2AjE-FmQXJTw3hws66O83ToQPmjOEZLDf-jDAKrrUkQ,546
86
88
  torchx/tracker/__init__.py,sha256=qo39aOa0Dz9zt4TtFkqPeIaH7MNqdAkFlGaOFiDLXTI,4375
87
- torchx/tracker/api.py,sha256=WZ7TYdbSVx_5h5MlX9EwQLRpxmIf0oKdiQwQ0zvkO3o,11262
88
- torchx/tracker/mlflow.py,sha256=poeoIXVPzr2sxgi515fMGRH83KAFNL6XFILMh0EQ2Dw,14487
89
+ torchx/tracker/api.py,sha256=4rteINX8ZMv_03t75qOPU-rP3YeIPmm6N1HX9t8lVQg,11263
90
+ torchx/tracker/mlflow.py,sha256=arl70oNw76VNIpf_gEP5p7A7OnDQVIfWZDEyImuM_Gc,14486
89
91
  torchx/tracker/backend/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
90
- torchx/tracker/backend/fsspec.py,sha256=528xKryBE27Rm_OHD7r2R6fmVAclknBtoy1s034Ny6c,10440
92
+ torchx/tracker/backend/fsspec.py,sha256=1lJ1SoaTXl8ajvIJtp9pUmQgTRw7nF5D0Hv3susfYmE,10439
91
93
  torchx/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
94
  torchx/util/cuda.py,sha256=-ZTa1WCLnY2WtSWAdWufLQqZSDCZfZsloBuiS84LIkU,1099
93
95
  torchx/util/datetime.py,sha256=hV6Sg0u5KTBe68yrmy_RGCC5su0i4Tb_mAYphWamiXI,405
@@ -102,10 +104,10 @@ torchx/util/types.py,sha256=E9dxAWQnsJkIDuHtg-poeOJ4etucSI_xP_Z5kNJX8uI,9229
102
104
  torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,798
103
105
  torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
104
106
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
105
- torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
106
- torchx_nightly-2025.11.12.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
107
- torchx_nightly-2025.11.12.dist-info/METADATA,sha256=Wg2n6bsPSMaU-WZzo1y7uTF_sPQNWCjP8yu5-to3ihA,5324
108
- torchx_nightly-2025.11.12.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
109
- torchx_nightly-2025.11.12.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
110
- torchx_nightly-2025.11.12.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
111
- torchx_nightly-2025.11.12.dist-info/RECORD,,
107
+ torchx/workspace/docker_workspace.py,sha256=EkiveycTlCYPzrkkoqL2EXNFZSUc3015RgTQY-7a3iU,10268
108
+ torchx_nightly-2026.1.22.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
109
+ torchx_nightly-2026.1.22.dist-info/METADATA,sha256=D169Ar4bVrkBHjTLDbenoqIyqBEiqxoDtDc59YXl4N8,5323
110
+ torchx_nightly-2026.1.22.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
111
+ torchx_nightly-2026.1.22.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
112
+ torchx_nightly-2026.1.22.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
113
+ torchx_nightly-2026.1.22.dist-info/RECORD,,