torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_list.py +1 -2
- torchx/cli/cmd_run.py +202 -28
- torchx/cli/cmd_tracker.py +1 -1
- torchx/cli/main.py +2 -0
- torchx/components/__init__.py +1 -8
- torchx/components/dist.py +9 -3
- torchx/components/integration_tests/component_provider.py +2 -2
- torchx/components/utils.py +1 -1
- torchx/distributed/__init__.py +1 -1
- torchx/runner/api.py +102 -81
- torchx/runner/config.py +3 -1
- torchx/runner/events/__init__.py +20 -10
- torchx/runner/events/api.py +1 -1
- torchx/schedulers/__init__.py +7 -10
- torchx/schedulers/api.py +66 -25
- torchx/schedulers/aws_batch_scheduler.py +47 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
- torchx/schedulers/docker_scheduler.py +4 -3
- torchx/schedulers/ids.py +27 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
- torchx/schedulers/kubernetes_scheduler.py +355 -36
- torchx/schedulers/local_scheduler.py +2 -1
- torchx/schedulers/lsf_scheduler.py +1 -1
- torchx/schedulers/slurm_scheduler.py +102 -27
- torchx/specs/__init__.py +40 -9
- torchx/specs/api.py +222 -12
- torchx/specs/builders.py +109 -28
- torchx/specs/file_linter.py +117 -53
- torchx/specs/finder.py +25 -37
- torchx/specs/named_resources_aws.py +13 -2
- torchx/specs/overlays.py +106 -0
- torchx/tracker/__init__.py +2 -2
- torchx/tracker/api.py +1 -1
- torchx/util/entrypoints.py +1 -6
- torchx/util/strings.py +1 -1
- torchx/util/types.py +12 -1
- torchx/version.py +2 -2
- torchx/workspace/api.py +102 -5
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
- torchx/pipelines/kfp/__init__.py +0 -30
- torchx/pipelines/kfp/adapter.py +0 -274
- torchx/pipelines/kfp/version.py +0 -19
- torchx/schedulers/gcp_batch_scheduler.py +0 -497
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -454
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
|
@@ -73,6 +73,15 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
|
|
|
73
73
|
return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
|
|
74
74
|
|
|
75
75
|
|
|
76
|
+
def get_appstate_from_job(job: dict[str, object]) -> AppState:
|
|
77
|
+
# Prior to slurm-23.11, job_state was a string and not a list
|
|
78
|
+
job_state = job.get("job_state", None)
|
|
79
|
+
if isinstance(job_state, list):
|
|
80
|
+
return appstate_from_slurm_state(job_state[0])
|
|
81
|
+
else:
|
|
82
|
+
return appstate_from_slurm_state(str(job_state))
|
|
83
|
+
|
|
84
|
+
|
|
76
85
|
def version() -> Tuple[int, int]:
|
|
77
86
|
"""
|
|
78
87
|
Uses ``sinfo --version`` to get the slurm version. If the command fails, it
|
|
@@ -126,6 +135,7 @@ SBATCH_JOB_OPTIONS = {
|
|
|
126
135
|
"comment",
|
|
127
136
|
"mail-user",
|
|
128
137
|
"mail-type",
|
|
138
|
+
"account",
|
|
129
139
|
}
|
|
130
140
|
SBATCH_GROUP_OPTIONS = {
|
|
131
141
|
"partition",
|
|
@@ -150,6 +160,7 @@ def _apply_app_id_env(s: str) -> str:
|
|
|
150
160
|
SlurmOpts = TypedDict(
|
|
151
161
|
"SlurmOpts",
|
|
152
162
|
{
|
|
163
|
+
"account": Optional[str],
|
|
153
164
|
"partition": str,
|
|
154
165
|
"time": str,
|
|
155
166
|
"comment": Optional[str],
|
|
@@ -210,6 +221,7 @@ class SlurmReplicaRequest:
|
|
|
210
221
|
sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
|
|
211
222
|
else:
|
|
212
223
|
sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
|
|
224
|
+
sbatch_opts.setdefault("ntasks", "1")
|
|
213
225
|
|
|
214
226
|
srun_opts = {
|
|
215
227
|
"output": f"slurm-{macros.app_id}-{name}.out",
|
|
@@ -325,9 +337,7 @@ fi
|
|
|
325
337
|
{self.materialize()}"""
|
|
326
338
|
|
|
327
339
|
|
|
328
|
-
class SlurmScheduler(
|
|
329
|
-
DirWorkspaceMixin, Scheduler[SlurmOpts, AppDef, AppDryRunInfo[SlurmBatchRequest]]
|
|
330
|
-
):
|
|
340
|
+
class SlurmScheduler(DirWorkspaceMixin, Scheduler[SlurmOpts]):
|
|
331
341
|
"""
|
|
332
342
|
SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
|
|
333
343
|
that slurm CLI tools are locally installed and job accounting is enabled.
|
|
@@ -396,6 +406,12 @@ class SlurmScheduler(
|
|
|
396
406
|
|
|
397
407
|
def _run_opts(self) -> runopts:
|
|
398
408
|
opts = runopts()
|
|
409
|
+
opts.add(
|
|
410
|
+
"account",
|
|
411
|
+
type_=str,
|
|
412
|
+
help="The account to use for the slurm job.",
|
|
413
|
+
default=None,
|
|
414
|
+
)
|
|
399
415
|
opts.add(
|
|
400
416
|
"partition",
|
|
401
417
|
type_=str,
|
|
@@ -569,6 +585,8 @@ class SlurmScheduler(
|
|
|
569
585
|
return self._describe_sacct(app_id)
|
|
570
586
|
|
|
571
587
|
def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
588
|
+
# NOTE: Handles multiple job ID formats due to SLURM version differences.
|
|
589
|
+
# Different clusters use heterogeneous (+) vs regular (.) job ID formats.
|
|
572
590
|
try:
|
|
573
591
|
output = subprocess.check_output(
|
|
574
592
|
["sacct", "--parsable2", "-j", app_id],
|
|
@@ -593,15 +611,27 @@ class SlurmScheduler(
|
|
|
593
611
|
msg = ""
|
|
594
612
|
app_state = AppState.UNKNOWN
|
|
595
613
|
for row in reader:
|
|
596
|
-
|
|
614
|
+
# Handle both "+" (heterogeneous) and "." (regular) job ID formats
|
|
615
|
+
job_id_full = row["JobID"]
|
|
616
|
+
|
|
617
|
+
# Split on both "+" and "." to handle different SLURM configurations
|
|
618
|
+
if "+" in job_id_full:
|
|
619
|
+
job_id, *parts = job_id_full.split("+")
|
|
620
|
+
is_subjob = len(parts) > 0 and "." in parts[0]
|
|
621
|
+
else:
|
|
622
|
+
job_id, *parts = job_id_full.split(".")
|
|
623
|
+
is_subjob = len(parts) > 0
|
|
624
|
+
|
|
597
625
|
if job_id != app_id:
|
|
598
626
|
continue
|
|
599
|
-
|
|
600
|
-
|
|
627
|
+
|
|
628
|
+
if is_subjob:
|
|
629
|
+
# we only care about the main job not the child jobs (.batch, .0, etc.)
|
|
601
630
|
continue
|
|
602
631
|
|
|
603
|
-
|
|
604
|
-
|
|
632
|
+
msg = row["State"]
|
|
633
|
+
# Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
|
|
634
|
+
state = msg.split()[0].rstrip("+")
|
|
605
635
|
app_state = appstate_from_slurm_state(state)
|
|
606
636
|
|
|
607
637
|
role, _, replica_id = row["JobName"].rpartition("-")
|
|
@@ -628,6 +658,9 @@ class SlurmScheduler(
|
|
|
628
658
|
)
|
|
629
659
|
|
|
630
660
|
def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
661
|
+
# NOTE: This method contains multiple compatibility checks for different SLURM versions
|
|
662
|
+
# due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
|
|
663
|
+
|
|
631
664
|
# squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
|
|
632
665
|
# if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
|
|
633
666
|
output = subprocess.check_output(
|
|
@@ -648,7 +681,7 @@ class SlurmScheduler(
|
|
|
648
681
|
|
|
649
682
|
entrypoint = job["command"]
|
|
650
683
|
image = job["current_working_directory"]
|
|
651
|
-
state =
|
|
684
|
+
state = get_appstate_from_job(job)
|
|
652
685
|
|
|
653
686
|
job_resources = job["job_resources"]
|
|
654
687
|
|
|
@@ -669,7 +702,18 @@ class SlurmScheduler(
|
|
|
669
702
|
if state == AppState.PENDING:
|
|
670
703
|
# NOTE: torchx launched jobs points to exactly one host
|
|
671
704
|
# otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
|
|
672
|
-
|
|
705
|
+
|
|
706
|
+
# SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
|
|
707
|
+
if job_resources is not None:
|
|
708
|
+
hostname = job_resources.get("scheduled_nodes", "")
|
|
709
|
+
# If scheduled_nodes not found in job_resources, try nodes.list
|
|
710
|
+
if not hostname and "nodes" in job_resources:
|
|
711
|
+
nodes_info = job_resources.get("nodes", {})
|
|
712
|
+
if isinstance(nodes_info, dict):
|
|
713
|
+
hostname = nodes_info.get("list", "")
|
|
714
|
+
else:
|
|
715
|
+
# For pending jobs where job_resources is None, check top-level fields
|
|
716
|
+
hostname = job.get("nodes", "") or job.get("scheduled_nodes", "")
|
|
673
717
|
|
|
674
718
|
role.num_replicas += 1
|
|
675
719
|
role_status.replicas.append(
|
|
@@ -685,24 +729,35 @@ class SlurmScheduler(
|
|
|
685
729
|
# where each replica is a "sub-job" so `allocated_nodes` will always be 1
|
|
686
730
|
# but we deal with jobs that have not been launched with torchx
|
|
687
731
|
# which can have multiple hosts per sub-job (count them as replicas)
|
|
688
|
-
|
|
732
|
+
nodes_data = job_resources.get("nodes", {})
|
|
733
|
+
|
|
734
|
+
# SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
|
|
735
|
+
if "allocation" in nodes_data and isinstance(
|
|
736
|
+
nodes_data["allocation"], list
|
|
737
|
+
):
|
|
738
|
+
# SLURM 24.11+ format: nodes.allocation is a list
|
|
739
|
+
for node_info in nodes_data["allocation"]:
|
|
740
|
+
hostname = node_info["name"]
|
|
741
|
+
cpu = int(node_info["cpus"]["used"])
|
|
742
|
+
memMB = (
|
|
743
|
+
int(node_info["memory"]["allocated"]) // 1024
|
|
744
|
+
) # Convert to MB
|
|
689
745
|
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
role=role_name,
|
|
700
|
-
state=state,
|
|
701
|
-
hostname=hostname,
|
|
746
|
+
role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
|
|
747
|
+
role.num_replicas += 1
|
|
748
|
+
role_status.replicas.append(
|
|
749
|
+
ReplicaStatus(
|
|
750
|
+
id=int(replica_id),
|
|
751
|
+
role=role_name,
|
|
752
|
+
state=state,
|
|
753
|
+
hostname=hostname,
|
|
754
|
+
)
|
|
702
755
|
)
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
756
|
+
elif "allocated_nodes" in job_resources and isinstance(
|
|
757
|
+
job_resources["allocated_nodes"], list
|
|
758
|
+
):
|
|
759
|
+
# Legacy format: allocated_nodes is a list
|
|
760
|
+
for node_info in job_resources["allocated_nodes"]:
|
|
706
761
|
# NOTE: we expect resource specs for all the nodes to be the same
|
|
707
762
|
# NOTE: use allocated (not used/requested) memory since
|
|
708
763
|
# users may only specify --cpu, in which case slurm
|
|
@@ -725,6 +780,26 @@ class SlurmScheduler(
|
|
|
725
780
|
hostname=hostname,
|
|
726
781
|
)
|
|
727
782
|
)
|
|
783
|
+
else:
|
|
784
|
+
# Fallback: use hostname from nodes.list
|
|
785
|
+
if isinstance(nodes_data, str):
|
|
786
|
+
hostname = nodes_data
|
|
787
|
+
else:
|
|
788
|
+
hostname = (
|
|
789
|
+
nodes_data.get("list", "")
|
|
790
|
+
if isinstance(nodes_data, dict)
|
|
791
|
+
else ""
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
role.num_replicas += 1
|
|
795
|
+
role_status.replicas.append(
|
|
796
|
+
ReplicaStatus(
|
|
797
|
+
id=int(replica_id),
|
|
798
|
+
role=role_name,
|
|
799
|
+
state=state,
|
|
800
|
+
hostname=hostname,
|
|
801
|
+
)
|
|
802
|
+
)
|
|
728
803
|
|
|
729
804
|
return DescribeAppResponse(
|
|
730
805
|
app_id=app_id,
|
|
@@ -821,7 +896,7 @@ class SlurmScheduler(
|
|
|
821
896
|
out.append(
|
|
822
897
|
ListAppResponse(
|
|
823
898
|
app_id=str(job["job_id"]),
|
|
824
|
-
state=
|
|
899
|
+
state=get_appstate_from_job(job),
|
|
825
900
|
name=job["name"],
|
|
826
901
|
)
|
|
827
902
|
)
|
torchx/specs/__init__.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
2
|
# All rights reserved.
|
|
4
3
|
#
|
|
@@ -13,7 +12,9 @@ used by components to define the apps which can then be launched via a TorchX
|
|
|
13
12
|
scheduler or pipeline adapter.
|
|
14
13
|
"""
|
|
15
14
|
import difflib
|
|
16
|
-
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
from typing import Callable, Dict, Iterator, Mapping, Optional
|
|
17
18
|
|
|
18
19
|
from torchx.specs.api import (
|
|
19
20
|
ALL,
|
|
@@ -42,9 +43,11 @@ from torchx.specs.api import (
|
|
|
42
43
|
RoleStatus,
|
|
43
44
|
runopt,
|
|
44
45
|
runopts,
|
|
46
|
+
TORCHX_HOME,
|
|
45
47
|
UnknownAppException,
|
|
46
48
|
UnknownSchedulerException,
|
|
47
49
|
VolumeMount,
|
|
50
|
+
Workspace,
|
|
48
51
|
)
|
|
49
52
|
from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
|
|
50
53
|
|
|
@@ -52,14 +55,22 @@ from torchx.util.entrypoints import load_group
|
|
|
52
55
|
|
|
53
56
|
from torchx.util.modules import import_attr
|
|
54
57
|
|
|
55
|
-
|
|
58
|
+
GiB: int = 1024
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
ResourceFactory = Callable[[], Resource]
|
|
62
|
+
|
|
63
|
+
AWS_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
|
|
56
64
|
"torchx.specs.named_resources_aws", "NAMED_RESOURCES", default={}
|
|
57
65
|
)
|
|
58
|
-
GENERIC_NAMED_RESOURCES: Mapping[str,
|
|
66
|
+
GENERIC_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
|
|
59
67
|
"torchx.specs.named_resources_generic", "NAMED_RESOURCES", default={}
|
|
60
68
|
)
|
|
61
|
-
|
|
62
|
-
|
|
69
|
+
CUSTOM_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
|
|
70
|
+
os.environ.get("TORCHX_CUSTOM_NAMED_RESOURCES", "torchx.specs.fb.named_resources"),
|
|
71
|
+
"NAMED_RESOURCES",
|
|
72
|
+
default={},
|
|
73
|
+
)
|
|
63
74
|
|
|
64
75
|
|
|
65
76
|
def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
|
|
@@ -69,6 +80,7 @@ def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
|
|
|
69
80
|
for name, resource in {
|
|
70
81
|
**GENERIC_NAMED_RESOURCES,
|
|
71
82
|
**AWS_NAMED_RESOURCES,
|
|
83
|
+
**CUSTOM_NAMED_RESOURCES,
|
|
72
84
|
**resource_methods,
|
|
73
85
|
}.items():
|
|
74
86
|
materialized_resources[name] = resource
|
|
@@ -101,8 +113,22 @@ class _NamedResourcesLibrary:
|
|
|
101
113
|
def __contains__(self, key: str) -> bool:
|
|
102
114
|
return key in _named_resource_factories
|
|
103
115
|
|
|
104
|
-
def __iter__(self) ->
|
|
105
|
-
|
|
116
|
+
def __iter__(self) -> Iterator[str]:
|
|
117
|
+
"""Iterates through the names of the registered named_resources.
|
|
118
|
+
|
|
119
|
+
Usage:
|
|
120
|
+
|
|
121
|
+
.. doctest::
|
|
122
|
+
|
|
123
|
+
from torchx import specs
|
|
124
|
+
|
|
125
|
+
for resource_name in specs.named_resources:
|
|
126
|
+
resource = specs.resource(h=resource_name)
|
|
127
|
+
assert isinstance(resource, specs.Resource)
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
for key in _named_resource_factories:
|
|
131
|
+
yield (key)
|
|
106
132
|
|
|
107
133
|
|
|
108
134
|
named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
|
|
@@ -122,7 +148,7 @@ def resource(
|
|
|
122
148
|
|
|
123
149
|
If ``h`` is specified then it is used to look up the
|
|
124
150
|
resource specs from the list of registered named resources.
|
|
125
|
-
See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
|
|
151
|
+
See `registering named resource <https://meta-pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
|
|
126
152
|
|
|
127
153
|
Otherwise a ``Resource`` object is created from the raw resource specs.
|
|
128
154
|
|
|
@@ -225,5 +251,10 @@ __all__ = [
|
|
|
225
251
|
"make_app_handle",
|
|
226
252
|
"materialize_appdef",
|
|
227
253
|
"parse_mounts",
|
|
254
|
+
"torchx_run_args_from_argparse",
|
|
255
|
+
"torchx_run_args_from_json",
|
|
256
|
+
"TorchXRunArgs",
|
|
228
257
|
"ALL",
|
|
258
|
+
"TORCHX_HOME",
|
|
259
|
+
"Workspace",
|
|
229
260
|
]
|