torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  2. torchx/cli/cmd_delete.py +30 -0
  3. torchx/cli/cmd_list.py +1 -2
  4. torchx/cli/cmd_run.py +202 -28
  5. torchx/cli/cmd_tracker.py +1 -1
  6. torchx/cli/main.py +2 -0
  7. torchx/components/__init__.py +1 -8
  8. torchx/components/dist.py +9 -3
  9. torchx/components/integration_tests/component_provider.py +2 -2
  10. torchx/components/utils.py +1 -1
  11. torchx/distributed/__init__.py +1 -1
  12. torchx/runner/api.py +102 -81
  13. torchx/runner/config.py +3 -1
  14. torchx/runner/events/__init__.py +20 -10
  15. torchx/runner/events/api.py +1 -1
  16. torchx/schedulers/__init__.py +7 -10
  17. torchx/schedulers/api.py +66 -25
  18. torchx/schedulers/aws_batch_scheduler.py +47 -6
  19. torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
  20. torchx/schedulers/docker_scheduler.py +4 -3
  21. torchx/schedulers/ids.py +27 -23
  22. torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
  23. torchx/schedulers/kubernetes_scheduler.py +355 -36
  24. torchx/schedulers/local_scheduler.py +2 -1
  25. torchx/schedulers/lsf_scheduler.py +1 -1
  26. torchx/schedulers/slurm_scheduler.py +102 -27
  27. torchx/specs/__init__.py +40 -9
  28. torchx/specs/api.py +222 -12
  29. torchx/specs/builders.py +109 -28
  30. torchx/specs/file_linter.py +117 -53
  31. torchx/specs/finder.py +25 -37
  32. torchx/specs/named_resources_aws.py +13 -2
  33. torchx/specs/overlays.py +106 -0
  34. torchx/tracker/__init__.py +2 -2
  35. torchx/tracker/api.py +1 -1
  36. torchx/util/entrypoints.py +1 -6
  37. torchx/util/strings.py +1 -1
  38. torchx/util/types.py +12 -1
  39. torchx/version.py +2 -2
  40. torchx/workspace/api.py +102 -5
  41. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
  42. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
  43. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
  44. torchx/examples/pipelines/__init__.py +0 -0
  45. torchx/examples/pipelines/kfp/__init__.py +0 -0
  46. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
  47. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
  48. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
  49. torchx/pipelines/kfp/__init__.py +0 -30
  50. torchx/pipelines/kfp/adapter.py +0 -274
  51. torchx/pipelines/kfp/version.py +0 -19
  52. torchx/schedulers/gcp_batch_scheduler.py +0 -497
  53. torchx/schedulers/ray/ray_common.py +0 -22
  54. torchx/schedulers/ray/ray_driver.py +0 -307
  55. torchx/schedulers/ray_scheduler.py +0 -454
  56. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
  57. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
  58. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
@@ -73,6 +73,15 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
73
73
  return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
74
74
 
75
75
 
76
+ def get_appstate_from_job(job: dict[str, object]) -> AppState:
77
+ # Prior to slurm-23.11, job_state was a string and not a list
78
+ job_state = job.get("job_state", None)
79
+ if isinstance(job_state, list):
80
+ return appstate_from_slurm_state(job_state[0])
81
+ else:
82
+ return appstate_from_slurm_state(str(job_state))
83
+
84
+
76
85
  def version() -> Tuple[int, int]:
77
86
  """
78
87
  Uses ``sinfo --version`` to get the slurm version. If the command fails, it
@@ -126,6 +135,7 @@ SBATCH_JOB_OPTIONS = {
126
135
  "comment",
127
136
  "mail-user",
128
137
  "mail-type",
138
+ "account",
129
139
  }
130
140
  SBATCH_GROUP_OPTIONS = {
131
141
  "partition",
@@ -150,6 +160,7 @@ def _apply_app_id_env(s: str) -> str:
150
160
  SlurmOpts = TypedDict(
151
161
  "SlurmOpts",
152
162
  {
163
+ "account": Optional[str],
153
164
  "partition": str,
154
165
  "time": str,
155
166
  "comment": Optional[str],
@@ -210,6 +221,7 @@ class SlurmReplicaRequest:
210
221
  sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
211
222
  else:
212
223
  sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
224
+ sbatch_opts.setdefault("ntasks", "1")
213
225
 
214
226
  srun_opts = {
215
227
  "output": f"slurm-{macros.app_id}-{name}.out",
@@ -325,9 +337,7 @@ fi
325
337
  {self.materialize()}"""
326
338
 
327
339
 
328
- class SlurmScheduler(
329
- DirWorkspaceMixin, Scheduler[SlurmOpts, AppDef, AppDryRunInfo[SlurmBatchRequest]]
330
- ):
340
+ class SlurmScheduler(DirWorkspaceMixin, Scheduler[SlurmOpts]):
331
341
  """
332
342
  SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
333
343
  that slurm CLI tools are locally installed and job accounting is enabled.
@@ -396,6 +406,12 @@ class SlurmScheduler(
396
406
 
397
407
  def _run_opts(self) -> runopts:
398
408
  opts = runopts()
409
+ opts.add(
410
+ "account",
411
+ type_=str,
412
+ help="The account to use for the slurm job.",
413
+ default=None,
414
+ )
399
415
  opts.add(
400
416
  "partition",
401
417
  type_=str,
@@ -569,6 +585,8 @@ class SlurmScheduler(
569
585
  return self._describe_sacct(app_id)
570
586
 
571
587
  def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
588
+ # NOTE: Handles multiple job ID formats due to SLURM version differences.
589
+ # Different clusters use heterogeneous (+) vs regular (.) job ID formats.
572
590
  try:
573
591
  output = subprocess.check_output(
574
592
  ["sacct", "--parsable2", "-j", app_id],
@@ -593,15 +611,27 @@ class SlurmScheduler(
593
611
  msg = ""
594
612
  app_state = AppState.UNKNOWN
595
613
  for row in reader:
596
- job_id, *parts = row["JobID"].split("+")
614
+ # Handle both "+" (heterogeneous) and "." (regular) job ID formats
615
+ job_id_full = row["JobID"]
616
+
617
+ # Split on both "+" and "." to handle different SLURM configurations
618
+ if "+" in job_id_full:
619
+ job_id, *parts = job_id_full.split("+")
620
+ is_subjob = len(parts) > 0 and "." in parts[0]
621
+ else:
622
+ job_id, *parts = job_id_full.split(".")
623
+ is_subjob = len(parts) > 0
624
+
597
625
  if job_id != app_id:
598
626
  continue
599
- if len(parts) > 0 and "." in parts[0]:
600
- # we only care about the worker not the child jobs
627
+
628
+ if is_subjob:
629
+ # we only care about the main job not the child jobs (.batch, .0, etc.)
601
630
  continue
602
631
 
603
- state = row["State"]
604
- msg = state
632
+ msg = row["State"]
633
+ # Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
634
+ state = msg.split()[0].rstrip("+")
605
635
  app_state = appstate_from_slurm_state(state)
606
636
 
607
637
  role, _, replica_id = row["JobName"].rpartition("-")
@@ -628,6 +658,9 @@ class SlurmScheduler(
628
658
  )
629
659
 
630
660
  def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
661
+ # NOTE: This method contains multiple compatibility checks for different SLURM versions
662
+ # due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
663
+
631
664
  # squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
632
665
  # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
633
666
  output = subprocess.check_output(
@@ -648,7 +681,7 @@ class SlurmScheduler(
648
681
 
649
682
  entrypoint = job["command"]
650
683
  image = job["current_working_directory"]
651
- state = appstate_from_slurm_state(job["job_state"][0])
684
+ state = get_appstate_from_job(job)
652
685
 
653
686
  job_resources = job["job_resources"]
654
687
 
@@ -669,7 +702,18 @@ class SlurmScheduler(
669
702
  if state == AppState.PENDING:
670
703
  # NOTE: torchx launched jobs points to exactly one host
671
704
  # otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
672
- hostname = job_resources.get("scheduled_nodes", "")
705
+
706
+ # SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
707
+ if job_resources is not None:
708
+ hostname = job_resources.get("scheduled_nodes", "")
709
+ # If scheduled_nodes not found in job_resources, try nodes.list
710
+ if not hostname and "nodes" in job_resources:
711
+ nodes_info = job_resources.get("nodes", {})
712
+ if isinstance(nodes_info, dict):
713
+ hostname = nodes_info.get("list", "")
714
+ else:
715
+ # For pending jobs where job_resources is None, check top-level fields
716
+ hostname = job.get("nodes", "") or job.get("scheduled_nodes", "")
673
717
 
674
718
  role.num_replicas += 1
675
719
  role_status.replicas.append(
@@ -685,24 +729,35 @@ class SlurmScheduler(
685
729
  # where each replica is a "sub-job" so `allocated_nodes` will always be 1
686
730
  # but we deal with jobs that have not been launched with torchx
687
731
  # which can have multiple hosts per sub-job (count them as replicas)
688
- node_infos = job_resources.get("allocated_nodes", [])
732
+ nodes_data = job_resources.get("nodes", {})
733
+
734
+ # SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
735
+ if "allocation" in nodes_data and isinstance(
736
+ nodes_data["allocation"], list
737
+ ):
738
+ # SLURM 24.11+ format: nodes.allocation is a list
739
+ for node_info in nodes_data["allocation"]:
740
+ hostname = node_info["name"]
741
+ cpu = int(node_info["cpus"]["used"])
742
+ memMB = (
743
+ int(node_info["memory"]["allocated"]) // 1024
744
+ ) # Convert to MB
689
745
 
690
- if not isinstance(node_infos, list):
691
- # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
692
- # is not a list of individual nodes, but a map of the nodelist specs
693
- # in this case just use jobs[].job_resources.nodes
694
- hostname = job_resources.get("nodes")
695
- role.num_replicas += 1
696
- role_status.replicas.append(
697
- ReplicaStatus(
698
- id=int(replica_id),
699
- role=role_name,
700
- state=state,
701
- hostname=hostname,
746
+ role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
747
+ role.num_replicas += 1
748
+ role_status.replicas.append(
749
+ ReplicaStatus(
750
+ id=int(replica_id),
751
+ role=role_name,
752
+ state=state,
753
+ hostname=hostname,
754
+ )
702
755
  )
703
- )
704
- else:
705
- for node_info in node_infos:
756
+ elif "allocated_nodes" in job_resources and isinstance(
757
+ job_resources["allocated_nodes"], list
758
+ ):
759
+ # Legacy format: allocated_nodes is a list
760
+ for node_info in job_resources["allocated_nodes"]:
706
761
  # NOTE: we expect resource specs for all the nodes to be the same
707
762
  # NOTE: use allocated (not used/requested) memory since
708
763
  # users may only specify --cpu, in which case slurm
@@ -725,6 +780,26 @@ class SlurmScheduler(
725
780
  hostname=hostname,
726
781
  )
727
782
  )
783
+ else:
784
+ # Fallback: use hostname from nodes.list
785
+ if isinstance(nodes_data, str):
786
+ hostname = nodes_data
787
+ else:
788
+ hostname = (
789
+ nodes_data.get("list", "")
790
+ if isinstance(nodes_data, dict)
791
+ else ""
792
+ )
793
+
794
+ role.num_replicas += 1
795
+ role_status.replicas.append(
796
+ ReplicaStatus(
797
+ id=int(replica_id),
798
+ role=role_name,
799
+ state=state,
800
+ hostname=hostname,
801
+ )
802
+ )
728
803
 
729
804
  return DescribeAppResponse(
730
805
  app_id=app_id,
@@ -821,7 +896,7 @@ class SlurmScheduler(
821
896
  out.append(
822
897
  ListAppResponse(
823
898
  app_id=str(job["job_id"]),
824
- state=SLURM_STATES[job["job_state"][0]],
899
+ state=get_appstate_from_job(job),
825
900
  name=job["name"],
826
901
  )
827
902
  )
torchx/specs/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  # Copyright (c) Meta Platforms, Inc. and affiliates.
3
2
  # All rights reserved.
4
3
  #
@@ -13,7 +12,9 @@ used by components to define the apps which can then be launched via a TorchX
13
12
  scheduler or pipeline adapter.
14
13
  """
15
14
  import difflib
16
- from typing import Callable, Dict, Mapping, Optional
15
+
16
+ import os
17
+ from typing import Callable, Dict, Iterator, Mapping, Optional
17
18
 
18
19
  from torchx.specs.api import (
19
20
  ALL,
@@ -42,9 +43,11 @@ from torchx.specs.api import (
42
43
  RoleStatus,
43
44
  runopt,
44
45
  runopts,
46
+ TORCHX_HOME,
45
47
  UnknownAppException,
46
48
  UnknownSchedulerException,
47
49
  VolumeMount,
50
+ Workspace,
48
51
  )
49
52
  from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
50
53
 
@@ -52,14 +55,22 @@ from torchx.util.entrypoints import load_group
52
55
 
53
56
  from torchx.util.modules import import_attr
54
57
 
55
- AWS_NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = import_attr(
58
+ GiB: int = 1024
59
+
60
+
61
+ ResourceFactory = Callable[[], Resource]
62
+
63
+ AWS_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
56
64
  "torchx.specs.named_resources_aws", "NAMED_RESOURCES", default={}
57
65
  )
58
- GENERIC_NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = import_attr(
66
+ GENERIC_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
59
67
  "torchx.specs.named_resources_generic", "NAMED_RESOURCES", default={}
60
68
  )
61
-
62
- GiB: int = 1024
69
+ CUSTOM_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
70
+ os.environ.get("TORCHX_CUSTOM_NAMED_RESOURCES", "torchx.specs.fb.named_resources"),
71
+ "NAMED_RESOURCES",
72
+ default={},
73
+ )
63
74
 
64
75
 
65
76
  def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
@@ -69,6 +80,7 @@ def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
69
80
  for name, resource in {
70
81
  **GENERIC_NAMED_RESOURCES,
71
82
  **AWS_NAMED_RESOURCES,
83
+ **CUSTOM_NAMED_RESOURCES,
72
84
  **resource_methods,
73
85
  }.items():
74
86
  materialized_resources[name] = resource
@@ -101,8 +113,22 @@ class _NamedResourcesLibrary:
101
113
  def __contains__(self, key: str) -> bool:
102
114
  return key in _named_resource_factories
103
115
 
104
- def __iter__(self) -> None:
105
- raise NotImplementedError("named resources doesn't support iterating")
116
+ def __iter__(self) -> Iterator[str]:
117
+ """Iterates through the names of the registered named_resources.
118
+
119
+ Usage:
120
+
121
+ .. doctest::
122
+
123
+ from torchx import specs
124
+
125
+ for resource_name in specs.named_resources:
126
+ resource = specs.resource(h=resource_name)
127
+ assert isinstance(resource, specs.Resource)
128
+
129
+ """
130
+ for key in _named_resource_factories:
131
+ yield (key)
106
132
 
107
133
 
108
134
  named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
@@ -122,7 +148,7 @@ def resource(
122
148
 
123
149
  If ``h`` is specified then it is used to look up the
124
150
  resource specs from the list of registered named resources.
125
- See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
151
+ See `registering named resource <https://meta-pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
126
152
 
127
153
  Otherwise a ``Resource`` object is created from the raw resource specs.
128
154
 
@@ -225,5 +251,10 @@ __all__ = [
225
251
  "make_app_handle",
226
252
  "materialize_appdef",
227
253
  "parse_mounts",
254
+ "torchx_run_args_from_argparse",
255
+ "torchx_run_args_from_json",
256
+ "TorchXRunArgs",
228
257
  "ALL",
258
+ "TORCHX_HOME",
259
+ "Workspace",
229
260
  ]