torchx-nightly 2025.10.1__py3-none-any.whl → 2025.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

@@ -92,6 +92,8 @@ ENV_TORCHX_ROLE_IDX = "TORCHX_ROLE_IDX"
92
92
 
93
93
  ENV_TORCHX_ROLE_NAME = "TORCHX_ROLE_NAME"
94
94
 
95
+ ENV_TORCHX_IMAGE = "TORCHX_IMAGE"
96
+
95
97
  DEFAULT_ROLE_NAME = "node"
96
98
 
97
99
  TAG_TORCHX_VER = "torchx.pytorch.org/version"
@@ -99,6 +101,37 @@ TAG_TORCHX_APPNAME = "torchx.pytorch.org/app-name"
99
101
  TAG_TORCHX_USER = "torchx.pytorch.org/user"
100
102
 
101
103
 
104
+ def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]:
105
+ """
106
+ Parse ulimit string in format: name:softLimit:hardLimit
107
+ Multiple ulimits separated by commas.
108
+ """
109
+ if not ulimits_list:
110
+ return []
111
+
112
+ ulimits = []
113
+ for ulimit_str in ulimits_list:
114
+ if not ulimit_str.strip():
115
+ continue
116
+
117
+ parts = ulimit_str.strip().split(":")
118
+ if len(parts) != 3:
119
+ raise ValueError(
120
+ f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}"
121
+ )
122
+
123
+ name, soft_limit, hard_limit = parts
124
+ ulimits.append(
125
+ {
126
+ "name": name,
127
+ "softLimit": int(soft_limit) if soft_limit != "-1" else -1,
128
+ "hardLimit": int(hard_limit) if hard_limit != "-1" else -1,
129
+ }
130
+ )
131
+
132
+ return ulimits
133
+
134
+
102
135
  if TYPE_CHECKING:
103
136
  from docker import DockerClient
104
137
 
@@ -177,7 +210,8 @@ def _role_to_node_properties(
177
210
  privileged: bool = False,
178
211
  job_role_arn: Optional[str] = None,
179
212
  execution_role_arn: Optional[str] = None,
180
- ) -> Dict[str, object]:
213
+ ulimits: Optional[List[Dict[str, Any]]] = None,
214
+ ) -> Dict[str, Any]:
181
215
  role.mounts += get_device_mounts(role.resource.devices)
182
216
 
183
217
  mount_points = []
@@ -239,6 +273,7 @@ def _role_to_node_properties(
239
273
  "environment": [{"name": k, "value": v} for k, v in role.env.items()],
240
274
  "privileged": privileged,
241
275
  "resourceRequirements": resource_requirements_from_resource(role.resource),
276
+ **({"ulimits": ulimits} if ulimits else {}),
242
277
  "linuxParameters": {
243
278
  # To support PyTorch dataloaders we need to set /dev/shm to larger
244
279
  # than the 64M default.
@@ -361,6 +396,7 @@ class AWSBatchOpts(TypedDict, total=False):
361
396
  priority: int
362
397
  job_role_arn: Optional[str]
363
398
  execution_role_arn: Optional[str]
399
+ ulimits: Optional[list[str]]
364
400
 
365
401
 
366
402
  class AWSBatchScheduler(
@@ -506,6 +542,7 @@ class AWSBatchScheduler(
506
542
  role = values.apply(role)
507
543
  role.env[ENV_TORCHX_ROLE_IDX] = str(role_idx)
508
544
  role.env[ENV_TORCHX_ROLE_NAME] = str(role.name)
545
+ role.env[ENV_TORCHX_IMAGE] = role.image
509
546
 
510
547
  nodes.append(
511
548
  _role_to_node_properties(
@@ -514,6 +551,7 @@ class AWSBatchScheduler(
514
551
  privileged=cfg["privileged"],
515
552
  job_role_arn=cfg.get("job_role_arn"),
516
553
  execution_role_arn=cfg.get("execution_role_arn"),
554
+ ulimits=parse_ulimits(cfg.get("ulimits") or []),
517
555
  )
518
556
  )
519
557
  node_idx += role.num_replicas
@@ -599,6 +637,11 @@ class AWSBatchScheduler(
599
637
  type_=str,
600
638
  help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
601
639
  )
640
+ opts.add(
641
+ "ulimits",
642
+ type_=List[str],
643
+ help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)",
644
+ )
602
645
  return opts
603
646
 
604
647
  def _get_job_id(self, app_id: str) -> Optional[str]:
@@ -84,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
84
84
  LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
85
85
  LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
86
86
 
87
+ ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
88
+
87
89
  NETWORK = "torchx"
88
90
 
89
91
 
@@ -279,6 +281,7 @@ class DockerScheduler(
279
281
 
280
282
  # configure distributed host envs
281
283
  env["TORCHX_RANK0_HOST"] = rank0_name
284
+ env[ENV_TORCHX_IMAGE] = replica_role.image
282
285
 
283
286
  c = DockerContainer(
284
287
  image=replica_role.image,
@@ -399,6 +399,7 @@ def app_to_resource(
399
399
  replica_role = values.apply(role)
400
400
  if role_idx == 0 and replica_id == 0:
401
401
  replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
402
+ replica_role.env["TORCHX_IMAGE"] = replica_role.image
402
403
 
403
404
  pod = role_to_pod(name, replica_role, service_account)
404
405
  pod.metadata.labels.update(
@@ -73,6 +73,15 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
73
73
  return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
74
74
 
75
75
 
76
+ def get_appstate_from_job(job: dict[str, object]) -> AppState:
77
+ # Prior to slurm-23.11, job_state was a string and not a list
78
+ job_state = job.get("job_state", None)
79
+ if isinstance(job_state, list):
80
+ return appstate_from_slurm_state(job_state[0])
81
+ else:
82
+ return appstate_from_slurm_state(str(job_state))
83
+
84
+
76
85
  def version() -> Tuple[int, int]:
77
86
  """
78
87
  Uses ``sinfo --version`` to get the slurm version. If the command fails, it
@@ -666,7 +675,7 @@ class SlurmScheduler(
666
675
 
667
676
  entrypoint = job["command"]
668
677
  image = job["current_working_directory"]
669
- state = appstate_from_slurm_state(job["job_state"][0])
678
+ state = get_appstate_from_job(job)
670
679
 
671
680
  job_resources = job["job_resources"]
672
681
 
@@ -881,7 +890,7 @@ class SlurmScheduler(
881
890
  out.append(
882
891
  ListAppResponse(
883
892
  app_id=str(job["job_id"]),
884
- state=SLURM_STATES[job["job_state"][0]],
893
+ state=get_appstate_from_job(job),
885
894
  name=job["name"],
886
895
  )
887
896
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2025.10.1
3
+ Version: 2025.10.2
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/meta-pytorch/torchx
6
6
  Author: TorchX Devs
@@ -58,16 +58,16 @@ torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-
58
58
  torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
59
59
  torchx/schedulers/__init__.py,sha256=_Wx6-X3FNh8RJR82UGgUwKg7V_VQYsAkrveDoSSk2xU,2195
60
60
  torchx/schedulers/api.py,sha256=lfxNhrEO6eYYqVuQzzj9sTXrZShuZkyYxJ1jPE-Lvpo,14561
61
- torchx/schedulers/aws_batch_scheduler.py,sha256=hFxYzSZEK2SVS5sEyQC5YvNI0JJUJUQsWORlYpj_h3M,28105
61
+ torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
62
62
  torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
63
63
  torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
64
- torchx/schedulers/docker_scheduler.py,sha256=xuK00-dB6o8TV1YaZox7O5P09LHB2KeQ6t4eiNtqMYQ,16781
64
+ torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
65
65
  torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
66
66
  torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
67
- torchx/schedulers/kubernetes_scheduler.py,sha256=hCqtcz8S5doqFa9SUX_KRJAKg0FzkYHHvEf1tjq1Rak,28222
67
+ torchx/schedulers/kubernetes_scheduler.py,sha256=Wb6XDzwcvp3-NqBhKrjtgDC4L6GVOmcyP6fuoPFByBE,28288
68
68
  torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
69
69
  torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
70
- torchx/schedulers/slurm_scheduler.py,sha256=vZt102OxuTGj0ZE-V9dWbldtOyL2VbHcxADm_osL7Y4,31568
70
+ torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
71
71
  torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
72
72
  torchx/specs/__init__.py,sha256=RNjj4cV64AXP-2XowHLJJpzub1zYuyS17-2SU-dCcN0,6632
73
73
  torchx/specs/api.py,sha256=ZJEqBnEFG2jMMfQuIrBFHiX-Thr_wz2mAMiYeGf-fWo,42311
@@ -102,9 +102,9 @@ torchx/workspace/__init__.py,sha256=cZsKVvUWwDYcGhe6SCXQGBQfbk_yTnKEImOkI6xmu30,
102
102
  torchx/workspace/api.py,sha256=MGBQauBoH7wZdvXHXOx7JqefCF41rK0AHWF68IUwr4k,11276
103
103
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
104
104
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
105
- torchx_nightly-2025.10.1.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
106
- torchx_nightly-2025.10.1.dist-info/METADATA,sha256=6utanW11HzSGBuD995u_wr6QRD7vTc-wV8JQfBrvJR0,5068
107
- torchx_nightly-2025.10.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
108
- torchx_nightly-2025.10.1.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
109
- torchx_nightly-2025.10.1.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
110
- torchx_nightly-2025.10.1.dist-info/RECORD,,
105
+ torchx_nightly-2025.10.2.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
106
+ torchx_nightly-2025.10.2.dist-info/METADATA,sha256=X5eR-tfdt5wWYtmM_S-i7NgJm49DxolTkcP4dbs7KfY,5068
107
+ torchx_nightly-2025.10.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
108
+ torchx_nightly-2025.10.2.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
109
+ torchx_nightly-2025.10.2.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
110
+ torchx_nightly-2025.10.2.dist-info/RECORD,,