torchx-nightly 2025.10.1__py3-none-any.whl → 2025.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/aws_batch_scheduler.py +44 -1
- torchx/schedulers/docker_scheduler.py +3 -0
- torchx/schedulers/kubernetes_scheduler.py +1 -0
- torchx/schedulers/slurm_scheduler.py +11 -2
- {torchx_nightly-2025.10.1.dist-info → torchx_nightly-2025.10.3.dist-info}/METADATA +1 -1
- {torchx_nightly-2025.10.1.dist-info → torchx_nightly-2025.10.3.dist-info}/RECORD +10 -10
- {torchx_nightly-2025.10.1.dist-info → torchx_nightly-2025.10.3.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.10.1.dist-info → torchx_nightly-2025.10.3.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.10.1.dist-info → torchx_nightly-2025.10.3.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.10.1.dist-info → torchx_nightly-2025.10.3.dist-info}/top_level.txt +0 -0
|
@@ -92,6 +92,8 @@ ENV_TORCHX_ROLE_IDX = "TORCHX_ROLE_IDX"
|
|
|
92
92
|
|
|
93
93
|
ENV_TORCHX_ROLE_NAME = "TORCHX_ROLE_NAME"
|
|
94
94
|
|
|
95
|
+
ENV_TORCHX_IMAGE = "TORCHX_IMAGE"
|
|
96
|
+
|
|
95
97
|
DEFAULT_ROLE_NAME = "node"
|
|
96
98
|
|
|
97
99
|
TAG_TORCHX_VER = "torchx.pytorch.org/version"
|
|
@@ -99,6 +101,37 @@ TAG_TORCHX_APPNAME = "torchx.pytorch.org/app-name"
|
|
|
99
101
|
TAG_TORCHX_USER = "torchx.pytorch.org/user"
|
|
100
102
|
|
|
101
103
|
|
|
104
|
+
def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]:
|
|
105
|
+
"""
|
|
106
|
+
Parse ulimit string in format: name:softLimit:hardLimit
|
|
107
|
+
Multiple ulimits separated by commas.
|
|
108
|
+
"""
|
|
109
|
+
if not ulimits_list:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
ulimits = []
|
|
113
|
+
for ulimit_str in ulimits_list:
|
|
114
|
+
if not ulimit_str.strip():
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
parts = ulimit_str.strip().split(":")
|
|
118
|
+
if len(parts) != 3:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
name, soft_limit, hard_limit = parts
|
|
124
|
+
ulimits.append(
|
|
125
|
+
{
|
|
126
|
+
"name": name,
|
|
127
|
+
"softLimit": int(soft_limit) if soft_limit != "-1" else -1,
|
|
128
|
+
"hardLimit": int(hard_limit) if hard_limit != "-1" else -1,
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return ulimits
|
|
133
|
+
|
|
134
|
+
|
|
102
135
|
if TYPE_CHECKING:
|
|
103
136
|
from docker import DockerClient
|
|
104
137
|
|
|
@@ -177,7 +210,8 @@ def _role_to_node_properties(
|
|
|
177
210
|
privileged: bool = False,
|
|
178
211
|
job_role_arn: Optional[str] = None,
|
|
179
212
|
execution_role_arn: Optional[str] = None,
|
|
180
|
-
|
|
213
|
+
ulimits: Optional[List[Dict[str, Any]]] = None,
|
|
214
|
+
) -> Dict[str, Any]:
|
|
181
215
|
role.mounts += get_device_mounts(role.resource.devices)
|
|
182
216
|
|
|
183
217
|
mount_points = []
|
|
@@ -239,6 +273,7 @@ def _role_to_node_properties(
|
|
|
239
273
|
"environment": [{"name": k, "value": v} for k, v in role.env.items()],
|
|
240
274
|
"privileged": privileged,
|
|
241
275
|
"resourceRequirements": resource_requirements_from_resource(role.resource),
|
|
276
|
+
**({"ulimits": ulimits} if ulimits else {}),
|
|
242
277
|
"linuxParameters": {
|
|
243
278
|
# To support PyTorch dataloaders we need to set /dev/shm to larger
|
|
244
279
|
# than the 64M default.
|
|
@@ -361,6 +396,7 @@ class AWSBatchOpts(TypedDict, total=False):
|
|
|
361
396
|
priority: int
|
|
362
397
|
job_role_arn: Optional[str]
|
|
363
398
|
execution_role_arn: Optional[str]
|
|
399
|
+
ulimits: Optional[list[str]]
|
|
364
400
|
|
|
365
401
|
|
|
366
402
|
class AWSBatchScheduler(
|
|
@@ -506,6 +542,7 @@ class AWSBatchScheduler(
|
|
|
506
542
|
role = values.apply(role)
|
|
507
543
|
role.env[ENV_TORCHX_ROLE_IDX] = str(role_idx)
|
|
508
544
|
role.env[ENV_TORCHX_ROLE_NAME] = str(role.name)
|
|
545
|
+
role.env[ENV_TORCHX_IMAGE] = role.image
|
|
509
546
|
|
|
510
547
|
nodes.append(
|
|
511
548
|
_role_to_node_properties(
|
|
@@ -514,6 +551,7 @@ class AWSBatchScheduler(
|
|
|
514
551
|
privileged=cfg["privileged"],
|
|
515
552
|
job_role_arn=cfg.get("job_role_arn"),
|
|
516
553
|
execution_role_arn=cfg.get("execution_role_arn"),
|
|
554
|
+
ulimits=parse_ulimits(cfg.get("ulimits") or []),
|
|
517
555
|
)
|
|
518
556
|
)
|
|
519
557
|
node_idx += role.num_replicas
|
|
@@ -599,6 +637,11 @@ class AWSBatchScheduler(
|
|
|
599
637
|
type_=str,
|
|
600
638
|
help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
|
|
601
639
|
)
|
|
640
|
+
opts.add(
|
|
641
|
+
"ulimits",
|
|
642
|
+
type_=List[str],
|
|
643
|
+
help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)",
|
|
644
|
+
)
|
|
602
645
|
return opts
|
|
603
646
|
|
|
604
647
|
def _get_job_id(self, app_id: str) -> Optional[str]:
|
|
@@ -84,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
|
|
|
84
84
|
LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
|
|
85
85
|
LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
|
|
86
86
|
|
|
87
|
+
ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
|
|
88
|
+
|
|
87
89
|
NETWORK = "torchx"
|
|
88
90
|
|
|
89
91
|
|
|
@@ -279,6 +281,7 @@ class DockerScheduler(
|
|
|
279
281
|
|
|
280
282
|
# configure distributed host envs
|
|
281
283
|
env["TORCHX_RANK0_HOST"] = rank0_name
|
|
284
|
+
env[ENV_TORCHX_IMAGE] = replica_role.image
|
|
282
285
|
|
|
283
286
|
c = DockerContainer(
|
|
284
287
|
image=replica_role.image,
|
|
@@ -399,6 +399,7 @@ def app_to_resource(
|
|
|
399
399
|
replica_role = values.apply(role)
|
|
400
400
|
if role_idx == 0 and replica_id == 0:
|
|
401
401
|
replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
|
|
402
|
+
replica_role.env["TORCHX_IMAGE"] = replica_role.image
|
|
402
403
|
|
|
403
404
|
pod = role_to_pod(name, replica_role, service_account)
|
|
404
405
|
pod.metadata.labels.update(
|
|
@@ -73,6 +73,15 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
|
|
|
73
73
|
return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
|
|
74
74
|
|
|
75
75
|
|
|
76
|
+
def get_appstate_from_job(job: dict[str, object]) -> AppState:
|
|
77
|
+
# Prior to slurm-23.11, job_state was a string and not a list
|
|
78
|
+
job_state = job.get("job_state", None)
|
|
79
|
+
if isinstance(job_state, list):
|
|
80
|
+
return appstate_from_slurm_state(job_state[0])
|
|
81
|
+
else:
|
|
82
|
+
return appstate_from_slurm_state(str(job_state))
|
|
83
|
+
|
|
84
|
+
|
|
76
85
|
def version() -> Tuple[int, int]:
|
|
77
86
|
"""
|
|
78
87
|
Uses ``sinfo --version`` to get the slurm version. If the command fails, it
|
|
@@ -666,7 +675,7 @@ class SlurmScheduler(
|
|
|
666
675
|
|
|
667
676
|
entrypoint = job["command"]
|
|
668
677
|
image = job["current_working_directory"]
|
|
669
|
-
state =
|
|
678
|
+
state = get_appstate_from_job(job)
|
|
670
679
|
|
|
671
680
|
job_resources = job["job_resources"]
|
|
672
681
|
|
|
@@ -881,7 +890,7 @@ class SlurmScheduler(
|
|
|
881
890
|
out.append(
|
|
882
891
|
ListAppResponse(
|
|
883
892
|
app_id=str(job["job_id"]),
|
|
884
|
-
state=
|
|
893
|
+
state=get_appstate_from_job(job),
|
|
885
894
|
name=job["name"],
|
|
886
895
|
)
|
|
887
896
|
)
|
|
@@ -58,16 +58,16 @@ torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-
|
|
|
58
58
|
torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
|
|
59
59
|
torchx/schedulers/__init__.py,sha256=_Wx6-X3FNh8RJR82UGgUwKg7V_VQYsAkrveDoSSk2xU,2195
|
|
60
60
|
torchx/schedulers/api.py,sha256=lfxNhrEO6eYYqVuQzzj9sTXrZShuZkyYxJ1jPE-Lvpo,14561
|
|
61
|
-
torchx/schedulers/aws_batch_scheduler.py,sha256
|
|
61
|
+
torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
|
|
62
62
|
torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
|
|
63
63
|
torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
|
|
64
|
-
torchx/schedulers/docker_scheduler.py,sha256=
|
|
64
|
+
torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
|
|
65
65
|
torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
|
|
66
66
|
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
|
|
67
|
-
torchx/schedulers/kubernetes_scheduler.py,sha256=
|
|
67
|
+
torchx/schedulers/kubernetes_scheduler.py,sha256=Wb6XDzwcvp3-NqBhKrjtgDC4L6GVOmcyP6fuoPFByBE,28288
|
|
68
68
|
torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
|
|
69
69
|
torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
|
|
70
|
-
torchx/schedulers/slurm_scheduler.py,sha256=
|
|
70
|
+
torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
|
|
71
71
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
72
72
|
torchx/specs/__init__.py,sha256=RNjj4cV64AXP-2XowHLJJpzub1zYuyS17-2SU-dCcN0,6632
|
|
73
73
|
torchx/specs/api.py,sha256=ZJEqBnEFG2jMMfQuIrBFHiX-Thr_wz2mAMiYeGf-fWo,42311
|
|
@@ -102,9 +102,9 @@ torchx/workspace/__init__.py,sha256=cZsKVvUWwDYcGhe6SCXQGBQfbk_yTnKEImOkI6xmu30,
|
|
|
102
102
|
torchx/workspace/api.py,sha256=MGBQauBoH7wZdvXHXOx7JqefCF41rK0AHWF68IUwr4k,11276
|
|
103
103
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
104
104
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
105
|
-
torchx_nightly-2025.10.
|
|
106
|
-
torchx_nightly-2025.10.
|
|
107
|
-
torchx_nightly-2025.10.
|
|
108
|
-
torchx_nightly-2025.10.
|
|
109
|
-
torchx_nightly-2025.10.
|
|
110
|
-
torchx_nightly-2025.10.
|
|
105
|
+
torchx_nightly-2025.10.3.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
106
|
+
torchx_nightly-2025.10.3.dist-info/METADATA,sha256=L9OzHKF3z3Yu3LqJxU0-joIiJuE2rffQR_5jXzWg1aU,5068
|
|
107
|
+
torchx_nightly-2025.10.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
108
|
+
torchx_nightly-2025.10.3.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
109
|
+
torchx_nightly-2025.10.3.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
110
|
+
torchx_nightly-2025.10.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|