torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
- torchx/cli/cmd_delete.py +30 -0
- torchx/cli/cmd_list.py +1 -2
- torchx/cli/cmd_run.py +202 -28
- torchx/cli/cmd_tracker.py +1 -1
- torchx/cli/main.py +2 -0
- torchx/components/__init__.py +1 -8
- torchx/components/dist.py +9 -3
- torchx/components/integration_tests/component_provider.py +2 -2
- torchx/components/utils.py +1 -1
- torchx/distributed/__init__.py +1 -1
- torchx/runner/api.py +102 -81
- torchx/runner/config.py +3 -1
- torchx/runner/events/__init__.py +20 -10
- torchx/runner/events/api.py +1 -1
- torchx/schedulers/__init__.py +7 -10
- torchx/schedulers/api.py +66 -25
- torchx/schedulers/aws_batch_scheduler.py +47 -6
- torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
- torchx/schedulers/docker_scheduler.py +4 -3
- torchx/schedulers/ids.py +27 -23
- torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
- torchx/schedulers/kubernetes_scheduler.py +355 -36
- torchx/schedulers/local_scheduler.py +2 -1
- torchx/schedulers/lsf_scheduler.py +1 -1
- torchx/schedulers/slurm_scheduler.py +102 -27
- torchx/specs/__init__.py +40 -9
- torchx/specs/api.py +222 -12
- torchx/specs/builders.py +109 -28
- torchx/specs/file_linter.py +117 -53
- torchx/specs/finder.py +25 -37
- torchx/specs/named_resources_aws.py +13 -2
- torchx/specs/overlays.py +106 -0
- torchx/tracker/__init__.py +2 -2
- torchx/tracker/api.py +1 -1
- torchx/util/entrypoints.py +1 -6
- torchx/util/strings.py +1 -1
- torchx/util/types.py +12 -1
- torchx/version.py +2 -2
- torchx/workspace/api.py +102 -5
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
- torchx/examples/pipelines/__init__.py +0 -0
- torchx/examples/pipelines/kfp/__init__.py +0 -0
- torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
- torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
- torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
- torchx/pipelines/kfp/__init__.py +0 -30
- torchx/pipelines/kfp/adapter.py +0 -274
- torchx/pipelines/kfp/version.py +0 -19
- torchx/schedulers/gcp_batch_scheduler.py +0 -497
- torchx/schedulers/ray/ray_common.py +0 -22
- torchx/schedulers/ray/ray_driver.py +0 -307
- torchx/schedulers/ray_scheduler.py +0 -454
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
- {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
|
@@ -92,6 +92,8 @@ ENV_TORCHX_ROLE_IDX = "TORCHX_ROLE_IDX"
|
|
|
92
92
|
|
|
93
93
|
ENV_TORCHX_ROLE_NAME = "TORCHX_ROLE_NAME"
|
|
94
94
|
|
|
95
|
+
ENV_TORCHX_IMAGE = "TORCHX_IMAGE"
|
|
96
|
+
|
|
95
97
|
DEFAULT_ROLE_NAME = "node"
|
|
96
98
|
|
|
97
99
|
TAG_TORCHX_VER = "torchx.pytorch.org/version"
|
|
@@ -99,6 +101,37 @@ TAG_TORCHX_APPNAME = "torchx.pytorch.org/app-name"
|
|
|
99
101
|
TAG_TORCHX_USER = "torchx.pytorch.org/user"
|
|
100
102
|
|
|
101
103
|
|
|
104
|
+
def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]:
|
|
105
|
+
"""
|
|
106
|
+
Parse ulimit string in format: name:softLimit:hardLimit
|
|
107
|
+
Multiple ulimits separated by commas.
|
|
108
|
+
"""
|
|
109
|
+
if not ulimits_list:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
ulimits = []
|
|
113
|
+
for ulimit_str in ulimits_list:
|
|
114
|
+
if not ulimit_str.strip():
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
parts = ulimit_str.strip().split(":")
|
|
118
|
+
if len(parts) != 3:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
name, soft_limit, hard_limit = parts
|
|
124
|
+
ulimits.append(
|
|
125
|
+
{
|
|
126
|
+
"name": name,
|
|
127
|
+
"softLimit": int(soft_limit) if soft_limit != "-1" else -1,
|
|
128
|
+
"hardLimit": int(hard_limit) if hard_limit != "-1" else -1,
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
return ulimits
|
|
133
|
+
|
|
134
|
+
|
|
102
135
|
if TYPE_CHECKING:
|
|
103
136
|
from docker import DockerClient
|
|
104
137
|
|
|
@@ -177,7 +210,8 @@ def _role_to_node_properties(
|
|
|
177
210
|
privileged: bool = False,
|
|
178
211
|
job_role_arn: Optional[str] = None,
|
|
179
212
|
execution_role_arn: Optional[str] = None,
|
|
180
|
-
|
|
213
|
+
ulimits: Optional[List[Dict[str, Any]]] = None,
|
|
214
|
+
) -> Dict[str, Any]:
|
|
181
215
|
role.mounts += get_device_mounts(role.resource.devices)
|
|
182
216
|
|
|
183
217
|
mount_points = []
|
|
@@ -239,6 +273,7 @@ def _role_to_node_properties(
|
|
|
239
273
|
"environment": [{"name": k, "value": v} for k, v in role.env.items()],
|
|
240
274
|
"privileged": privileged,
|
|
241
275
|
"resourceRequirements": resource_requirements_from_resource(role.resource),
|
|
276
|
+
**({"ulimits": ulimits} if ulimits else {}),
|
|
242
277
|
"linuxParameters": {
|
|
243
278
|
# To support PyTorch dataloaders we need to set /dev/shm to larger
|
|
244
279
|
# than the 64M default.
|
|
@@ -255,7 +290,7 @@ def _role_to_node_properties(
|
|
|
255
290
|
container["jobRoleArn"] = job_role_arn
|
|
256
291
|
if execution_role_arn:
|
|
257
292
|
container["executionRoleArn"] = execution_role_arn
|
|
258
|
-
if role.num_replicas >
|
|
293
|
+
if role.num_replicas > 0:
|
|
259
294
|
instance_type = instance_type_from_resource(role.resource)
|
|
260
295
|
if instance_type is not None:
|
|
261
296
|
container["instanceType"] = instance_type
|
|
@@ -346,7 +381,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
|
|
|
346
381
|
|
|
347
382
|
|
|
348
383
|
@_thread_local_cache
|
|
349
|
-
def _local_session() -> "boto3.session.Session":
|
|
384
|
+
def _local_session() -> "boto3.session.Session": # noqa: F821
|
|
350
385
|
import boto3.session
|
|
351
386
|
|
|
352
387
|
return boto3.session.Session()
|
|
@@ -361,11 +396,10 @@ class AWSBatchOpts(TypedDict, total=False):
|
|
|
361
396
|
priority: int
|
|
362
397
|
job_role_arn: Optional[str]
|
|
363
398
|
execution_role_arn: Optional[str]
|
|
399
|
+
ulimits: Optional[list[str]]
|
|
364
400
|
|
|
365
401
|
|
|
366
|
-
class AWSBatchScheduler(
|
|
367
|
-
DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
|
|
368
|
-
):
|
|
402
|
+
class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
|
|
369
403
|
"""
|
|
370
404
|
AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.
|
|
371
405
|
|
|
@@ -506,6 +540,7 @@ class AWSBatchScheduler(
|
|
|
506
540
|
role = values.apply(role)
|
|
507
541
|
role.env[ENV_TORCHX_ROLE_IDX] = str(role_idx)
|
|
508
542
|
role.env[ENV_TORCHX_ROLE_NAME] = str(role.name)
|
|
543
|
+
role.env[ENV_TORCHX_IMAGE] = role.image
|
|
509
544
|
|
|
510
545
|
nodes.append(
|
|
511
546
|
_role_to_node_properties(
|
|
@@ -514,6 +549,7 @@ class AWSBatchScheduler(
|
|
|
514
549
|
privileged=cfg["privileged"],
|
|
515
550
|
job_role_arn=cfg.get("job_role_arn"),
|
|
516
551
|
execution_role_arn=cfg.get("execution_role_arn"),
|
|
552
|
+
ulimits=parse_ulimits(cfg.get("ulimits") or []),
|
|
517
553
|
)
|
|
518
554
|
)
|
|
519
555
|
node_idx += role.num_replicas
|
|
@@ -599,6 +635,11 @@ class AWSBatchScheduler(
|
|
|
599
635
|
type_=str,
|
|
600
636
|
help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
|
|
601
637
|
)
|
|
638
|
+
opts.add(
|
|
639
|
+
"ulimits",
|
|
640
|
+
type_=List[str],
|
|
641
|
+
help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)",
|
|
642
|
+
)
|
|
602
643
|
return opts
|
|
603
644
|
|
|
604
645
|
def _get_job_id(self, app_id: str) -> Optional[str]:
|
|
@@ -157,7 +157,7 @@ def _merge_ordered(
|
|
|
157
157
|
|
|
158
158
|
class AWSSageMakerScheduler(
|
|
159
159
|
DockerWorkspaceMixin,
|
|
160
|
-
Scheduler[AWSSageMakerOpts
|
|
160
|
+
Scheduler[AWSSageMakerOpts],
|
|
161
161
|
):
|
|
162
162
|
"""
|
|
163
163
|
AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.
|
|
@@ -84,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
|
|
|
84
84
|
LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
|
|
85
85
|
LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
|
|
86
86
|
|
|
87
|
+
ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
|
|
88
|
+
|
|
87
89
|
NETWORK = "torchx"
|
|
88
90
|
|
|
89
91
|
|
|
@@ -127,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
|
|
|
127
129
|
privileged: bool
|
|
128
130
|
|
|
129
131
|
|
|
130
|
-
class DockerScheduler(
|
|
131
|
-
DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
|
|
132
|
-
):
|
|
132
|
+
class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
|
|
133
133
|
"""
|
|
134
134
|
DockerScheduler is a TorchX scheduling interface to Docker.
|
|
135
135
|
|
|
@@ -279,6 +279,7 @@ class DockerScheduler(
|
|
|
279
279
|
|
|
280
280
|
# configure distributed host envs
|
|
281
281
|
env["TORCHX_RANK0_HOST"] = rank0_name
|
|
282
|
+
env[ENV_TORCHX_IMAGE] = replica_role.image
|
|
282
283
|
|
|
283
284
|
c = DockerContainer(
|
|
284
285
|
image=replica_role.image,
|
torchx/schedulers/ids.py
CHANGED
|
@@ -8,9 +8,9 @@
|
|
|
8
8
|
# pyre-strict
|
|
9
9
|
|
|
10
10
|
import os
|
|
11
|
-
import random
|
|
12
11
|
import struct
|
|
13
12
|
|
|
13
|
+
|
|
14
14
|
START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
|
|
15
15
|
END_CANDIDATES: str = START_CANDIDATES + "012345679"
|
|
16
16
|
|
|
@@ -19,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
|
|
|
19
19
|
"""
|
|
20
20
|
Appends a unique 64-bit string to the input argument.
|
|
21
21
|
|
|
22
|
+
Note that the unique string pulls entropy from `/dev/urandom` hence is not
|
|
23
|
+
affected by `random.seed()`
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
name: the name string to unique-ify
|
|
27
|
+
string_length: max length of the unique 64-bit string to append to the ``name``.
|
|
28
|
+
Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
|
|
29
|
+
|
|
22
30
|
Returns:
|
|
23
|
-
string in format
|
|
31
|
+
string in format ``{name}-{unique_suffix}`
|
|
24
32
|
"""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
if string_length == 0
|
|
28
|
-
else f"{name}-{get_len_random_id(string_length)}"
|
|
29
|
-
)
|
|
33
|
+
max_length = None if string_length == 0 else string_length
|
|
34
|
+
return f"{name}-{random_id(max_length)}"
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
def random_uint64() -> int:
|
|
@@ -36,13 +41,24 @@ def random_uint64() -> int:
|
|
|
36
41
|
return struct.unpack("!Q", os.urandom(8))[0]
|
|
37
42
|
|
|
38
43
|
|
|
39
|
-
def random_id() -> str:
|
|
44
|
+
def random_id(max_length: int | None = None) -> str:
|
|
40
45
|
"""
|
|
41
46
|
Generates an alphanumeric string ID that matches the requirements from
|
|
42
47
|
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
|
|
48
|
+
|
|
49
|
+
Note that the unique string pulls entropy from `/dev/urandom` hence is not
|
|
50
|
+
affected by `random.seed()`
|
|
51
|
+
|
|
52
|
+
If ``max_length`` is provided, the returned ID will be at most that many characters long.
|
|
53
|
+
|
|
43
54
|
"""
|
|
55
|
+
# If a max_length is provided and is non-positive, return empty string
|
|
56
|
+
if max_length is not None and max_length <= 0:
|
|
57
|
+
return ""
|
|
58
|
+
|
|
44
59
|
out = ""
|
|
45
60
|
v = random_uint64()
|
|
61
|
+
|
|
46
62
|
while v > 0:
|
|
47
63
|
if out == "":
|
|
48
64
|
candidates = START_CANDIDATES
|
|
@@ -52,21 +68,9 @@ def random_id() -> str:
|
|
|
52
68
|
char = v % len(candidates)
|
|
53
69
|
v = v // len(candidates)
|
|
54
70
|
out += candidates[char]
|
|
55
|
-
return out
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def get_len_random_id(string_length: int) -> str:
|
|
59
|
-
"""
|
|
60
|
-
Generates an alphanumeric string ID that matches the requirements from
|
|
61
|
-
https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
|
|
62
|
-
"""
|
|
63
|
-
out = ""
|
|
64
|
-
for i in range(string_length):
|
|
65
|
-
if out == "":
|
|
66
|
-
candidates = START_CANDIDATES
|
|
67
|
-
else:
|
|
68
|
-
candidates = END_CANDIDATES
|
|
69
71
|
|
|
70
|
-
|
|
72
|
+
if max_length is not None and len(out) >= max_length:
|
|
73
|
+
break
|
|
71
74
|
|
|
75
|
+
# NOTE: statistically the length of `out` is typically between 12-14 characters long
|
|
72
76
|
return out
|
|
@@ -796,10 +796,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
|
|
|
796
796
|
network: Optional[str]
|
|
797
797
|
|
|
798
798
|
|
|
799
|
-
class KubernetesMCADScheduler(
|
|
800
|
-
DockerWorkspaceMixin,
|
|
801
|
-
Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
|
|
802
|
-
):
|
|
799
|
+
class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
|
|
803
800
|
"""
|
|
804
801
|
KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.
|
|
805
802
|
|