torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  2. torchx/cli/cmd_delete.py +30 -0
  3. torchx/cli/cmd_list.py +1 -2
  4. torchx/cli/cmd_run.py +202 -28
  5. torchx/cli/cmd_tracker.py +1 -1
  6. torchx/cli/main.py +2 -0
  7. torchx/components/__init__.py +1 -8
  8. torchx/components/dist.py +9 -3
  9. torchx/components/integration_tests/component_provider.py +2 -2
  10. torchx/components/utils.py +1 -1
  11. torchx/distributed/__init__.py +1 -1
  12. torchx/runner/api.py +102 -81
  13. torchx/runner/config.py +3 -1
  14. torchx/runner/events/__init__.py +20 -10
  15. torchx/runner/events/api.py +1 -1
  16. torchx/schedulers/__init__.py +7 -10
  17. torchx/schedulers/api.py +66 -25
  18. torchx/schedulers/aws_batch_scheduler.py +47 -6
  19. torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
  20. torchx/schedulers/docker_scheduler.py +4 -3
  21. torchx/schedulers/ids.py +27 -23
  22. torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
  23. torchx/schedulers/kubernetes_scheduler.py +355 -36
  24. torchx/schedulers/local_scheduler.py +2 -1
  25. torchx/schedulers/lsf_scheduler.py +1 -1
  26. torchx/schedulers/slurm_scheduler.py +102 -27
  27. torchx/specs/__init__.py +40 -9
  28. torchx/specs/api.py +222 -12
  29. torchx/specs/builders.py +109 -28
  30. torchx/specs/file_linter.py +117 -53
  31. torchx/specs/finder.py +25 -37
  32. torchx/specs/named_resources_aws.py +13 -2
  33. torchx/specs/overlays.py +106 -0
  34. torchx/tracker/__init__.py +2 -2
  35. torchx/tracker/api.py +1 -1
  36. torchx/util/entrypoints.py +1 -6
  37. torchx/util/strings.py +1 -1
  38. torchx/util/types.py +12 -1
  39. torchx/version.py +2 -2
  40. torchx/workspace/api.py +102 -5
  41. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
  42. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
  43. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
  44. torchx/examples/pipelines/__init__.py +0 -0
  45. torchx/examples/pipelines/kfp/__init__.py +0 -0
  46. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
  47. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
  48. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
  49. torchx/pipelines/kfp/__init__.py +0 -30
  50. torchx/pipelines/kfp/adapter.py +0 -274
  51. torchx/pipelines/kfp/version.py +0 -19
  52. torchx/schedulers/gcp_batch_scheduler.py +0 -497
  53. torchx/schedulers/ray/ray_common.py +0 -22
  54. torchx/schedulers/ray/ray_driver.py +0 -307
  55. torchx/schedulers/ray_scheduler.py +0 -454
  56. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
  57. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
  58. {torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0
@@ -92,6 +92,8 @@ ENV_TORCHX_ROLE_IDX = "TORCHX_ROLE_IDX"
92
92
 
93
93
  ENV_TORCHX_ROLE_NAME = "TORCHX_ROLE_NAME"
94
94
 
95
+ ENV_TORCHX_IMAGE = "TORCHX_IMAGE"
96
+
95
97
  DEFAULT_ROLE_NAME = "node"
96
98
 
97
99
  TAG_TORCHX_VER = "torchx.pytorch.org/version"
@@ -99,6 +101,37 @@ TAG_TORCHX_APPNAME = "torchx.pytorch.org/app-name"
99
101
  TAG_TORCHX_USER = "torchx.pytorch.org/user"
100
102
 
101
103
 
104
+ def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]:
105
+ """
106
+ Parse ulimit string in format: name:softLimit:hardLimit
107
+ Multiple ulimits separated by commas.
108
+ """
109
+ if not ulimits_list:
110
+ return []
111
+
112
+ ulimits = []
113
+ for ulimit_str in ulimits_list:
114
+ if not ulimit_str.strip():
115
+ continue
116
+
117
+ parts = ulimit_str.strip().split(":")
118
+ if len(parts) != 3:
119
+ raise ValueError(
120
+ f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}"
121
+ )
122
+
123
+ name, soft_limit, hard_limit = parts
124
+ ulimits.append(
125
+ {
126
+ "name": name,
127
+ "softLimit": int(soft_limit) if soft_limit != "-1" else -1,
128
+ "hardLimit": int(hard_limit) if hard_limit != "-1" else -1,
129
+ }
130
+ )
131
+
132
+ return ulimits
133
+
134
+
102
135
  if TYPE_CHECKING:
103
136
  from docker import DockerClient
104
137
 
@@ -177,7 +210,8 @@ def _role_to_node_properties(
177
210
  privileged: bool = False,
178
211
  job_role_arn: Optional[str] = None,
179
212
  execution_role_arn: Optional[str] = None,
180
- ) -> Dict[str, object]:
213
+ ulimits: Optional[List[Dict[str, Any]]] = None,
214
+ ) -> Dict[str, Any]:
181
215
  role.mounts += get_device_mounts(role.resource.devices)
182
216
 
183
217
  mount_points = []
@@ -239,6 +273,7 @@ def _role_to_node_properties(
239
273
  "environment": [{"name": k, "value": v} for k, v in role.env.items()],
240
274
  "privileged": privileged,
241
275
  "resourceRequirements": resource_requirements_from_resource(role.resource),
276
+ **({"ulimits": ulimits} if ulimits else {}),
242
277
  "linuxParameters": {
243
278
  # To support PyTorch dataloaders we need to set /dev/shm to larger
244
279
  # than the 64M default.
@@ -255,7 +290,7 @@ def _role_to_node_properties(
255
290
  container["jobRoleArn"] = job_role_arn
256
291
  if execution_role_arn:
257
292
  container["executionRoleArn"] = execution_role_arn
258
- if role.num_replicas > 1:
293
+ if role.num_replicas > 0:
259
294
  instance_type = instance_type_from_resource(role.resource)
260
295
  if instance_type is not None:
261
296
  container["instanceType"] = instance_type
@@ -346,7 +381,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
346
381
 
347
382
 
348
383
  @_thread_local_cache
349
- def _local_session() -> "boto3.session.Session":
384
+ def _local_session() -> "boto3.session.Session": # noqa: F821
350
385
  import boto3.session
351
386
 
352
387
  return boto3.session.Session()
@@ -361,11 +396,10 @@ class AWSBatchOpts(TypedDict, total=False):
361
396
  priority: int
362
397
  job_role_arn: Optional[str]
363
398
  execution_role_arn: Optional[str]
399
+ ulimits: Optional[list[str]]
364
400
 
365
401
 
366
- class AWSBatchScheduler(
367
- DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
368
- ):
402
+ class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
369
403
  """
370
404
  AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.
371
405
 
@@ -506,6 +540,7 @@ class AWSBatchScheduler(
506
540
  role = values.apply(role)
507
541
  role.env[ENV_TORCHX_ROLE_IDX] = str(role_idx)
508
542
  role.env[ENV_TORCHX_ROLE_NAME] = str(role.name)
543
+ role.env[ENV_TORCHX_IMAGE] = role.image
509
544
 
510
545
  nodes.append(
511
546
  _role_to_node_properties(
@@ -514,6 +549,7 @@ class AWSBatchScheduler(
514
549
  privileged=cfg["privileged"],
515
550
  job_role_arn=cfg.get("job_role_arn"),
516
551
  execution_role_arn=cfg.get("execution_role_arn"),
552
+ ulimits=parse_ulimits(cfg.get("ulimits") or []),
517
553
  )
518
554
  )
519
555
  node_idx += role.num_replicas
@@ -599,6 +635,11 @@ class AWSBatchScheduler(
599
635
  type_=str,
600
636
  help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
601
637
  )
638
+ opts.add(
639
+ "ulimits",
640
+ type_=List[str],
641
+ help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)",
642
+ )
602
643
  return opts
603
644
 
604
645
  def _get_job_id(self, app_id: str) -> Optional[str]:
@@ -157,7 +157,7 @@ def _merge_ordered(
157
157
 
158
158
  class AWSSageMakerScheduler(
159
159
  DockerWorkspaceMixin,
160
- Scheduler[AWSSageMakerOpts, AppDef, AppDryRunInfo[AWSSageMakerJob]],
160
+ Scheduler[AWSSageMakerOpts],
161
161
  ):
162
162
  """
163
163
  AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.
@@ -84,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
84
84
  LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
85
85
  LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
86
86
 
87
+ ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
88
+
87
89
  NETWORK = "torchx"
88
90
 
89
91
 
@@ -127,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
127
129
  privileged: bool
128
130
 
129
131
 
130
- class DockerScheduler(
131
- DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
132
- ):
132
+ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
133
133
  """
134
134
  DockerScheduler is a TorchX scheduling interface to Docker.
135
135
 
@@ -279,6 +279,7 @@ class DockerScheduler(
279
279
 
280
280
  # configure distributed host envs
281
281
  env["TORCHX_RANK0_HOST"] = rank0_name
282
+ env[ENV_TORCHX_IMAGE] = replica_role.image
282
283
 
283
284
  c = DockerContainer(
284
285
  image=replica_role.image,
torchx/schedulers/ids.py CHANGED
@@ -8,9 +8,9 @@
8
8
  # pyre-strict
9
9
 
10
10
  import os
11
- import random
12
11
  import struct
13
12
 
13
+
14
14
  START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
15
15
  END_CANDIDATES: str = START_CANDIDATES + "012345679"
16
16
 
@@ -19,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
19
19
  """
20
20
  Appends a unique 64-bit string to the input argument.
21
21
 
22
+ Note that the unique string pulls entropy from `/dev/urandom` hence is not
23
+ affected by `random.seed()`
24
+
25
+ Args:
26
+ name: the name string to unique-ify
27
+ string_length: max length of the unique 64-bit string to append to the ``name``.
28
+ Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
29
+
22
30
  Returns:
23
- string in format $name-$unique_suffix
31
+ string in format ``{name}-{unique_suffix}`
24
32
  """
25
- return (
26
- f"{name}-{random_id()}"
27
- if string_length == 0
28
- else f"{name}-{get_len_random_id(string_length)}"
29
- )
33
+ max_length = None if string_length == 0 else string_length
34
+ return f"{name}-{random_id(max_length)}"
30
35
 
31
36
 
32
37
  def random_uint64() -> int:
@@ -36,13 +41,24 @@ def random_uint64() -> int:
36
41
  return struct.unpack("!Q", os.urandom(8))[0]
37
42
 
38
43
 
39
- def random_id() -> str:
44
+ def random_id(max_length: int | None = None) -> str:
40
45
  """
41
46
  Generates an alphanumeric string ID that matches the requirements from
42
47
  https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
48
+
49
+ Note that the unique string pulls entropy from `/dev/urandom` hence is not
50
+ affected by `random.seed()`
51
+
52
+ If ``max_length`` is provided, the returned ID will be at most that many characters long.
53
+
43
54
  """
55
+ # If a max_length is provided and is non-positive, return empty string
56
+ if max_length is not None and max_length <= 0:
57
+ return ""
58
+
44
59
  out = ""
45
60
  v = random_uint64()
61
+
46
62
  while v > 0:
47
63
  if out == "":
48
64
  candidates = START_CANDIDATES
@@ -52,21 +68,9 @@ def random_id() -> str:
52
68
  char = v % len(candidates)
53
69
  v = v // len(candidates)
54
70
  out += candidates[char]
55
- return out
56
-
57
-
58
- def get_len_random_id(string_length: int) -> str:
59
- """
60
- Generates an alphanumeric string ID that matches the requirements from
61
- https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
62
- """
63
- out = ""
64
- for i in range(string_length):
65
- if out == "":
66
- candidates = START_CANDIDATES
67
- else:
68
- candidates = END_CANDIDATES
69
71
 
70
- out += random.choice(candidates)
72
+ if max_length is not None and len(out) >= max_length:
73
+ break
71
74
 
75
+ # NOTE: statistically the length of `out` is typically between 12-14 characters long
72
76
  return out
@@ -796,10 +796,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
796
796
  network: Optional[str]
797
797
 
798
798
 
799
- class KubernetesMCADScheduler(
800
- DockerWorkspaceMixin,
801
- Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
802
- ):
799
+ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
803
800
  """
804
801
  KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.
805
802