torchx-nightly 2023.10.21__py3-none-any.whl → 2025.12.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

Files changed (110) hide show
  1. torchx/__init__.py +2 -0
  2. torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
  3. torchx/apps/serve/serve.py +2 -0
  4. torchx/apps/utils/booth_main.py +2 -0
  5. torchx/apps/utils/copy_main.py +2 -0
  6. torchx/apps/utils/process_monitor.py +2 -0
  7. torchx/cli/__init__.py +2 -0
  8. torchx/cli/argparse_util.py +38 -3
  9. torchx/cli/cmd_base.py +2 -0
  10. torchx/cli/cmd_cancel.py +2 -0
  11. torchx/cli/cmd_configure.py +2 -0
  12. torchx/cli/cmd_delete.py +30 -0
  13. torchx/cli/cmd_describe.py +2 -0
  14. torchx/cli/cmd_list.py +8 -4
  15. torchx/cli/cmd_log.py +6 -24
  16. torchx/cli/cmd_run.py +269 -45
  17. torchx/cli/cmd_runopts.py +2 -0
  18. torchx/cli/cmd_status.py +12 -1
  19. torchx/cli/cmd_tracker.py +3 -1
  20. torchx/cli/colors.py +2 -0
  21. torchx/cli/main.py +4 -0
  22. torchx/components/__init__.py +3 -8
  23. torchx/components/component_test_base.py +2 -0
  24. torchx/components/dist.py +18 -7
  25. torchx/components/integration_tests/component_provider.py +4 -2
  26. torchx/components/integration_tests/integ_tests.py +2 -0
  27. torchx/components/serve.py +2 -0
  28. torchx/components/structured_arg.py +7 -6
  29. torchx/components/utils.py +15 -4
  30. torchx/distributed/__init__.py +2 -4
  31. torchx/examples/apps/datapreproc/datapreproc.py +2 -0
  32. torchx/examples/apps/lightning/data.py +5 -3
  33. torchx/examples/apps/lightning/model.py +7 -6
  34. torchx/examples/apps/lightning/profiler.py +7 -4
  35. torchx/examples/apps/lightning/train.py +11 -2
  36. torchx/examples/torchx_out_of_sync_training.py +11 -0
  37. torchx/notebook.py +2 -0
  38. torchx/runner/__init__.py +2 -0
  39. torchx/runner/api.py +167 -60
  40. torchx/runner/config.py +43 -10
  41. torchx/runner/events/__init__.py +57 -13
  42. torchx/runner/events/api.py +14 -3
  43. torchx/runner/events/handlers.py +2 -0
  44. torchx/runtime/tracking/__init__.py +2 -0
  45. torchx/runtime/tracking/api.py +2 -0
  46. torchx/schedulers/__init__.py +16 -15
  47. torchx/schedulers/api.py +70 -14
  48. torchx/schedulers/aws_batch_scheduler.py +79 -5
  49. torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
  50. torchx/schedulers/devices.py +17 -4
  51. torchx/schedulers/docker_scheduler.py +43 -11
  52. torchx/schedulers/ids.py +29 -23
  53. torchx/schedulers/kubernetes_mcad_scheduler.py +10 -8
  54. torchx/schedulers/kubernetes_scheduler.py +383 -38
  55. torchx/schedulers/local_scheduler.py +100 -27
  56. torchx/schedulers/lsf_scheduler.py +5 -4
  57. torchx/schedulers/slurm_scheduler.py +336 -20
  58. torchx/schedulers/streams.py +2 -0
  59. torchx/specs/__init__.py +89 -12
  60. torchx/specs/api.py +431 -32
  61. torchx/specs/builders.py +176 -38
  62. torchx/specs/file_linter.py +143 -57
  63. torchx/specs/finder.py +68 -28
  64. torchx/specs/named_resources_aws.py +254 -22
  65. torchx/specs/named_resources_generic.py +2 -0
  66. torchx/specs/overlays.py +106 -0
  67. torchx/specs/test/components/__init__.py +2 -0
  68. torchx/specs/test/components/a/__init__.py +2 -0
  69. torchx/specs/test/components/a/b/__init__.py +2 -0
  70. torchx/specs/test/components/a/b/c.py +2 -0
  71. torchx/specs/test/components/c/__init__.py +2 -0
  72. torchx/specs/test/components/c/d.py +2 -0
  73. torchx/tracker/__init__.py +12 -6
  74. torchx/tracker/api.py +15 -18
  75. torchx/tracker/backend/fsspec.py +2 -0
  76. torchx/util/cuda.py +2 -0
  77. torchx/util/datetime.py +2 -0
  78. torchx/util/entrypoints.py +39 -15
  79. torchx/util/io.py +2 -0
  80. torchx/util/log_tee_helpers.py +210 -0
  81. torchx/util/modules.py +65 -0
  82. torchx/util/session.py +42 -0
  83. torchx/util/shlex.py +2 -0
  84. torchx/util/strings.py +3 -1
  85. torchx/util/types.py +90 -29
  86. torchx/version.py +4 -2
  87. torchx/workspace/__init__.py +2 -0
  88. torchx/workspace/api.py +136 -6
  89. torchx/workspace/dir_workspace.py +2 -0
  90. torchx/workspace/docker_workspace.py +30 -2
  91. torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
  92. torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
  93. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
  94. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
  95. torchx/examples/pipelines/__init__.py +0 -0
  96. torchx/examples/pipelines/kfp/__init__.py +0 -0
  97. torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
  98. torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
  99. torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
  100. torchx/pipelines/kfp/__init__.py +0 -28
  101. torchx/pipelines/kfp/adapter.py +0 -271
  102. torchx/pipelines/kfp/version.py +0 -17
  103. torchx/schedulers/gcp_batch_scheduler.py +0 -487
  104. torchx/schedulers/ray/ray_common.py +0 -22
  105. torchx/schedulers/ray/ray_driver.py +0 -307
  106. torchx/schedulers/ray_scheduler.py +0 -453
  107. torchx_nightly-2023.10.21.dist-info/METADATA +0 -174
  108. torchx_nightly-2023.10.21.dist-info/RECORD +0 -118
  109. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
  110. {torchx_nightly-2023.10.21.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0
@@ -4,18 +4,20 @@
4
4
  # This source code is licensed under the BSD-style license found in the
5
5
  # LICENSE file in the root directory of this source tree.
6
6
 
7
+ # pyre-strict
8
+
7
9
  import fnmatch
8
10
  import logging
9
11
  import os.path
12
+ import re
10
13
  import tempfile
11
14
  from dataclasses import dataclass
12
15
  from datetime import datetime
13
- from typing import Any, Dict, Iterable, List, Optional, TYPE_CHECKING, Union
16
+ from typing import Any, Dict, Iterable, List, Optional, TYPE_CHECKING, TypedDict, Union
14
17
 
15
18
  import torchx
16
19
  import yaml
17
20
  from torchx.schedulers.api import (
18
- AppDryRunInfo,
19
21
  DescribeAppResponse,
20
22
  filter_regex,
21
23
  ListAppResponse,
@@ -27,6 +29,7 @@ from torchx.schedulers.devices import get_device_mounts
27
29
  from torchx.schedulers.ids import make_unique
28
30
  from torchx.specs.api import (
29
31
  AppDef,
32
+ AppDryRunInfo,
30
33
  AppState,
31
34
  BindMount,
32
35
  DeviceMount,
@@ -39,7 +42,6 @@ from torchx.specs.api import (
39
42
  VolumeMount,
40
43
  )
41
44
  from torchx.workspace.docker_workspace import DockerWorkspaceMixin
42
- from typing_extensions import TypedDict
43
45
 
44
46
 
45
47
  if TYPE_CHECKING:
@@ -82,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
82
84
  LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
83
85
  LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
84
86
 
87
+ ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
88
+
85
89
  NETWORK = "torchx"
86
90
 
87
91
 
@@ -121,6 +125,8 @@ def ensure_network(client: Optional["DockerClient"] = None) -> None:
121
125
 
122
126
  class DockerOpts(TypedDict, total=False):
123
127
  copy_env: Optional[List[str]]
128
+ env: Optional[Dict[str, str]]
129
+ privileged: bool
124
130
 
125
131
 
126
132
  class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
@@ -215,9 +221,14 @@ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
215
221
  for k in keys:
216
222
  default_env[k] = os.environ[k]
217
223
 
224
+ env = cfg.get("env")
225
+ if env:
226
+ default_env.update(env)
227
+
218
228
  app_id = make_unique(app.name)
219
229
  req = DockerJob(app_id=app_id, containers=[])
220
- rank0_name = f"{app_id}-{app.roles[0].name}-0"
230
+ # trim app_id and role name in case name is longer than 64 letters
231
+ rank0_name = f"{app_id[-30:]}-{app.roles[0].name[:30]}-0"
221
232
  for role in app.roles:
222
233
  mounts = []
223
234
  devices = []
@@ -256,14 +267,19 @@ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
256
267
  rank0_env="TORCHX_RANK0_HOST",
257
268
  )
258
269
  replica_role = values.apply(role)
259
- name = f"{app_id}-{role.name}-{replica_id}"
260
-
270
+ # trim app_id and role name in case name is longer than 64 letters. Assume replica_id is less than 10_000. Remove invalid prefixes (https://github.com/moby/moby/blob/master/daemon/names/names.go#L6).
271
+ name = re.sub(
272
+ r"^[^a-zA-Z0-9]+",
273
+ "",
274
+ f"{app_id[-30:]}-{role.name[:30]}-{replica_id}",
275
+ )
261
276
  env = default_env.copy()
262
277
  if replica_role.env:
263
278
  env.update(replica_role.env)
264
279
 
265
280
  # configure distributed host envs
266
281
  env["TORCHX_RANK0_HOST"] = rank0_name
282
+ env[ENV_TORCHX_IMAGE] = replica_role.image
267
283
 
268
284
  c = DockerContainer(
269
285
  image=replica_role.image,
@@ -278,6 +294,7 @@ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
278
294
  LABEL_REPLICA_ID: str(replica_id),
279
295
  },
280
296
  "hostname": name,
297
+ "privileged": cfg.get("privileged", False),
281
298
  "network": NETWORK,
282
299
  "mounts": mounts,
283
300
  "devices": devices,
@@ -292,9 +309,9 @@ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
292
309
  if resource.memMB >= 0:
293
310
  # To support PyTorch dataloaders we need to set /dev/shm to
294
311
  # larger than the 64M default.
295
- c.kwargs["mem_limit"] = c.kwargs[
296
- "shm_size"
297
- ] = f"{int(resource.memMB)}m"
312
+ c.kwargs["mem_limit"] = c.kwargs["shm_size"] = (
313
+ f"{int(resource.memMB)}m"
314
+ )
298
315
  if resource.cpu >= 0:
299
316
  c.kwargs["nano_cpus"] = int(resource.cpu * 1e9)
300
317
  if resource.gpu > 0:
@@ -305,14 +322,14 @@ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
305
322
  c.kwargs["device_requests"] = [
306
323
  DeviceRequest(
307
324
  count=resource.gpu,
308
- capabilities=[["compute"]],
325
+ capabilities=[["compute", "utility"]],
309
326
  )
310
327
  ]
311
328
  req.containers.append(c)
312
329
 
313
330
  return AppDryRunInfo(req, repr)
314
331
 
315
- def _validate(self, app: AppDef, scheduler: str) -> None:
332
+ def _validate(self, app: AppDef, scheduler: str, cfg: DockerOpts) -> None:
316
333
  # Skip validation step
317
334
  pass
318
335
 
@@ -357,6 +374,21 @@ class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
357
374
  default=None,
358
375
  help="list of glob patterns of environment variables to copy if not set in AppDef. Ex: FOO_*",
359
376
  )
377
+ opts.add(
378
+ "env",
379
+ type_=Dict[str, str],
380
+ default=None,
381
+ help="""environment variables to be passed to the run. The separator sign can be eiher comma or semicolon
382
+ (e.g. ENV1:v1,ENV2:v2,ENV3:v3 or ENV1:V1;ENV2:V2). Environment variables from env will be applied on top
383
+ of the ones from copy_env""",
384
+ )
385
+ opts.add(
386
+ "privileged",
387
+ type_=bool,
388
+ default=False,
389
+ help="If true runs the container with elevated permissions."
390
+ " Equivalent to running with `docker run --privileged`.",
391
+ )
360
392
  return opts
361
393
 
362
394
  def _get_app_state(self, container: "Container") -> AppState:
torchx/schedulers/ids.py CHANGED
@@ -5,10 +5,12 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ # pyre-strict
9
+
8
10
  import os
9
- import random
10
11
  import struct
11
12
 
13
+
12
14
  START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
13
15
  END_CANDIDATES: str = START_CANDIDATES + "012345679"
14
16
 
@@ -17,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
17
19
  """
18
20
  Appends a unique 64-bit string to the input argument.
19
21
 
22
+ Note that the unique string pulls entropy from `/dev/urandom` hence is not
23
+ affected by `random.seed()`
24
+
25
+ Args:
26
+ name: the name string to unique-ify
27
+ string_length: max length of the unique 64-bit string to append to the ``name``.
28
+ Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
29
+
20
30
  Returns:
21
- string in format $name-$unique_suffix
31
+ string in format ``{name}-{unique_suffix}`
22
32
  """
23
- return (
24
- f"{name}-{random_id()}"
25
- if string_length == 0
26
- else f"{name}-{get_len_random_id(string_length)}"
27
- )
33
+ max_length = None if string_length == 0 else string_length
34
+ return f"{name}-{random_id(max_length)}"
28
35
 
29
36
 
30
37
  def random_uint64() -> int:
@@ -34,13 +41,24 @@ def random_uint64() -> int:
34
41
  return struct.unpack("!Q", os.urandom(8))[0]
35
42
 
36
43
 
37
- def random_id() -> str:
44
+ def random_id(max_length: int | None = None) -> str:
38
45
  """
39
46
  Generates an alphanumeric string ID that matches the requirements from
40
47
  https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
48
+
49
+ Note that the unique string pulls entropy from `/dev/urandom` hence is not
50
+ affected by `random.seed()`
51
+
52
+ If ``max_length`` is provided, the returned ID will be at most that many characters long.
53
+
41
54
  """
55
+ # If a max_length is provided and is non-positive, return empty string
56
+ if max_length is not None and max_length <= 0:
57
+ return ""
58
+
42
59
  out = ""
43
60
  v = random_uint64()
61
+
44
62
  while v > 0:
45
63
  if out == "":
46
64
  candidates = START_CANDIDATES
@@ -50,21 +68,9 @@ def random_id() -> str:
50
68
  char = v % len(candidates)
51
69
  v = v // len(candidates)
52
70
  out += candidates[char]
53
- return out
54
-
55
-
56
- def get_len_random_id(string_length: int) -> str:
57
- """
58
- Generates an alphanumeric string ID that matches the requirements from
59
- https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
60
- """
61
- out = ""
62
- for i in range(string_length):
63
- if out == "":
64
- candidates = START_CANDIDATES
65
- else:
66
- candidates = END_CANDIDATES
67
71
 
68
- out += random.choice(candidates)
72
+ if max_length is not None and len(out) >= max_length:
73
+ break
69
74
 
75
+ # NOTE: statistically the length of `out` is typically between 12-14 characters long
70
76
  return out
@@ -5,6 +5,8 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ # pyre-strict
9
+
8
10
  """
9
11
 
10
12
  This contains the TorchX Kubernetes_MCAD scheduler which can be used to run TorchX
@@ -15,8 +17,8 @@ Prerequisites
15
17
 
16
18
  TorchX Kubernetes_MCAD scheduler depends on AppWrapper + MCAD.
17
19
 
18
- Install MCAD:
19
- See deploying Multi-Cluster-Application-Dispatcher guide
20
+ Install MCAD:
21
+ See deploying Multi-Cluster-Application-Dispatcher guide
20
22
  https://github.com/project-codeflare/multi-cluster-app-dispatcher/blob/main/doc/deploy/deployment.md
21
23
 
22
24
  This implementation requires MCAD v1.34.1 or higher.
@@ -44,12 +46,12 @@ from typing import (
44
46
  Optional,
45
47
  Tuple,
46
48
  TYPE_CHECKING,
49
+ TypedDict,
47
50
  )
48
51
 
49
52
  import torchx
50
53
  import yaml
51
54
  from torchx.schedulers.api import (
52
- AppDryRunInfo,
53
55
  DescribeAppResponse,
54
56
  filter_regex,
55
57
  ListAppResponse,
@@ -60,6 +62,7 @@ from torchx.schedulers.api import (
60
62
  from torchx.schedulers.ids import make_unique
61
63
  from torchx.specs.api import (
62
64
  AppDef,
65
+ AppDryRunInfo,
63
66
  AppState,
64
67
  BindMount,
65
68
  CfgVal,
@@ -76,7 +79,6 @@ from torchx.specs.api import (
76
79
  )
77
80
 
78
81
  from torchx.workspace.docker_workspace import DockerWorkspaceMixin
79
- from typing_extensions import TypedDict
80
82
 
81
83
  if TYPE_CHECKING:
82
84
  from docker import DockerClient
@@ -436,7 +438,7 @@ def mcad_svc(
436
438
  target_port=int(service_port),
437
439
  )
438
440
  ],
439
- selector={"appwrapper.workload.codeflare.dev": svc_name},
441
+ selector={LABEL_UNIQUE_NAME: svc_name},
440
442
  session_affinity="None",
441
443
  type="ClusterIP",
442
444
  ),
@@ -598,7 +600,7 @@ def app_to_resource(
598
600
 
599
601
  """
600
602
  Create Service:
601
- The selector will have the key 'appwrapper.workload.codeflare.dev', and the value will be
603
+ The selector will have the key 'appwrapper.workload.codeflare.dev', and the value will be
602
604
  the appwrapper name
603
605
  """
604
606
 
@@ -990,7 +992,7 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
990
992
  if image_secret is not None and service_account is not None:
991
993
  msg = """Service Account and Image Secret names are both provided.
992
994
  Depending on the Service Account configuration, an ImagePullSecret may be defined in your Service Account.
993
- If this is the case, check service account and image secret configurations to understand the expected behavior for
995
+ If this is the case, check service account and image secret configurations to understand the expected behavior for
994
996
  patched image push access."""
995
997
  warnings.warn(msg)
996
998
  namespace = cfg.get("namespace")
@@ -1031,7 +1033,7 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
1031
1033
  info._cfg = cfg
1032
1034
  return info
1033
1035
 
1034
- def _validate(self, app: AppDef, scheduler: str) -> None:
1036
+ def _validate(self, app: AppDef, scheduler: str, cfg: KubernetesMCADOpts) -> None:
1035
1037
  # Skip validation step
1036
1038
  pass
1037
1039