torchx-nightly 2025.10.16__py3-none-any.whl → 2025.11.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

torchx/_version.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+ BASE_VERSION = "0.8.0dev0"
torchx/runner/api.py CHANGED
@@ -420,52 +420,44 @@ class Runner:
420
420
  scheduler,
421
421
  runcfg=json.dumps(cfg) if cfg else None,
422
422
  workspace=str(workspace),
423
- ):
423
+ ) as ctx:
424
424
  sched = self._scheduler(scheduler)
425
425
  resolved_cfg = sched.run_opts().resolve(cfg)
426
426
 
427
427
  sched._pre_build_validate(app, scheduler, resolved_cfg)
428
428
 
429
429
  if isinstance(sched, WorkspaceMixin):
430
- for i, role in enumerate(app.roles):
431
- role_workspace = role.workspace
432
-
433
- if i == 0 and workspace:
434
- # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
435
- # later, torchx added support for the workspace attr in Role
436
- # for BC, give precedence to the workspace argument over the workspace attr for role[0]
437
- if role_workspace:
438
- logger.info(
439
- f"Using workspace={workspace} over role[{i}].workspace={role_workspace} for role[{i}]={role.name}."
440
- " To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically." # noqa: B950
441
- )
442
- role_workspace = workspace
443
-
444
- if role_workspace:
445
- old_img = role.image
430
+ if workspace:
431
+ # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
432
+ # later, torchx added support for the workspace attr in Role
433
+ # for BC, give precedence to the workspace argument over the workspace attr for role[0]
434
+ if app.roles[0].workspace:
446
435
  logger.info(
447
- f"Checking for changes in workspace `{role_workspace}` for role[{i}]={role.name}..."
448
- )
449
- # TODO kiuk@ once we deprecate the `workspace` argument in runner APIs we can simplify the signature of
450
- # build_workspace_and_update_role2() to just taking the role and resolved_cfg
451
- sched.build_workspace_and_update_role2(
452
- role, role_workspace, resolved_cfg
436
+ "Overriding role[%d] (%s) workspace to `%s`"
437
+ "To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
438
+ 0,
439
+ role.name,
440
+ str(app.roles[0].workspace),
453
441
  )
442
+ app.roles[0].workspace = (
443
+ Workspace.from_str(workspace)
444
+ if isinstance(workspace, str)
445
+ else workspace
446
+ )
454
447
 
455
- if old_img != role.image:
456
- logger.info(
457
- f"Built new image `{role.image}` based on original image `{old_img}`"
458
- f" and changes in workspace `{role_workspace}` for role[{i}]={role.name}."
459
- )
460
- else:
461
- logger.info(
462
- f"Reusing original image `{old_img}` for role[{i}]={role.name}."
463
- " Either a patch was built or no changes to workspace was detected."
464
- )
448
+ sched.build_workspaces(app.roles, resolved_cfg)
465
449
 
466
450
  sched._validate(app, scheduler, resolved_cfg)
467
451
  dryrun_info = sched.submit_dryrun(app, resolved_cfg)
468
452
  dryrun_info._scheduler = scheduler
453
+
454
+ event = ctx._torchx_event
455
+ event.scheduler = scheduler
456
+ event.runcfg = json.dumps(cfg) if cfg else None
457
+ event.app_id = app.name
458
+ event.app_image = none_throws(dryrun_info._app).roles[0].image
459
+ event.app_metadata = app.metadata
460
+
469
461
  return dryrun_info
470
462
 
471
463
  def scheduler_run_opts(self, scheduler: str) -> runopts:
torchx/schedulers/api.py CHANGED
@@ -131,7 +131,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
131
131
  self,
132
132
  app: A,
133
133
  cfg: T,
134
- workspace: Optional[Union[Workspace, str]] = None,
134
+ workspace: str | Workspace | None = None,
135
135
  ) -> str:
136
136
  """
137
137
  Submits the application to be run by the scheduler.
@@ -145,7 +145,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
145
145
  resolved_cfg = self.run_opts().resolve(cfg)
146
146
  if workspace:
147
147
  assert isinstance(self, WorkspaceMixin)
148
- self.build_workspace_and_update_role2(app.roles[0], workspace, resolved_cfg)
148
+
149
+ if isinstance(workspace, str):
150
+ workspace = Workspace.from_str(workspace)
151
+
152
+ app.roles[0].workspace = workspace
153
+ self.build_workspaces(app.roles, resolved_cfg)
149
154
 
150
155
  # pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
151
156
  dryrun_info = self.submit_dryrun(app, resolved_cfg)
@@ -27,10 +27,81 @@ Install Volcano:
27
27
  See the
28
28
  `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
29
29
  for more information.
30
+
31
+ Pod Overlay
32
+ ===========
33
+
34
+ You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
35
+ the ``kubernetes`` metadata on your role. The value can be:
36
+
37
+ - A dict with the overlay structure
38
+ - A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
39
+
40
+ Merge semantics:
41
+ - **dict**: recursive merge (upsert)
42
+ - **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
43
+ - **primitives**: replace
44
+
45
+ .. code:: python
46
+
47
+ from torchx.specs import Role
48
+
49
+ # Dict overlay - lists append, tuples replace
50
+ role = Role(
51
+ name="trainer",
52
+ image="my-image:latest",
53
+ entrypoint="train.py",
54
+ metadata={
55
+ "kubernetes": {
56
+ "spec": {
57
+ "nodeSelector": {"gpu": "true"},
58
+ "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}], # appends
59
+ "volumes": ({"name": "my-volume", "emptyDir": {}},) # replaces
60
+ }
61
+ }
62
+ }
63
+ )
64
+
65
+ # File URI overlay
66
+ role = Role(
67
+ name="trainer",
68
+ image="my-image:latest",
69
+ entrypoint="train.py",
70
+ metadata={
71
+ "kubernetes": "file:///path/to/pod_overlay.yaml"
72
+ }
73
+ )
74
+
75
+ CLI usage with builtin components:
76
+
77
+ .. code:: bash
78
+
79
+ $ torchx run --scheduler kubernetes dist.ddp \\
80
+ --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
81
+ --script train.py
82
+
83
+ Example ``pod_overlay.yaml``:
84
+
85
+ .. code:: yaml
86
+
87
+ spec:
88
+ nodeSelector:
89
+ node.kubernetes.io/instance-type: p4d.24xlarge
90
+ tolerations:
91
+ - key: nvidia.com/gpu
92
+ operator: Exists
93
+ effect: NoSchedule
94
+ volumes: !!python/tuple
95
+ - name: my-volume
96
+ emptyDir: {}
97
+
98
+ The overlay is deep-merged with the generated pod, preserving existing fields
99
+ and adding or overriding specified ones.
30
100
  """
31
101
 
32
102
  import json
33
103
  import logging
104
+ import re
34
105
  import warnings
35
106
  from dataclasses import dataclass
36
107
  from datetime import datetime
@@ -45,6 +116,7 @@ from typing import (
45
116
  Tuple,
46
117
  TYPE_CHECKING,
47
118
  TypedDict,
119
+ Union,
48
120
  )
49
121
 
50
122
  import torchx
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
97
169
  RESERVED_MILLICPU = 100
98
170
  RESERVED_MEMMB = 1024
99
171
 
172
+
173
+ def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
174
+ """Apply overlay dict to V1Pod object, merging nested fields.
175
+
176
+ Merge semantics:
177
+ - dict: upsert (recursive merge)
178
+ - list: append by default, replace if tuple
179
+ - primitives: replace
180
+ """
181
+ from kubernetes import client
182
+
183
+ api = client.ApiClient()
184
+ pod_dict = api.sanitize_for_serialization(pod)
185
+
186
+ def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
187
+ for key, value in overlay.items():
188
+ if isinstance(value, dict) and key in base and isinstance(base[key], dict):
189
+ deep_merge(base[key], value)
190
+ elif isinstance(value, tuple):
191
+ base[key] = list(value)
192
+ elif (
193
+ isinstance(value, list) and key in base and isinstance(base[key], list)
194
+ ):
195
+ base[key].extend(value)
196
+ else:
197
+ base[key] = value
198
+
199
+ deep_merge(pod_dict, overlay)
200
+
201
+ merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
202
+ pod.spec = merged_pod.spec
203
+ pod.metadata = merged_pod.metadata
204
+
205
+
100
206
  RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
101
207
  RetryPolicy.REPLICA: [],
102
208
  RetryPolicy.APPLICATION: [
@@ -369,7 +475,7 @@ def app_to_resource(
369
475
  queue: str,
370
476
  service_account: Optional[str],
371
477
  priority_class: Optional[str] = None,
372
- ) -> Dict[str, object]:
478
+ ) -> Dict[str, Any]:
373
479
  """
374
480
  app_to_resource creates a volcano job kubernetes resource definition from
375
481
  the provided AppDef. The resource definition can be used to launch the
@@ -402,6 +508,17 @@ def app_to_resource(
402
508
  replica_role.env["TORCHX_IMAGE"] = replica_role.image
403
509
 
404
510
  pod = role_to_pod(name, replica_role, service_account)
511
+ if k8s_metadata := role.metadata.get("kubernetes"):
512
+ if isinstance(k8s_metadata, str):
513
+ import fsspec
514
+
515
+ with fsspec.open(k8s_metadata, "r") as f:
516
+ k8s_metadata = yaml.unsafe_load(f)
517
+ elif not isinstance(k8s_metadata, dict):
518
+ raise ValueError(
519
+ f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
520
+ )
521
+ _apply_pod_overlay(pod, k8s_metadata)
405
522
  pod.metadata.labels.update(
406
523
  pod_labels(
407
524
  app=app,
@@ -444,7 +561,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
444
561
  if priority_class is not None:
445
562
  job_spec["priorityClassName"] = priority_class
446
563
 
447
- resource: Dict[str, object] = {
564
+ resource: Dict[str, Any] = {
448
565
  "apiVersion": "batch.volcano.sh/v1alpha1",
449
566
  "kind": "Job",
450
567
  "metadata": {"name": f"{unique_app_id}"},
@@ -456,7 +573,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
456
573
  @dataclass
457
574
  class KubernetesJob:
458
575
  images_to_push: Dict[str, Tuple[str, str]]
459
- resource: Dict[str, object]
576
+ resource: Dict[str, Any]
460
577
 
461
578
  def __str__(self) -> str:
462
579
  return yaml.dump(sanitize_for_serialization(self.resource))
@@ -471,6 +588,7 @@ class KubernetesOpts(TypedDict, total=False):
471
588
  image_repo: Optional[str]
472
589
  service_account: Optional[str]
473
590
  priority_class: Optional[str]
591
+ validate_spec: Optional[bool]
474
592
 
475
593
 
476
594
  class KubernetesScheduler(
@@ -636,7 +754,7 @@ class KubernetesScheduler(
636
754
  else:
637
755
  raise
638
756
 
639
- return f'{namespace}:{resp["metadata"]["name"]}'
757
+ return f"{namespace}:{resp['metadata']['name']}"
640
758
 
641
759
  def _submit_dryrun(
642
760
  self, app: AppDef, cfg: KubernetesOpts
@@ -659,6 +777,36 @@ class KubernetesScheduler(
659
777
  ), "priority_class must be a str"
660
778
 
661
779
  resource = app_to_resource(app, queue, service_account, priority_class)
780
+
781
+ if cfg.get("validate_spec"):
782
+ try:
783
+ self._custom_objects_api().create_namespaced_custom_object(
784
+ group="batch.volcano.sh",
785
+ version="v1alpha1",
786
+ namespace=cfg.get("namespace") or "default",
787
+ plural="jobs",
788
+ body=resource,
789
+ dry_run="All",
790
+ )
791
+ except Exception as e:
792
+ from kubernetes.client.rest import ApiException
793
+
794
+ if isinstance(e, ApiException):
795
+ raise ValueError(f"Invalid job spec: {e.reason}") from e
796
+ raise
797
+
798
+ job_name = resource["metadata"]["name"]
799
+ for task in resource["spec"]["tasks"]:
800
+ task_name = task["name"]
801
+ replicas = task.get("replicas", 1)
802
+ max_index = replicas - 1
803
+ pod_name = f"{job_name}-{task_name}-{max_index}"
804
+ if len(pod_name) > 63:
805
+ raise ValueError(
806
+ f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
807
+ f"Shorten app.name or role names"
808
+ )
809
+
662
810
  req = KubernetesJob(
663
811
  resource=resource,
664
812
  images_to_push=images_to_push,
@@ -703,19 +851,32 @@ class KubernetesScheduler(
703
851
  type_=str,
704
852
  help="The name of the PriorityClass to set on the job specs",
705
853
  )
854
+ opts.add(
855
+ "validate_spec",
856
+ type_=bool,
857
+ help="Validate job spec using Kubernetes API dry-run before submission",
858
+ default=True,
859
+ )
706
860
  return opts
707
861
 
708
862
  def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
863
+ from kubernetes.client.rest import ApiException
864
+
709
865
  namespace, name = app_id.split(":")
710
866
  roles = {}
711
867
  roles_statuses = {}
712
- resp = self._custom_objects_api().get_namespaced_custom_object_status(
713
- group="batch.volcano.sh",
714
- version="v1alpha1",
715
- namespace=namespace,
716
- plural="jobs",
717
- name=name,
718
- )
868
+ try:
869
+ resp = self._custom_objects_api().get_namespaced_custom_object_status(
870
+ group="batch.volcano.sh",
871
+ version="v1alpha1",
872
+ namespace=namespace,
873
+ plural="jobs",
874
+ name=name,
875
+ )
876
+ except ApiException as e:
877
+ if e.status == 404:
878
+ return None
879
+ raise
719
880
  status = resp.get("status")
720
881
  if status:
721
882
  state_str = status["state"]["phase"]
@@ -824,13 +985,34 @@ def create_scheduler(
824
985
  def pod_labels(
825
986
  app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
826
987
  ) -> Dict[str, str]:
988
+
989
+ def clean(label_value: str) -> str:
990
+ # cleans the provided `label_value` to make it compliant
991
+ # to pod label specs as described in
992
+ # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
993
+ #
994
+ # Valid label value:
995
+ # must be 63 characters or less (can be empty),
996
+ # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
997
+ # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
998
+
999
+ # Replace invalid characters (allow: alphanum, -, _, .) with "."
1000
+ label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
1001
+ # Replace leading non-alphanumeric with "."
1002
+ label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
1003
+ # Replace trailing non-alphanumeric with "."
1004
+ label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
1005
+
1006
+ # Trim to 63 characters
1007
+ return label_value[:63]
1008
+
827
1009
  return {
828
- LABEL_VERSION: torchx.__version__,
829
- LABEL_APP_NAME: app.name,
1010
+ LABEL_VERSION: clean(torchx.__version__),
1011
+ LABEL_APP_NAME: clean(app.name),
830
1012
  LABEL_ROLE_INDEX: str(role_idx),
831
- LABEL_ROLE_NAME: role.name,
1013
+ LABEL_ROLE_NAME: clean(role.name),
832
1014
  LABEL_REPLICA_ID: str(replica_id),
833
- LABEL_KUBE_APP_NAME: app.name,
1015
+ LABEL_KUBE_APP_NAME: clean(app.name),
834
1016
  LABEL_ORGANIZATION: "torchx.pytorch.org",
835
- LABEL_UNIQUE_NAME: app_id,
1017
+ LABEL_UNIQUE_NAME: clean(app_id),
836
1018
  }
torchx/specs/__init__.py CHANGED
@@ -14,7 +14,7 @@ scheduler or pipeline adapter.
14
14
  import difflib
15
15
 
16
16
  import os
17
- from typing import Callable, Dict, Mapping, Optional
17
+ from typing import Callable, Dict, Iterator, Mapping, Optional
18
18
 
19
19
  from torchx.specs.api import (
20
20
  ALL,
@@ -113,8 +113,22 @@ class _NamedResourcesLibrary:
113
113
  def __contains__(self, key: str) -> bool:
114
114
  return key in _named_resource_factories
115
115
 
116
- def __iter__(self) -> None:
117
- raise NotImplementedError("named resources doesn't support iterating")
116
+ def __iter__(self) -> Iterator[str]:
117
+ """Iterates through the names of the registered named_resources.
118
+
119
+ Usage:
120
+
121
+ .. doctest::
122
+
123
+ from torchx import specs
124
+
125
+ for resource_name in specs.named_resources:
126
+ resource = specs.resource(h=resource_name)
127
+ assert isinstance(resource, specs.Resource)
128
+
129
+ """
130
+ for key in _named_resource_factories:
131
+ yield (key)
118
132
 
119
133
 
120
134
  named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
torchx/specs/api.py CHANGED
@@ -14,10 +14,12 @@ import logging as logger
14
14
  import os
15
15
  import pathlib
16
16
  import re
17
+ import shutil
17
18
  import typing
19
+ import warnings
18
20
  from dataclasses import asdict, dataclass, field
19
21
  from datetime import datetime
20
- from enum import Enum
22
+ from enum import Enum, IntEnum
21
23
  from json import JSONDecodeError
22
24
  from string import Template
23
25
  from typing import (
@@ -380,6 +382,16 @@ class Workspace:
380
382
  """False if no projects mapping. Lets us use workspace object in an if-statement"""
381
383
  return bool(self.projects)
382
384
 
385
+ def __eq__(self, other: object) -> bool:
386
+ if not isinstance(other, Workspace):
387
+ return False
388
+ return self.projects == other.projects
389
+
390
+ def __hash__(self) -> int:
391
+ # makes it possible to use Workspace as the key in the workspace build cache
392
+ # see WorkspaceMixin.caching_build_workspace_and_update_role
393
+ return hash(frozenset(self.projects.items()))
394
+
383
395
  def is_unmapped_single_project(self) -> bool:
384
396
  """
385
397
  Returns ``True`` if this workspace only has 1 project
@@ -387,6 +399,39 @@ class Workspace:
387
399
  """
388
400
  return len(self.projects) == 1 and not next(iter(self.projects.values()))
389
401
 
402
+ def merge_into(self, outdir: str | pathlib.Path) -> None:
403
+ """
404
+ Copies each project dir of this workspace into the specified ``outdir``.
405
+ Each project dir is copied into ``{outdir}/{target}`` where ``target`` is
406
+ the target mapping of the project dir.
407
+
408
+ For example:
409
+
410
+ .. code-block:: python
411
+ from os.path import expanduser
412
+
413
+ workspace = Workspace(
414
+ projects={
415
+ expanduser("~/workspace/torch"): "torch",
416
+ expanduser("~/workspace/my_project": "")
417
+ }
418
+ )
419
+ workspace.merge_into(expanduser("~/tmp"))
420
+
421
+ Copies:
422
+
423
+ * ``~/workspace/torch/**`` into ``~/tmp/torch/**``
424
+ * ``~/workspace/my_project/**`` into ``~/tmp/**``
425
+
426
+ """
427
+
428
+ for src, dst in self.projects.items():
429
+ dst_path = pathlib.Path(outdir) / dst
430
+ if pathlib.Path(src).is_file():
431
+ shutil.copy2(src, dst_path)
432
+ else: # src is dir
433
+ shutil.copytree(src, dst_path, dirs_exist_ok=True)
434
+
390
435
  @staticmethod
391
436
  def from_str(workspace: str | None) -> "Workspace":
392
437
  import yaml
@@ -891,14 +936,12 @@ class runopt:
891
936
  Represents the metadata about the specific run option
892
937
  """
893
938
 
894
- class alias(str):
895
- pass
896
-
897
939
  default: CfgVal
898
940
  opt_type: Type[CfgVal]
899
941
  is_required: bool
900
942
  help: str
901
- aliases: list[alias] | None = None
943
+ aliases: list[str] | None = None
944
+ deprecated_aliases: list[str] | None = None
902
945
 
903
946
  @property
904
947
  def is_type_list_of_str(self) -> bool:
@@ -990,7 +1033,7 @@ class runopts:
990
1033
 
991
1034
  def __init__(self) -> None:
992
1035
  self._opts: Dict[str, runopt] = {}
993
- self._alias_to_key: dict[runopt.alias, str] = {}
1036
+ self._alias_to_key: dict[str, str] = {}
994
1037
 
995
1038
  def __iter__(self) -> Iterator[Tuple[str, runopt]]:
996
1039
  return self._opts.items().__iter__()
@@ -1044,12 +1087,24 @@ class runopts:
1044
1087
  val = resolved_cfg.get(cfg_key)
1045
1088
  resolved_name = None
1046
1089
  aliases = runopt.aliases or []
1090
+ deprecated_aliases = runopt.deprecated_aliases or []
1047
1091
  if val is None:
1048
1092
  for alias in aliases:
1049
1093
  val = resolved_cfg.get(alias)
1050
1094
  if alias in cfg or val is not None:
1051
1095
  resolved_name = alias
1052
1096
  break
1097
+ for alias in deprecated_aliases:
1098
+ val = resolved_cfg.get(alias)
1099
+ if val is not None:
1100
+ resolved_name = alias
1101
+ use_instead = self._alias_to_key.get(alias)
1102
+ warnings.warn(
1103
+ f"Run option `{alias}` is deprecated, use `{use_instead}` instead",
1104
+ UserWarning,
1105
+ stacklevel=2,
1106
+ )
1107
+ break
1053
1108
  else:
1054
1109
  resolved_name = cfg_key
1055
1110
  for alias in aliases:
@@ -1172,49 +1227,23 @@ class runopts:
1172
1227
  cfg[key] = val
1173
1228
  return cfg
1174
1229
 
1175
- def _get_primary_key_and_aliases(
1176
- self,
1177
- cfg_key: list[str] | str,
1178
- ) -> tuple[str, list[runopt.alias]]:
1179
- """
1180
- Returns the primary key and aliases for the given cfg_key.
1181
- """
1182
- if isinstance(cfg_key, str):
1183
- return cfg_key, []
1184
-
1185
- if len(cfg_key) == 0:
1186
- raise ValueError("cfg_key must be a non-empty list")
1187
- primary_key = None
1188
- aliases = list[runopt.alias]()
1189
- for name in cfg_key:
1190
- if isinstance(name, runopt.alias):
1191
- aliases.append(name)
1192
- else:
1193
- if primary_key is not None:
1194
- raise ValueError(
1195
- f" Given more than one primary key: {primary_key}, {name}. Please use runopt.alias type for aliases. "
1196
- )
1197
- primary_key = name
1198
- if primary_key is None or primary_key == "":
1199
- raise ValueError(
1200
- "Missing cfg_key. Please provide one other than the aliases."
1201
- )
1202
- return primary_key, aliases
1203
-
1204
1230
  def add(
1205
1231
  self,
1206
- cfg_key: str | list[str],
1232
+ cfg_key: str,
1207
1233
  type_: Type[CfgVal],
1208
1234
  help: str,
1209
1235
  default: CfgVal = None,
1210
1236
  required: bool = False,
1237
+ aliases: Optional[list[str]] = None,
1238
+ deprecated_aliases: Optional[list[str]] = None,
1211
1239
  ) -> None:
1212
1240
  """
1213
1241
  Adds the ``config`` option with the given help string and ``default``
1214
1242
  value (if any). If the ``default`` is not specified then this option
1215
1243
  is a required option.
1216
1244
  """
1217
- primary_key, aliases = self._get_primary_key_and_aliases(cfg_key)
1245
+ aliases = aliases or []
1246
+ deprecated_aliases = deprecated_aliases or []
1218
1247
  if required and default is not None:
1219
1248
  raise ValueError(
1220
1249
  f"Required option: {cfg_key} must not specify default value. Given: {default}"
@@ -1225,10 +1254,20 @@ class runopts:
1225
1254
  f"Option: {cfg_key}, must be of type: {type_}."
1226
1255
  f" Given: {default} ({type(default).__name__})"
1227
1256
  )
1228
- opt = runopt(default, type_, required, help, aliases)
1257
+
1258
+ opt = runopt(
1259
+ default,
1260
+ type_,
1261
+ required,
1262
+ help,
1263
+ list(set(aliases)),
1264
+ list(set(deprecated_aliases)),
1265
+ )
1229
1266
  for alias in aliases:
1230
- self._alias_to_key[alias] = primary_key
1231
- self._opts[primary_key] = opt
1267
+ self._alias_to_key[alias] = cfg_key
1268
+ for deprecated_alias in deprecated_aliases:
1269
+ self._alias_to_key[deprecated_alias] = cfg_key
1270
+ self._opts[cfg_key] = opt
1232
1271
 
1233
1272
  def update(self, other: "runopts") -> None:
1234
1273
  self._opts.update(other._opts)
torchx/version.py CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  # Copyright (c) Meta Platforms, Inc. and affiliates.
3
2
  # All rights reserved.
4
3
  #
@@ -7,6 +6,7 @@
7
6
 
8
7
  # pyre-strict
9
8
 
9
+ from torchx._version import BASE_VERSION
10
10
  from torchx.util.entrypoints import load
11
11
 
12
12
  # Follows PEP-0440 version scheme guidelines
@@ -18,7 +18,7 @@ from torchx.util.entrypoints import load
18
18
  # 0.1.0bN # Beta release
19
19
  # 0.1.0rcN # Release Candidate
20
20
  # 0.1.0 # Final release
21
- __version__ = "0.8.0dev0"
21
+ __version__: str = BASE_VERSION
22
22
 
23
23
 
24
24
  # Use the github container registry images corresponding to the current package
torchx/workspace/api.py CHANGED
@@ -8,26 +8,17 @@
8
8
 
9
9
  import abc
10
10
  import fnmatch
11
+ import logging
11
12
  import posixpath
12
- import shutil
13
13
  import tempfile
14
14
  import warnings
15
15
  from dataclasses import dataclass
16
- from pathlib import Path
17
- from typing import (
18
- Any,
19
- Dict,
20
- Generic,
21
- Iterable,
22
- Mapping,
23
- Tuple,
24
- TYPE_CHECKING,
25
- TypeVar,
26
- Union,
27
- )
16
+ from typing import Any, Dict, Generic, Iterable, Mapping, Tuple, TYPE_CHECKING, TypeVar
28
17
 
29
18
  from torchx.specs import AppDef, CfgVal, Role, runopts, Workspace
30
19
 
20
+ logger: logging.Logger = logging.getLogger(__name__)
21
+
31
22
  if TYPE_CHECKING:
32
23
  from fsspec import AbstractFileSystem
33
24
 
@@ -113,45 +104,72 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
113
104
  """
114
105
  return runopts()
115
106
 
116
- def build_workspace_and_update_role2(
107
+ def build_workspaces(self, roles: list[Role], cfg: Mapping[str, CfgVal]) -> None:
108
+ """
109
+ NOTE: this method MUTATES the passed roles!
110
+
111
+ Builds the workspaces (if any) for each role and updates the role to reflect the built workspace.
112
+ Typically ``role.image`` is updated with the newly built image that reflects the local workspace.
113
+ Some workspace implementations may add extra environment variables to make it easier for other
114
+ parts of the program to access the workspace. For example a ``WORKSPACE_DIR`` env var may be added
115
+ to ``role.env`` that scripts can use to refert to the workspace directory in the container.
116
+ """
117
+
118
+ build_cache: dict[object, object] = {}
119
+
120
+ for i, role in enumerate(roles):
121
+ if role.workspace:
122
+ old_img = role.image
123
+ self.caching_build_workspace_and_update_role(role, cfg, build_cache)
124
+
125
+ if old_img != role.image:
126
+ logger.info(
127
+ "role[%d]=%s updated with new image to include workspace changes",
128
+ i,
129
+ role.name,
130
+ )
131
+
132
+ def caching_build_workspace_and_update_role(
117
133
  self,
118
134
  role: Role,
119
- workspace: Union[Workspace, str],
120
135
  cfg: Mapping[str, CfgVal],
136
+ build_cache: dict[object, object],
121
137
  ) -> None:
122
138
  """
123
- Same as :py:meth:`build_workspace_and_update_role` but operates
124
- on :py:class:`Workspace` (supports multi-project workspaces)
125
- as well as ``str`` (for backwards compatibility).
139
+ Same as :py:meth:`build_workspace_and_update_role` but takes
140
+ a ``build_cache`` that can be used to cache pointers to build artifacts
141
+ between building workspace for each role.
126
142
 
127
- If ``workspace`` is a ``str`` this method simply calls
143
+ This is useful when an appdef has multiple roles where the image and workspace
144
+ of the roles are the same but other attributes such as entrypoint or args are different.
145
+
146
+ NOTE: ``build_cache``'s lifetime is within :py:meth:`build_workspace_and_update_roles`
147
+ NOTE: the workspace implementation decides what to cache
148
+
149
+ Workspace subclasses should prefer implementing this method over
128
150
  :py:meth:`build_workspace_and_update_role`.
129
151
 
130
- If ``workspace`` is :py:class:`Workspace` then the default
131
- impl copies all the projects into a tmp directory and passes the tmp dir to
132
- :py:meth:`build_workspace_and_update_role`
152
+ The default implementation of this method simply calls the (deprecated) non-caching
153
+ :py:meth:`build_workspace_and_update_role` and deals with multi-dir workspaces by
154
+ merging them into a single tmpdir before passing it down.
133
155
 
134
- Subclasses can override this method to customize multi-project
135
- workspace building logic.
136
156
  """
137
- if isinstance(workspace, Workspace):
138
- if not workspace.is_unmapped_single_project():
139
- with tempfile.TemporaryDirectory(suffix="torchx_workspace_") as outdir:
140
- for src, dst in workspace.projects.items():
141
- dst_path = Path(outdir) / dst
142
- if Path(src).is_file():
143
- shutil.copy2(src, dst_path)
144
- else: # src is dir
145
- shutil.copytree(src, dst_path, dirs_exist_ok=True)
146
-
147
- self.build_workspace_and_update_role(role, outdir, cfg)
148
- return
149
- else: # single project workspace with no target mapping (treat like a str workspace)
150
- workspace = str(workspace)
151
-
152
- self.build_workspace_and_update_role(role, workspace, cfg)
153
157
 
154
- @abc.abstractmethod
158
+ workspace = role.workspace
159
+
160
+ if not workspace:
161
+ return
162
+
163
+ if workspace.is_unmapped_single_project():
164
+ # single-dir workspace with no target map; no need to copy to a tmp dir
165
+ self.build_workspace_and_update_role(role, str(workspace), cfg)
166
+ else:
167
+ # multi-dirs or single-dir with a target map;
168
+ # copy all dirs to a tmp dir and treat the tmp dir as a single-dir workspace
169
+ with tempfile.TemporaryDirectory(suffix="torchx_workspace_") as outdir:
170
+ workspace.merge_into(outdir)
171
+ self.build_workspace_and_update_role(role, outdir, cfg)
172
+
155
173
  def build_workspace_and_update_role(
156
174
  self,
157
175
  role: Role,
@@ -159,6 +177,9 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
159
177
  cfg: Mapping[str, CfgVal],
160
178
  ) -> None:
161
179
  """
180
+ .. note:: DEPRECATED: Workspace subclasses should implement
181
+ :py:meth:`caching_build_workspace_and_update_role` over this method.
182
+
162
183
  Builds the specified ``workspace`` with respect to ``img``
163
184
  and updates the ``role`` to reflect the built workspace artifacts.
164
185
  In the simplest case, this method builds a new image and updates
@@ -167,7 +188,7 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
167
188
 
168
189
  Note: this method mutates the passed ``role``.
169
190
  """
170
- ...
191
+ raise NotImplementedError("implement `caching_build_workspace_and_update_role`")
171
192
 
172
193
  def dryrun_push_images(self, app: AppDef, cfg: Mapping[str, CfgVal]) -> T:
173
194
  """
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: torchx-nightly
3
- Version: 2025.10.16
3
+ Version: 2025.11.17
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/meta-pytorch/torchx
6
6
  Author: TorchX Devs
@@ -23,8 +23,10 @@ Requires-Dist: docker
23
23
  Requires-Dist: filelock
24
24
  Requires-Dist: fsspec>=2023.10.0
25
25
  Requires-Dist: tabulate
26
- Provides-Extra: aws_batch
26
+ Provides-Extra: aws-batch
27
27
  Requires-Dist: boto3; extra == "aws-batch"
28
+ Provides-Extra: kubernetes
29
+ Requires-Dist: kubernetes>=11; extra == "kubernetes"
28
30
  Provides-Extra: dev
29
31
  Requires-Dist: aiobotocore==2.20.0; extra == "dev"
30
32
  Requires-Dist: ax-platform[mysql]==0.2.3; extra == "dev"
@@ -47,18 +49,29 @@ Requires-Dist: pytorch-lightning==2.5.0; extra == "dev"
47
49
  Requires-Dist: tensorboard==2.14.0; extra == "dev"
48
50
  Requires-Dist: sagemaker==2.230.0; extra == "dev"
49
51
  Requires-Dist: torch-model-archiver>=0.4.2; extra == "dev"
50
- Requires-Dist: torch>=2.7.0; extra == "dev"
52
+ Requires-Dist: torch; extra == "dev"
51
53
  Requires-Dist: torchmetrics==1.6.3; extra == "dev"
52
54
  Requires-Dist: torchserve>=0.10.0; extra == "dev"
53
- Requires-Dist: torchtext==0.18.0; extra == "dev"
54
- Requires-Dist: torchvision==0.23.0; extra == "dev"
55
+ Requires-Dist: torchtext; extra == "dev"
56
+ Requires-Dist: torchvision; extra == "dev"
55
57
  Requires-Dist: typing-extensions; extra == "dev"
56
58
  Requires-Dist: ts==0.5.1; extra == "dev"
57
59
  Requires-Dist: wheel; extra == "dev"
58
60
  Requires-Dist: lintrunner; extra == "dev"
59
61
  Requires-Dist: lintrunner-adapters; extra == "dev"
60
- Provides-Extra: kubernetes
61
- Requires-Dist: kubernetes>=11; extra == "kubernetes"
62
+ Dynamic: author
63
+ Dynamic: author-email
64
+ Dynamic: classifier
65
+ Dynamic: description
66
+ Dynamic: description-content-type
67
+ Dynamic: home-page
68
+ Dynamic: keywords
69
+ Dynamic: license
70
+ Dynamic: license-file
71
+ Dynamic: provides-extra
72
+ Dynamic: requires-dist
73
+ Dynamic: requires-python
74
+ Dynamic: summary
62
75
 
63
76
  [![PyPI](https://img.shields.io/pypi/v/torchx)](https://pypi.org/project/torchx/)
64
77
  [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://github.com/meta-pytorch/torchx/blob/main/LICENSE)
@@ -1,6 +1,7 @@
1
1
  torchx/__init__.py,sha256=QFDTdJacncWYWHL-2QyWdY5MUck3jVfSPRRGdvedcKc,355
2
+ torchx/_version.py,sha256=TzDuXIviDldFbXAhGe33redQcoP33jIsVR_hMyqSgdc,250
2
3
  torchx/notebook.py,sha256=Rc6XUMzSq7NXtsYdtVluE6T89LpEhcba-3ANxuaLCCU,1008
3
- torchx/version.py,sha256=d28ccaZP21nlF8jEmSLjJiidyquMJo02tDpeVD36inc,951
4
+ torchx/version.py,sha256=YcE66UkBxYHMQMtjVts4jF3l6Qeaj1gK_LzxU77l8Bo,975
4
5
  torchx/apps/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
5
6
  torchx/apps/serve/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
6
7
  torchx/apps/serve/serve.py,sha256=u_h8agld1TwIPq5GRosHL3uxhkljNfS65McLB77O0OE,4386
@@ -48,7 +49,7 @@ torchx/examples/apps/lightning/profiler.py,sha256=SSSihnwjeUTkBoz0E3qn1b-wbkfUIo
48
49
  torchx/examples/apps/lightning/train.py,sha256=0wvvshGHvZowePB4LfclXwn40X7i9euM0ReETWBcPSo,6253
49
50
  torchx/pipelines/__init__.py,sha256=2MbRVk5xwRjg-d2qPemeXpEhDsocMQumPQ53lsesZAI,606
50
51
  torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
51
- torchx/runner/api.py,sha256=jxtgOl7nNOqpzG-sjUJngXhIOachqaVfKu9rF8YqHWI,31271
52
+ torchx/runner/api.py,sha256=xQpgiUz9jCX4zZriubbWk4tTJRe7MxNJQK64g0o7KQ8,30438
52
53
  torchx/runner/config.py,sha256=SaKOB50d79WaMFPWK8CC4as6UaNFaRGhrBkfajq3KC4,18311
53
54
  torchx/runner/events/__init__.py,sha256=cMiNjnr4eUNQ2Nxxtu4nsvN5lu56b-a6nJ-ct3i7DQk,5536
54
55
  torchx/runner/events/api.py,sha256=bvxKBAYK8LzbrBNaNLgL1x0aivtfANmWo1EMGOrSR8k,2668
@@ -57,20 +58,20 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
57
58
  torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
58
59
  torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
59
60
  torchx/schedulers/__init__.py,sha256=FQN9boQM4mwOD3sK9LZ3GBgw-gJ7Vx4MFj6z6ATQIrc,2211
60
- torchx/schedulers/api.py,sha256=5Amli1httEl82XebAqd8vl3dM8zMKwYfRgfd0mEq3is,14538
61
+ torchx/schedulers/api.py,sha256=smoUv1ocfqsBRmesXbz9i1F86zBOixZ8QHxYmI_MzgQ,14649
61
62
  torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
62
63
  torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
63
64
  torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
64
65
  torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
65
66
  torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
66
67
  torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
67
- torchx/schedulers/kubernetes_scheduler.py,sha256=Wb6XDzwcvp3-NqBhKrjtgDC4L6GVOmcyP6fuoPFByBE,28288
68
+ torchx/schedulers/kubernetes_scheduler.py,sha256=86ny9XXt9tdeV6Y7AlVFQ6vhxlviOdNeZUz4gOzU3cc,34478
68
69
  torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
69
70
  torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
70
71
  torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
71
72
  torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
72
- torchx/specs/__init__.py,sha256=SXS4r_roOkbbAL-p7EY5fl5ou-AG7S9Ck-zKtRBdHOk,6760
73
- torchx/specs/api.py,sha256=ICKsTWxEats9IwWXUm-D1NJy4jyONMV2zdrWfUrpKNg,47827
73
+ torchx/specs/__init__.py,sha256=TaC0AveTebkCMo5hmdY1wGpo09vFDqzWnsT166ionTw,7108
74
+ torchx/specs/api.py,sha256=OrLX4gGa97qtjUbl3x_YnOKCdP0rQkVEruPIbNjo7fk,49230
74
75
  torchx/specs/builders.py,sha256=Ye3of4MupJ-da8vLaX6_-nzGo_FRw1BFpYsX6dAZCNk,13730
75
76
  torchx/specs/file_linter.py,sha256=z0c4mKJv47BWiPaWCdUM0A8kHwnj4b1s7oTmESuD9Tc,14407
76
77
  torchx/specs/finder.py,sha256=gWQNEFrLYqrZoI0gMMhQ70YAC4sxqS0ZFpoWAmcVi44,17438
@@ -99,12 +100,12 @@ torchx/util/shlex.py,sha256=eXEKu8KC3zIcd8tEy9_s8Ds5oma8BORr-0VGWNpG2dk,463
99
100
  torchx/util/strings.py,sha256=7Ef1loz2IYMrzeJ6Lewywi5cBIc3X3g7lSPbT1Tn_z4,664
100
101
  torchx/util/types.py,sha256=E9dxAWQnsJkIDuHtg-poeOJ4etucSI_xP_Z5kNJX8uI,9229
101
102
  torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,798
102
- torchx/workspace/api.py,sha256=h2SaC-pYPBLuo3XtkXJ0APMoro-C-ry7KucI7r3EUf4,8753
103
+ torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
103
104
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
104
105
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
105
- torchx_nightly-2025.10.16.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
106
- torchx_nightly-2025.10.16.dist-info/METADATA,sha256=LdONpXnVGtW8end6ZL0EIZ1W4TwP6sJx1TypIYVg8z8,5069
107
- torchx_nightly-2025.10.16.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
108
- torchx_nightly-2025.10.16.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
109
- torchx_nightly-2025.10.16.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
110
- torchx_nightly-2025.10.16.dist-info/RECORD,,
106
+ torchx_nightly-2025.11.17.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
107
+ torchx_nightly-2025.11.17.dist-info/METADATA,sha256=iim6P-wiEztRPHgcWaQCa9_f0GsU-GyxHBILL2cyVJg,5324
108
+ torchx_nightly-2025.11.17.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
109
+ torchx_nightly-2025.11.17.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
110
+ torchx_nightly-2025.11.17.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
111
+ torchx_nightly-2025.11.17.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: setuptools (79.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5