torchx-nightly 2025.9.28__py3-none-any.whl → 2025.11.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

torchx/specs/api.py CHANGED
@@ -11,11 +11,15 @@ import copy
11
11
  import inspect
12
12
  import json
13
13
  import logging as logger
14
+ import os
15
+ import pathlib
14
16
  import re
17
+ import shutil
15
18
  import typing
19
+ import warnings
16
20
  from dataclasses import asdict, dataclass, field
17
21
  from datetime import datetime
18
- from enum import Enum
22
+ from enum import Enum, IntEnum
19
23
  from json import JSONDecodeError
20
24
  from string import Template
21
25
  from typing import (
@@ -66,6 +70,32 @@ YELLOW_BOLD = "\033[1;33m"
66
70
  RESET = "\033[0m"
67
71
 
68
72
 
73
+ def TORCHX_HOME(*subdir_paths: str) -> pathlib.Path:
74
+ """
75
+ Path to the "dot-directory" for torchx.
76
+ Defaults to `~/.torchx` and is overridable via the `TORCHX_HOME` environment variable.
77
+
78
+ Usage:
79
+
80
+ .. doc-test::
81
+
82
+ from pathlib import Path
83
+ from torchx.specs import TORCHX_HOME
84
+
85
+ assert TORCHX_HOME() == Path.home() / ".torchx"
86
+ assert TORCHX_HOME("conda-pack-out") == Path.home() / ".torchx" / "conda-pack-out"
87
+ ```
88
+ """
89
+
90
+ default_dir = str(pathlib.Path.home() / ".torchx")
91
+ torchx_home = pathlib.Path(os.getenv("TORCHX_HOME", default_dir))
92
+
93
+ torchx_home = torchx_home / os.path.sep.join(subdir_paths)
94
+ torchx_home.mkdir(parents=True, exist_ok=True)
95
+
96
+ return torchx_home
97
+
98
+
69
99
  # ========================================
70
100
  # ==== Distributed AppDef API =======
71
101
  # ========================================
@@ -322,6 +352,121 @@ class DeviceMount:
322
352
  permissions: str = "rwm"
323
353
 
324
354
 
355
+ @dataclass
356
+ class Workspace:
357
+ """
358
+ Specifies a local "workspace" (a set of directories). Workspaces are ad-hoc built
359
+ into an (usually ephemeral) image. This effectively mirrors the local code changes
360
+ at job submission time.
361
+
362
+ For example:
363
+
364
+ 1. ``projects={"~/github/torch": "torch"}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/torch/**``
365
+ 2. ``projects={"~/github/torch": ""}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/**``
366
+
367
+ The exact location of ``$REMOTE_WORKSPACE_ROOT`` is implementation dependent and varies between
368
+ different implementations of :py:class:`~torchx.workspace.api.WorkspaceMixin`.
369
+ Check the scheduler documentation for details on which workspace it supports.
370
+
371
+ Note: ``projects`` maps the location of the local project to a sub-directory in the remote workspace root directory.
372
+ Typically the local project location is a directory path (e.g. ``/home/foo/github/torch``).
373
+
374
+
375
+ Attributes:
376
+ projects: mapping of local project to the sub-dir in the remote workspace dir.
377
+ """
378
+
379
+ projects: dict[str, str]
380
+
381
+ def __bool__(self) -> bool:
382
+ """False if no projects mapping. Lets us use workspace object in an if-statement"""
383
+ return bool(self.projects)
384
+
385
+ def __eq__(self, other: object) -> bool:
386
+ if not isinstance(other, Workspace):
387
+ return False
388
+ return self.projects == other.projects
389
+
390
+ def __hash__(self) -> int:
391
+ # makes it possible to use Workspace as the key in the workspace build cache
392
+ # see WorkspaceMixin.caching_build_workspace_and_update_role
393
+ return hash(frozenset(self.projects.items()))
394
+
395
+ def is_unmapped_single_project(self) -> bool:
396
+ """
397
+ Returns ``True`` if this workspace only has 1 project
398
+ and its target mapping is an empty string.
399
+ """
400
+ return len(self.projects) == 1 and not next(iter(self.projects.values()))
401
+
402
+ def merge_into(self, outdir: str | pathlib.Path) -> None:
403
+ """
404
+ Copies each project dir of this workspace into the specified ``outdir``.
405
+ Each project dir is copied into ``{outdir}/{target}`` where ``target`` is
406
+ the target mapping of the project dir.
407
+
408
+ For example:
409
+
410
+ .. code-block:: python
411
+ from os.path import expanduser
412
+
413
+ workspace = Workspace(
414
+ projects={
415
+ expanduser("~/workspace/torch"): "torch",
416
+ expanduser("~/workspace/my_project": "")
417
+ }
418
+ )
419
+ workspace.merge_into(expanduser("~/tmp"))
420
+
421
+ Copies:
422
+
423
+ * ``~/workspace/torch/**`` into ``~/tmp/torch/**``
424
+ * ``~/workspace/my_project/**`` into ``~/tmp/**``
425
+
426
+ """
427
+
428
+ for src, dst in self.projects.items():
429
+ dst_path = pathlib.Path(outdir) / dst
430
+ if pathlib.Path(src).is_file():
431
+ shutil.copy2(src, dst_path)
432
+ else: # src is dir
433
+ shutil.copytree(src, dst_path, dirs_exist_ok=True)
434
+
435
+ @staticmethod
436
+ def from_str(workspace: str | None) -> "Workspace":
437
+ import yaml
438
+
439
+ if not workspace:
440
+ return Workspace({})
441
+
442
+ projects = yaml.safe_load(workspace)
443
+ if isinstance(projects, str): # single project workspace
444
+ projects = {projects: ""}
445
+ else: # multi-project workspace
446
+ # Replace None mappings with "" (empty string)
447
+ projects = {k: ("" if v is None else v) for k, v in projects.items()}
448
+
449
+ return Workspace(projects)
450
+
451
+ def __str__(self) -> str:
452
+ """
453
+ Returns a string representation of the Workspace by concatenating
454
+ the project mappings using ';' as a delimiter and ':' between key and value.
455
+ If the single-project workspace with no target mapping, then simply
456
+ returns the src (local project dir)
457
+
458
+ NOTE: meant to be used for logging purposes not serde.
459
+ Therefore not symmetric with :py:func:`Workspace.from_str`.
460
+
461
+ """
462
+ if self.is_unmapped_single_project():
463
+ return next(iter(self.projects))
464
+ else:
465
+ return ";".join(
466
+ k if not v else f"{k}:{v}" for k, v in self.projects.items()
467
+ )
468
+
469
+
325
470
  @dataclass
326
471
  class Role:
327
472
  """
@@ -374,12 +519,15 @@ class Role:
374
519
  metadata: Free form information that is associated with the role, for example
375
520
  scheduler specific data. The key should follow the pattern: ``$scheduler.$key``
376
521
  mounts: a list of mounts on the machine
522
+ workspace: local project directories to be mirrored on the remote job.
523
+ NOTE: The workspace argument provided to the :py:class:`~torchx.runner.api.Runner` APIs
524
+ only takes effect on ``appdef.role[0]`` and overrides this attribute.
525
+
377
526
  """
378
527
 
379
528
  name: str
380
529
  image: str
381
530
  min_replicas: Optional[int] = None
382
- base_image: Optional[str] = None # DEPRECATED DO NOT SET, WILL BE REMOVED SOON
383
531
  entrypoint: str = MISSING
384
532
  args: List[str] = field(default_factory=list)
385
533
  env: Dict[str, str] = field(default_factory=dict)
@@ -389,9 +537,10 @@ class Role:
389
537
  resource: Resource = field(default_factory=_null_resource)
390
538
  port_map: Dict[str, int] = field(default_factory=dict)
391
539
  metadata: Dict[str, Any] = field(default_factory=dict)
392
- mounts: List[Union[BindMount, VolumeMount, DeviceMount]] = field(
393
- default_factory=list
394
- )
540
+ mounts: List[BindMount | VolumeMount | DeviceMount] = field(default_factory=list)
541
+ workspace: Workspace | None = None
542
+
543
+ # DEPRECATED DO NOT SET, WILL BE REMOVED SOON
395
544
  overrides: Dict[str, Any] = field(default_factory=dict)
396
545
 
397
546
  # pyre-ignore
@@ -791,6 +940,8 @@ class runopt:
791
940
  opt_type: Type[CfgVal]
792
941
  is_required: bool
793
942
  help: str
943
+ aliases: list[str] | None = None
944
+ deprecated_aliases: list[str] | None = None
794
945
 
795
946
  @property
796
947
  def is_type_list_of_str(self) -> bool:
@@ -826,7 +977,7 @@ class runopt:
826
977
 
827
978
  NOTE: dict parsing uses ":" as the kv separator (rather than the standard "=") because "=" is used
828
979
  at the top-level cfg to parse runopts (notice the plural) from the CLI. Originally torchx only supported
829
- primitives and list[str] as CfgVal but dict[str,str] was added in https://github.com/pytorch/torchx/pull/855
980
+ primitives and list[str] as CfgVal but dict[str,str] was added in https://github.com/meta-pytorch/torchx/pull/855
830
981
  """
831
982
 
832
983
  if self.opt_type is None:
@@ -882,6 +1033,7 @@ class runopts:
882
1033
 
883
1034
  def __init__(self) -> None:
884
1035
  self._opts: Dict[str, runopt] = {}
1036
+ self._alias_to_key: dict[str, str] = {}
885
1037
 
886
1038
  def __iter__(self) -> Iterator[Tuple[str, runopt]]:
887
1039
  return self._opts.items().__iter__()
@@ -909,9 +1061,16 @@ class runopts:
909
1061
 
910
1062
  def get(self, name: str) -> Optional[runopt]:
911
1063
  """
912
- Returns option if any was registered, or None otherwise
1064
+ Returns option if any was registered, or None otherwise.
1065
+ First searches for the option by ``name``, then falls-back to matching ``name`` with any
1066
+ registered aliases.
1067
+
913
1068
  """
914
- return self._opts.get(name, None)
1069
+ if name in self._opts:
1070
+ return self._opts[name]
1071
+ if name in self._alias_to_key:
1072
+ return self._opts[self._alias_to_key[name]]
1073
+ return None
915
1074
 
916
1075
  def resolve(self, cfg: Mapping[str, CfgVal]) -> Dict[str, CfgVal]:
917
1076
  """
@@ -926,6 +1085,36 @@ class runopts:
926
1085
 
927
1086
  for cfg_key, runopt in self._opts.items():
928
1087
  val = resolved_cfg.get(cfg_key)
1088
+ resolved_name = None
1089
+ aliases = runopt.aliases or []
1090
+ deprecated_aliases = runopt.deprecated_aliases or []
1091
+ if val is None:
1092
+ for alias in aliases:
1093
+ val = resolved_cfg.get(alias)
1094
+ if alias in cfg or val is not None:
1095
+ resolved_name = alias
1096
+ break
1097
+ for alias in deprecated_aliases:
1098
+ val = resolved_cfg.get(alias)
1099
+ if val is not None:
1100
+ resolved_name = alias
1101
+ use_instead = self._alias_to_key.get(alias)
1102
+ warnings.warn(
1103
+ f"Run option `{alias}` is deprecated, use `{use_instead}` instead",
1104
+ UserWarning,
1105
+ stacklevel=2,
1106
+ )
1107
+ break
1108
+ else:
1109
+ resolved_name = cfg_key
1110
+ for alias in aliases:
1111
+ duplicate_val = resolved_cfg.get(alias)
1112
+ if alias in cfg or duplicate_val is not None:
1113
+ raise InvalidRunConfigException(
1114
+ f"Duplicate opt name. runopt: `{resolved_name}``, is an alias of runopt: `{alias}`",
1115
+ resolved_name,
1116
+ cfg,
1117
+ )
929
1118
 
930
1119
  # check required opt
931
1120
  if runopt.is_required and val is None:
@@ -945,7 +1134,7 @@ class runopts:
945
1134
  )
946
1135
 
947
1136
  # not required and not set, set to default
948
- if val is None:
1137
+ if val is None and resolved_name is None:
949
1138
  resolved_cfg[cfg_key] = runopt.default
950
1139
  return resolved_cfg
951
1140
 
@@ -1045,12 +1234,16 @@ class runopts:
1045
1234
  help: str,
1046
1235
  default: CfgVal = None,
1047
1236
  required: bool = False,
1237
+ aliases: Optional[list[str]] = None,
1238
+ deprecated_aliases: Optional[list[str]] = None,
1048
1239
  ) -> None:
1049
1240
  """
1050
1241
  Adds the ``config`` option with the given help string and ``default``
1051
1242
  value (if any). If the ``default`` is not specified then this option
1052
1243
  is a required option.
1053
1244
  """
1245
+ aliases = aliases or []
1246
+ deprecated_aliases = deprecated_aliases or []
1054
1247
  if required and default is not None:
1055
1248
  raise ValueError(
1056
1249
  f"Required option: {cfg_key} must not specify default value. Given: {default}"
@@ -1062,7 +1255,19 @@ class runopts:
1062
1255
  f" Given: {default} ({type(default).__name__})"
1063
1256
  )
1064
1257
 
1065
- self._opts[cfg_key] = runopt(default, type_, required, help)
1258
+ opt = runopt(
1259
+ default,
1260
+ type_,
1261
+ required,
1262
+ help,
1263
+ list(set(aliases)),
1264
+ list(set(deprecated_aliases)),
1265
+ )
1266
+ for alias in aliases:
1267
+ self._alias_to_key[alias] = cfg_key
1268
+ for deprecated_alias in deprecated_aliases:
1269
+ self._alias_to_key[deprecated_alias] = cfg_key
1270
+ self._opts[cfg_key] = opt
1066
1271
 
1067
1272
  def update(self, other: "runopts") -> None:
1068
1273
  self._opts.update(other._opts)
@@ -75,7 +75,7 @@ def get_fn_docstring(fn: Callable[..., object]) -> Tuple[str, Dict[str, str]]:
75
75
  if the description
76
76
  """
77
77
  default_fn_desc = f"""{fn.__name__} TIP: improve this help string by adding a docstring
78
- to your component (see: https://pytorch.org/torchx/latest/component_best_practices.html)"""
78
+ to your component (see: https://meta-pytorch.org/torchx/latest/component_best_practices.html)"""
79
79
  args_description = _get_default_arguments_descriptions(fn)
80
80
  func_description = inspect.getdoc(fn)
81
81
  if not func_description:
torchx/specs/finder.py CHANGED
@@ -452,7 +452,7 @@ def get_component(
452
452
  raise ComponentNotFoundException(
453
453
  f"Component `{name}` not found. Please make sure it is one of the "
454
454
  "builtins: `torchx builtins`. Or registered via `[torchx.components]` "
455
- "entry point (see: https://pytorch.org/torchx/latest/configure.html)"
455
+ "entry point (see: https://meta-pytorch.org/torchx/latest/configure.html)"
456
456
  )
457
457
 
458
458
  component = components[name]
@@ -16,7 +16,7 @@ the equvalent resource in mem, cpu and gpu numbers.
16
16
 
17
17
  .. note::
18
18
  These resource definitions may change in future. It is expected for each user to
19
- manage their own resources. Follow https://pytorch.org/torchx/latest/specs.html#torchx.specs.get_named_resources
19
+ manage their own resources. Follow https://meta-pytorch.org/torchx/latest/specs.html#torchx.specs.get_named_resources
20
20
  to set up named resources.
21
21
 
22
22
  Usage:
@@ -47,7 +47,7 @@ NEURON_DEVICE = "aws.amazon.com/neurondevice"
47
47
  MEM_TAX = 0.96
48
48
 
49
49
  # determines instance type for non-honogeneous CEs
50
- # see https://github.com/pytorch/torchx/issues/780
50
+ # see https://github.com/meta-pytorch/torchx/issues/780
51
51
  K8S_ITYPE = "node.kubernetes.io/instance-type"
52
52
  GiB: int = int(1024 * MEM_TAX)
53
53
 
@@ -120,6 +120,16 @@ def aws_p5_48xlarge() -> Resource:
120
120
  )
121
121
 
122
122
 
123
+ def aws_p5e_48xlarge() -> Resource:
124
+ return Resource(
125
+ cpu=192,
126
+ gpu=8,
127
+ memMB=2048 * GiB,
128
+ capabilities={K8S_ITYPE: "p5e.48xlarge"},
129
+ devices={EFA_DEVICE: 32},
130
+ )
131
+
132
+
123
133
  def aws_p5en_48xlarge() -> Resource:
124
134
  return Resource(
125
135
  cpu=192,
@@ -419,6 +429,7 @@ NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
419
429
  "aws_p4d.24xlarge": aws_p4d_24xlarge,
420
430
  "aws_p4de.24xlarge": aws_p4de_24xlarge,
421
431
  "aws_p5.48xlarge": aws_p5_48xlarge,
432
+ "aws_p5e.48xlarge": aws_p5e_48xlarge,
422
433
  "aws_p5en.48xlarge": aws_p5en_48xlarge,
423
434
  "aws_g4dn.xlarge": aws_g4dn_xlarge,
424
435
  "aws_g4dn.2xlarge": aws_g4dn_2xlarge,
@@ -32,7 +32,7 @@ implementation.
32
32
 
33
33
  Example usage
34
34
  -------------
35
- Sample `code <https://github.com/pytorch/torchx/blob/main/torchx/examples/apps/tracker/main.py>`__ using tracker API.
35
+ Sample `code <https://github.com/meta-pytorch/torchx/blob/main/torchx/examples/apps/tracker/main.py>`__ using tracker API.
36
36
 
37
37
 
38
38
  Tracker Setup
@@ -111,7 +111,7 @@ Use :py:meth:`~torchx.tracker.app_run_from_env`:
111
111
  Reference :py:class:`~torchx.tracker.api.TrackerBase` implementation
112
112
  --------------------------------------------------------------------
113
113
  :py:class:`~torchx.tracker.backend.fsspec.FsspecTracker` provides reference implementation of a tracker backend.
114
- GitHub example `directory <https://github.com/pytorch/torchx/blob/main/torchx/examples/apps/tracker/>`__ provides example on how to
114
+ GitHub example `directory <https://github.com/meta-pytorch/torchx/blob/main/torchx/examples/apps/tracker/>`__ provides example on how to
115
115
  configure and use it in user application.
116
116
 
117
117
 
torchx/tracker/api.py CHANGED
@@ -191,7 +191,7 @@ def build_trackers(
191
191
  factory = entrypoint_factories.get(factory_name) or load_module(factory_name)
192
192
  if not factory:
193
193
  logger.warning(
194
- f"No tracker factory `{factory_name}` found in entry_points or modules. See https://pytorch.org/torchx/main/tracker.html#module-torchx.tracker"
194
+ f"No tracker factory `{factory_name}` found in entry_points or modules. See https://meta-pytorch.org/torchx/main/tracker.html#module-torchx.tracker"
195
195
  )
196
196
  continue
197
197
  if config:
@@ -69,9 +69,7 @@ def _defer_load_ep(ep: EntryPoint) -> object:
69
69
  return run
70
70
 
71
71
 
72
- def load_group(
73
- group: str, default: Optional[Dict[str, Any]] = None, skip_defaults: bool = False
74
- ):
72
+ def load_group(group: str, default: Optional[Dict[str, Any]] = None):
75
73
  """
76
74
  Loads all the entry points specified by ``group`` and returns
77
75
  the entry points as a map of ``name (str) -> deferred_load_fn``.
@@ -90,7 +88,6 @@ def load_group(
90
88
  1. ``load_group("foo")["bar"]("baz")`` -> equivalent to calling ``this.is.a_fn("baz")``
91
89
  1. ``load_group("food")`` -> ``None``
92
90
  1. ``load_group("food", default={"hello": this.is.c_fn})["hello"]("world")`` -> equivalent to calling ``this.is.c_fn("world")``
93
- 1. ``load_group("food", default={"hello": this.is.c_fn}, skip_defaults=True)`` -> ``None``
94
91
 
95
92
 
96
93
  If the entrypoint is a module (versus a function as shown above), then calling the ``deferred_load_fn``
@@ -115,8 +112,6 @@ def load_group(
115
112
  entrypoints = metadata.entry_points().get(group, ())
116
113
 
117
114
  if len(entrypoints) == 0:
118
- if skip_defaults:
119
- return None
120
115
  return default
121
116
 
122
117
  eps = {}
torchx/version.py CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  # Copyright (c) Meta Platforms, Inc. and affiliates.
3
2
  # All rights reserved.
4
3
  #
@@ -7,6 +6,7 @@
7
6
 
8
7
  # pyre-strict
9
8
 
9
+ from torchx._version import BASE_VERSION
10
10
  from torchx.util.entrypoints import load
11
11
 
12
12
  # Follows PEP-0440 version scheme guidelines
@@ -18,7 +18,7 @@ from torchx.util.entrypoints import load
18
18
  # 0.1.0bN # Beta release
19
19
  # 0.1.0rcN # Release Candidate
20
20
  # 0.1.0 # Final release
21
- __version__ = "0.8.0dev0"
21
+ __version__: str = BASE_VERSION
22
22
 
23
23
 
24
24
  # Use the github container registry images corresponding to the current package
@@ -22,4 +22,4 @@ Example workspace paths:
22
22
  * ``memory://foo-bar/`` an in-memory workspace for notebook/programmatic usage
23
23
  """
24
24
 
25
- from torchx.workspace.api import walk_workspace, Workspace, WorkspaceMixin # noqa: F401
25
+ from torchx.workspace.api import walk_workspace, WorkspaceMixin # noqa: F401