PyPI - torchx-nightly - Versions diffs - 2025.9.28__py3-none-any.whl → 2025.11.17__py3-none-any.whl - Mend

torchx-nightly 2025.9.28py3-none-any.whl → 2025.11.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (32) hide show

torchx/_version.py +8 -0
torchx/cli/cmd_run.py +10 -5
torchx/cli/cmd_tracker.py +1 -1
torchx/components/__init__.py +1 -1
torchx/components/dist.py +9 -3
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +30 -22
torchx/runner/config.py +2 -0
torchx/schedulers/__init__.py +8 -9
torchx/schedulers/api.py +9 -4
torchx/schedulers/aws_batch_scheduler.py +44 -1
torchx/schedulers/docker_scheduler.py +3 -0
torchx/schedulers/kubernetes_scheduler.py +200 -17
torchx/schedulers/slurm_scheduler.py +11 -2
torchx/specs/__init__.py +30 -7
torchx/specs/api.py +215 -10
torchx/specs/file_linter.py +1 -1
torchx/specs/finder.py +1 -1
torchx/specs/named_resources_aws.py +13 -2
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/version.py +2 -2
torchx/workspace/__init__.py +1 -1
torchx/workspace/api.py +65 -110
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/METADATA +34 -21
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/RECORD +32 -31
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/WHEEL +1 -1
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/top_level.txt +0 -0

torchx/_version.py ADDED Viewed

@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+BASE_VERSION = "0.8.0dev0"

torchx/cli/cmd_run.py CHANGED Viewed

@@ -26,7 +26,7 @@ from torchx.cli.cmd_log import get_logs
 from torchx.runner import config, get_runner, Runner
 from torchx.runner.config import load_sections
 from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories
-from torchx.specs import CfgVal
+from torchx.specs import CfgVal, Workspace
 from torchx.specs.finder import (
     _Component,
     ComponentNotFoundException,
@@ -36,7 +36,6 @@ from torchx.specs.finder import (
 )
 from torchx.util.log_tee_helpers import tee_logs
 from torchx.util.types import none_throws
-from torchx.workspace import Workspace
 MISSING_COMPONENT_ERROR_MSG = (
@@ -344,7 +343,7 @@ class CmdRun(SubCommand):
                 "Invalid scheduler configuration: %s\n"
                 "To configure scheduler options, either:\n"
                 "  1. Use the `-cfg` command-line argument, e.g., `-cfg key1=value1,key2=value2`\n"
-                "  2. Set up a `.torchxconfig` file. For more details, visit: https://pytorch.org/torchx/main/runner.config.html\n"
+                "  2. Set up a `.torchxconfig` file. For more details, visit: https://meta-pytorch.org/torchx/main/runner.config.html\n"
                 "Run `torchx runopts %s` to check all available configuration options for the "
                 "`%s` scheduler."
             )
@@ -379,12 +378,16 @@ class CmdRun(SubCommand):
         if not args.stdin:
             return None
         if self._stdin_data_json is None:
-            self._stdin_data_json = self.torchx_json_from_stdin()
+            self._stdin_data_json = self.torchx_json_from_stdin(args)
         return self._stdin_data_json
-    def torchx_json_from_stdin(self) -> Dict[str, Any]:
+    def torchx_json_from_stdin(
+        self, args: Optional[argparse.Namespace] = None
+    ) -> Dict[str, Any]:
         try:
             stdin_data_json = json.load(sys.stdin)
+            if args and args.dryrun:
+                stdin_data_json["dryrun"] = True
             if not isinstance(stdin_data_json, dict):
                 logger.error(
                     "Invalid JSON input for `torchx run` command. Expected a dictionary."
@@ -413,6 +416,8 @@ class CmdRun(SubCommand):
                 continue
             if action.dest == "help":  # Skip help
                 continue
+            if action.dest == "dryrun":  # Skip dryrun
+                continue
             current_value = getattr(args, action.dest, None)
             default_value = action.default

torchx/cli/cmd_tracker.py CHANGED Viewed

@@ -45,7 +45,7 @@ class CmdTracker(SubCommand):
         else:
             raise RuntimeError(
                 "No trackers configured."
-                " See: https://pytorch.org/torchx/latest/runtime/tracking.html"
+                " See: https://meta-pytorch.org/torchx/latest/runtime/tracking.html"
             )
     def add_list_job_arguments(self, subparser: argparse.ArgumentParser) -> None:

torchx/components/__init__.py CHANGED Viewed

@@ -181,7 +181,7 @@ To validate that you've defined your component correctly you can either:
 1. (easiest) Dryrun your component's ``--help`` with the cli: ``torchx run --dryrun ~/component.py:train --help``
 2. Use the component :ref:`linter<specs:Component Linter>`
-   (see `dist_test.py <https://github.com/pytorch/torchx/blob/main/torchx/components/test/dist_test.py>`_ as an example)
+   (see `dist_test.py <https://github.com/meta-pytorch/torchx/blob/main/torchx/components/test/dist_test.py>`_ as an example)
 Running as a Job

torchx/components/dist.py CHANGED Viewed

@@ -92,6 +92,7 @@ def spmd(
     h: str = "gpu.small",
     j: str = "1x1",
     env: Optional[Dict[str, str]] = None,
+    metadata: Optional[Dict[str, str]] = None,
     max_retries: int = 0,
     mounts: Optional[List[str]] = None,
     debug: bool = False,
@@ -131,6 +132,7 @@ def spmd(
         h: the type of host to run on (e.g. aws_p4d.24xlarge). Must be one of the registered named resources
         j: {nnodes}x{nproc_per_node}. For GPU hosts omitting nproc_per_node will infer it from the GPU count on the host
         env: environment variables to be passed to the run (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
+        metadata: metadata to be passed to the scheduler (e.g. KEY1=v1,KEY2=v2,KEY3=v3)
         max_retries: the number of scheduler retries allowed
         mounts: (for docker based runs only) mounts to mount into the worker environment/container
                 (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]).
@@ -150,6 +152,7 @@ def spmd(
         h=h,
         j=str(StructuredJArgument.parse_from(h, j)),
         env=env,
+        metadata=metadata,
         max_retries=max_retries,
         mounts=mounts,
         debug=debug,
@@ -168,6 +171,7 @@ def ddp(
     memMB: int = 1024,
     j: str = "1x2",
     env: Optional[Dict[str, str]] = None,
+    metadata: Optional[Dict[str, str]] = None,
     max_retries: int = 0,
     rdzv_port: int = 29500,
     rdzv_backend: str = "c10d",
@@ -186,7 +190,7 @@ def ddp(
     Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
           ``h`` takes precedence if specified for setting resource requirements.
-          See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
+          See `registering named resources <https://meta-pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
     Args:
         script_args: arguments to the main module
@@ -201,6 +205,7 @@ def ddp(
         h: a registered named resource (if specified takes precedence over cpu, gpu, memMB)
         j: [{min_nnodes}:]{nnodes}x{nproc_per_node}, for gpu hosts, nproc_per_node must not exceed num gpus
         env: environment varibles to be passed to the run (e.g. ENV1=v1,ENV2=v2,ENV3=v3)
+        metadata: metadata to be passed to the scheduler (e.g. KEY1=v1,KEY2=v2,KEY3=v3)
         max_retries: the number of scheduler retries allowed
         rdzv_port: the port on rank0's host to use for hosting the c10d store used for rendezvous.
                    Only takes effect when running multi-node. When running single node, this parameter
@@ -237,8 +242,8 @@ def ddp(
         # use $$ in the prefix to escape the '$' literal (rather than a string Template substitution argument)
         rdzv_endpoint = _noquote(f"$${{{macros.rank0_env}:=localhost}}:{rdzv_port}")
-    if env is None:
-        env = {}
+    env = env or {}
+    metadata = metadata or {}
     argname = StructuredNameArgument.parse_from(
         name=name,
@@ -299,6 +304,7 @@ def ddp(
                 mounts=specs.parse_mounts(mounts) if mounts else [],
             )
         ],
+        metadata=metadata,
     )

torchx/components/utils.py CHANGED Viewed

@@ -154,7 +154,7 @@ def python(
     Note: (cpu, gpu, memMB) parameters are mutually exclusive with ``h`` (named resource) where
           ``h`` takes precedence if specified for setting resource requirements.
-          See `registering named resources <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
+          See `registering named resources <https://meta-pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
     Args:
         args: arguments passed to the program in sys.argv[1:] (ignored with `--c`)

torchx/distributed/__init__.py CHANGED Viewed

@@ -48,7 +48,7 @@ def local_rank() -> int:
                 " but the `LOCAL_RANK` environment variable is not set. Will trivially return 0 for local_rank.\n"
                 " It is recommended to use torchrun/torchx to run your script or set the `LOCAL_RANK` manually.\n"
                 " For additional details see:\n"
-                "  1) https://pytorch.org/torchx/latest/components/distributed.html\n"
+                "  1) https://meta-pytorch.org/torchx/latest/components/distributed.html\n"
                 "  2) https://pytorch.org/docs/stable/elastic/run.html\n"
                 "=============================================================================================="
             )

torchx/runner/api.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -43,6 +42,7 @@ from torchx.specs import (
     parse_app_handle,
     runopts,
     UnknownAppException,
+    Workspace,
 )
 from torchx.specs.finder import get_component
 from torchx.tracker.api import (
@@ -54,7 +54,7 @@ from torchx.tracker.api import (
 from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
 from torchx.util.types import none_throws
-from torchx.workspace.api import Workspace, WorkspaceMixin
+from torchx.workspace import WorkspaceMixin
 if TYPE_CHECKING:
     from typing_extensions import Self
@@ -420,36 +420,44 @@ class Runner:
             scheduler,
             runcfg=json.dumps(cfg) if cfg else None,
             workspace=str(workspace),
-        ):
+        ) as ctx:
             sched = self._scheduler(scheduler)
             resolved_cfg = sched.run_opts().resolve(cfg)
             sched._pre_build_validate(app, scheduler, resolved_cfg)
-            if workspace and isinstance(sched, WorkspaceMixin):
-                role = app.roles[0]
-                old_img = role.image
-                logger.info(f"Checking for changes in workspace `{workspace}`...")
-                logger.info(
-                    'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
-                )
-                sched.build_workspace_and_update_role2(role, workspace, resolved_cfg)
-                if old_img != role.image:
-                    logger.info(
-                        f"Built new image `{role.image}` based on original image `{old_img}`"
-                        f" and changes in workspace `{workspace}` for role[0]={role.name}."
-                    )
-                else:
-                    logger.info(
-                        f"Reusing original image `{old_img}` for role[0]={role.name}."
-                        " Either a patch was built or no changes to workspace was detected."
+            if isinstance(sched, WorkspaceMixin):
+                if workspace:
+                    # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
+                    # later, torchx added support for the workspace attr in Role
+                    # for BC, give precedence to the workspace argument over the workspace attr for role[0]
+                    if app.roles[0].workspace:
+                        logger.info(
+                            "Overriding role[%d] (%s) workspace to `%s`"
+                            "To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
+                            0,
+                            role.name,
+                            str(app.roles[0].workspace),
+                        )
+                    app.roles[0].workspace = (
+                        Workspace.from_str(workspace)
+                        if isinstance(workspace, str)
+                        else workspace
                     )
+                sched.build_workspaces(app.roles, resolved_cfg)
             sched._validate(app, scheduler, resolved_cfg)
             dryrun_info = sched.submit_dryrun(app, resolved_cfg)
             dryrun_info._scheduler = scheduler
+            event = ctx._torchx_event
+            event.scheduler = scheduler
+            event.runcfg = json.dumps(cfg) if cfg else None
+            event.app_id = app.name
+            event.app_image = none_throws(dryrun_info._app).roles[0].image
+            event.app_metadata = app.metadata
             return dryrun_info
     def scheduler_run_opts(self, scheduler: str) -> runopts:

torchx/runner/config.py CHANGED Viewed

@@ -494,6 +494,8 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
     config = os.getenv(ENV_TORCHXCONFIG)
     if config is not None:
+        if not config:
+            return []
         configfile = Path(config)
         if not configfile.is_file():
             raise FileNotFoundError(

torchx/schedulers/__init__.py CHANGED Viewed

@@ -49,15 +49,14 @@ def get_scheduler_factories(
     The first scheduler in the dictionary is used as the default scheduler.
     """
-    default_schedulers: dict[str, SchedulerFactory] = {}
-    for scheduler, path in DEFAULT_SCHEDULER_MODULES.items():
-        default_schedulers[scheduler] = _defer_load_scheduler(path)
-    return load_group(
-        group,
-        default=default_schedulers,
-        skip_defaults=skip_defaults,
-    )
+    if skip_defaults:
+        default_schedulers = {}
+    else:
+        default_schedulers: dict[str, SchedulerFactory] = {}
+        for scheduler, path in DEFAULT_SCHEDULER_MODULES.items():
+            default_schedulers[scheduler] = _defer_load_scheduler(path)
+    return load_group(group, default=default_schedulers)
 def get_default_scheduler_name() -> str:

torchx/schedulers/api.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -22,8 +21,9 @@ from torchx.specs import (
     Role,
     RoleStatus,
     runopts,
+    Workspace,
 )
-from torchx.workspace.api import Workspace, WorkspaceMixin
+from torchx.workspace import WorkspaceMixin
 DAYS_IN_2_WEEKS = 14
@@ -131,7 +131,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         self,
         app: A,
         cfg: T,
-        workspace: Optional[Union[Workspace, str]] = None,
+        workspace: str | Workspace | None = None,
     ) -> str:
         """
         Submits the application to be run by the scheduler.
@@ -145,7 +145,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         resolved_cfg = self.run_opts().resolve(cfg)
         if workspace:
             assert isinstance(self, WorkspaceMixin)
-            self.build_workspace_and_update_role2(app.roles[0], workspace, resolved_cfg)
+            if isinstance(workspace, str):
+                workspace = Workspace.from_str(workspace)
+            app.roles[0].workspace = workspace
+            self.build_workspaces(app.roles, resolved_cfg)
         # pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
         dryrun_info = self.submit_dryrun(app, resolved_cfg)

torchx/schedulers/aws_batch_scheduler.py CHANGED Viewed

@@ -92,6 +92,8 @@ ENV_TORCHX_ROLE_IDX = "TORCHX_ROLE_IDX"
 ENV_TORCHX_ROLE_NAME = "TORCHX_ROLE_NAME"
+ENV_TORCHX_IMAGE = "TORCHX_IMAGE"
 DEFAULT_ROLE_NAME = "node"
 TAG_TORCHX_VER = "torchx.pytorch.org/version"
@@ -99,6 +101,37 @@ TAG_TORCHX_APPNAME = "torchx.pytorch.org/app-name"
 TAG_TORCHX_USER = "torchx.pytorch.org/user"
+def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]:
+    """
+    Parse ulimit string in format: name:softLimit:hardLimit
+    Multiple ulimits separated by commas.
+    """
+    if not ulimits_list:
+        return []
+    ulimits = []
+    for ulimit_str in ulimits_list:
+        if not ulimit_str.strip():
+            continue
+        parts = ulimit_str.strip().split(":")
+        if len(parts) != 3:
+            raise ValueError(
+                f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}"
+            )
+        name, soft_limit, hard_limit = parts
+        ulimits.append(
+            {
+                "name": name,
+                "softLimit": int(soft_limit) if soft_limit != "-1" else -1,
+                "hardLimit": int(hard_limit) if hard_limit != "-1" else -1,
+            }
+        )
+    return ulimits
 if TYPE_CHECKING:
     from docker import DockerClient
@@ -177,7 +210,8 @@ def _role_to_node_properties(
     privileged: bool = False,
     job_role_arn: Optional[str] = None,
     execution_role_arn: Optional[str] = None,
-) -> Dict[str, object]:
+    ulimits: Optional[List[Dict[str, Any]]] = None,
+) -> Dict[str, Any]:
     role.mounts += get_device_mounts(role.resource.devices)
     mount_points = []
@@ -239,6 +273,7 @@ def _role_to_node_properties(
         "environment": [{"name": k, "value": v} for k, v in role.env.items()],
         "privileged": privileged,
         "resourceRequirements": resource_requirements_from_resource(role.resource),
+        **({"ulimits": ulimits} if ulimits else {}),
         "linuxParameters": {
             # To support PyTorch dataloaders we need to set /dev/shm to larger
             # than the 64M default.
@@ -361,6 +396,7 @@ class AWSBatchOpts(TypedDict, total=False):
     priority: int
     job_role_arn: Optional[str]
     execution_role_arn: Optional[str]
+    ulimits: Optional[list[str]]
 class AWSBatchScheduler(
@@ -506,6 +542,7 @@ class AWSBatchScheduler(
             role = values.apply(role)
             role.env[ENV_TORCHX_ROLE_IDX] = str(role_idx)
             role.env[ENV_TORCHX_ROLE_NAME] = str(role.name)
+            role.env[ENV_TORCHX_IMAGE] = role.image
             nodes.append(
                 _role_to_node_properties(
@@ -514,6 +551,7 @@ class AWSBatchScheduler(
                     privileged=cfg["privileged"],
                     job_role_arn=cfg.get("job_role_arn"),
                     execution_role_arn=cfg.get("execution_role_arn"),
+                    ulimits=parse_ulimits(cfg.get("ulimits") or []),
                 )
             )
             node_idx += role.num_replicas
@@ -599,6 +637,11 @@ class AWSBatchScheduler(
             type_=str,
             help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
         )
+        opts.add(
+            "ulimits",
+            type_=List[str],
+            help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)",
+        )
         return opts
     def _get_job_id(self, app_id: str) -> Optional[str]:

torchx/schedulers/docker_scheduler.py CHANGED Viewed

@@ -84,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
 LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
 LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
+ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
 NETWORK = "torchx"
@@ -279,6 +281,7 @@ class DockerScheduler(
                 # configure distributed host envs
                 env["TORCHX_RANK0_HOST"] = rank0_name
+                env[ENV_TORCHX_IMAGE] = replica_role.image
                 c = DockerContainer(
                     image=replica_role.image,

torchx-nightly 2025.9.28__py3-none-any.whl → 2025.11.17__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2025.9.28py3-none-any.whl → 2025.11.17py3-none-any.whl