PyPI - torchx-nightly - Versions diffs - 2025.7.9__py3-none-any.whl → 2025.11.12__py3-none-any.whl - Mend

torchx-nightly 2025.7.9py3-none-any.whl → 2025.11.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
torchx/cli/cmd_list.py +1 -2
torchx/cli/cmd_run.py +202 -28
torchx/cli/cmd_tracker.py +1 -1
torchx/components/__init__.py +1 -8
torchx/components/dist.py +9 -3
torchx/components/integration_tests/component_provider.py +2 -2
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +92 -81
torchx/runner/config.py +11 -9
torchx/runner/events/__init__.py +20 -10
torchx/runner/events/api.py +1 -1
torchx/schedulers/__init__.py +7 -10
torchx/schedulers/api.py +20 -15
torchx/schedulers/aws_batch_scheduler.py +45 -2
torchx/schedulers/docker_scheduler.py +3 -0
torchx/schedulers/kubernetes_scheduler.py +200 -17
torchx/schedulers/local_scheduler.py +1 -0
torchx/schedulers/slurm_scheduler.py +160 -26
torchx/specs/__init__.py +23 -6
torchx/specs/api.py +279 -33
torchx/specs/builders.py +109 -28
torchx/specs/file_linter.py +117 -53
torchx/specs/finder.py +25 -37
torchx/specs/named_resources_aws.py +13 -2
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/util/strings.py +1 -1
torchx/util/types.py +12 -1
torchx/version.py +2 -2
torchx/workspace/api.py +102 -5
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/METADATA +34 -48
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/RECORD +39 -51
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/WHEEL +1 -1
torchx/examples/pipelines/__init__.py +0 -0
torchx/examples/pipelines/kfp/__init__.py +0 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
torchx/pipelines/kfp/__init__.py +0 -30
torchx/pipelines/kfp/adapter.py +0 -274
torchx/pipelines/kfp/version.py +0 -19
torchx/schedulers/gcp_batch_scheduler.py +0 -497
torchx/schedulers/ray/ray_common.py +0 -22
torchx/schedulers/ray/ray_driver.py +0 -307
torchx/schedulers/ray_scheduler.py +0 -454
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/top_level.txt +0 -0

torchx/runner/api.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -25,6 +24,7 @@ from typing import (
     Type,
     TYPE_CHECKING,
     TypeVar,
+    Union,
 )
 from torchx.runner.events import log_event
@@ -42,6 +42,7 @@ from torchx.specs import (
     parse_app_handle,
     runopts,
     UnknownAppException,
+    Workspace,
 )
 from torchx.specs.finder import get_component
 from torchx.tracker.api import (
@@ -53,7 +54,7 @@ from torchx.tracker.api import (
 from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
 from torchx.util.types import none_throws
-from torchx.workspace.api import PkgInfo, WorkspaceBuilder, WorkspaceMixin
+from torchx.workspace import WorkspaceMixin
 if TYPE_CHECKING:
     from typing_extensions import Self
@@ -129,9 +130,9 @@ class Runner:
     def _get_scheduler_params_from_env(self) -> Dict[str, str]:
         scheduler_params = {}
         for key, value in os.environ.items():
-            lower_case_key = key.lower()
-            if lower_case_key.startswith("torchx_"):
-                scheduler_params[lower_case_key.strip("torchx_")] = value
+            key = key.lower()
+            if key.startswith("torchx_"):
+                scheduler_params[key.removeprefix("torchx_")] = value
         return scheduler_params
     def __enter__(self) -> "Self":
@@ -164,25 +165,13 @@ class Runner:
         for scheduler in self._scheduler_instances.values():
             scheduler.close()
-    def build_standalone_workspace(
-        self,
-        workspace_builder: WorkspaceBuilder[S, T],
-        sync: bool = True,
-    ) -> PkgInfo[S]:
-        """
-        Build a standalone workspace for the given role.
-        This method is used to build a workspace for a role independent of the scheduler and
-        also enables asynchronous workspace building using the Role overrides.
-        """
-        return workspace_builder.build_workspace(sync)
     def run_component(
         self,
         component: str,
-        component_args: List[str],
+        component_args: Union[list[str], dict[str, Any]],
         scheduler: str,
         cfg: Optional[Mapping[str, CfgVal]] = None,
-        workspace: Optional[str] = None,
+        workspace: Optional[Union[Workspace, str]] = None,
         parent_run_id: Optional[str] = None,
     ) -> AppHandle:
         """
@@ -217,7 +206,7 @@ class Runner:
             ComponentNotFoundException: if the ``component_path`` is failed to resolve.
         """
-        with log_event("run_component", workspace=workspace) as ctx:
+        with log_event("run_component") as ctx:
             dryrun_info = self.dryrun_component(
                 component,
                 component_args,
@@ -228,7 +217,8 @@ class Runner:
             )
             handle = self.schedule(dryrun_info)
             app = none_throws(dryrun_info._app)
-            ctx._torchx_event.workspace = workspace
+            ctx._torchx_event.workspace = str(workspace)
             ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
             ctx._torchx_event.app_image = app.roles[0].image
             ctx._torchx_event.app_id = parse_app_handle(handle)[2]
@@ -238,10 +228,10 @@ class Runner:
     def dryrun_component(
         self,
         component: str,
-        component_args: List[str],
+        component_args: Union[list[str], dict[str, Any]],
         scheduler: str,
         cfg: Optional[Mapping[str, CfgVal]] = None,
-        workspace: Optional[str] = None,
+        workspace: Optional[Union[Workspace, str]] = None,
         parent_run_id: Optional[str] = None,
     ) -> AppDryRunInfo:
         """
@@ -249,10 +239,13 @@ class Runner:
         component, but just returns what "would" have run.
         """
         component_def = get_component(component)
+        args_from_cli = component_args if isinstance(component_args, list) else []
+        args_from_json = component_args if isinstance(component_args, dict) else {}
         app = materialize_appdef(
             component_def.fn,
-            component_args,
+            args_from_cli,
             self._component_defaults.get(component, None),
+            args_from_json,
         )
         return self.dryrun(
             app,
@@ -267,7 +260,7 @@ class Runner:
         app: AppDef,
         scheduler: str,
         cfg: Optional[Mapping[str, CfgVal]] = None,
-        workspace: Optional[str] = None,
+        workspace: Optional[Union[Workspace, str]] = None,
         parent_run_id: Optional[str] = None,
     ) -> AppHandle:
         """
@@ -280,9 +273,7 @@ class Runner:
             An application handle that is used to call other action APIs on the app.
         """
-        with log_event(
-            api="run", runcfg=json.dumps(cfg) if cfg else None, workspace=workspace
-        ) as ctx:
+        with log_event(api="run") as ctx:
             dryrun_info = self.dryrun(
                 app,
                 scheduler,
@@ -291,10 +282,15 @@ class Runner:
                 parent_run_id=parent_run_id,
             )
             handle = self.schedule(dryrun_info)
-            ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
-            ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image
-            ctx._torchx_event.app_id = parse_app_handle(handle)[2]
-            ctx._torchx_event.app_metadata = app.metadata
+            event = ctx._torchx_event
+            event.scheduler = scheduler
+            event.runcfg = json.dumps(cfg) if cfg else None
+            event.workspace = str(workspace)
+            event.app_id = parse_app_handle(handle)[2]
+            event.app_image = none_throws(dryrun_info._app).roles[0].image
+            event.app_metadata = app.metadata
             return handle
     def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle:
@@ -328,21 +324,22 @@ class Runner:
         """
         scheduler = none_throws(dryrun_info._scheduler)
-        app_image = none_throws(dryrun_info._app).roles[0].image
         cfg = dryrun_info._cfg
-        with log_event(
-            "schedule",
-            scheduler,
-            app_image=app_image,
-            runcfg=json.dumps(cfg) if cfg else None,
-        ) as ctx:
+        with log_event("schedule") as ctx:
             sched = self._scheduler(scheduler)
             app_id = sched.schedule(dryrun_info)
             app_handle = make_app_handle(scheduler, self._name, app_id)
             app = none_throws(dryrun_info._app)
             self._apps[app_handle] = app
-            _, _, app_id = parse_app_handle(app_handle)
-            ctx._torchx_event.app_id = app_id
+            event = ctx._torchx_event
+            event.scheduler = scheduler
+            event.runcfg = json.dumps(cfg) if cfg else None
+            event.app_id = app_id
+            event.app_image = none_throws(dryrun_info._app).roles[0].image
+            event.app_metadata = app.metadata
             return app_handle
     def name(self) -> str:
@@ -353,7 +350,7 @@ class Runner:
         app: AppDef,
         scheduler: str,
         cfg: Optional[Mapping[str, CfgVal]] = None,
-        workspace: Optional[str] = None,
+        workspace: Optional[Union[Workspace, str]] = None,
         parent_run_id: Optional[str] = None,
     ) -> AppDryRunInfo:
         """
@@ -422,52 +419,45 @@ class Runner:
             "dryrun",
             scheduler,
             runcfg=json.dumps(cfg) if cfg else None,
-            workspace=workspace,
-        ):
+            workspace=str(workspace),
+        ) as ctx:
             sched = self._scheduler(scheduler)
             resolved_cfg = sched.run_opts().resolve(cfg)
-            # early validation before build workspace
-            with log_event(
-                "pre_build_validate",
-                scheduler,
-            ):
-                sched._pre_build_validate(app, scheduler, resolved_cfg)
-            if workspace and isinstance(sched, WorkspaceMixin):
-                role = app.roles[0]
-                old_img = role.image
-                logger.info(f"Checking for changes in workspace `{workspace}`...")
-                logger.info(
-                    'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
-                )
-                with log_event(
-                    "build_workspace_and_update_role",
-                    scheduler,
-                ) as ctx:
-                    sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
-                    ctx._torchx_event.app_image = role.image
-                    ctx._torchx_event.workspace = workspace
-                if old_img != role.image:
-                    logger.info(
-                        f"Built new image `{role.image}` based on original image `{old_img}`"
-                        f" and changes in workspace `{workspace}` for role[0]={role.name}."
-                    )
-                else:
-                    logger.info(
-                        f"Reusing original image `{old_img}` for role[0]={role.name}."
-                        " Either a patch was built or no changes to workspace was detected."
+            sched._pre_build_validate(app, scheduler, resolved_cfg)
+            if isinstance(sched, WorkspaceMixin):
+                if workspace:
+                    # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
+                    # later, torchx added support for the workspace attr in Role
+                    # for BC, give precedence to the workspace argument over the workspace attr for role[0]
+                    if app.roles[0].workspace:
+                        logger.info(
+                            "Overriding role[%d] (%s) workspace to `%s`"
+                            "To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
+                            0,
+                            role.name,
+                            str(app.roles[0].workspace),
+                        )
+                    app.roles[0].workspace = (
+                        Workspace.from_str(workspace)
+                        if isinstance(workspace, str)
+                        else workspace
                     )
-            with log_event(
-                "validate",
-                scheduler,
-            ):
-                sched._validate(app, scheduler, resolved_cfg)
+                sched.build_workspaces(app.roles, resolved_cfg)
+            sched._validate(app, scheduler, resolved_cfg)
             dryrun_info = sched.submit_dryrun(app, resolved_cfg)
             dryrun_info._scheduler = scheduler
+            event = ctx._torchx_event
+            event.scheduler = scheduler
+            event.runcfg = json.dumps(cfg) if cfg else None
+            event.app_id = app.name
+            event.app_image = none_throws(dryrun_info._app).roles[0].image
+            event.app_metadata = app.metadata
             return dryrun_info
     def scheduler_run_opts(self, scheduler: str) -> runopts:
@@ -486,6 +476,27 @@ class Runner:
         """
         return self._scheduler(scheduler).run_opts()
+    def cfg_from_str(self, scheduler: str, *cfg_literal: str) -> Mapping[str, CfgVal]:
+        """
+        Convenience function around the scheduler's ``runopts.cfg_from_str()`` method.
+        Usage:
+        .. doctest::
+            from torchx.runner import get_runner
+            runner = get_runner()
+            cfg = runner.cfg_from_str("local_cwd", "log_dir=/tmp/foobar", "prepend_cwd=True")
+            assert cfg == {"log_dir": "/tmp/foobar", "prepend_cwd": True, "auto_set_cuda_visible_devices": False}
+        """
+        opts = self._scheduler(scheduler).run_opts()
+        cfg = {}
+        for cfg_str in cfg_literal:
+            cfg.update(opts.cfg_from_str(cfg_str))
+        return cfg
     def scheduler_backends(self) -> List[str]:
         """
         Returns a list of all supported scheduler backends.

torchx/runner/config.py CHANGED Viewed

@@ -73,7 +73,7 @@ CLI Usage
 #. In addition, it is possible to specify a different config other than .torchxconfig to
    load at runtime. Requirements are that the config path is specified by enviornment
-   variable TORCHX_CONFIG. It also disables hierarchy loading configs from multiple
+   variable TORCHXCONFIG. It also disables hierarchy loading configs from multiple
    directories as the cases otherwise.
 #. User level .torchxconfig
@@ -278,14 +278,14 @@ def dump(
                     continue
                 # serialize list elements with `;` delimiter (consistent with torchx cli)
-                if opt.opt_type == List[str]:
+                if opt.is_type_list_of_str:
                     # deal with empty or None default lists
                     if opt.default:
                         # pyre-ignore[6] opt.default type checked already as List[str]
                         val = ";".join(opt.default)
                     else:
                         val = _NONE
-                elif opt.opt_type == Dict[str, str]:
+                elif opt.is_type_dict_of_str:
                     # deal with empty or None default lists
                     if opt.default:
                         # pyre-ignore[16] opt.default type checked already as Dict[str, str]
@@ -494,6 +494,8 @@ def find_configs(dirs: Optional[Iterable[str]] = None) -> List[str]:
     config = os.getenv(ENV_TORCHXCONFIG)
     if config is not None:
+        if not config:
+            return []
         configfile = Path(config)
         if not configfile.is_file():
             raise FileNotFoundError(
@@ -536,26 +538,26 @@ def load(scheduler: str, f: TextIO, cfg: Dict[str, CfgVal]) -> None:
                 # this also handles empty or None lists
                 cfg[name] = None
             else:
-                runopt = runopts.get(name)
+                opt = runopts.get(name)
-                if runopt is None:
+                if opt is None:
                     log.warning(
                         f"`{name} = {value}` was declared in the [{section}] section "
                         f" of the config file but is not a runopt of `{scheduler}` scheduler."
                         f" Remove the entry from the config file to no longer see this warning"
                     )
                 else:
-                    if runopt.opt_type is bool:
+                    if opt.opt_type is bool:
                         # need to handle bool specially since str -> bool is based on
                         # str emptiness not value (e.g. bool("False") == True)
                         cfg[name] = config.getboolean(section, name)
-                    elif runopt.opt_type is List[str]:
+                    elif opt.is_type_list_of_str:
                         cfg[name] = value.split(";")
-                    elif runopt.opt_type is Dict[str, str]:
+                    elif opt.is_type_dict_of_str:
                         cfg[name] = {
                             s.split(":", 1)[0]: s.split(":", 1)[1]
                             for s in value.replace(",", ";").split(";")
                         }
                     else:
                         # pyre-ignore[29]
-                        cfg[name] = runopt.opt_type(value)
+                        cfg[name] = opt.opt_type(value)

torchx/runner/events/__init__.py CHANGED Viewed

@@ -33,8 +33,9 @@ from torchx.util.session import get_session_id_or_create_new
 from .api import SourceType, TorchxEvent  # noqa F401
-# pyre-fixme[9]: _events_logger is a global variable
-_events_logger: logging.Logger = None
+_events_logger: Optional[logging.Logger] = None
+log: logging.Logger = logging.getLogger(__name__)
 def _get_or_create_logger(destination: str = "null") -> logging.Logger:
@@ -51,19 +52,28 @@ def _get_or_create_logger(destination: str = "null") -> logging.Logger:
             a new logger if None provided.
     """
     global _events_logger
     if _events_logger:
         return _events_logger
-    logging_handler = get_logging_handler(destination)
-    logging_handler.setLevel(logging.DEBUG)
-    _events_logger = logging.getLogger(f"torchx-events-{destination}")
-    # Do not propagate message to the root logger
-    _events_logger.propagate = False
-    _events_logger.addHandler(logging_handler)
-    return _events_logger
+    else:
+        logging_handler = get_logging_handler(destination)
+        logging_handler.setLevel(logging.DEBUG)
+        _events_logger = logging.getLogger(f"torchx-events-{destination}")
+        # Do not propagate message to the root logger
+        _events_logger.propagate = False
+        _events_logger.addHandler(logging_handler)
+        assert _events_logger  # make type-checker happy
+        return _events_logger
 def record(event: TorchxEvent, destination: str = "null") -> None:
-    _get_or_create_logger(destination).info(event.serialize())
+    try:
+        serialized_event = event.serialize()
+    except Exception:
+        log.exception("failed to serialize event, will not record event")
+    else:
+        _get_or_create_logger(destination).info(serialized_event)
 class log_event:

torchx/runner/events/api.py CHANGED Viewed

@@ -29,7 +29,7 @@ class TorchxEvent:
         scheduler: Scheduler that is used to execute request
         api: Api name
         app_id: Unique id that is set by the underlying scheduler
-        image: Image/container bundle that is used to execute request.
+        app_image: Image/container bundle that is used to execute request.
         app_metadata: metadata to the app (treatment of metadata is scheduler dependent)
         runcfg: Run config that was used to schedule app.
         source: Type of source the event is generated.

torchx/schedulers/__init__.py CHANGED Viewed

@@ -21,8 +21,6 @@ DEFAULT_SCHEDULER_MODULES: Mapping[str, str] = {
     "kubernetes_mcad": "torchx.schedulers.kubernetes_mcad_scheduler",
     "aws_batch": "torchx.schedulers.aws_batch_scheduler",
     "aws_sagemaker": "torchx.schedulers.aws_sagemaker_scheduler",
-    "gcp_batch": "torchx.schedulers.gcp_batch_scheduler",
-    "ray": "torchx.schedulers.ray_scheduler",
     "lsf": "torchx.schedulers.lsf_scheduler",
 }
@@ -51,15 +49,14 @@ def get_scheduler_factories(
     The first scheduler in the dictionary is used as the default scheduler.
     """
-    default_schedulers: dict[str, SchedulerFactory] = {}
-    for scheduler, path in DEFAULT_SCHEDULER_MODULES.items():
-        default_schedulers[scheduler] = _defer_load_scheduler(path)
+    if skip_defaults:
+        default_schedulers = {}
+    else:
+        default_schedulers: dict[str, SchedulerFactory] = {}
+        for scheduler, path in DEFAULT_SCHEDULER_MODULES.items():
+            default_schedulers[scheduler] = _defer_load_scheduler(path)
-    return load_group(
-        group,
-        default=default_schedulers,
-        skip_defaults=skip_defaults,
-    )
+    return load_group(group, default=default_schedulers)
 def get_default_scheduler_name() -> str:

torchx/schedulers/api.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -12,7 +11,7 @@ import re
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Generic, Iterable, List, Optional, TypeVar
+from typing import Generic, Iterable, List, Optional, TypeVar, Union
 from torchx.specs import (
     AppDef,
@@ -22,8 +21,9 @@ from torchx.specs import (
     Role,
     RoleStatus,
     runopts,
+    Workspace,
 )
-from torchx.workspace.api import WorkspaceMixin
+from torchx.workspace import WorkspaceMixin
 DAYS_IN_2_WEEKS = 14
@@ -131,7 +131,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         self,
         app: A,
         cfg: T,
-        workspace: Optional[str] = None,
+        workspace: str | Workspace | None = None,
     ) -> str:
         """
         Submits the application to be run by the scheduler.
@@ -144,10 +144,14 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         # pyre-fixme: Generic cfg type passed to resolve
         resolved_cfg = self.run_opts().resolve(cfg)
         if workspace:
-            sched = self
-            assert isinstance(sched, WorkspaceMixin)
-            role = app.roles[0]
-            sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
+            assert isinstance(self, WorkspaceMixin)
+            if isinstance(workspace, str):
+                workspace = Workspace.from_str(workspace)
+            app.roles[0].workspace = workspace
+            self.build_workspaces(app.roles, resolved_cfg)
         # pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
         dryrun_info = self.submit_dryrun(app, resolved_cfg)
         return self.schedule(dryrun_info)
@@ -356,13 +360,14 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         Raises error if application is not compatible with scheduler
         """
-        if isinstance(app, AppDef):
-            for role in app.roles:
-                if role.resource == NULL_RESOURCE:
-                    raise ValueError(
-                        f"No resource for role: {role.image}."
-                        f" Did you forget to attach resource to the role"
-                    )
+        if not isinstance(app, AppDef):
+            return
+        for role in app.roles:
+            if role.resource == NULL_RESOURCE:
+                raise ValueError(
+                    f"No resource for role: {role.image}. Did you forget to attach resource to the role"
+                )
 def filter_regex(regex: str, data: Iterable[str]) -> Iterable[str]:

torchx/schedulers/aws_batch_scheduler.py CHANGED Viewed

@@ -92,6 +92,8 @@ ENV_TORCHX_ROLE_IDX = "TORCHX_ROLE_IDX"
 ENV_TORCHX_ROLE_NAME = "TORCHX_ROLE_NAME"
+ENV_TORCHX_IMAGE = "TORCHX_IMAGE"
 DEFAULT_ROLE_NAME = "node"
 TAG_TORCHX_VER = "torchx.pytorch.org/version"
@@ -99,6 +101,37 @@ TAG_TORCHX_APPNAME = "torchx.pytorch.org/app-name"
 TAG_TORCHX_USER = "torchx.pytorch.org/user"
+def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]:
+    """
+    Parse ulimit string in format: name:softLimit:hardLimit
+    Multiple ulimits separated by commas.
+    """
+    if not ulimits_list:
+        return []
+    ulimits = []
+    for ulimit_str in ulimits_list:
+        if not ulimit_str.strip():
+            continue
+        parts = ulimit_str.strip().split(":")
+        if len(parts) != 3:
+            raise ValueError(
+                f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}"
+            )
+        name, soft_limit, hard_limit = parts
+        ulimits.append(
+            {
+                "name": name,
+                "softLimit": int(soft_limit) if soft_limit != "-1" else -1,
+                "hardLimit": int(hard_limit) if hard_limit != "-1" else -1,
+            }
+        )
+    return ulimits
 if TYPE_CHECKING:
     from docker import DockerClient
@@ -177,7 +210,8 @@ def _role_to_node_properties(
     privileged: bool = False,
     job_role_arn: Optional[str] = None,
     execution_role_arn: Optional[str] = None,
-) -> Dict[str, object]:
+    ulimits: Optional[List[Dict[str, Any]]] = None,
+) -> Dict[str, Any]:
     role.mounts += get_device_mounts(role.resource.devices)
     mount_points = []
@@ -239,6 +273,7 @@ def _role_to_node_properties(
         "environment": [{"name": k, "value": v} for k, v in role.env.items()],
         "privileged": privileged,
         "resourceRequirements": resource_requirements_from_resource(role.resource),
+        **({"ulimits": ulimits} if ulimits else {}),
         "linuxParameters": {
             # To support PyTorch dataloaders we need to set /dev/shm to larger
             # than the 64M default.
@@ -255,7 +290,7 @@ def _role_to_node_properties(
         container["jobRoleArn"] = job_role_arn
     if execution_role_arn:
         container["executionRoleArn"] = execution_role_arn
-    if role.num_replicas > 1:
+    if role.num_replicas > 0:
         instance_type = instance_type_from_resource(role.resource)
         if instance_type is not None:
             container["instanceType"] = instance_type
@@ -361,6 +396,7 @@ class AWSBatchOpts(TypedDict, total=False):
     priority: int
     job_role_arn: Optional[str]
     execution_role_arn: Optional[str]
+    ulimits: Optional[list[str]]
 class AWSBatchScheduler(
@@ -506,6 +542,7 @@ class AWSBatchScheduler(
             role = values.apply(role)
             role.env[ENV_TORCHX_ROLE_IDX] = str(role_idx)
             role.env[ENV_TORCHX_ROLE_NAME] = str(role.name)
+            role.env[ENV_TORCHX_IMAGE] = role.image
             nodes.append(
                 _role_to_node_properties(
@@ -514,6 +551,7 @@ class AWSBatchScheduler(
                     privileged=cfg["privileged"],
                     job_role_arn=cfg.get("job_role_arn"),
                     execution_role_arn=cfg.get("execution_role_arn"),
+                    ulimits=parse_ulimits(cfg.get("ulimits") or []),
                 )
             )
             node_idx += role.num_replicas
@@ -599,6 +637,11 @@ class AWSBatchScheduler(
             type_=str,
             help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
         )
+        opts.add(
+            "ulimits",
+            type_=List[str],
+            help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)",
+        )
         return opts
     def _get_job_id(self, app_id: str) -> Optional[str]:

torchx/schedulers/docker_scheduler.py CHANGED Viewed

@@ -84,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
 LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
 LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
+ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
 NETWORK = "torchx"
@@ -279,6 +281,7 @@ class DockerScheduler(
                 # configure distributed host envs
                 env["TORCHX_RANK0_HOST"] = rank0_name
+                env[ENV_TORCHX_IMAGE] = replica_role.image
                 c = DockerContainer(
                     image=replica_role.image,

torchx-nightly 2025.7.9__py3-none-any.whl → 2025.11.12__py3-none-any.whl

torchx-nightly 2025.7.9py3-none-any.whl → 2025.11.12py3-none-any.whl