PyPI - torchx-nightly - Versions diffs - 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl - Mend

torchx-nightly 2025.8.5py3-none-any.whl → 2026.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
torchx/cli/cmd_delete.py +30 -0
torchx/cli/cmd_list.py +1 -2
torchx/cli/cmd_run.py +202 -28
torchx/cli/cmd_tracker.py +1 -1
torchx/cli/main.py +2 -0
torchx/components/__init__.py +1 -8
torchx/components/dist.py +9 -3
torchx/components/integration_tests/component_provider.py +2 -2
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +102 -81
torchx/runner/config.py +3 -1
torchx/runner/events/__init__.py +20 -10
torchx/runner/events/api.py +1 -1
torchx/schedulers/__init__.py +7 -10
torchx/schedulers/api.py +66 -25
torchx/schedulers/aws_batch_scheduler.py +47 -6
torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
torchx/schedulers/docker_scheduler.py +4 -3
torchx/schedulers/ids.py +27 -23
torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
torchx/schedulers/kubernetes_scheduler.py +355 -36
torchx/schedulers/local_scheduler.py +2 -1
torchx/schedulers/lsf_scheduler.py +1 -1
torchx/schedulers/slurm_scheduler.py +102 -27
torchx/specs/__init__.py +40 -9
torchx/specs/api.py +222 -12
torchx/specs/builders.py +109 -28
torchx/specs/file_linter.py +117 -53
torchx/specs/finder.py +25 -37
torchx/specs/named_resources_aws.py +13 -2
torchx/specs/overlays.py +106 -0
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/util/strings.py +1 -1
torchx/util/types.py +12 -1
torchx/version.py +2 -2
torchx/workspace/api.py +102 -5
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
torchx/examples/pipelines/__init__.py +0 -0
torchx/examples/pipelines/kfp/__init__.py +0 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
torchx/pipelines/kfp/__init__.py +0 -30
torchx/pipelines/kfp/adapter.py +0 -274
torchx/pipelines/kfp/version.py +0 -19
torchx/schedulers/gcp_batch_scheduler.py +0 -497
torchx/schedulers/ray/ray_common.py +0 -22
torchx/schedulers/ray/ray_driver.py +0 -307
torchx/schedulers/ray_scheduler.py +0 -454
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0

torchx/schedulers/aws_batch_scheduler.py CHANGED Viewed

@@ -92,6 +92,8 @@ ENV_TORCHX_ROLE_IDX = "TORCHX_ROLE_IDX"
 ENV_TORCHX_ROLE_NAME = "TORCHX_ROLE_NAME"
+ENV_TORCHX_IMAGE = "TORCHX_IMAGE"
 DEFAULT_ROLE_NAME = "node"
 TAG_TORCHX_VER = "torchx.pytorch.org/version"
@@ -99,6 +101,37 @@ TAG_TORCHX_APPNAME = "torchx.pytorch.org/app-name"
 TAG_TORCHX_USER = "torchx.pytorch.org/user"
+def parse_ulimits(ulimits_list: list[str]) -> List[Dict[str, Any]]:
+    """
+    Parse ulimit string in format: name:softLimit:hardLimit
+    Multiple ulimits separated by commas.
+    """
+    if not ulimits_list:
+        return []
+    ulimits = []
+    for ulimit_str in ulimits_list:
+        if not ulimit_str.strip():
+            continue
+        parts = ulimit_str.strip().split(":")
+        if len(parts) != 3:
+            raise ValueError(
+                f"ulimit must be in format name:softLimit:hardLimit, got: {ulimit_str}"
+            )
+        name, soft_limit, hard_limit = parts
+        ulimits.append(
+            {
+                "name": name,
+                "softLimit": int(soft_limit) if soft_limit != "-1" else -1,
+                "hardLimit": int(hard_limit) if hard_limit != "-1" else -1,
+            }
+        )
+    return ulimits
 if TYPE_CHECKING:
     from docker import DockerClient
@@ -177,7 +210,8 @@ def _role_to_node_properties(
     privileged: bool = False,
     job_role_arn: Optional[str] = None,
     execution_role_arn: Optional[str] = None,
-) -> Dict[str, object]:
+    ulimits: Optional[List[Dict[str, Any]]] = None,
+) -> Dict[str, Any]:
     role.mounts += get_device_mounts(role.resource.devices)
     mount_points = []
@@ -239,6 +273,7 @@ def _role_to_node_properties(
         "environment": [{"name": k, "value": v} for k, v in role.env.items()],
         "privileged": privileged,
         "resourceRequirements": resource_requirements_from_resource(role.resource),
+        **({"ulimits": ulimits} if ulimits else {}),
         "linuxParameters": {
             # To support PyTorch dataloaders we need to set /dev/shm to larger
             # than the 64M default.
@@ -255,7 +290,7 @@ def _role_to_node_properties(
         container["jobRoleArn"] = job_role_arn
     if execution_role_arn:
         container["executionRoleArn"] = execution_role_arn
-    if role.num_replicas > 1:
+    if role.num_replicas > 0:
         instance_type = instance_type_from_resource(role.resource)
         if instance_type is not None:
             container["instanceType"] = instance_type
@@ -346,7 +381,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
 @_thread_local_cache
-def _local_session() -> "boto3.session.Session":
+def _local_session() -> "boto3.session.Session":  # noqa: F821
     import boto3.session
     return boto3.session.Session()
@@ -361,11 +396,10 @@ class AWSBatchOpts(TypedDict, total=False):
     priority: int
     job_role_arn: Optional[str]
     execution_role_arn: Optional[str]
+    ulimits: Optional[list[str]]
-class AWSBatchScheduler(
-    DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
-):
+class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
     """
     AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.
@@ -506,6 +540,7 @@ class AWSBatchScheduler(
             role = values.apply(role)
             role.env[ENV_TORCHX_ROLE_IDX] = str(role_idx)
             role.env[ENV_TORCHX_ROLE_NAME] = str(role.name)
+            role.env[ENV_TORCHX_IMAGE] = role.image
             nodes.append(
                 _role_to_node_properties(
@@ -514,6 +549,7 @@ class AWSBatchScheduler(
                     privileged=cfg["privileged"],
                     job_role_arn=cfg.get("job_role_arn"),
                     execution_role_arn=cfg.get("execution_role_arn"),
+                    ulimits=parse_ulimits(cfg.get("ulimits") or []),
                 )
             )
             node_idx += role.num_replicas
@@ -599,6 +635,11 @@ class AWSBatchScheduler(
             type_=str,
             help="The Amazon Resource Name (ARN) of the IAM role that the ECS agent can assume for AWS permissions.",
         )
+        opts.add(
+            "ulimits",
+            type_=List[str],
+            help="Ulimit settings in format: name:softLimit:hardLimit (multiple separated by commas)",
+        )
         return opts
     def _get_job_id(self, app_id: str) -> Optional[str]:

torchx/schedulers/aws_sagemaker_scheduler.py CHANGED Viewed

@@ -157,7 +157,7 @@ def _merge_ordered(
 class AWSSageMakerScheduler(
     DockerWorkspaceMixin,
-    Scheduler[AWSSageMakerOpts, AppDef, AppDryRunInfo[AWSSageMakerJob]],
+    Scheduler[AWSSageMakerOpts],
 ):
     """
     AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.

torchx/schedulers/docker_scheduler.py CHANGED Viewed

@@ -84,6 +84,8 @@ LABEL_APP_ID: str = "torchx.pytorch.org/app-id"
 LABEL_ROLE_NAME: str = "torchx.pytorch.org/role-name"
 LABEL_REPLICA_ID: str = "torchx.pytorch.org/replica-id"
+ENV_TORCHX_IMAGE: str = "TORCHX_IMAGE"
 NETWORK = "torchx"
@@ -127,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
     privileged: bool
-class DockerScheduler(
-    DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
-):
+class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
     """
     DockerScheduler is a TorchX scheduling interface to Docker.
@@ -279,6 +279,7 @@ class DockerScheduler(
                 # configure distributed host envs
                 env["TORCHX_RANK0_HOST"] = rank0_name
+                env[ENV_TORCHX_IMAGE] = replica_role.image
                 c = DockerContainer(
                     image=replica_role.image,

torchx/schedulers/ids.py CHANGED Viewed

@@ -8,9 +8,9 @@
 # pyre-strict
 import os
-import random
 import struct
 START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
 END_CANDIDATES: str = START_CANDIDATES + "012345679"
@@ -19,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
     """
     Appends a unique 64-bit string to the input argument.
+    Note that the unique string pulls entropy from `/dev/urandom` hence is not
+    affected by `random.seed()`
+    Args:
+        name: the name string to unique-ify
+        string_length: max length of the unique 64-bit string to append to the ``name``.
+          Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
     Returns:
-        string in format $name-$unique_suffix
+        string in format ``{name}-{unique_suffix}`
     """
-    return (
-        f"{name}-{random_id()}"
-        if string_length == 0
-        else f"{name}-{get_len_random_id(string_length)}"
-    )
+    max_length = None if string_length == 0 else string_length
+    return f"{name}-{random_id(max_length)}"
 def random_uint64() -> int:
@@ -36,13 +41,24 @@ def random_uint64() -> int:
     return struct.unpack("!Q", os.urandom(8))[0]
-def random_id() -> str:
+def random_id(max_length: int | None = None) -> str:
     """
     Generates an alphanumeric string ID that matches the requirements from
     https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
+    Note that the unique string pulls entropy from `/dev/urandom` hence is not
+    affected by `random.seed()`
+    If ``max_length`` is provided, the returned ID will be at most that many characters long.
     """
+    # If a max_length is provided and is non-positive, return empty string
+    if max_length is not None and max_length <= 0:
+        return ""
     out = ""
     v = random_uint64()
     while v > 0:
         if out == "":
             candidates = START_CANDIDATES
@@ -52,21 +68,9 @@ def random_id() -> str:
         char = v % len(candidates)
         v = v // len(candidates)
         out += candidates[char]
-    return out
-def get_len_random_id(string_length: int) -> str:
-    """
-    Generates an alphanumeric string ID that matches the requirements from
-    https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
-    """
-    out = ""
-    for i in range(string_length):
-        if out == "":
-            candidates = START_CANDIDATES
-        else:
-            candidates = END_CANDIDATES
-        out += random.choice(candidates)
+        if max_length is not None and len(out) >= max_length:
+            break
+    # NOTE: statistically the length of `out` is typically between 12-14 characters long
     return out

torchx/schedulers/kubernetes_mcad_scheduler.py CHANGED Viewed

@@ -796,10 +796,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
     network: Optional[str]
-class KubernetesMCADScheduler(
-    DockerWorkspaceMixin,
-    Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
-):
+class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
     """
     KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.

torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

torchx-nightly 2025.8.5py3-none-any.whl → 2026.1.11py3-none-any.whl