PyPI - torchx-nightly - Versions diffs - 2025.9.28__py3-none-any.whl → 2025.11.17__py3-none-any.whl - Mend

torchx-nightly 2025.9.28py3-none-any.whl → 2025.11.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (32) hide show

torchx/_version.py +8 -0
torchx/cli/cmd_run.py +10 -5
torchx/cli/cmd_tracker.py +1 -1
torchx/components/__init__.py +1 -1
torchx/components/dist.py +9 -3
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +30 -22
torchx/runner/config.py +2 -0
torchx/schedulers/__init__.py +8 -9
torchx/schedulers/api.py +9 -4
torchx/schedulers/aws_batch_scheduler.py +44 -1
torchx/schedulers/docker_scheduler.py +3 -0
torchx/schedulers/kubernetes_scheduler.py +200 -17
torchx/schedulers/slurm_scheduler.py +11 -2
torchx/specs/__init__.py +30 -7
torchx/specs/api.py +215 -10
torchx/specs/file_linter.py +1 -1
torchx/specs/finder.py +1 -1
torchx/specs/named_resources_aws.py +13 -2
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/version.py +2 -2
torchx/workspace/__init__.py +1 -1
torchx/workspace/api.py +65 -110
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/METADATA +34 -21
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/RECORD +32 -31
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/WHEEL +1 -1
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/top_level.txt +0 -0

torchx/schedulers/kubernetes_scheduler.py CHANGED Viewed

@@ -27,10 +27,81 @@ Install Volcano:
 See the
 `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
 for more information.
+Pod Overlay
+===========
+You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
+the ``kubernetes`` metadata on your role. The value can be:
+- A dict with the overlay structure
+- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
+Merge semantics:
+- **dict**: recursive merge (upsert)
+- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
+- **primitives**: replace
+.. code:: python
+    from torchx.specs import Role
+    # Dict overlay - lists append, tuples replace
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": {
+                "spec": {
+                    "nodeSelector": {"gpu": "true"},
+                    "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}],  # appends
+                    "volumes": ({"name": "my-volume", "emptyDir": {}},)  # replaces
+                }
+            }
+        }
+    )
+    # File URI overlay
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": "file:///path/to/pod_overlay.yaml"
+        }
+    )
+CLI usage with builtin components:
+.. code:: bash
+    $ torchx run --scheduler kubernetes dist.ddp \\
+        --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
+        --script train.py
+Example ``pod_overlay.yaml``:
+.. code:: yaml
+    spec:
+      nodeSelector:
+        node.kubernetes.io/instance-type: p4d.24xlarge
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      volumes: !!python/tuple
+        - name: my-volume
+          emptyDir: {}
+The overlay is deep-merged with the generated pod, preserving existing fields
+and adding or overriding specified ones.
 """
 import json
 import logging
+import re
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
@@ -45,6 +116,7 @@ from typing import (
     Tuple,
     TYPE_CHECKING,
     TypedDict,
+    Union,
 )
 import torchx
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
 RESERVED_MILLICPU = 100
 RESERVED_MEMMB = 1024
+def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
+    """Apply overlay dict to V1Pod object, merging nested fields.
+    Merge semantics:
+    - dict: upsert (recursive merge)
+    - list: append by default, replace if tuple
+    - primitives: replace
+    """
+    from kubernetes import client
+    api = client.ApiClient()
+    pod_dict = api.sanitize_for_serialization(pod)
+    def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
+        for key, value in overlay.items():
+            if isinstance(value, dict) and key in base and isinstance(base[key], dict):
+                deep_merge(base[key], value)
+            elif isinstance(value, tuple):
+                base[key] = list(value)
+            elif (
+                isinstance(value, list) and key in base and isinstance(base[key], list)
+            ):
+                base[key].extend(value)
+            else:
+                base[key] = value
+    deep_merge(pod_dict, overlay)
+    merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
+    pod.spec = merged_pod.spec
+    pod.metadata = merged_pod.metadata
 RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
     RetryPolicy.REPLICA: [],
     RetryPolicy.APPLICATION: [
@@ -369,7 +475,7 @@ def app_to_resource(
     queue: str,
     service_account: Optional[str],
     priority_class: Optional[str] = None,
-) -> Dict[str, object]:
+) -> Dict[str, Any]:
     """
     app_to_resource creates a volcano job kubernetes resource definition from
     the provided AppDef. The resource definition can be used to launch the
@@ -399,8 +505,20 @@ def app_to_resource(
             replica_role = values.apply(role)
             if role_idx == 0 and replica_id == 0:
                 replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
+            replica_role.env["TORCHX_IMAGE"] = replica_role.image
             pod = role_to_pod(name, replica_role, service_account)
+            if k8s_metadata := role.metadata.get("kubernetes"):
+                if isinstance(k8s_metadata, str):
+                    import fsspec
+                    with fsspec.open(k8s_metadata, "r") as f:
+                        k8s_metadata = yaml.unsafe_load(f)
+                elif not isinstance(k8s_metadata, dict):
+                    raise ValueError(
+                        f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
+                    )
+                _apply_pod_overlay(pod, k8s_metadata)
             pod.metadata.labels.update(
                 pod_labels(
                     app=app,
@@ -443,7 +561,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
     if priority_class is not None:
         job_spec["priorityClassName"] = priority_class
-    resource: Dict[str, object] = {
+    resource: Dict[str, Any] = {
         "apiVersion": "batch.volcano.sh/v1alpha1",
         "kind": "Job",
         "metadata": {"name": f"{unique_app_id}"},
@@ -455,7 +573,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
 @dataclass
 class KubernetesJob:
     images_to_push: Dict[str, Tuple[str, str]]
-    resource: Dict[str, object]
+    resource: Dict[str, Any]
     def __str__(self) -> str:
         return yaml.dump(sanitize_for_serialization(self.resource))
@@ -470,6 +588,7 @@ class KubernetesOpts(TypedDict, total=False):
     image_repo: Optional[str]
     service_account: Optional[str]
     priority_class: Optional[str]
+    validate_spec: Optional[bool]
 class KubernetesScheduler(
@@ -485,7 +604,7 @@ class KubernetesScheduler(
     For installation instructions see: https://github.com/volcano-sh/volcano
     This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
-    v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
+    v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
     tracking Volcano support for Kubernetes v1.22.
     .. note::
@@ -635,7 +754,7 @@ class KubernetesScheduler(
             else:
                 raise
-        return f'{namespace}:{resp["metadata"]["name"]}'
+        return f"{namespace}:{resp['metadata']['name']}"
     def _submit_dryrun(
         self, app: AppDef, cfg: KubernetesOpts
@@ -658,6 +777,36 @@ class KubernetesScheduler(
         ), "priority_class must be a str"
         resource = app_to_resource(app, queue, service_account, priority_class)
+        if cfg.get("validate_spec"):
+            try:
+                self._custom_objects_api().create_namespaced_custom_object(
+                    group="batch.volcano.sh",
+                    version="v1alpha1",
+                    namespace=cfg.get("namespace") or "default",
+                    plural="jobs",
+                    body=resource,
+                    dry_run="All",
+                )
+            except Exception as e:
+                from kubernetes.client.rest import ApiException
+                if isinstance(e, ApiException):
+                    raise ValueError(f"Invalid job spec: {e.reason}") from e
+                raise
+            job_name = resource["metadata"]["name"]
+            for task in resource["spec"]["tasks"]:
+                task_name = task["name"]
+                replicas = task.get("replicas", 1)
+                max_index = replicas - 1
+                pod_name = f"{job_name}-{task_name}-{max_index}"
+                if len(pod_name) > 63:
+                    raise ValueError(
+                        f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
+                        f"Shorten app.name or role names"
+                    )
         req = KubernetesJob(
             resource=resource,
             images_to_push=images_to_push,
@@ -702,19 +851,32 @@ class KubernetesScheduler(
             type_=str,
             help="The name of the PriorityClass to set on the job specs",
         )
+        opts.add(
+            "validate_spec",
+            type_=bool,
+            help="Validate job spec using Kubernetes API dry-run before submission",
+            default=True,
+        )
         return opts
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        from kubernetes.client.rest import ApiException
         namespace, name = app_id.split(":")
         roles = {}
         roles_statuses = {}
-        resp = self._custom_objects_api().get_namespaced_custom_object_status(
-            group="batch.volcano.sh",
-            version="v1alpha1",
-            namespace=namespace,
-            plural="jobs",
-            name=name,
-        )
+        try:
+            resp = self._custom_objects_api().get_namespaced_custom_object_status(
+                group="batch.volcano.sh",
+                version="v1alpha1",
+                namespace=namespace,
+                plural="jobs",
+                name=name,
+            )
+        except ApiException as e:
+            if e.status == 404:
+                return None
+            raise
         status = resp.get("status")
         if status:
             state_str = status["state"]["phase"]
@@ -823,13 +985,34 @@ def create_scheduler(
 def pod_labels(
     app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
 ) -> Dict[str, str]:
+    def clean(label_value: str) -> str:
+        # cleans the provided `label_value` to make it compliant
+        # to pod label specs as described in
+        # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+        #
+        # Valid label value:
+        # must be 63 characters or less (can be empty),
+        # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
+        # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
+        # Replace invalid characters (allow: alphanum, -, _, .) with "."
+        label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
+        # Replace leading non-alphanumeric with "."
+        label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
+        # Replace trailing non-alphanumeric with "."
+        label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
+        # Trim to 63 characters
+        return label_value[:63]
     return {
-        LABEL_VERSION: torchx.__version__,
-        LABEL_APP_NAME: app.name,
+        LABEL_VERSION: clean(torchx.__version__),
+        LABEL_APP_NAME: clean(app.name),
         LABEL_ROLE_INDEX: str(role_idx),
-        LABEL_ROLE_NAME: role.name,
+        LABEL_ROLE_NAME: clean(role.name),
         LABEL_REPLICA_ID: str(replica_id),
-        LABEL_KUBE_APP_NAME: app.name,
+        LABEL_KUBE_APP_NAME: clean(app.name),
         LABEL_ORGANIZATION: "torchx.pytorch.org",
-        LABEL_UNIQUE_NAME: app_id,
+        LABEL_UNIQUE_NAME: clean(app_id),
     }

torchx/schedulers/slurm_scheduler.py CHANGED Viewed

@@ -73,6 +73,15 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
+def get_appstate_from_job(job: dict[str, object]) -> AppState:
+    # Prior to slurm-23.11, job_state was a string and not a list
+    job_state = job.get("job_state", None)
+    if isinstance(job_state, list):
+        return appstate_from_slurm_state(job_state[0])
+    else:
+        return appstate_from_slurm_state(str(job_state))
 def version() -> Tuple[int, int]:
     """
     Uses ``sinfo --version`` to get the slurm version. If the command fails, it
@@ -666,7 +675,7 @@ class SlurmScheduler(
             entrypoint = job["command"]
             image = job["current_working_directory"]
-            state = appstate_from_slurm_state(job["job_state"][0])
+            state = get_appstate_from_job(job)
             job_resources = job["job_resources"]
@@ -881,7 +890,7 @@ class SlurmScheduler(
             out.append(
                 ListAppResponse(
                     app_id=str(job["job_id"]),
-                    state=SLURM_STATES[job["job_state"][0]],
+                    state=get_appstate_from_job(job),
                     name=job["name"],
                 )
             )

torchx/specs/__init__.py CHANGED Viewed

@@ -12,7 +12,9 @@ used by components to define the apps which can then be launched via a TorchX
 scheduler or pipeline adapter.
 """
 import difflib
-from typing import Callable, Dict, Mapping, Optional
+import os
+from typing import Callable, Dict, Iterator, Mapping, Optional
 from torchx.specs.api import (
     ALL,
@@ -41,9 +43,11 @@ from torchx.specs.api import (
     RoleStatus,
     runopt,
     runopts,
+    TORCHX_HOME,
     UnknownAppException,
     UnknownSchedulerException,
     VolumeMount,
+    Workspace,
 )
 from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
@@ -53,6 +57,7 @@ from torchx.util.modules import import_attr
 GiB: int = 1024
 ResourceFactory = Callable[[], Resource]
 AWS_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
@@ -61,8 +66,10 @@ AWS_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
 GENERIC_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
     "torchx.specs.named_resources_generic", "NAMED_RESOURCES", default={}
 )
-FB_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
-    "torchx.specs.fb.named_resources", "NAMED_RESOURCES", default={}
+CUSTOM_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
+    os.environ.get("TORCHX_CUSTOM_NAMED_RESOURCES", "torchx.specs.fb.named_resources"),
+    "NAMED_RESOURCES",
+    default={},
 )
@@ -73,7 +80,7 @@ def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
     for name, resource in {
         **GENERIC_NAMED_RESOURCES,
         **AWS_NAMED_RESOURCES,
-        **FB_NAMED_RESOURCES,
+        **CUSTOM_NAMED_RESOURCES,
         **resource_methods,
     }.items():
         materialized_resources[name] = resource
@@ -106,8 +113,22 @@ class _NamedResourcesLibrary:
     def __contains__(self, key: str) -> bool:
         return key in _named_resource_factories
-    def __iter__(self) -> None:
-        raise NotImplementedError("named resources doesn't support iterating")
+    def __iter__(self) -> Iterator[str]:
+        """Iterates through the names of the registered named_resources.
+        Usage:
+        .. doctest::
+            from torchx import specs
+            for resource_name in specs.named_resources:
+                resource = specs.resource(h=resource_name)
+                assert isinstance(resource, specs.Resource)
+        """
+        for key in _named_resource_factories:
+            yield (key)
 named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
@@ -127,7 +148,7 @@ def resource(
     If ``h`` is specified then it is used to look up the
     resource specs from the list of registered named resources.
-    See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
+    See `registering named resource <https://meta-pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
     Otherwise a ``Resource`` object is created from the raw resource specs.
@@ -234,4 +255,6 @@ __all__ = [
     "torchx_run_args_from_json",
     "TorchXRunArgs",
     "ALL",
+    "TORCHX_HOME",
+    "Workspace",
 ]

torchx-nightly 2025.9.28__py3-none-any.whl → 2025.11.17__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2025.9.28py3-none-any.whl → 2025.11.17py3-none-any.whl