PyPI - torchx-nightly - Versions diffs - 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl - Mend

torchx-nightly 2024.1.6py3-none-any.whl → 2025.12.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (110) hide show

torchx/__init__.py +2 -0
torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
torchx/apps/serve/serve.py +2 -0
torchx/apps/utils/booth_main.py +2 -0
torchx/apps/utils/copy_main.py +2 -0
torchx/apps/utils/process_monitor.py +2 -0
torchx/cli/__init__.py +2 -0
torchx/cli/argparse_util.py +38 -3
torchx/cli/cmd_base.py +2 -0
torchx/cli/cmd_cancel.py +2 -0
torchx/cli/cmd_configure.py +2 -0
torchx/cli/cmd_delete.py +30 -0
torchx/cli/cmd_describe.py +2 -0
torchx/cli/cmd_list.py +8 -4
torchx/cli/cmd_log.py +6 -24
torchx/cli/cmd_run.py +269 -45
torchx/cli/cmd_runopts.py +2 -0
torchx/cli/cmd_status.py +12 -1
torchx/cli/cmd_tracker.py +3 -1
torchx/cli/colors.py +2 -0
torchx/cli/main.py +4 -0
torchx/components/__init__.py +3 -8
torchx/components/component_test_base.py +2 -0
torchx/components/dist.py +18 -7
torchx/components/integration_tests/component_provider.py +4 -2
torchx/components/integration_tests/integ_tests.py +2 -0
torchx/components/serve.py +2 -0
torchx/components/structured_arg.py +4 -3
torchx/components/utils.py +15 -4
torchx/distributed/__init__.py +2 -4
torchx/examples/apps/datapreproc/datapreproc.py +2 -0
torchx/examples/apps/lightning/data.py +5 -3
torchx/examples/apps/lightning/model.py +7 -6
torchx/examples/apps/lightning/profiler.py +7 -4
torchx/examples/apps/lightning/train.py +11 -2
torchx/examples/torchx_out_of_sync_training.py +11 -0
torchx/notebook.py +2 -0
torchx/runner/__init__.py +2 -0
torchx/runner/api.py +167 -60
torchx/runner/config.py +43 -10
torchx/runner/events/__init__.py +57 -13
torchx/runner/events/api.py +14 -3
torchx/runner/events/handlers.py +2 -0
torchx/runtime/tracking/__init__.py +2 -0
torchx/runtime/tracking/api.py +2 -0
torchx/schedulers/__init__.py +16 -15
torchx/schedulers/api.py +70 -14
torchx/schedulers/aws_batch_scheduler.py +75 -6
torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
torchx/schedulers/devices.py +17 -4
torchx/schedulers/docker_scheduler.py +43 -11
torchx/schedulers/ids.py +29 -23
torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
torchx/schedulers/kubernetes_scheduler.py +383 -38
torchx/schedulers/local_scheduler.py +100 -27
torchx/schedulers/lsf_scheduler.py +5 -4
torchx/schedulers/slurm_scheduler.py +336 -20
torchx/schedulers/streams.py +2 -0
torchx/specs/__init__.py +89 -12
torchx/specs/api.py +418 -30
torchx/specs/builders.py +176 -38
torchx/specs/file_linter.py +143 -57
torchx/specs/finder.py +68 -28
torchx/specs/named_resources_aws.py +181 -4
torchx/specs/named_resources_generic.py +2 -0
torchx/specs/overlays.py +106 -0
torchx/specs/test/components/__init__.py +2 -0
torchx/specs/test/components/a/__init__.py +2 -0
torchx/specs/test/components/a/b/__init__.py +2 -0
torchx/specs/test/components/a/b/c.py +2 -0
torchx/specs/test/components/c/__init__.py +2 -0
torchx/specs/test/components/c/d.py +2 -0
torchx/tracker/__init__.py +12 -6
torchx/tracker/api.py +15 -18
torchx/tracker/backend/fsspec.py +2 -0
torchx/util/cuda.py +2 -0
torchx/util/datetime.py +2 -0
torchx/util/entrypoints.py +39 -15
torchx/util/io.py +2 -0
torchx/util/log_tee_helpers.py +210 -0
torchx/util/modules.py +65 -0
torchx/util/session.py +42 -0
torchx/util/shlex.py +2 -0
torchx/util/strings.py +3 -1
torchx/util/types.py +90 -29
torchx/version.py +4 -2
torchx/workspace/__init__.py +2 -0
torchx/workspace/api.py +136 -6
torchx/workspace/dir_workspace.py +2 -0
torchx/workspace/docker_workspace.py +30 -2
torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
torchx/examples/pipelines/__init__.py +0 -0
torchx/examples/pipelines/kfp/__init__.py +0 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
torchx/pipelines/kfp/__init__.py +0 -28
torchx/pipelines/kfp/adapter.py +0 -271
torchx/pipelines/kfp/version.py +0 -17
torchx/schedulers/gcp_batch_scheduler.py +0 -487
torchx/schedulers/ray/ray_common.py +0 -22
torchx/schedulers/ray/ray_driver.py +0 -307
torchx/schedulers/ray_scheduler.py +0 -453
torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0

torchx/schedulers/kubernetes_scheduler.py CHANGED Viewed

@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 """
 This contains the TorchX Kubernetes scheduler which can be used to run TorchX
@@ -23,12 +25,83 @@ Install Volcano:
     kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.6.0/installer/volcano-development.yaml
 See the
-`Volcano Quickstart <https://github.com/volcano-sh/volcano#user-content-quick-start-guide>`_
+`Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
 for more information.
+Pod Overlay
+===========
+You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
+the ``kubernetes`` metadata on your role. The value can be:
+- A dict with the overlay structure
+- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
+Merge semantics:
+- **dict**: recursive merge (upsert)
+- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
+- **primitives**: replace
+.. code:: python
+    from torchx.specs import Role
+    # Dict overlay - lists append, tuples replace
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": {
+                "spec": {
+                    "nodeSelector": {"gpu": "true"},
+                    "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}],  # appends
+                    "volumes": ({"name": "my-volume", "emptyDir": {}},)  # replaces
+                }
+            }
+        }
+    )
+    # File URI overlay
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": "file:///path/to/pod_overlay.yaml"
+        }
+    )
+CLI usage with builtin components:
+.. code:: bash
+    $ torchx run --scheduler kubernetes dist.ddp \\
+        --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
+        --script train.py
+Example ``pod_overlay.yaml``:
+.. code:: yaml
+    spec:
+      nodeSelector:
+        node.kubernetes.io/instance-type: p4d.24xlarge
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      volumes: !!python/tuple
+        - name: my-volume
+          emptyDir: {}
+The overlay is deep-merged with the generated pod, preserving existing fields
+and adding or overriding specified ones.
 """
 import json
 import logging
+import re
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
@@ -42,12 +115,13 @@ from typing import (
     Optional,
     Tuple,
     TYPE_CHECKING,
+    TypedDict,
+    Union,
 )
 import torchx
 import yaml
 from torchx.schedulers.api import (
-    AppDryRunInfo,
     DescribeAppResponse,
     filter_regex,
     ListAppResponse,
@@ -58,6 +132,7 @@ from torchx.schedulers.api import (
 from torchx.schedulers.ids import make_unique
 from torchx.specs.api import (
     AppDef,
+    AppDryRunInfo,
     AppState,
     BindMount,
     CfgVal,
@@ -73,8 +148,6 @@ from torchx.specs.api import (
 )
 from torchx.util.strings import normalize_str
 from torchx.workspace.docker_workspace import DockerWorkspaceMixin
-from typing_extensions import TypedDict
 if TYPE_CHECKING:
     from docker import DockerClient
@@ -85,6 +158,7 @@ if TYPE_CHECKING:
     )
     from kubernetes.client.rest import ApiException
 logger: logging.Logger = logging.getLogger(__name__)
 # Kubernetes reserves a small amount of resources per host for the system. For
@@ -95,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
 RESERVED_MILLICPU = 100
 RESERVED_MEMMB = 1024
+def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
+    """Apply overlay dict to V1Pod object, merging nested fields.
+    Merge semantics:
+    - dict: upsert (recursive merge)
+    - list: append by default, replace if tuple
+    - primitives: replace
+    """
+    from kubernetes import client
+    api = client.ApiClient()
+    pod_dict = api.sanitize_for_serialization(pod)
+    def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
+        for key, value in overlay.items():
+            if isinstance(value, dict) and key in base and isinstance(base[key], dict):
+                deep_merge(base[key], value)
+            elif isinstance(value, tuple):
+                base[key] = list(value)
+            elif (
+                isinstance(value, list) and key in base and isinstance(base[key], list)
+            ):
+                base[key].extend(value)
+            else:
+                base[key] = value
+    deep_merge(pod_dict, overlay)
+    merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
+    pod.spec = merged_pod.spec
+    pod.metadata = merged_pod.metadata
 RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
     RetryPolicy.REPLICA: [],
     RetryPolicy.APPLICATION: [
@@ -167,6 +275,17 @@ ANNOTATION_ISTIO_SIDECAR = "sidecar.istio.io/inject"
 LABEL_INSTANCE_TYPE = "node.kubernetes.io/instance-type"
+# role.env translates to static env variables in the yaml
+# {"FOO" : "bar"}               =====>      - name: FOO
+#                                             value: bar
+# unless this placeholder is present at the start of the role.env value then the env variable
+# in the yaml will be dynamically populated at runtime (placeholder is stripped out of the value)
+# {"FOO" : "[FIELD_PATH]bar"}   =====>      - name: FOO
+#                                             valueFrom:
+#                                               fieldRef:
+#                                                 fieldPath: bar
+PLACEHOLDER_FIELD_PATH = "[FIELD_PATH]"
 def sanitize_for_serialization(obj: object) -> object:
     from kubernetes import client
@@ -175,13 +294,22 @@ def sanitize_for_serialization(obj: object) -> object:
     return api.sanitize_for_serialization(obj)
-def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod":
+def role_to_pod(
+    name: str,
+    role: Role,
+    service_account: Optional[str],
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
+) -> "V1Pod":
     from kubernetes.client.models import (  # noqa: F811 redefinition of unused
         V1Container,
         V1ContainerPort,
         V1EmptyDirVolumeSource,
         V1EnvVar,
+        V1EnvVarSource,
         V1HostPathVolumeSource,
+        V1ObjectFieldSelector,
         V1ObjectMeta,
         V1PersistentVolumeClaimVolumeSource,
         V1Pod,
@@ -203,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
     if resource.cpu > 0:
         mcpu = int(resource.cpu * 1000)
         limits["cpu"] = f"{mcpu}m"
-        request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
+        request_mcpu = max(mcpu - reserved_millicpu, 0)
         requests["cpu"] = f"{request_mcpu}m"
     if resource.memMB > 0:
         limits["memory"] = f"{int(resource.memMB)}M"
-        request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
+        request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
         requests["memory"] = f"{request_memMB}M"
     if resource.gpu > 0:
         requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
+    EFA_DEVICE = "vpc.amazonaws.com/efa"
     for device_name, device_limit in resource.devices.items():
         limits[device_name] = str(device_limit)
+    # Handle EFA device count override:
+    # - None (default): use whatever count is in the resource spec (already added above)
+    # - 0: remove EFA devices entirely
+    # - N > 0: set EFA device count to N (override or add)
+    if efa_device_count is not None:
+        if efa_device_count == 0:
+            limits.pop(EFA_DEVICE, None)
+        else:
+            limits[EFA_DEVICE] = str(efa_device_count)
     resources = V1ResourceRequirements(
         limits=limits,
         requests=requests,
@@ -301,9 +440,20 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
         image=role.image,
         name=name,
         env=[
-            V1EnvVar(
-                name=name,
-                value=value,
+            (
+                V1EnvVar(
+                    name=name,
+                    value_from=V1EnvVarSource(
+                        field_ref=V1ObjectFieldSelector(
+                            field_path=value.strip(PLACEHOLDER_FIELD_PATH)
+                        )
+                    ),
+                )
+                if value.startswith(PLACEHOLDER_FIELD_PATH)
+                else V1EnvVar(
+                    name=name,
+                    value=value,
+                )
             )
             for name, value in role.env.items()
         ],
@@ -343,7 +493,10 @@ def app_to_resource(
     queue: str,
     service_account: Optional[str],
     priority_class: Optional[str] = None,
-) -> Dict[str, object]:
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
+) -> Dict[str, Any]:
     """
     app_to_resource creates a volcano job kubernetes resource definition from
     the provided AppDef. The resource definition can be used to launch the
@@ -373,8 +526,27 @@ def app_to_resource(
             replica_role = values.apply(role)
             if role_idx == 0 and replica_id == 0:
                 replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
-            pod = role_to_pod(name, replica_role, service_account)
+            replica_role.env["TORCHX_IMAGE"] = replica_role.image
+            pod = role_to_pod(
+                name,
+                replica_role,
+                service_account,
+                reserved_millicpu,
+                reserved_memmb,
+                efa_device_count,
+            )
+            if k8s_metadata := role.metadata.get("kubernetes"):
+                if isinstance(k8s_metadata, str):
+                    import fsspec
+                    with fsspec.open(k8s_metadata, "r") as f:
+                        k8s_metadata = yaml.unsafe_load(f)
+                elif not isinstance(k8s_metadata, dict):
+                    raise ValueError(
+                        f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
+                    )
+                _apply_pod_overlay(pod, k8s_metadata)
             pod.metadata.labels.update(
                 pod_labels(
                     app=app,
@@ -417,7 +589,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
     if priority_class is not None:
         job_spec["priorityClassName"] = priority_class
-    resource: Dict[str, object] = {
+    resource: Dict[str, Any] = {
         "apiVersion": "batch.volcano.sh/v1alpha1",
         "kind": "Job",
         "metadata": {"name": f"{unique_app_id}"},
@@ -429,7 +601,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
 @dataclass
 class KubernetesJob:
     images_to_push: Dict[str, Tuple[str, str]]
-    resource: Dict[str, object]
+    resource: Dict[str, Any]
     def __str__(self) -> str:
         return yaml.dump(sanitize_for_serialization(self.resource))
@@ -444,6 +616,10 @@ class KubernetesOpts(TypedDict, total=False):
     image_repo: Optional[str]
     service_account: Optional[str]
     priority_class: Optional[str]
+    validate_spec: Optional[bool]
+    reserved_millicpu: Optional[int]
+    reserved_memmb: Optional[int]
+    efa_device_count: Optional[int]
 class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
@@ -456,7 +632,7 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
     For installation instructions see: https://github.com/volcano-sh/volcano
     This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
-    v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
+    v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
     tracking Volcano support for Kubernetes v1.22.
     .. note::
@@ -474,6 +650,16 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
         $ torchx status kubernetes://torchx_user/1234
         ...
+    **Cancellation**
+    Canceling a job aborts it while preserving the job spec for inspection
+    and cloning via kubectl apply. Use the delete command to remove the job entirely:
+    .. code-block:: bash
+        $ torchx cancel kubernetes://namespace/jobname  # abort, preserves spec
+        $ torchx delete kubernetes://namespace/jobname  # delete completely
     **Config Options**
     .. runopts::
@@ -552,9 +738,14 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
         if c is None:
             configuration = client.Configuration()
             try:
-                config.load_kube_config(client_configuration=configuration)
-            except config.ConfigException as e:
-                warnings.warn(f"failed to load kube config: {e}")
+                # Try in-cluster config first (for pods with ServiceAccount)
+                config.load_incluster_config(client_configuration=configuration)
+            except config.ConfigException:
+                # Fall back to kubeconfig (for local development)
+                try:
+                    config.load_kube_config(client_configuration=configuration)
+                except config.ConfigException as e:
+                    warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
             c = self._client = client.ApiClient(configuration)
@@ -606,7 +797,7 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
             else:
                 raise
-        return f'{namespace}:{resp["metadata"]["name"]}'
+        return f"{namespace}:{resp['metadata']['name']}"
     def _submit_dryrun(
         self, app: AppDef, cfg: KubernetesOpts
@@ -628,18 +819,92 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
             priority_class, str
         ), "priority_class must be a str"
-        resource = app_to_resource(app, queue, service_account, priority_class)
+        reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
+        assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
+        reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
+        assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
+        efa_device_count = cfg.get("efa_device_count")
+        assert efa_device_count is None or isinstance(
+            efa_device_count, int
+        ), "efa_device_count must be an int or None"
+        resource = app_to_resource(
+            app,
+            queue,
+            service_account,
+            priority_class,
+            reserved_millicpu,
+            reserved_memmb,
+            efa_device_count,
+        )
+        if cfg.get("validate_spec"):
+            try:
+                self._custom_objects_api().create_namespaced_custom_object(
+                    group="batch.volcano.sh",
+                    version="v1alpha1",
+                    namespace=cfg.get("namespace") or "default",
+                    plural="jobs",
+                    body=resource,
+                    dry_run="All",
+                )
+            except Exception as e:
+                from kubernetes.client.rest import ApiException
+                if isinstance(e, ApiException):
+                    raise ValueError(f"Invalid job spec: {e.reason}") from e
+                raise
+            job_name = resource["metadata"]["name"]
+            for task in resource["spec"]["tasks"]:
+                task_name = task["name"]
+                replicas = task.get("replicas", 1)
+                max_index = replicas - 1
+                pod_name = f"{job_name}-{task_name}-{max_index}"
+                if len(pod_name) > 63:
+                    raise ValueError(
+                        f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
+                        f"Shorten app.name or role names"
+                    )
         req = KubernetesJob(
             resource=resource,
             images_to_push=images_to_push,
         )
         return AppDryRunInfo(req, repr)
-    def _validate(self, app: AppDef, scheduler: str) -> None:
+    def _validate(self, app: AppDef, scheduler: str, cfg: KubernetesOpts) -> None:
         # Skip validation step
         pass
     def _cancel_existing(self, app_id: str) -> None:
+        """
+        Abort a Volcano job while preserving the spec for inspection.
+        """
+        namespace, name = app_id.split(":")
+        vcjob = self._custom_objects_api().get_namespaced_custom_object(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+        )
+        vcjob["status"]["state"]["phase"] = "Aborted"
+        self._custom_objects_api().replace_namespaced_custom_object_status(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+            body=vcjob,
+        )
+    def _delete_existing(self, app_id: str) -> None:
+        """
+        Delete a Volcano job completely from the cluster.
+        """
         namespace, name = app_id.split(":")
         self._custom_objects_api().delete_namespaced_custom_object(
             group="batch.volcano.sh",
@@ -673,19 +938,52 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
             type_=str,
             help="The name of the PriorityClass to set on the job specs",
         )
+        opts.add(
+            "validate_spec",
+            type_=bool,
+            help="Validate job spec using Kubernetes API dry-run before submission",
+            default=True,
+        )
+        opts.add(
+            "reserved_millicpu",
+            type_=int,
+            help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
+            default=RESERVED_MILLICPU,
+        )
+        opts.add(
+            "reserved_memmb",
+            type_=int,
+            help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
+            default=RESERVED_MEMMB,
+        )
+        opts.add(
+            "efa_device_count",
+            type_=int,
+            help="EFA device count override: None/unset=use resource spec, "
+            "0=remove EFA, N>0=set EFA count to N",
+            default=None,
+        )
         return opts
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        from kubernetes import client
+        from kubernetes.client.rest import ApiException
         namespace, name = app_id.split(":")
         roles = {}
         roles_statuses = {}
-        resp = self._custom_objects_api().get_namespaced_custom_object_status(
-            group="batch.volcano.sh",
-            version="v1alpha1",
-            namespace=namespace,
-            plural="jobs",
-            name=name,
-        )
+        try:
+            resp = self._custom_objects_api().get_namespaced_custom_object_status(
+                group="batch.volcano.sh",
+                version="v1alpha1",
+                namespace=namespace,
+                plural="jobs",
+                name=name,
+            )
+        except ApiException as e:
+            if e.status == 404:
+                return None
+            raise
         status = resp.get("status")
         if status:
             state_str = status["state"]["phase"]
@@ -694,18 +992,44 @@ class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
             TASK_STATUS_COUNT = "taskStatusCount"
             if TASK_STATUS_COUNT in status:
-                for name, status in status[TASK_STATUS_COUNT].items():
-                    role, _, idx = name.rpartition("-")
+                for task_name, task_status in status[TASK_STATUS_COUNT].items():
+                    role, _, idx = task_name.rpartition("-")
-                    state_str = next(iter(status["phase"].keys()))
+                    state_str = next(iter(task_status["phase"].keys()))
                     state = TASK_STATE[state_str]
                     if role not in roles:
                         roles[role] = Role(name=role, num_replicas=0, image="")
                         roles_statuses[role] = RoleStatus(role, [])
                     roles[role].num_replicas += 1
+                    # Pod name follows the pattern: {job_name}-{task_name}-0
+                    # Get the pod to retrieve its IP address
+                    pod_name_k8s = f"{name}-{task_name}-0"
+                    hostname = ""
+                    try:
+                        core_api = client.CoreV1Api(self._api_client())
+                        pod = core_api.read_namespaced_pod(
+                            name=pod_name_k8s, namespace=namespace
+                        )
+                        pod_ip = pod.status.pod_ip
+                        if pod_ip is not None:
+                            # Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
+                            pod_ip_dashed = pod_ip.replace(".", "-")
+                            # Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
+                            # Note: This will only be useful if the client using the IPs is in the cluster.
+                            hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
+                    except ApiException:
+                        # Pod not found - hostname remains empty
+                        pass
                     roles_statuses[role].replicas.append(
-                        ReplicaStatus(id=int(idx), role=role, state=state, hostname="")
+                        ReplicaStatus(
+                            id=int(idx), role=role, state=state, hostname=hostname
+                        )
                     )
         else:
             app_state = AppState.UNKNOWN
@@ -794,13 +1118,34 @@ def create_scheduler(
 def pod_labels(
     app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
 ) -> Dict[str, str]:
+    def clean(label_value: str) -> str:
+        # cleans the provided `label_value` to make it compliant
+        # to pod label specs as described in
+        # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+        #
+        # Valid label value:
+        # must be 63 characters or less (can be empty),
+        # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
+        # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
+        # Replace invalid characters (allow: alphanum, -, _, .) with "."
+        label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
+        # Replace leading non-alphanumeric with "."
+        label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
+        # Replace trailing non-alphanumeric with "."
+        label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
+        # Trim to 63 characters
+        return label_value[:63]
     return {
-        LABEL_VERSION: torchx.__version__,
-        LABEL_APP_NAME: app.name,
+        LABEL_VERSION: clean(torchx.__version__),
+        LABEL_APP_NAME: clean(app.name),
         LABEL_ROLE_INDEX: str(role_idx),
-        LABEL_ROLE_NAME: role.name,
+        LABEL_ROLE_NAME: clean(role.name),
         LABEL_REPLICA_ID: str(replica_id),
-        LABEL_KUBE_APP_NAME: app.name,
+        LABEL_KUBE_APP_NAME: clean(app.name),
         LABEL_ORGANIZATION: "torchx.pytorch.org",
-        LABEL_UNIQUE_NAME: app_id,
+        LABEL_UNIQUE_NAME: clean(app_id),
     }

torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2024.1.6py3-none-any.whl → 2025.12.24py3-none-any.whl