PyPI - torchx-nightly - Versions diffs - 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl - Mend

torchx-nightly 2025.8.5py3-none-any.whl → 2026.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
torchx/cli/cmd_delete.py +30 -0
torchx/cli/cmd_list.py +1 -2
torchx/cli/cmd_run.py +202 -28
torchx/cli/cmd_tracker.py +1 -1
torchx/cli/main.py +2 -0
torchx/components/__init__.py +1 -8
torchx/components/dist.py +9 -3
torchx/components/integration_tests/component_provider.py +2 -2
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +102 -81
torchx/runner/config.py +3 -1
torchx/runner/events/__init__.py +20 -10
torchx/runner/events/api.py +1 -1
torchx/schedulers/__init__.py +7 -10
torchx/schedulers/api.py +66 -25
torchx/schedulers/aws_batch_scheduler.py +47 -6
torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
torchx/schedulers/docker_scheduler.py +4 -3
torchx/schedulers/ids.py +27 -23
torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
torchx/schedulers/kubernetes_scheduler.py +355 -36
torchx/schedulers/local_scheduler.py +2 -1
torchx/schedulers/lsf_scheduler.py +1 -1
torchx/schedulers/slurm_scheduler.py +102 -27
torchx/specs/__init__.py +40 -9
torchx/specs/api.py +222 -12
torchx/specs/builders.py +109 -28
torchx/specs/file_linter.py +117 -53
torchx/specs/finder.py +25 -37
torchx/specs/named_resources_aws.py +13 -2
torchx/specs/overlays.py +106 -0
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/util/strings.py +1 -1
torchx/util/types.py +12 -1
torchx/version.py +2 -2
torchx/workspace/api.py +102 -5
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
torchx/examples/pipelines/__init__.py +0 -0
torchx/examples/pipelines/kfp/__init__.py +0 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
torchx/pipelines/kfp/__init__.py +0 -30
torchx/pipelines/kfp/adapter.py +0 -274
torchx/pipelines/kfp/version.py +0 -19
torchx/schedulers/gcp_batch_scheduler.py +0 -497
torchx/schedulers/ray/ray_common.py +0 -22
torchx/schedulers/ray/ray_driver.py +0 -307
torchx/schedulers/ray_scheduler.py +0 -454
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0

torchx/schedulers/kubernetes_scheduler.py CHANGED Viewed

@@ -27,10 +27,81 @@ Install Volcano:
 See the
 `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
 for more information.
+Pod Overlay
+===========
+You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
+the ``kubernetes`` metadata on your role. The value can be:
+- A dict with the overlay structure
+- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
+Merge semantics:
+- **dict**: recursive merge (upsert)
+- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
+- **primitives**: replace
+.. code:: python
+    from torchx.specs import Role
+    # Dict overlay - lists append, tuples replace
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": {
+                "spec": {
+                    "nodeSelector": {"gpu": "true"},
+                    "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}],  # appends
+                    "volumes": ({"name": "my-volume", "emptyDir": {}},)  # replaces
+                }
+            }
+        }
+    )
+    # File URI overlay
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": "file:///path/to/pod_overlay.yaml"
+        }
+    )
+CLI usage with builtin components:
+.. code:: bash
+    $ torchx run --scheduler kubernetes dist.ddp \\
+        --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
+        --script train.py
+Example ``pod_overlay.yaml``:
+.. code:: yaml
+    spec:
+      nodeSelector:
+        node.kubernetes.io/instance-type: p4d.24xlarge
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      volumes: !!python/tuple
+        - name: my-volume
+          emptyDir: {}
+The overlay is deep-merged with the generated pod, preserving existing fields
+and adding or overriding specified ones.
 """
 import json
 import logging
+import re
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
@@ -45,6 +116,7 @@ from typing import (
     Tuple,
     TYPE_CHECKING,
     TypedDict,
+    Union,
 )
 import torchx
@@ -77,7 +149,6 @@ from torchx.specs.api import (
 from torchx.util.strings import normalize_str
 from torchx.workspace.docker_workspace import DockerWorkspaceMixin
 if TYPE_CHECKING:
     from docker import DockerClient
     from kubernetes.client import ApiClient, CustomObjectsApi
@@ -87,6 +158,7 @@ if TYPE_CHECKING:
     )
     from kubernetes.client.rest import ApiException
 logger: logging.Logger = logging.getLogger(__name__)
 # Kubernetes reserves a small amount of resources per host for the system. For
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
 RESERVED_MILLICPU = 100
 RESERVED_MEMMB = 1024
+def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
+    """Apply overlay dict to V1Pod object, merging nested fields.
+    Merge semantics:
+    - dict: upsert (recursive merge)
+    - list: append by default, replace if tuple
+    - primitives: replace
+    """
+    from kubernetes import client
+    api = client.ApiClient()
+    pod_dict = api.sanitize_for_serialization(pod)
+    def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
+        for key, value in overlay.items():
+            if isinstance(value, dict) and key in base and isinstance(base[key], dict):
+                deep_merge(base[key], value)
+            elif isinstance(value, tuple):
+                base[key] = list(value)
+            elif (
+                isinstance(value, list) and key in base and isinstance(base[key], list)
+            ):
+                base[key].extend(value)
+            else:
+                base[key] = value
+    deep_merge(pod_dict, overlay)
+    merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
+    pod.spec = merged_pod.spec
+    pod.metadata = merged_pod.metadata
 RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
     RetryPolicy.REPLICA: [],
     RetryPolicy.APPLICATION: [
@@ -188,7 +294,14 @@ def sanitize_for_serialization(obj: object) -> object:
     return api.sanitize_for_serialization(obj)
-def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod":
+def role_to_pod(
+    name: str,
+    role: Role,
+    service_account: Optional[str],
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
+) -> "V1Pod":
     from kubernetes.client.models import (  # noqa: F811 redefinition of unused
         V1Container,
         V1ContainerPort,
@@ -218,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
     if resource.cpu > 0:
         mcpu = int(resource.cpu * 1000)
         limits["cpu"] = f"{mcpu}m"
-        request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
+        request_mcpu = max(mcpu - reserved_millicpu, 0)
         requests["cpu"] = f"{request_mcpu}m"
     if resource.memMB > 0:
         limits["memory"] = f"{int(resource.memMB)}M"
-        request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
+        request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
         requests["memory"] = f"{request_memMB}M"
     if resource.gpu > 0:
         requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
+    EFA_DEVICE = "vpc.amazonaws.com/efa"
     for device_name, device_limit in resource.devices.items():
         limits[device_name] = str(device_limit)
+    # Handle EFA device count override:
+    # - None (default): use whatever count is in the resource spec (already added above)
+    # - 0: remove EFA devices entirely
+    # - N > 0: set EFA device count to N (override or add)
+    if efa_device_count is not None:
+        if efa_device_count == 0:
+            limits.pop(EFA_DEVICE, None)
+        else:
+            limits[EFA_DEVICE] = str(efa_device_count)
     resources = V1ResourceRequirements(
         limits=limits,
         requests=requests,
@@ -369,7 +493,10 @@ def app_to_resource(
     queue: str,
     service_account: Optional[str],
     priority_class: Optional[str] = None,
-) -> Dict[str, object]:
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
+) -> Dict[str, Any]:
     """
     app_to_resource creates a volcano job kubernetes resource definition from
     the provided AppDef. The resource definition can be used to launch the
@@ -399,8 +526,27 @@ def app_to_resource(
             replica_role = values.apply(role)
             if role_idx == 0 and replica_id == 0:
                 replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
-            pod = role_to_pod(name, replica_role, service_account)
+            replica_role.env["TORCHX_IMAGE"] = replica_role.image
+            pod = role_to_pod(
+                name,
+                replica_role,
+                service_account,
+                reserved_millicpu,
+                reserved_memmb,
+                efa_device_count,
+            )
+            if k8s_metadata := role.metadata.get("kubernetes"):
+                if isinstance(k8s_metadata, str):
+                    import fsspec
+                    with fsspec.open(k8s_metadata, "r") as f:
+                        k8s_metadata = yaml.unsafe_load(f)
+                elif not isinstance(k8s_metadata, dict):
+                    raise ValueError(
+                        f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
+                    )
+                _apply_pod_overlay(pod, k8s_metadata)
             pod.metadata.labels.update(
                 pod_labels(
                     app=app,
@@ -443,7 +589,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
     if priority_class is not None:
         job_spec["priorityClassName"] = priority_class
-    resource: Dict[str, object] = {
+    resource: Dict[str, Any] = {
         "apiVersion": "batch.volcano.sh/v1alpha1",
         "kind": "Job",
         "metadata": {"name": f"{unique_app_id}"},
@@ -455,7 +601,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
 @dataclass
 class KubernetesJob:
     images_to_push: Dict[str, Tuple[str, str]]
-    resource: Dict[str, object]
+    resource: Dict[str, Any]
     def __str__(self) -> str:
         return yaml.dump(sanitize_for_serialization(self.resource))
@@ -470,12 +616,13 @@ class KubernetesOpts(TypedDict, total=False):
     image_repo: Optional[str]
     service_account: Optional[str]
     priority_class: Optional[str]
+    validate_spec: Optional[bool]
+    reserved_millicpu: Optional[int]
+    reserved_memmb: Optional[int]
+    efa_device_count: Optional[int]
-class KubernetesScheduler(
-    DockerWorkspaceMixin,
-    Scheduler[KubernetesOpts, AppDef, AppDryRunInfo[KubernetesJob]],
-):
+class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
     """
     KubernetesScheduler is a TorchX scheduling interface to Kubernetes.
@@ -485,7 +632,7 @@ class KubernetesScheduler(
     For installation instructions see: https://github.com/volcano-sh/volcano
     This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
-    v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
+    v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
     tracking Volcano support for Kubernetes v1.22.
     .. note::
@@ -503,6 +650,16 @@ class KubernetesScheduler(
         $ torchx status kubernetes://torchx_user/1234
         ...
+    **Cancellation**
+    Canceling a job aborts it while preserving the job spec for inspection
+    and cloning via kubectl apply. Use the delete command to remove the job entirely:
+    .. code-block:: bash
+        $ torchx cancel kubernetes://namespace/jobname  # abort, preserves spec
+        $ torchx delete kubernetes://namespace/jobname  # delete completely
     **Config Options**
     .. runopts::
@@ -581,9 +738,14 @@ class KubernetesScheduler(
         if c is None:
             configuration = client.Configuration()
             try:
-                config.load_kube_config(client_configuration=configuration)
-            except config.ConfigException as e:
-                warnings.warn(f"failed to load kube config: {e}")
+                # Try in-cluster config first (for pods with ServiceAccount)
+                config.load_incluster_config(client_configuration=configuration)
+            except config.ConfigException:
+                # Fall back to kubeconfig (for local development)
+                try:
+                    config.load_kube_config(client_configuration=configuration)
+                except config.ConfigException as e:
+                    warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
             c = self._client = client.ApiClient(configuration)
@@ -635,7 +797,7 @@ class KubernetesScheduler(
             else:
                 raise
-        return f'{namespace}:{resp["metadata"]["name"]}'
+        return f"{namespace}:{resp['metadata']['name']}"
     def _submit_dryrun(
         self, app: AppDef, cfg: KubernetesOpts
@@ -657,7 +819,56 @@ class KubernetesScheduler(
             priority_class, str
         ), "priority_class must be a str"
-        resource = app_to_resource(app, queue, service_account, priority_class)
+        reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
+        assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
+        reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
+        assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
+        efa_device_count = cfg.get("efa_device_count")
+        assert efa_device_count is None or isinstance(
+            efa_device_count, int
+        ), "efa_device_count must be an int or None"
+        resource = app_to_resource(
+            app,
+            queue,
+            service_account,
+            priority_class,
+            reserved_millicpu,
+            reserved_memmb,
+            efa_device_count,
+        )
+        if cfg.get("validate_spec"):
+            try:
+                self._custom_objects_api().create_namespaced_custom_object(
+                    group="batch.volcano.sh",
+                    version="v1alpha1",
+                    namespace=cfg.get("namespace") or "default",
+                    plural="jobs",
+                    body=resource,
+                    dry_run="All",
+                )
+            except Exception as e:
+                from kubernetes.client.rest import ApiException
+                if isinstance(e, ApiException):
+                    raise ValueError(f"Invalid job spec: {e.reason}") from e
+                raise
+            job_name = resource["metadata"]["name"]
+            for task in resource["spec"]["tasks"]:
+                task_name = task["name"]
+                replicas = task.get("replicas", 1)
+                max_index = replicas - 1
+                pod_name = f"{job_name}-{task_name}-{max_index}"
+                if len(pod_name) > 63:
+                    raise ValueError(
+                        f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
+                        f"Shorten app.name or role names"
+                    )
         req = KubernetesJob(
             resource=resource,
             images_to_push=images_to_push,
@@ -669,6 +880,31 @@ class KubernetesScheduler(
         pass
     def _cancel_existing(self, app_id: str) -> None:
+        """
+        Abort a Volcano job while preserving the spec for inspection.
+        """
+        namespace, name = app_id.split(":")
+        vcjob = self._custom_objects_api().get_namespaced_custom_object(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+        )
+        vcjob["status"]["state"]["phase"] = "Aborted"
+        self._custom_objects_api().replace_namespaced_custom_object_status(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+            body=vcjob,
+        )
+    def _delete_existing(self, app_id: str) -> None:
+        """
+        Delete a Volcano job completely from the cluster.
+        """
         namespace, name = app_id.split(":")
         self._custom_objects_api().delete_namespaced_custom_object(
             group="batch.volcano.sh",
@@ -702,19 +938,52 @@ class KubernetesScheduler(
             type_=str,
             help="The name of the PriorityClass to set on the job specs",
         )
+        opts.add(
+            "validate_spec",
+            type_=bool,
+            help="Validate job spec using Kubernetes API dry-run before submission",
+            default=True,
+        )
+        opts.add(
+            "reserved_millicpu",
+            type_=int,
+            help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
+            default=RESERVED_MILLICPU,
+        )
+        opts.add(
+            "reserved_memmb",
+            type_=int,
+            help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
+            default=RESERVED_MEMMB,
+        )
+        opts.add(
+            "efa_device_count",
+            type_=int,
+            help="EFA device count override: None/unset=use resource spec, "
+            "0=remove EFA, N>0=set EFA count to N",
+            default=None,
+        )
         return opts
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        from kubernetes import client
+        from kubernetes.client.rest import ApiException
         namespace, name = app_id.split(":")
         roles = {}
         roles_statuses = {}
-        resp = self._custom_objects_api().get_namespaced_custom_object_status(
-            group="batch.volcano.sh",
-            version="v1alpha1",
-            namespace=namespace,
-            plural="jobs",
-            name=name,
-        )
+        try:
+            resp = self._custom_objects_api().get_namespaced_custom_object_status(
+                group="batch.volcano.sh",
+                version="v1alpha1",
+                namespace=namespace,
+                plural="jobs",
+                name=name,
+            )
+        except ApiException as e:
+            if e.status == 404:
+                return None
+            raise
         status = resp.get("status")
         if status:
             state_str = status["state"]["phase"]
@@ -723,18 +992,44 @@ class KubernetesScheduler(
             TASK_STATUS_COUNT = "taskStatusCount"
             if TASK_STATUS_COUNT in status:
-                for name, status in status[TASK_STATUS_COUNT].items():
-                    role, _, idx = name.rpartition("-")
+                for task_name, task_status in status[TASK_STATUS_COUNT].items():
+                    role, _, idx = task_name.rpartition("-")
-                    state_str = next(iter(status["phase"].keys()))
+                    state_str = next(iter(task_status["phase"].keys()))
                     state = TASK_STATE[state_str]
                     if role not in roles:
                         roles[role] = Role(name=role, num_replicas=0, image="")
                         roles_statuses[role] = RoleStatus(role, [])
                     roles[role].num_replicas += 1
+                    # Pod name follows the pattern: {job_name}-{task_name}-0
+                    # Get the pod to retrieve its IP address
+                    pod_name_k8s = f"{name}-{task_name}-0"
+                    hostname = ""
+                    try:
+                        core_api = client.CoreV1Api(self._api_client())
+                        pod = core_api.read_namespaced_pod(
+                            name=pod_name_k8s, namespace=namespace
+                        )
+                        pod_ip = pod.status.pod_ip
+                        if pod_ip is not None:
+                            # Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
+                            pod_ip_dashed = pod_ip.replace(".", "-")
+                            # Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
+                            # Note: This will only be useful if the client using the IPs is in the cluster.
+                            hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
+                    except ApiException:
+                        # Pod not found - hostname remains empty
+                        pass
                     roles_statuses[role].replicas.append(
-                        ReplicaStatus(id=int(idx), role=role, state=state, hostname="")
+                        ReplicaStatus(
+                            id=int(idx), role=role, state=state, hostname=hostname
+                        )
                     )
         else:
             app_state = AppState.UNKNOWN
@@ -778,7 +1073,10 @@ class KubernetesScheduler(
         core_api = client.CoreV1Api(self._api_client())
         if should_tail:
             w = watch.Watch()
-            iterator = w.stream(core_api.read_namespaced_pod_log, **args)
+            iterator = (
+                f"{line}\n"
+                for line in w.stream(core_api.read_namespaced_pod_log, **args)
+            )
         else:
             resp = core_api.read_namespaced_pod_log(**args)
             iterator = split_lines(resp)
@@ -823,13 +1121,34 @@ def create_scheduler(
 def pod_labels(
     app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
 ) -> Dict[str, str]:
+    def clean(label_value: str) -> str:
+        # cleans the provided `label_value` to make it compliant
+        # to pod label specs as described in
+        # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+        #
+        # Valid label value:
+        # must be 63 characters or less (can be empty),
+        # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
+        # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
+        # Replace invalid characters (allow: alphanum, -, _, .) with "."
+        label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
+        # Replace leading non-alphanumeric with "."
+        label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
+        # Replace trailing non-alphanumeric with "."
+        label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
+        # Trim to 63 characters
+        return label_value[:63]
     return {
-        LABEL_VERSION: torchx.__version__,
-        LABEL_APP_NAME: app.name,
+        LABEL_VERSION: clean(torchx.__version__),
+        LABEL_APP_NAME: clean(app.name),
         LABEL_ROLE_INDEX: str(role_idx),
-        LABEL_ROLE_NAME: role.name,
+        LABEL_ROLE_NAME: clean(role.name),
         LABEL_REPLICA_ID: str(replica_id),
-        LABEL_KUBE_APP_NAME: app.name,
+        LABEL_KUBE_APP_NAME: clean(app.name),
         LABEL_ORGANIZATION: "torchx.pytorch.org",
-        LABEL_UNIQUE_NAME: app_id,
+        LABEL_UNIQUE_NAME: clean(app_id),
     }

torchx/schedulers/local_scheduler.py CHANGED Viewed

@@ -529,7 +529,7 @@ def _register_termination_signals() -> None:
         signal.signal(signal.SIGINT, _terminate_process_handler)
-class LocalScheduler(Scheduler[LocalOpts, AppDef, AppDryRunInfo[PopenRequest]]):
+class LocalScheduler(Scheduler[LocalOpts]):
     """
     Schedules on localhost. Containers are modeled as processes and
     certain properties of the container that are either not relevant
@@ -1159,6 +1159,7 @@ class LogIterator:
             self._check_finished()  # check to see if app has finished running
             if os.path.isfile(self._log_file):
+                time.sleep(0.1)  # fix timing issue
                 self._log_fp = open(
                     self._log_file,
                     mode="rt",

torchx/schedulers/lsf_scheduler.py CHANGED Viewed

@@ -394,7 +394,7 @@ class LsfBsub:
 {self.materialize()}"""
-class LsfScheduler(Scheduler[LsfOpts, AppDef, AppDryRunInfo]):
+class LsfScheduler(Scheduler[LsfOpts]):
     """
     **Example: hello_world**

torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

torchx-nightly 2025.8.5py3-none-any.whl → 2026.1.11py3-none-any.whl