PyPI - torchmonarch-nightly - Versions diffs - 2025.8.2__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.3__cp313-cp313-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.8.2__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.3__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +414 -216
monarch/_src/actor/allocator.py +75 -6
monarch/_src/actor/bootstrap_main.py +7 -4
monarch/_src/actor/code_sync/__init__.py +2 -0
monarch/_src/actor/debugger/__init__.py +7 -0
monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
monarch/_src/actor/endpoint.py +27 -45
monarch/_src/actor/future.py +86 -24
monarch/_src/actor/host_mesh.py +125 -0
monarch/_src/actor/logging.py +94 -0
monarch/_src/actor/pickle.py +25 -0
monarch/_src/actor/proc_mesh.py +423 -156
monarch/_src/actor/python_extension_methods.py +90 -0
monarch/_src/actor/shape.py +8 -1
monarch/_src/actor/source_loader.py +45 -0
monarch/_src/actor/telemetry/__init__.py +172 -0
monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
monarch/_src/debug_cli/__init__.py +7 -0
monarch/_src/debug_cli/debug_cli.py +43 -0
monarch/_src/tensor_engine/rdma.py +64 -9
monarch/_testing.py +1 -3
monarch/actor/__init__.py +24 -4
monarch/common/_C.so +0 -0
monarch/common/device_mesh.py +14 -0
monarch/common/future.py +10 -0
monarch/common/remote.py +14 -25
monarch/common/tensor.py +12 -0
monarch/debug_cli/__init__.py +7 -0
monarch/debug_cli/__main__.py +12 -0
monarch/fetch.py +2 -2
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +4 -2
monarch/mesh_controller.py +34 -14
monarch/monarch_controller +0 -0
monarch/tools/colors.py +25 -0
monarch/tools/commands.py +42 -7
monarch/tools/components/hyperactor.py +1 -1
monarch/tools/config/__init__.py +31 -4
monarch/tools/config/defaults.py +13 -3
monarch/tools/config/environment.py +45 -0
monarch/tools/config/workspace.py +165 -0
monarch/tools/mesh_spec.py +2 -0
monarch/utils/__init__.py +9 -0
monarch/utils/utils.py +78 -0
tests/error_test_binary.py +5 -3
tests/python_actor_test_binary.py +52 -0
tests/test_actor_error.py +142 -14
tests/test_alloc.py +1 -1
tests/test_allocator.py +59 -72
tests/test_debugger.py +639 -45
tests/test_env_before_cuda.py +4 -4
tests/test_mesh_trait.py +38 -0
tests/test_python_actors.py +965 -75
tests/test_rdma.py +7 -6
tests/test_tensor_engine.py +6 -6
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +63 -47
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0

monarch/tools/config/workspace.py ADDED Viewed

@@ -0,0 +1,165 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import shutil
+from pathlib import Path
+from monarch.tools.config.environment import CondaEnvironment, Environment
+ACTIVE_CONDA_ENV = CondaEnvironment()
+class Workspace:
+    """
+    A workspace is one or more local directories that contains your project(s).
+    Workspaces can specify an "environment" on which projects are developed and run locally.
+    A currently active conda environment is an example of such environment.
+    At the time of job submission an ephemeral version of the "image" is built and the
+    new job is configured to run on this image. The "image" is the one specified by
+    `Role.image` attribute in the job's `AppDef`
+    (see `monarch.tools.components.hyperactor.host_mesh()`).
+    For example when launching onto Kubernetes, "image" is interpreted as a Docker image (e.g. "name:tag")
+    Specifically the ephemeral image contains:
+    1. A copy of the workspace directories
+    2. (If Applicable) A copy of the currently active environment
+    This effectively one-time mirrors the local codebase and environment on the remote machines.
+    Workspaces can also be sync'ed interactively on-demand (post job launch) by using
+    `monarch.actor.proc_mesh.ProcMesh.syncWorkspace(Workspace)`.
+    Usage:
+    .. doc-test::
+        import pathlib
+        from monarch.tools.config import Workspace
+        from monarch.tools.config import Config
+        HOME = pathlib.Path().home()
+        # 1. single project workspace
+        config = Config(
+            workspace=Workspace(dirs=[HOME / "github" / "torchtitan"]),
+        )
+        # 2. multiple projects (useful for cross-project development)
+        config = Config(
+            workspace=Workspace(
+                dirs=[
+                    # $HOME/torch             (local) -> $WORKSPACE_DIR/torch      (remote)
+                    # $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/torchtitan (remote)
+                    HOME() / "torch",
+                    HOME() / "github" / "torchtitan",
+                ]
+            ),
+        )
+        # 3. with explicit local -> remote mappings
+        config = Config(
+            workspace=Workspace(
+                dirs={
+                    # $HOME/torch             (local) -> $WORKSPACE_DIR/github/pytorch    (remote)
+                    # $HOME/github/torchtitan (local) -> $WORKSPACE_DIR/github/torchtitan (remote)
+                    HOME() / "torch" : "github/pytorch"
+                    HOME() / "github" / "torchtitan" : "github/torchtitan"
+                }
+            )
+        )
+        # -- or flat into WORKSPACE_DIR
+        config = Config(
+            workspace=Workspace(
+                # $HOME/github/torchtitan  (local) -> $WORKSPACE_DIR/  (remote)
+                dirs={HOME() / "github" / "torchtitan": ""},
+            )
+        )
+        # 3. no project, everything is installed in my environment (but sync my env)
+        config = Config(
+            workspace=Workspace(),
+        )
+        # 4. disable project and environment sync
+        config = Config(
+            workspace=Workspace(env=None),
+        )
+    """
+    def __init__(
+        self,
+        dirs: list[Path | str] | dict[Path | str, str] | None = None,
+        env: Environment | None = ACTIVE_CONDA_ENV,
+    ) -> None:
+        self.env = env
+        self.dirs: dict[Path, str] = {}  # src -> dst
+        if dirs is None:
+            pass
+        elif isinstance(dirs, list):
+            for d in dirs:
+                d = Path(d)
+                self.dirs[d] = d.name
+        else:  # dict
+            for src, dst in dirs.items():
+                self.dirs[Path(src)] = dst
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Workspace):
+            return False
+        return self.env == other.env and self.dirs == other.dirs
+    def merge(self, outdir: str | Path) -> None:
+        """Merges the dirs of this workspace into the given outdir."""
+        outdir = Path(outdir)
+        outdir.mkdir(parents=True, exist_ok=True)
+        for src, dst in self.dirs.items():
+            shutil.copytree(src, outdir / dst, dirs_exist_ok=True)
+    # pyre-ignore[2] skip type-hint to avoid torchx dep
+    def set_env_vars(self, appdef) -> None:
+        """For each role in the appdef, sets the following env vars (if not already set):
+        1. `WORKSPACE_DIR`: the root directory of the remote workspace
+        2. `PYTHONPATH`: include all the remote workspace dirs for all the roles in the appdef
+                (dedups and appends to existing `PYTHONPATH`)
+        3. `CONDA_DIR`: (if env is conda) the remote path to the conda env to activate
+        """
+        # typically this macro comes from torchx.specs.macros.img_root
+        # but we use the str repr instead to avoid taking a dep to torchx from this module
+        # unittest (test_workspace.py) asserts against torchx.specs.macros.img_root
+        # guarding against changes to the macro value
+        img_root_macro = "${img_root}"
+        for role in appdef.roles:
+            remote_workspace_root = role.env.setdefault(
+                "WORKSPACE_DIR",
+                f"{img_root_macro}/workspace",
+            )
+            PYTHONPATH = [p for p in role.env.get("PYTHONPATH", "").split(":") if p]
+            for dst in self.dirs.values():
+                remote_dir = f"{remote_workspace_root}/{dst}"
+                if remote_dir not in PYTHONPATH:
+                    PYTHONPATH.append(remote_dir)
+            role.env["PYTHONPATH"] = ":".join(PYTHONPATH)
+            if isinstance(self.env, CondaEnvironment):
+                role.env.setdefault("CONDA_DIR", f"{img_root_macro}/conda")
+    @staticmethod
+    def null() -> "Workspace":
+        """Returns a "null" workspace; a workspace with no project dirs and no environment."""
+        return Workspace(env=None)

monarch/tools/mesh_spec.py CHANGED Viewed

@@ -128,6 +128,7 @@ class ServerSpec:
     meshes: list[MeshSpec]
     scheduler: str
     namespace: str = ""
+    ui_url: Optional[str] = None
     @property
     def server_handle(self) -> str:
@@ -210,6 +211,7 @@ class ServerSpec:
         return {
             "name": self.name,
             "server_handle": self.server_handle,
+            **({"ui_url": self.ui_url} if self.ui_url else {}),
             "state": self.state.name,
             "meshes": {
                 mesh.name: {

monarch/utils/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from .utils import setup_env_for_distributed
+__all__ = ["setup_env_for_distributed"]

monarch/utils/utils.py ADDED Viewed

@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import os
+import socket
+from monarch.actor import Actor, current_rank, endpoint, ProcMesh
+def _find_free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("localhost", 0))
+        addr = s.getsockname()
+        port = addr[1]
+        return port
+class _TorchDistributedInitActor(Actor):
+    def __init__(self) -> None:
+        self.rank: int = current_rank().rank
+    @endpoint
+    def get_host_port(self) -> tuple[str, int]:
+        return (socket.gethostname(), _find_free_port())
+    @endpoint
+    def setup_env(self, master_addr: str, master_port: int) -> None:
+        cr = current_rank()
+        # Assume last dimension is the local rank.
+        last_label = cr.extent.labels[-1]
+        local_world_size = cr.size(last_label)
+        world_size = cr.extent.nelements
+        global_rank = cr.rank
+        local_rank = min(world_size, global_rank % local_world_size)
+        group_rank = global_rank // local_world_size
+        group_world_size = (world_size + local_world_size - 1) // local_world_size
+        env = {
+            "MASTER_ADDR": master_addr,
+            "MASTER_PORT": str(master_port),
+            "RANK": str(global_rank),
+            "LOCAL_RANK": str(local_rank),
+            "LOCAL_WORLD_SIZE": str(local_world_size),
+            "GROUP_RANK": str(group_rank),
+            "GROUP_WORLD_SIZE": str(group_world_size),
+            "ROLE_RANK": str(global_rank),
+            "ROLE_WORLD_SIZE": str(world_size),
+            "ROLE_NAME": "rank",
+            "WORLD_SIZE": str(world_size),
+        }
+        os.environ.update(env)
+async def setup_env_for_distributed(
+    proc_mesh: ProcMesh,
+    master_addr: str | None = None,
+    master_port: int | None = None,
+) -> None:
+    """
+    Sets up environment variables for pytorch distributed.
+    It selects a random proc in the proc_mesh to be the master node.
+    It sets enviornment variables like RANK, LOCAL_RANK, WORLD_SIZE, etc.
+    If master_addr and master_port are None, it will automatically select a master node and port.
+    """
+    assert (
+        (master_addr is None) == (master_port is None)
+    ), "Either both master_addr and master_port must be specified or neither must be specified."
+    am = await proc_mesh.spawn("_TorchDistributedInitActor", _TorchDistributedInitActor)
+    if master_addr is None:
+        # We use call instead of call_one because call_one can't handle tuple return types.
+        vm = await am.flatten("rank").slice(rank=0).get_host_port.call()
+        master_addr, master_port = vm.item()
+    assert master_port is not None, "master_port should not be None here."
+    await am.setup_env.call(master_addr, master_port)

tests/error_test_binary.py CHANGED Viewed

@@ -4,9 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-unsafe
 import asyncio
 import ctypes
-import sys
 import click
 from monarch._rust_bindings.monarch_extension.blocking import blocking_function
@@ -158,8 +159,9 @@ def error_endpoint(num_procs, sync_test_impl, sync_endpoint, endpoint_name):
 @main.command("error-bootstrap")
 def error_bootstrap():
     print("Started function error_bootstrap", flush=True)
-    proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
+    proc_mesh(
+        gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}
+    ).initialized.get()
 async def _error_unmonitored():

tests/python_actor_test_binary.py ADDED Viewed

@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import asyncio
+import logging
+import click
+from monarch.actor import Actor, endpoint, proc_mesh
+@click.group()
+def main() -> None:
+    pass
+class Printer(Actor):
+    def __init__(self) -> None:
+        self.logger: logging.Logger = logging.getLogger()
+    @endpoint
+    async def print(self, content: str) -> None:
+        print(f"{content}", flush=True)
+async def _flush_logs() -> None:
+    # Create a lot of processes to stress test the logging
+    pm = await proc_mesh(gpus=32)
+    # never flush
+    await pm.logging_option(aggregate_window_sec=1000)
+    am = await pm.spawn("printer", Printer)
+    # These should be streamed to client
+    for _ in range(5):
+        await am.print.call("has print streaming")
+    await pm.stop()
+@main.command("flush-logs")
+def flush_logs() -> None:
+    asyncio.run(_flush_logs())
+if __name__ == "__main__":
+    main()

tests/test_actor_error.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-unsafe
 import importlib.resources
 import os
@@ -34,6 +35,24 @@ class ExceptionActorSync(Actor):
         raise Exception("This is a test exception")
+class NestedExceptionActor(Actor):
+    @endpoint
+    async def raise_exception_with_context(self) -> None:
+        try:
+            raise Exception("Inner exception")
+        except Exception:
+            # Don't use from here to set __context__ instead of __cause__
+            raise Exception("Outer exception")
+    @endpoint
+    async def raise_exception_with_cause(self) -> None:
+        try:
+            raise Exception("Inner exception")
+        except Exception as e:
+            # Use from here to set __cause__ instead of __context__
+            raise Exception("Outer exception") from e
 class BrokenPickleClass:
     """A class that can be configured to raise exceptions during pickling/unpickling."""
@@ -116,6 +135,41 @@ def test_actor_exception_sync(mesh, actor_class, num_procs):
             exception_actor.raise_exception.call().get()
+@pytest.mark.parametrize(
+    "mesh",
+    [local_proc_mesh, proc_mesh],
+    ids=["local_proc_mesh", "distributed_proc_mesh"],
+)
+async def test_actor_error_message(mesh):
+    """
+    Test that exceptions raised in actor endpoints capture nested exceptions.
+    """
+    proc = mesh(gpus=2)
+    exception_actor = await proc.spawn("exception_actor", NestedExceptionActor)
+    with pytest.raises(ActorError) as exc_info:
+        await exception_actor.raise_exception_with_cause.call()
+    # Make sure both exception messages are present in the message.
+    assert "Inner exception" in str(exc_info.value)
+    assert "Outer exception" in str(exc_info.value)
+    # Make sure the "cause" is set.
+    assert "The above exception was the direct cause of the following exception" in str(
+        exc_info.value
+    )
+    with pytest.raises(ActorError) as exc_info:
+        await exception_actor.raise_exception_with_context.call()
+    # Make sure both exception messages are present in the message.
+    assert "Inner exception" in str(exc_info.value)
+    assert "Outer exception" in str(exc_info.value)
+    # Make sure the "cause" is set.
+    assert "During handling of the above exception, another exception occurred" in str(
+        exc_info.value
+    )
 '''
 # oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
 @pytest.mark.oss_skip
@@ -436,14 +490,14 @@ async def test_proc_mesh_monitoring(mesh):
     event = await anext(monitor)
     assert isinstance(event, ProcEvent.Crashed)
     assert event[0] == 0  # check rank
-    assert "ActorFailureError" in event[1]  # check error message
+    assert "failed: did not handle supervision event" in event[1]  # check error message
     assert (
         "Simulated actor failure for supervision testing" in event[1]
     )  # check error message
     # should not be able to spawn actors anymore as proc mesh is unhealthy
     with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
-        await proc.spawn("ex", ExceptionActorSync)
+        await proc.spawn("ex", ExceptionActorSync).initialized
 @pytest.mark.parametrize(
@@ -467,16 +521,19 @@ async def test_actor_mesh_supervision_handling(mesh):
     await e.check.call()
     # existing call should fail with supervision error
-    with pytest.raises(SupervisionError, match="supervision error:"):
+    with pytest.raises(
+        SupervisionError,
+        match=".*Actor .* exited because of the following reason",
+    ):
         await e.fail_with_supervision_error.call_one()
     # new call should fail with check of health state of actor mesh
-    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+    with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
         await e.check.call()
     # should not be able to spawn actors anymore as proc mesh is unhealthy
     with pytest.raises(SupervisionError, match="proc mesh is stopped with reason"):
-        await proc.spawn("ex", ExceptionActorSync)
+        await proc.spawn("ex", ExceptionActorSync).initialized
 class HealthyActor(Actor):
@@ -534,11 +591,14 @@ async def test_actor_mesh_supervision_handling_chained_error(mesh):
     # in a chain of client -> Intermediate -> ErrorActor, a supervision error
     # happening in ErrorActor will be captured by Intermediate and re-raised
     # as an application error (ActorError).
-    with pytest.raises(ActorError, match="supervision error:"):
+    with pytest.raises(
+        ActorError,
+        match=".*Actor .* exited because of the following reason",
+    ):
         await intermediate_actor.forward_error.call()
     # calling success endpoint should fail with ActorError, but with supervision msg.
-    with pytest.raises(ActorError, match="actor mesh is not in a healthy state"):
+    with pytest.raises(ActorError, match="Actor .* is unhealthy with reason"):
         await intermediate_actor.forward_success.call()
     # healthy actor should still be working
@@ -567,11 +627,14 @@ async def test_base_exception_handling(mesh, method_name):
     method = getattr(error_actor, method_name)
     # The call should raise a SupervisionError
-    with pytest.raises(SupervisionError, match="supervision error:"):
+    with pytest.raises(
+        SupervisionError,
+        match=".*Actor .* exited because of the following reason",
+    ):
         await method.call_one()
     # Subsequent calls should fail with a health state error
-    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+    with pytest.raises(RuntimeError, match="Actor .* is unhealthy with reason"):
         await error_actor.check.call()
@@ -587,18 +650,24 @@ async def test_supervision_with_proc_mesh_stopped(mesh):
     await proc.stop()
     # new call should fail with check of health state of actor mesh
-    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+    with pytest.raises(
+        SupervisionError, match="actor mesh is stopped due to proc mesh shutdown"
+    ):
         await actor_mesh.check.call()
     # proc mesh cannot spawn new actors anymore
     with pytest.raises(RuntimeError, match="`ProcMesh` has already been stopped"):
-        await proc.spawn("immediate", Intermediate)
+        await proc.spawn("immediate", Intermediate).initialized
 # TODO - re-enable after resolving T232206970
 @pytest.mark.oss_skip
 async def test_supervision_with_sending_error():
+    # Messages of length > this will cause a send error and a returned
+    # undeliverable.
     os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "50000000"
+    # Limit retries for sending before giving up.
+    os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "5"
     proc = await proc_mesh(gpus=1)
     actor_mesh = await proc.spawn("healthy", HealthyActor)
@@ -610,12 +679,71 @@ async def test_supervision_with_sending_error():
     # send a large payload to trigger send timeout error
     with pytest.raises(
-        SupervisionError, match="supervision error:.*actor mesh is stopped"
+        SupervisionError,
+        match=".*Actor .* exited because of the following reason",
     ):
         await actor_mesh.check_with_payload.call(payload="a" * 55000000)
     # new call should fail with check of health state of actor mesh
-    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+    with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
         await actor_mesh.check.call()
-    with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):
+    with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason"):
         await actor_mesh.check_with_payload.call(payload="a")
+async def test_slice_supervision() -> None:
+    pm = await proc_mesh(gpus=4)
+    healthy_mesh = await pm.spawn("healthy", HealthyActor)
+    error_mesh = await pm.spawn("error", ErrorActor)
+    slice_1 = error_mesh.slice(gpus=slice(2, 4))
+    slice_2 = error_mesh.slice(gpus=2)
+    slice_3 = error_mesh.slice(gpus=3)
+    # Trigger supervision error on gpus=3
+    with pytest.raises(SupervisionError, match="did not handle supervision event"):
+        await slice_3.fail_with_supervision_error.call()
+    # Mesh containing all gpus is unhealthy
+    with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
+        await error_mesh.check.call()
+    # Slice containing only gpus=3 is unhealthy
+    with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
+        await slice_3.check.call()
+    # Slice containing gpus=3 is unhealthy
+    with pytest.raises(SupervisionError, match="Actor .* is unhealthy with reason:"):
+        await slice_1.check.call()
+    # Slice not containing gpus=3 is healthy
+    check = await slice_2.check.call()
+    for _, item in check.items():
+        assert item == "this is a healthy check"
+    # Other actor mesh on the same proc mesh is healthy
+    check = await healthy_mesh.check.call()
+    for _, item in check.items():
+        assert item == "this is a healthy check"
+async def test_mesh_slices_inherit_parent_errors() -> None:
+    pm = await proc_mesh(gpus=4)
+    error_mesh = await pm.spawn("error", ErrorActor)
+    slice_1 = error_mesh.slice(gpus=slice(2, 4))
+    # Trigger supervision error on gpus=2, 3, 4
+    with pytest.raises(SupervisionError):
+        await slice_1.fail_with_supervision_error.call()
+    # Newly created slice containing gpu=3 is unhealthy
+    slice_2 = error_mesh.slice(gpus=3)
+    with pytest.raises(SupervisionError):
+        await slice_2.check.call()
+    # Newly created slice containing gpu=1 is healthy
+    slice_3 = error_mesh.slice(gpus=1)
+    check = await slice_3.check.call()
+    for _, item in check.items():
+        assert item == "this is a healthy check"
+    await pm.stop()

tests/test_alloc.py CHANGED Viewed

@@ -20,6 +20,6 @@ class TestAlloc(IsolatedAsyncioTestCase):
         cmd = "echo hello"
         allocator = ProcessAllocator(cmd)
         spec = AllocSpec(AllocConstraints(), replica=2)
-        alloc = await allocator.allocate(spec)
+        alloc = allocator.allocate(spec)
         print(alloc)