PyPI - torchmonarch-nightly - Versions diffs - 2025.7.29__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.31__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.29__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.31__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +9 -5
monarch/_src/actor/allocator.py +5 -6
monarch/_src/actor/debugger.py +159 -98
monarch/_src/actor/endpoint.py +15 -4
monarch/_src/actor/future.py +79 -32
monarch/_src/actor/pdb_wrapper.py +10 -4
monarch/_src/actor/proc_mesh.py +82 -114
monarch/_src/actor/shape.py +32 -38
monarch/_src/tensor_engine/rdma.py +12 -6
monarch/mesh_controller.py +37 -4
monarch/monarch_controller +0 -0
tests/test_actor_error.py +3 -4
tests/test_actor_shape.py +114 -0
tests/test_allocator.py +34 -9
tests/test_debugger.py +406 -178
tests/test_python_actors.py +67 -67
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/RECORD +23 -22
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.29.dist-info → torchmonarch_nightly-2025.7.31.dist-info}/top_level.txt +0 -0

monarch/_src/actor/pdb_wrapper.py CHANGED Viewed

@@ -47,9 +47,12 @@ class PdbWrapper(pdb.Pdb):
         super().__init__(stdout=WriteWrapper(self), stdin=ReadWrapper.create(self))
         self._first = True
-    def set_trace(self, frame):
+    def set_trace(self, frame=None):
         self.client_ref.debugger_session_start.broadcast(
-            self.rank, self.coords, socket.getfqdn(socket.gethostname()), self.actor_id
+            self.rank,
+            self.coords,
+            socket.getfqdn(socket.gethostname()),
+            self.actor_id.actor_name,
         )
         if self.header:
             self.message(self.header)
@@ -67,7 +70,9 @@ class PdbWrapper(pdb.Pdb):
             super().do_clear(arg)
     def end_debug_session(self):
-        self.client_ref.debugger_session_end.broadcast(self.rank)
+        self.client_ref.debugger_session_end.broadcast(
+            self.actor_id.actor_name, self.rank
+        )
         # Once the debug client actor is notified of the session being over,
         # we need to prevent any additional requests being sent for the session
         # by redirecting stdin and stdout.
@@ -88,7 +93,7 @@ class ReadWrapper(io.RawIOBase):
     def readinto(self, b):
         with fake_sync_state():
             response = self.session.client_ref.debugger_read.call_one(
-                self.session.rank, len(b)
+                self.session.actor_id.actor_name, self.session.rank, len(b)
             ).get()
             if response == "detach":
                 # this gets injected by the worker event loop to
@@ -124,6 +129,7 @@ class WriteWrapper:
             # pyre-ignore
             lineno = self.session.curframe.f_lineno
         self.session.client_ref.debugger_write.broadcast(
+            self.session.actor_id.actor_name,
             self.session.rank,
             DebuggerWrite(
                 s.encode(),

monarch/_src/actor/proc_mesh.py CHANGED Viewed

@@ -37,15 +37,14 @@ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
     ProcMeshMonitor,
 )
 from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
-from monarch._src.actor.actor_mesh import (
-    _Actor,
-    _ActorMeshRefImpl,
-    Actor,
-    ActorMeshRef,
-    fake_sync_state,
-)
+from monarch._src.actor.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
-from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator, SimAllocator
+from monarch._src.actor.allocator import (
+    AllocateMixin,
+    LocalAllocator,
+    ProcessAllocator,
+    SimAllocator,
+)
 from monarch._src.actor.code_sync import (
     CodeSyncMeshClient,
     RemoteWorkspace,
@@ -111,29 +110,12 @@ except ImportError:
     IN_PAR = False
-async def _allocate_nonblocking(
-    alloc: Alloc, setup: Callable[[], None] | None = None
-) -> "ProcMesh":
-    _proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
-    if setup is None:
-        return ProcMesh(_proc_mesh)
-    # If the user has passed the setup lambda, we need to call
-    # it here before any of the other actors are spawned so that
-    # the environment variables are set up before cuda init.
-    proc_mesh = ProcMesh(_proc_mesh)
-    setup_actor = await proc_mesh.spawn("setup", SetupActor, setup)
-    await setup_actor.setup.call()
-    del setup_actor
-    return proc_mesh
 class ProcMesh(MeshTrait):
     def __init__(
         self,
         hy_proc_mesh: HyProcMesh,
         _mock_shape: Optional[Shape] = None,
         _device_mesh: Optional["DeviceMesh"] = None,
-        _is_initializing_debugger: bool = False,
     ) -> None:
         self._proc_mesh = hy_proc_mesh
         self._mock_shape: Optional[Shape] = _mock_shape
@@ -146,20 +128,32 @@ class ProcMesh(MeshTrait):
         self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
         self._stopped = False
-        # This code is unsafe in async contexts, but we currently do it all over the place
-        # we need to refactor this by moving it to the first time we try to spawn on the mesh.
-        # Right now we simply preserve the previous behavior and disable the check that prevents
-        # end users from doing the same.
-        with fake_sync_state():
-            if _mock_shape is None and HAS_TENSOR_ENGINE:
-                # type: ignore[21]
-                self._rdma_manager = _RdmaManager.create_rdma_manager_blocking(
-                    self._proc_mesh
-                )
-            if not _is_initializing_debugger and _mock_shape is None:
-                self._debug_manager = self.spawn(
-                    _DEBUG_MANAGER_ACTOR_NAME, DebugManager, debug_client()
-                ).get()
+    async def _init_manager_actors(
+        self,
+        setup: Callable[[], None] | None = None,
+    ) -> "ProcMesh":
+        _rdma_manager = (
+            # pyre-ignore
+            await _RdmaManager.create_rdma_manager_nonblocking(self._proc_mesh)
+            if HAS_TENSOR_ENGINE
+            else None
+        )
+        _debug_manager = await self._spawn_nonblocking(
+            _DEBUG_MANAGER_ACTOR_NAME, DebugManager, await _debug_client()
+        )
+        self._debug_manager = _debug_manager
+        self._rdma_manager = _rdma_manager
+        if setup is not None:
+            # If the user has passed the setup lambda, we need to call
+            # it here before any of the other actors are spawned so that
+            # the environment variables are set up before cuda init.
+            setup_actor = await self._spawn_nonblocking("setup", SetupActor, setup)
+            # pyre-ignore
+            await setup_actor.setup.call()._status.coro
+        return self
     @property
     def _shape(self) -> Shape:
@@ -184,10 +178,7 @@ class ProcMesh(MeshTrait):
     def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
         if self._mock_shape is not None:
             raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
-        return Future(
-            impl=lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
-            requires_loop=False,
-        )
+        return Future(coro=self._spawn_nonblocking(name, Class, *args, **kwargs))
     async def monitor(self) -> ProcMeshMonitor:
         """
@@ -230,8 +221,7 @@ class ProcMesh(MeshTrait):
         ```
         """
         return Future(
-            impl=lambda: _allocate_nonblocking(alloc, setup),
-            requires_loop=False,
+            coro=_proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors=True)
         )
     def __repr__(self) -> str:
@@ -345,10 +335,7 @@ class ProcMesh(MeshTrait):
             await self._proc_mesh.stop_nonblocking()
             self._stopped = True
-        return Future(
-            impl=lambda: _stop_nonblocking(),
-            requires_loop=False,
-        )
+        return Future(coro=_stop_nonblocking())
     async def __aexit__(
         self, exc_type: object, exc_val: object, exc_tb: object
@@ -370,46 +357,15 @@ class ProcMesh(MeshTrait):
             # Cannot call stop here because it is async.
-async def local_proc_mesh_nonblocking(
-    *,
-    gpus: Optional[int] = None,
-    hosts: int = 1,
-    _is_initializing_debugger: bool = False,
-) -> ProcMesh:
-    if gpus is None:
-        gpus = _local_device_count()
-    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
-    allocator = LocalAllocator()
-    alloc = await allocator.allocate(spec)
-    proc_mesh = HyProcMesh.allocate_nonblocking(alloc)
-    return ProcMesh(
-        await proc_mesh,
-        _is_initializing_debugger=_is_initializing_debugger,
-    )
 def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
     return Future(
-        impl=lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
-        requires_loop=False,
+        coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=LocalAllocator())
     )
-async def sim_proc_mesh_nonblocking(
-    *, gpus: Optional[int] = None, hosts: int = 1
-) -> ProcMesh:
-    if gpus is None:
-        gpus = _local_device_count()
-    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
-    allocator = SimAllocator()
-    alloc = await allocator.allocate(spec)
-    return await ProcMesh.from_alloc(alloc)
 def sim_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
     return Future(
-        impl=lambda: sim_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
-        requires_loop=False,
+        coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=SimAllocator())
     )
@@ -431,33 +387,35 @@ def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
     return cmd, args, env
-async def proc_mesh_nonblocking(
+async def _proc_mesh_from_alloc_coro(
+    alloc: Alloc,
+    setup: Callable[[], None] | None,
+    init_manager_actors: bool,
+) -> ProcMesh:
+    _hy_proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
+    proc_mesh = ProcMesh(_hy_proc_mesh)
+    if init_manager_actors:
+        await proc_mesh._init_manager_actors(setup)
+    return proc_mesh
+async def _proc_mesh_coro(
     *,
+    allocator: AllocateMixin,
     gpus: Optional[int] = None,
     hosts: int = 1,
-    env: dict[str, str] | None = None,
     setup: Callable[[], None] | None = None,
+    init_manager_actors: bool = True,
 ) -> ProcMesh:
     if gpus is None:
         gpus = _local_device_count()
     # gpus must come last in this order because
     # test_remote_function_all_gather expects that hosts comes before gpus
     # in the order of the dimensions.
-    spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
-    env = env or {}
-    # Todo: Deprecate the env field from the ProcessAllocator
-    # The PAR_MAIN_OVERRIDE needs to be passed as an env
-    # to the proc mesh construction in rust, so can not be moved to the
-    # SetupActor yet
-    cmd, args, bootstrap_env = _get_bootstrap_args()
-    env.update(bootstrap_env)
-    allocator = ProcessAllocator(cmd, args, env)
-    alloc = await allocator.allocate(spec)
+    spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
+    alloc = await allocator.allocate_nonblocking(spec)
-    return await ProcMesh.from_alloc(
-        alloc,
-        setup=setup,
-    )
+    return await _proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors)
 def proc_mesh(
@@ -467,12 +425,22 @@ def proc_mesh(
     env: dict[str, str] | None = None,
     setup: Callable[[], None] | None = None,
 ) -> Future[ProcMesh]:
-    return Future(
-        impl=lambda: proc_mesh_nonblocking(
-            gpus=gpus, hosts=hosts, env=env, setup=setup
-        ),
-        requires_loop=False,
+    env = env or {}
+    # Todo: Deprecate the env field from the ProcessAllocator
+    # The PAR_MAIN_OVERRIDE needs to be passed as an env
+    # to the proc mesh construction in rust, so can not be moved to the
+    # SetupActor yet
+    cmd, args, bootstrap_env = _get_bootstrap_args()
+    env.update(bootstrap_env)
+    task = _proc_mesh_coro(
+        gpus=gpus,
+        hosts=hosts,
+        setup=setup,
+        allocator=ProcessAllocator(cmd, args, env),
+        init_manager_actors=True,
     )
+    return Future(coro=task)
 _debug_proc_mesh: Optional["ProcMesh"] = None
@@ -482,15 +450,12 @@ _debug_proc_mesh: Optional["ProcMesh"] = None
 # doesn't trigger the debug client to spawn, which could cause confusing
 # logs. This is defined in proc_mesh.py instead of debugger.py for
 # circular import reasons.
-def _get_debug_proc_mesh() -> "ProcMesh":
+async def _get_debug_proc_mesh() -> "ProcMesh":
     global _debug_proc_mesh
     if _debug_proc_mesh is None:
-        _debug_proc_mesh = Future(
-            impl=lambda: local_proc_mesh_nonblocking(
-                gpus=1, hosts=1, _is_initializing_debugger=True
-            ),
-            requires_loop=False,
-        ).get()
+        _debug_proc_mesh = await _proc_mesh_coro(
+            gpus=1, hosts=1, allocator=LocalAllocator(), init_manager_actors=False
+        )
     return _debug_proc_mesh
@@ -499,10 +464,13 @@ _debug_client_mesh: Optional[DebugClient] = None
 # Lazy init for the same reason as above. This is defined in proc_mesh.py
 # instead of debugger.py for circular import reasons.
-def debug_client() -> DebugClient:
+async def _debug_client() -> DebugClient:
     global _debug_client_mesh
     if _debug_client_mesh is None:
-        _debug_client_mesh = (
-            _get_debug_proc_mesh().spawn("debug_client", DebugClient).get()
-        )
+        mesh = await _get_debug_proc_mesh()
+        _debug_client_mesh = await mesh._spawn_nonblocking("debug_client", DebugClient)
     return _debug_client_mesh
+def debug_client() -> DebugClient:
+    return Future(coro=_debug_client()).get()

monarch/_src/actor/shape.py CHANGED Viewed

@@ -31,6 +31,32 @@ def iter_ranks(ranks: Slices) -> Generator[int, None, None]:
         yield from ranks
+class ShapeExt:
+    """Extension methods for Shape that add higher-level
+    functionality."""
+    @staticmethod
+    def slice(shape: Shape, **kwargs) -> Shape:
+        """Select along named dimensions. Integer values remove
+        dimensions, slice objects keep dimensions but restrict them.
+        Examples: ShapeExt.slice(shape, batch=3, gpu=slice(2, 6))
+        """
+        for label, selector in kwargs.items():
+            if label not in shape.labels:
+                raise TypeError(f"Shape does not have dimension labeled {label!r}")
+            if isinstance(selector, slice):
+                shape = shape.select(label, selector)
+            else:
+                if (
+                    selector < 0
+                    or selector >= shape.ndslice.sizes[shape.labels.index(label)]
+                ):
+                    raise IndexError("index out of range")
+                shape = shape.at(label, selector)
+        return shape
 class MeshTrait(ABC):
     """
     Mesh interface. Implemented via Shape.
@@ -51,45 +77,13 @@ class MeshTrait(ABC):
     def _new_with_shape(self, shape: Shape) -> Self: ...
     def slice(self, **kwargs) -> Self:
-        """
-        mesh.slice(batch=3) or mesh.slice(batch=slice(3, None))
-        """
-        ndslice = self._ndslice
-        labels = self._labels
-        offset = ndslice.offset
-        names = []
-        sizes = []
-        strides = []
-        for name, size, stride in zip(labels, ndslice.sizes, ndslice.strides):
-            if name in kwargs:
-                e = kwargs.pop(name)
-                if isinstance(e, slice):
-                    start, stop, slice_stride = e.indices(size)
-                    offset += start * stride
-                    names.append(name)
-                    # The number of elems in `start..stop` with step
-                    # `slice_stride`. This is:
-                    #    ⌈(stop - start) /slice_stride⌉
-                    # — the number of stride steps that fit in the
-                    # half-open interval.
-                    sizes.append((stop - start + slice_stride - 1) // slice_stride)
-                    strides.append(slice_stride * stride)
-                else:
-                    if e >= size or e < 0:
-                        raise IndexError("index out of range")
-                    offset += e * stride
-            else:
-                names.append(name)
-                sizes.append(size)
-                strides.append(stride)
-        if kwargs:
-            raise TypeError(
-                f"{self} does not have dimension(s) named {tuple(kwargs.keys())}"
-            )
+        """Select along named dimensions. Integer values remove
+        dimensions, slice objects keep dimensions but restrict them.
-        new_ndslice = NDSlice(offset=offset, sizes=sizes, strides=strides)
-        return self._new_with_shape(Shape(names, new_ndslice))
+        Examples: mesh.slice(batch=3, gpu=slice(2, 6))
+        """
+        shape = Shape(list(self._labels), self._ndslice)
+        return self._new_with_shape(ShapeExt.slice(shape, **kwargs))
     def split(self, **kwargs) -> Self:
         """

monarch/_src/tensor_engine/rdma.py CHANGED Viewed

@@ -120,12 +120,15 @@ class RDMABuffer:
                 f"offset + size ({offset + size}) must be <= dst.numel() ({dst.numel()})"
             )
+        local_proc_id = MonarchContext.get().proc_id
+        client = MonarchContext.get().mailbox
         async def read_into_nonblocking() -> Optional[int]:
             res = await self._buffer.read_into(
                 addr=addr,
                 size=size,
-                local_proc_id=MonarchContext.get().proc_id,
-                client=MonarchContext.get().mailbox,
+                local_proc_id=local_proc_id,
+                client=client,
                 timeout=timeout,
             )
             # TODO - remove this once GPU support is added.
@@ -133,7 +136,7 @@ class RDMABuffer:
                 dst_gpu.copy_(dst)
             return res
-        return Future(impl=read_into_nonblocking, requires_loop=False)
+        return Future(coro=read_into_nonblocking())
     def write_from(
         self, src: torch.Tensor, offset: int = 0, timeout: int = 3
@@ -164,12 +167,15 @@ class RDMABuffer:
                 f"size + offset ({size + offset}) must be <= src.numel() ({src.numel()})"
             )
+        local_proc_id = MonarchContext.get().proc_id
+        client = MonarchContext.get().mailbox
         async def write_from_nonblocking() -> None:
             res = await self._buffer.write_from(
                 addr=addr,
                 size=size,
-                local_proc_id=MonarchContext.get().proc_id,
-                client=MonarchContext.get().mailbox,
+                local_proc_id=local_proc_id,
+                client=client,
                 timeout=timeout,
             )
             # TODO - remove this once GPU support is added.
@@ -177,4 +183,4 @@ class RDMABuffer:
                 src_gpu.copy_(src)
             return res
-        return Future(impl=write_from_nonblocking, requires_loop=False)
+        return Future(coro=write_from_nonblocking())

monarch/mesh_controller.py CHANGED Viewed

@@ -11,6 +11,7 @@ import os
 import pdb  # noqa
 import traceback
 from collections import deque
+from functools import partial
 from logging import Logger
 from typing import (
     Any,
@@ -32,6 +33,7 @@ from monarch._rust_bindings.monarch_extension.client import (  # @manual=//monar
 from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
 from monarch._rust_bindings.monarch_extension.tensor_worker import Ref
 from monarch._rust_bindings.monarch_hyperactor.actor import (
+    MethodSpecifier,
     PythonMessage,
     PythonMessageKind,
     UnflattenArg,
@@ -40,6 +42,7 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
 from monarch._rust_bindings.monarch_hyperactor.proc import (  # @manual=//monarch/monarch_extension:monarch_extension
     ActorId,
 )
+from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask
 from monarch._src.actor.actor_mesh import ActorEndpoint, Port, PortTuple
 from monarch._src.actor.endpoint import Selection
 from monarch._src.actor.shape import NDSlice
@@ -48,7 +51,7 @@ from monarch.common.controller_api import TController
 from monarch.common.function import ResolvableFunction
 from monarch.common.invocation import Seq
 from monarch.common.messages import Referenceable, SendResultOfActorCall
-from monarch.common.stream import StreamRef
+from monarch.common.stream import Stream, StreamRef
 from monarch.common.tensor import dtensor_check, InputChecker, Tensor
 from monarch.common.tree import flatten
 from monarch.tensor_worker_main import _set_trace
@@ -322,9 +325,39 @@ def actor_send(
     client = cast(MeshClient, checker.mesh.client)
-    stream_ref = chosen_stream._to_ref(client)
+    rest = partial(
+        _actor_send,
+        endpoint,
+        args_kwargs_tuple,
+        refs,
+        port,
+        selection,
+        client,
+        checker.mesh,
+        tensors,
+        chosen_stream,
+    )
+    if isinstance(endpoint._name, MethodSpecifier.Init):
+        # Init runs within the tokio loop, but creating a node blocks the loop sending actor messages, so
+        # we offload to a blocking thread
+        PythonTask.spawn_blocking(rest)
+    else:
+        rest()
-    fut = (port, checker.mesh._ndslice) if port is not None else None
+def _actor_send(
+    endpoint: ActorEndpoint,
+    args_kwargs_tuple: bytes,
+    refs: Sequence[Any],
+    port: Optional[Port[Any]],
+    selection: Selection,
+    client: MeshClient,
+    mesh: DeviceMesh,
+    tensors: List[Tensor],
+    chosen_stream: Stream,
+):
+    stream_ref = chosen_stream._to_ref(client)
+    fut = (port, mesh._ndslice) if port is not None else None
     ident = client.new_node([], tensors, cast("OldFuture", fut))
@@ -340,7 +373,7 @@ def actor_send(
         endpoint, selection, client, ident, args_kwargs_tuple, refs
     )
     worker_msg = SendResultOfActorCall(ident, broker_id, tensors, [], stream_ref)
-    client.send(checker.mesh._ndslice, worker_msg)
+    client.send(mesh._ndslice, worker_msg)
     # we have to ask for status updates
     # from workers to be sure they have finished
     # enough work to count this future as finished,

monarch/monarch_controller CHANGED Viewed

Binary file

tests/test_actor_error.py CHANGED Viewed

@@ -598,8 +598,7 @@ async def test_supervision_with_proc_mesh_stopped(mesh):
 # TODO - re-enable after resolving T232206970
 @pytest.mark.oss_skip
 async def test_supervision_with_sending_error():
-    os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "9999999999"
-    os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "1"
+    os.environ["HYPERACTOR_CODEC_MAX_FRAME_LENGTH"] = "50000000"
     proc = await proc_mesh(gpus=1)
     actor_mesh = await proc.spawn("healthy", HealthyActor)
@@ -611,9 +610,9 @@ async def test_supervision_with_sending_error():
     # send a large payload to trigger send timeout error
     with pytest.raises(
-        SupervisionError, match="supervision error:.*message not delivered:"
+        SupervisionError, match="supervision error:.*actor mesh is stopped"
     ):
-        await actor_mesh.check_with_payload.call(payload="a" * 5000000000)
+        await actor_mesh.check_with_payload.call(payload="a" * 55000000)
     # new call should fail with check of health state of actor mesh
     with pytest.raises(SupervisionError, match="actor mesh is not in a healthy state"):