PyPI - torchmonarch-nightly - Versions diffs - 2025.7.1__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.25__cp313-cp313-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.1__cp313-cp313-manylinux2014_x86_64.whl → 2025.7.25__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +874 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +270 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +500 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +56 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +51 -0
monarch/actor_mesh.py +6 -765
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +12 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/mesh_controller.py +201 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +95 -35
monarch/tools/mesh_spec.py +55 -0
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +48 -10
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +373 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +162 -0
tests/test_python_actors.py +184 -333
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +55 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.7.1.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.1.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0

tests/test_python_actors.py CHANGED Viewed

@@ -4,34 +4,39 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-unsafe
 import asyncio
+import logging
 import operator
-import re
+import os
+import sys
+import tempfile
 import threading
 import time
+import unittest
+from logging import INFO
 from types import ModuleType
-from unittest.mock import AsyncMock, patch
-import monarch
+from typing import cast
 import pytest
 import torch
-from monarch.actor_mesh import (
+from monarch._src.actor.actor_mesh import ActorMeshRef, Port, PortTuple
+from monarch.actor import (
     Accumulator,
     Actor,
     current_actor_name,
     current_rank,
     current_size,
     endpoint,
-    MonarchContext,
+    Future,
+    local_proc_mesh,
+    proc_mesh,
 )
-from monarch.debugger import init_debugging
-from monarch.future import ActorFuture
+from typing_extensions import assert_type
-from monarch.proc_mesh import local_proc_mesh, proc_mesh
-from monarch.rdma import RDMABuffer
 needs_cuda = pytest.mark.skipif(
     not torch.cuda.is_available(),
@@ -51,6 +56,10 @@ class Counter(Actor):
     async def value(self) -> int:
         return self.v
+    @endpoint
+    def value_sync_endpoint(self) -> int:
+        return self.v
 class Indirect(Actor):
     @endpoint
@@ -58,36 +67,23 @@ class Indirect(Actor):
         return await c.value.choose()
-class ParameterServer(Actor):
-    def __init__(self):
-        self.params = torch.rand(10, 10)
-        self.grad_buffer = torch.rand(10, 10)
-    @endpoint
-    async def grad_handle(self) -> RDMABuffer:
-        byte_tensor = self.grad_buffer.view(torch.uint8).flatten()
-        return RDMABuffer(byte_tensor)
-    @endpoint
-    async def update(self):
-        self.params += 0.01 * self.grad_buffer
-    @endpoint
-    async def get_grad_buffer(self) -> torch.Tensor:
-        # just used for testing
-        return self.grad_buffer
 async def test_choose():
     proc = await local_proc_mesh(gpus=2)
     v = await proc.spawn("counter", Counter, 3)
     i = await proc.spawn("indirect", Indirect)
     v.incr.broadcast()
     result = await v.value.choose()
+    # Test that Pyre derives the correct type for result (int, not Any)
+    assert_type(result, int)
     result2 = await i.call_value.choose(v)
     assert result == result2
+    result3 = await v.value_sync_endpoint.choose()
+    assert_type(result, int)
+    assert result2 == result3
 async def test_stream():
     proc = await local_proc_mesh(gpus=2)
@@ -97,78 +93,6 @@ async def test_stream():
     assert 8 == sum([x async for x in v.value.stream()])
-class ParameterClient(Actor):
-    def __init__(self, server, buffer):
-        self.server = server
-        byte_tensor = buffer.view(torch.uint8).flatten()
-        self.buffer = byte_tensor
-    @endpoint
-    async def upload(self, tensor):
-        gh = await self.server.grad_handle.call_one()
-        await gh.write(tensor)
-    @endpoint
-    async def download(self):
-        gh = await self.server.grad_handle.call_one()
-        await gh.read_into(self.buffer)
-    @endpoint
-    async def get_buffer(self):
-        return self.buffer
-@needs_cuda
-async def test_proc_mesh_rdma():
-    proc = await proc_mesh(gpus=1)
-    server = await proc.spawn("server", ParameterServer)
-    # --- CPU TESTS ---
-    client_cpu = await proc.spawn(
-        "client_cpu", ParameterClient, server, torch.ones(10, 10)
-    )
-    x = await client_cpu.get_buffer.call_one()
-    assert torch.sum(x.view(torch.float32).view(10, 10)) == 100
-    zeros = torch.zeros(10, 10)
-    await client_cpu.upload.call_one(zeros.view(torch.uint8).flatten())
-    await client_cpu.download.call_one()
-    x = await client_cpu.get_buffer.call_one()
-    assert torch.sum(x.view(torch.float32).view(10, 10)) == 0
-    # --- Modify server's backing buffer directly ---
-    await server.update.call_one()
-    # Should reflect updated values
-    await client_cpu.download.call_one()
-    buffer = await client_cpu.get_buffer.call_one()
-    remote_grad = await server.get_grad_buffer.call_one()
-    assert torch.allclose(buffer.view(torch.float32).view(10, 10), remote_grad)
-    # --- GPU TESTS ---
-    client_gpu = await proc.spawn(
-        "client_gpu", ParameterClient, server, torch.ones(10, 10, device="cuda")
-    )
-    x = await client_gpu.get_buffer.call_one()
-    buffer = x.view(torch.float32).view(10, 10)
-    assert torch.sum(buffer) == 100
-    zeros = torch.zeros(10, 10, device="cuda")
-    await client_gpu.upload.call_one(zeros.view(torch.uint8).flatten())
-    await client_gpu.download.call_one()
-    x = await client_gpu.get_buffer.call_one()
-    buffer_gpu = x.view(torch.float32).view(10, 10)
-    assert torch.sum(buffer_gpu) == 0
-    assert buffer_gpu.device.type == "cuda"
-    # Modify server state again
-    await server.update.call_one()
-    await client_gpu.download.call_one()
-    x = await client_gpu.get_buffer.call_one()
-    buffer_gpu = x.view(torch.float32).view(10, 10)
-    remote_grad = await server.get_grad_buffer.call_one()
-    assert torch.allclose(buffer_gpu.cpu(), remote_grad)
 class To(Actor):
     @endpoint
     async def whoami(self):
@@ -240,69 +164,6 @@ async def test_rank_size():
     assert 4 == await acc.accumulate(lambda: current_size()["gpus"])
-class TrainerActor(Actor):
-    def __init__(self):
-        super().__init__()
-        self.trainer = torch.nn.Linear(10, 10).to("cuda")
-        self.trainer.weight.data.zero_()
-    @endpoint
-    async def init(self, gen):
-        ranks = current_rank()
-        self.gen = gen.slice(**ranks)
-    @endpoint
-    async def exchange_metadata(self):
-        byte_tensor = self.trainer.weight.data.view(torch.uint8).flatten()
-        self.handle = RDMABuffer(byte_tensor)
-        await self.gen.attach_weight_buffer.call(self.handle)
-    @endpoint
-    async def weights_ready(self):
-        self.trainer.weight.data.add_(1.0)
-class GeneratorActor(Actor):
-    def __init__(self):
-        super().__init__()
-        self.generator = torch.nn.Linear(10, 10).to("cuda")
-        self.step = 0
-    @endpoint
-    async def init(self, trainer):
-        ranks = current_rank()
-        self.trainer = trainer.slice(**ranks)
-    @endpoint
-    async def attach_weight_buffer(self, handle):
-        self.handle = handle
-    @endpoint
-    async def update_weights(self):
-        self.step += 1
-        byte_tensor = self.generator.weight.data.view(torch.uint8).flatten()
-        await self.handle.read_into(byte_tensor)
-        assert (
-            torch.sum(self.generator.weight.data) == self.step * 100
-        ), f"{torch.sum(self.generator.weight.data)=}, {self.step=}"
-@needs_cuda
-async def test_gpu_trainer_generator():
-    trainer_proc = await proc_mesh(gpus=1)
-    gen_proc = await proc_mesh(gpus=1)
-    trainer = await trainer_proc.spawn("trainer", TrainerActor)
-    generator = await gen_proc.spawn("gen", GeneratorActor)
-    await generator.init.call(trainer)
-    await trainer.init.call(generator)
-    await trainer.exchange_metadata.call()
-    for _ in range(3):
-        await trainer.weights_ready.call()
-        await generator.update_weights.call()
 class SyncActor(Actor):
     @endpoint
     def sync_endpoint(self, a_counter: Counter):
@@ -317,22 +178,6 @@ async def test_sync_actor():
     assert r == 5
-@needs_cuda
-def test_gpu_trainer_generator_sync() -> None:
-    trainer_proc = proc_mesh(gpus=1).get()
-    gen_proc = proc_mesh(gpus=1).get()
-    trainer = trainer_proc.spawn("trainer", TrainerActor).get()
-    generator = gen_proc.spawn("gen", GeneratorActor).get()
-    generator.init.call(trainer).get()
-    trainer.init.call(generator).get()
-    trainer.exchange_metadata.call().get()
-    for _ in range(3):
-        trainer.weights_ready.call().get()
-        generator.update_weights.call().get()
 def test_sync_actor_sync_client():
     proc = local_proc_mesh(gpus=2).get()
     a = proc.spawn("actor", SyncActor).get()
@@ -408,147 +253,6 @@ def test_proc_mesh_liveness() -> None:
     counter.value.call().get()
-def _debugee_actor_internal(rank):
-    if rank == 0:
-        breakpoint()  # noqa
-        rank += 1
-        return rank
-    elif rank == 1:
-        breakpoint()  # noqa
-        rank += 2
-        return rank
-    elif rank == 2:
-        breakpoint()  # noqa
-        rank += 3
-        raise ValueError("bad rank")
-    elif rank == 3:
-        breakpoint()  # noqa
-        rank += 4
-        return rank
-class DebugeeActor(Actor):
-    @endpoint
-    async def to_debug(self):
-        rank = MonarchContext.get().point.rank
-        return _debugee_actor_internal(rank)
-@pytest.mark.oss_skip  # pyre-ignore[56] TODO T229449782
-async def test_debug() -> None:
-    input_mock = AsyncMock()
-    input_mock.side_effect = [
-        "attach 1",
-        "n",
-        "n",
-        "n",
-        "n",
-        "detach",
-        "attach 1",
-        "detach",
-        "quit",
-        "cast 0,3 n",
-        "cast 0,3 n",
-        # Attaching to 0 and 3 ensures that when we call "list"
-        # the next time, their function/lineno info will be
-        # up-to-date.
-        "attach 0",
-        "detach",
-        "attach 3",
-        "detach",
-        "quit",
-        "attach 2",
-        "c",
-        "quit",
-        "continue",
-    ]
-    outputs = []
-    def _patch_output(msg):
-        nonlocal outputs
-        outputs.append(msg)
-    with patch("monarch.debugger._debugger_input", side_effect=input_mock), patch(
-        "monarch.debugger._debugger_output", new=_patch_output
-    ):
-        proc = await proc_mesh(hosts=2, gpus=2)
-        debugee = await proc.spawn("debugee", DebugeeActor)
-        debug_client = await init_debugging(debugee)
-        fut = debugee.to_debug.call()
-        await debug_client.wait_pending_session.call_one()
-        breakpoints = []
-        for i in range(10):
-            breakpoints = await debug_client.list.call_one()
-            if len(breakpoints) == 4:
-                break
-            await asyncio.sleep(1)
-            if i == 9:
-                raise RuntimeError("timed out waiting for breakpoints")
-        initial_linenos = {}
-        for i in range(len(breakpoints)):
-            rank, coords, _, _, function, lineno = breakpoints[i]
-            initial_linenos[rank] = lineno
-            assert rank == i
-            assert coords == {"hosts": rank % 2, "gpus": rank // 2}
-            assert function == "test_python_actors._debugee_actor_internal"
-            assert lineno == breakpoints[0][5] + 4 * rank
-        await debug_client.enter.call_one()
-        # Check that when detaching and re-attaching to a session, the last portion of the output is repeated
-        expected_last_output = [
-            r"--Return--",
-            r"\n",
-            r"> (/.*/)+test_python_actors.py\(\d+\)to_debug\(\)->3\n-> return _debugee_actor_internal\(rank\)",
-            r"\n",
-            r"\(Pdb\) ",
-        ]
-        output_len = len(expected_last_output)
-        assert outputs[-2 * output_len : -output_len] == outputs[-output_len:]
-        for real_output, expected_output in zip(
-            outputs[-output_len:], expected_last_output
-        ):
-            assert re.match(expected_output, real_output) is not None
-        breakpoints = await debug_client.list.call_one()
-        for i in range(len(breakpoints)):
-            if i == 1:
-                assert breakpoints[i][4] == "test_python_actors.to_debug"
-            else:
-                assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
-                assert breakpoints[i][5] == initial_linenos[i]
-        await debug_client.enter.call_one()
-        breakpoints = await debug_client.list.call_one()
-        for i in range(len(breakpoints)):
-            if i == 1:
-                assert breakpoints[i][4] == "test_python_actors.to_debug"
-            elif i in (0, 3):
-                assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
-                assert breakpoints[i][5] == initial_linenos[i] + 2
-            else:
-                assert breakpoints[i][4] == "test_python_actors._debugee_actor_internal"
-                assert breakpoints[i][5] == initial_linenos[i]
-        await debug_client.enter.call_one()
-        breakpoints = await debug_client.list.call_one()
-        assert len(breakpoints) == 3
-        for i, rank in enumerate((0, 1, 3)):
-            assert breakpoints[i][0] == rank
-        await debug_client.enter.call_one()
-        breakpoints = await debug_client.list.call_one()
-        assert len(breakpoints) == 0
-        with pytest.raises(monarch.actor_mesh.ActorError, match="ValueError: bad rank"):
-            await fut
 class TLSActor(Actor):
     """An actor that manages thread-local state."""
@@ -644,7 +348,7 @@ async def awaitit(f):
     return await f
-def test_actor_future():
+def test_actor_future() -> None:
     v = 0
     async def incr():
@@ -654,32 +358,31 @@ def test_actor_future():
     # can use async implementation from sync
     # if no non-blocking is provided
-    f = ActorFuture(incr)
+    f = Future(impl=incr, requires_loop=False)
     assert f.get() == 1
     assert v == 1
     assert f.get() == 1
     assert asyncio.run(awaitit(f)) == 1
-    f = ActorFuture(incr)
+    f = Future(impl=incr, requires_loop=False)
     assert asyncio.run(awaitit(f)) == 2
     assert f.get() == 2
-    def incr2():
+    async def incr2():
         nonlocal v
         v += 2
         return v
     # Use non-blocking optimization if provided
-    f = ActorFuture(incr, incr2)
+    f = Future(impl=incr2)
     assert f.get() == 4
-    assert asyncio.run(awaitit(f)) == 4
     async def nope():
         nonlocal v
         v += 1
         raise ValueError("nope")
-    f = ActorFuture(nope)
+    f = Future(impl=nope, requires_loop=False)
     with pytest.raises(ValueError):
         f.get()
@@ -696,12 +399,12 @@ def test_actor_future():
     assert v == 5
-    def nope():
+    async def nope2():
         nonlocal v
         v += 1
         raise ValueError("nope")
-    f = ActorFuture(incr, nope)
+    f = Future(impl=nope2)
     with pytest.raises(ValueError):
         f.get()
@@ -723,7 +426,7 @@ def test_actor_future():
     async def seven():
         return 7
-    f = ActorFuture(seven)
+    f = Future(impl=seven, requires_loop=False)
     assert 7 == f.get(timeout=0.001)
@@ -731,7 +434,155 @@ def test_actor_future():
         f = asyncio.Future()
         await f
-    f = ActorFuture(neverfinish)
+    f = Future(impl=neverfinish, requires_loop=True)
     with pytest.raises(asyncio.exceptions.TimeoutError):
         f.get(timeout=0.1)
+class Printer(Actor):
+    def __init__(self):
+        self.logger = logging.getLogger()
+        self.logger.setLevel(INFO)
+    @endpoint
+    async def print(self, content: str):
+        print(f"{os.getpid()} {content}")
+    @endpoint
+    async def log(self, content: str):
+        self.logger.info(f"{os.getpid()} {content}")
+async def test_actor_log_streaming() -> None:
+    # Save original file descriptors
+    original_stdout_fd = os.dup(1)  # stdout
+    original_stderr_fd = os.dup(2)  # stderr
+    try:
+        # Create temporary files to capture output
+        with tempfile.NamedTemporaryFile(
+            mode="w+", delete=False
+        ) as stdout_file, tempfile.NamedTemporaryFile(
+            mode="w+", delete=False
+        ) as stderr_file:
+            stdout_path = stdout_file.name
+            stderr_path = stderr_file.name
+            # Redirect file descriptors to our temp files
+            # This will capture both Python and Rust output
+            os.dup2(stdout_file.fileno(), 1)
+            os.dup2(stderr_file.fileno(), 2)
+            # Also redirect Python's sys.stdout/stderr for completeness
+            original_sys_stdout = sys.stdout
+            original_sys_stderr = sys.stderr
+            sys.stdout = stdout_file
+            sys.stderr = stderr_file
+            try:
+                pm = await proc_mesh(gpus=2)
+                am = await pm.spawn("printer", Printer)
+                await am.print.call("hello 1")
+                await am.log.call("hello 2")
+                await pm.logging_option(stream_to_client=True)
+                await am.print.call("hello 3")
+                await am.log.call("hello 4")
+                # Give it sometime to send log back
+                time.sleep(5)
+                # Flush all outputs
+                stdout_file.flush()
+                stderr_file.flush()
+                os.fsync(stdout_file.fileno())
+                os.fsync(stderr_file.fileno())
+            finally:
+                # Restore Python's sys.stdout/stderr
+                sys.stdout = original_sys_stdout
+                sys.stderr = original_sys_stderr
+        # Restore original file descriptors
+        os.dup2(original_stdout_fd, 1)
+        os.dup2(original_stderr_fd, 2)
+        # Read the captured output
+        with open(stdout_path, "r") as f:
+            stdout_content = f.read()
+        # Clean up temp files
+        os.unlink(stdout_path)
+        os.unlink(stderr_path)
+        # TODO: (@jamessun) we need to disable logging forwarder for python logger
+        # assert "hello 1" not in stdout_content
+        assert "hello 2" not in stdout_content
+        assert "hello 3" in stdout_content
+        # assert "hello 4" in stdout_content
+    finally:
+        # Ensure file descriptors are restored even if something goes wrong
+        try:
+            os.dup2(original_stdout_fd, 1)
+            os.dup2(original_stderr_fd, 2)
+            os.close(original_stdout_fd)
+            os.close(original_stderr_fd)
+        except OSError:
+            pass
+class SendAlot(Actor):
+    @endpoint
+    async def send(self, port: Port[int]):
+        for i in range(100):
+            port.send(i)
+def test_port_as_argument():
+    proc_mesh = local_proc_mesh(gpus=1).get()
+    s = proc_mesh.spawn("send_alot", SendAlot).get()
+    send, recv = PortTuple.create(proc_mesh._mailbox)
+    s.send.broadcast(send)
+    for i in range(100):
+        assert i == recv.recv().get()
+@pytest.mark.timeout(15)
+async def test_same_actor_twice() -> None:
+    pm = await proc_mesh(gpus=1)
+    await pm.spawn("dup", Counter, 0)
+    # The second spawn with the same name should fail with a specific error
+    with pytest.raises(Exception) as exc_info:
+        await pm.spawn("dup", Counter, 0)
+    # Assert that the error message contains the expected text about duplicate actor name
+    error_msg = str(exc_info.value)
+    assert (
+        "gspawn failed: an actor with name 'dup' has already been spawned" in error_msg
+    ), f"Expected error message about duplicate actor name, got: {error_msg}"
+class TestActorMeshStop(unittest.IsolatedAsyncioTestCase):
+    async def test_actor_mesh_stop(self) -> None:
+        pm = await proc_mesh(gpus=2)
+        am_1 = await pm.spawn("printer", Printer)
+        am_2 = await pm.spawn("printer2", Printer)
+        await am_1.print.call("hello 1")
+        await am_1.log.call("hello 2")
+        await cast(ActorMeshRef, am_1).stop()
+        with self.assertRaisesRegex(
+            RuntimeError, expected_regex="`ActorMesh` has been stopped"
+        ):
+            await am_1.print.call("hello 1")
+        await am_2.print.call("hello 3")
+        await am_2.log.call("hello 4")