PyPI - torchmonarch-nightly - Versions diffs - 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +74 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +198 -0
monarch/actor_mesh.py +692 -0
monarch/allocator.py +62 -0
monarch/bootstrap_main.py +75 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +69 -0
monarch/cached_remote_function.py +257 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +646 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +443 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +572 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +304 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +204 -0
monarch/common/stream.py +111 -0
monarch/common/tensor.py +793 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/fetch.py +55 -0
monarch/future.py +25 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/proc_mesh.py +188 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +190 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +357 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +189 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +57 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +121 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +139 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +112 -0
tests/test_alloc.py +25 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +835 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +372 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +182 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
torchmonarch_nightly-2025.6.4.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.4.dist-info/RECORD +157 -0
torchmonarch_nightly-2025.6.4.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.4.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.4.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.4.dist-info/top_level.txt +3 -0

tests/test_grad_generator.py ADDED Viewed

@@ -0,0 +1,121 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from unittest import main, TestCase
+import torch
+from monarch.gradient._gradient_generator import GradientGenerator
+from monarch.gradient_generator import gradient_execution_order
+class TestGradIter(TestCase):
+    def checkEqual(self, r, r2):
+        self.assertEqual(len(r), len(r2))
+        for i, i2 in zip(r, r2):
+            self.assertTrue((i is None and i2 is None) or torch.allclose(i, i2))
+    def test_simple(self):
+        t = torch.rand(2, requires_grad=True)
+        t2 = torch.rand(2, requires_grad=True)
+        _ = t + t2
+        a, b = torch.std_mean(t + t2)
+        r2 = torch.autograd.grad([a, b], [t2, t], retain_graph=True)
+        r = list(GradientGenerator([a, b], [t2, t]))
+        print(a, b)
+        print(a.grad_fn, b.grad_fn)
+        print(r)
+        self.checkEqual(r, r2)
+    def test_pipeline_like(self):
+        t = torch.rand(3, 3, requires_grad=True)
+        w1 = torch.rand(3, 2, requires_grad=True)
+        w2 = torch.rand(3, 2, requires_grad=True)
+        w3 = torch.rand(3, 2, requires_grad=True)
+        u = torch.rand(3, 2, requires_grad=True)
+        _ = u * u
+        w4 = torch.rand(2, 3, requires_grad=True)
+        w5 = torch.rand(2, 3, requires_grad=True)
+        w6 = torch.rand(2, 3, requires_grad=True)
+        from torch.nn.functional import relu
+        a = relu(t @ (w1 @ w4))
+        a = relu(a @ (w2 @ w5))
+        a = relu(a @ (w3 @ w6))
+        std, mean = torch.std_mean(a)
+        loss = std + std
+        cgrads = torch.autograd.grad(
+            [loss], [t, w3, w6, u, w2, w5], allow_unused=True, retain_graph=True
+        )
+        gi = GradientGenerator([loss], [t, w3, w6, u, w2, w5])
+        grads = [*gi]
+        self.checkEqual(grads, cgrads)
+    def test_tree(self):
+        t = torch.rand(3, 3, requires_grad=True)
+        t2 = t + t
+        t3 = t * t
+        t4 = t / t
+        t5 = t - t
+        t6 = t2 * t3
+        t7 = t4 * t5
+        t8 = t2 * t4
+        t9 = t3 * t5
+        t10 = t6 + t7 + t8 + t9
+        t11 = t10.sum()
+        cgrads = torch.autograd.grad([t11], [t2, t], retain_graph=True)
+        gi = GradientGenerator([t11], [t2, t])
+        grads = [*gi]
+        self.checkEqual(grads, cgrads)
+    def test_broadcast(self):
+        t = torch.rand(3, 3, requires_grad=True)
+        t2 = torch.rand(3, requires_grad=True)
+        t3 = t2 / t2
+        r = (t * t3).sum()
+        cgrads = torch.autograd.grad([r], [t, t2], retain_graph=True)
+        gi = GradientGenerator([r], [t, t2])
+        grads = [*gi]
+        self.checkEqual(grads, cgrads)
+    def test_grad_order(self):
+        t = torch.rand(3, 3, requires_grad=True)
+        w1 = torch.rand(3, 3, requires_grad=True)
+        w2 = torch.rand(3, 3, requires_grad=True)
+        w3 = torch.rand(3, 3, requires_grad=True)
+        u = torch.rand(3, 2, requires_grad=True)
+        _ = u * u
+        from torch.nn.functional import relu
+        a = relu(t @ w1)
+        a = relu(a @ w2)
+        a = relu(a @ w3)
+        std, mean = torch.std_mean(a)
+        loss = std + std
+        order = gradient_execution_order([loss], [w2, w3, w1, a])
+        self.assertEqual(order, [3, 1, 0, 2])
+if __name__ == "__main__":
+    main()

tests/test_mock_cuda.py ADDED Viewed

@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+from unittest import main, TestCase
+import pytest
+import torch
+import monarch.common.mock_cuda  # usort: skip
+def simple_forward_backward(device: str) -> None:
+    torch.manual_seed(123)
+    m = torch.nn.Sequential(torch.nn.Linear(3, 3), torch.nn.ReLU()).to(device)
+    x = torch.rand(10, 3).to(device)
+    y = m(x)
+    loss_fn = torch.nn.CrossEntropyLoss()
+    loss = loss_fn(y, torch.randint(3, (10,)).to(device))
+    # Under the hood, enabling/disabling CUDA mocking is done with a thread-local
+    # flag. By default, backward() executes ops on a different thread than the one
+    # we enabled mocking on, which would lead to an invalid memory access. So we need
+    # to disable multithreading for backward.
+    with torch.autograd.set_multithreading_enabled(False):
+        loss.backward()
+    # pyre-ignore: Incompatible return type [7]: Expected `None` but got `Tuple[typing.Any, Union[None, Tensor, Module], Union[None, Tensor, Module]]`.
+    return y, m[0].weight.grad, m[0].bias.grad
+# Mock cuda depends on initialization load order
+# For OSS, run this test separately until it can be run in a subprocess.
+@pytest.mark.oss_skip
+class TestMockCuda(TestCase):
+    def setUp(self) -> None:
+        return super().setUp()
+    def test_output_is_garbage(self):
+        with monarch.common.mock_cuda.mock_cuda_guard():
+            x = torch.arange(9, device="cuda", dtype=torch.float32).reshape(3, 3)
+            y = 2 * torch.eye(3, device="cuda")
+            true_output = torch.tensor(
+                [[0, 2, 4], [6, 8, 10], [12, 14, 16]], dtype=torch.float32
+            )
+            self.assertFalse(torch.equal((x @ y).cpu(), true_output))
+    def test_simple_forward_backward(self):
+        # This test just makes sure that the forward and backward pass work
+        # and don't crash.
+        simple_forward_backward("cuda")
+    def test_turn_mock_on_and_off(self):
+        cpu_y, cpu_dw, cpu_db = simple_forward_backward("cpu")
+        real_y, real_dw, real_db = simple_forward_backward("cuda")
+        self.assertTrue(torch.allclose(cpu_y, real_y.cpu()))
+        self.assertTrue(torch.allclose(cpu_dw, real_dw.cpu()))
+        self.assertTrue(torch.allclose(cpu_db, real_db.cpu()))
+        with monarch.common.mock_cuda.mock_cuda_guard():
+            mocked_y, mocked_dw, mocked_db = simple_forward_backward("cuda")
+            self.assertFalse(torch.allclose(cpu_y, mocked_y.cpu()))
+            self.assertFalse(torch.allclose(cpu_dw, mocked_dw.cpu()))
+            self.assertFalse(torch.allclose(cpu_db, mocked_db.cpu()))
+        real_y, real_dw, real_db = simple_forward_backward("cuda")
+        self.assertTrue(torch.allclose(cpu_y, real_y.cpu()))
+        self.assertTrue(torch.allclose(cpu_dw, real_dw.cpu()))
+        self.assertTrue(torch.allclose(cpu_db, real_db.cpu()))
+if __name__ == "__main__":
+    main()

tests/test_pdb_actor.py ADDED Viewed

@@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import sys
+import traceback
+from contextlib import contextmanager
+from typing import Generator
+import pytest
+import torch
+from monarch import DeviceMesh, fetch_shard, remote, rust_local_mesh
+from monarch._rust_bindings.monarch_extension.client import (  # @manual=//monarch/monarch_extension:monarch_extension
+    ClientActor,
+    DebuggerMessage as ClientDebuggerMessage,
+)
+from monarch._rust_bindings.monarch_extension.debugger import (
+    DebuggerMessage as PdbDebuggerMessage,
+    get_bytes_from_write_action,
+)
+from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
+from monarch.rust_local_mesh import LoggingLocation, SocketType
+from monarch_supervisor.logging import fix_exception_lines
+def custom_excepthook(exc_type, exc_value, exc_traceback):
+    tb_lines = fix_exception_lines(
+        traceback.format_exception(exc_type, exc_value, exc_traceback)
+    )
+    print("\n".join(tb_lines), file=sys.stderr)
+sys.excepthook = custom_excepthook
+@contextmanager
+def local_mesh(
+    hosts: int = 1, gpu_per_host: int = 2, activate: bool = True
+) -> Generator[DeviceMesh, None, None]:
+    with rust_local_mesh.local_mesh(
+        hosts=hosts,
+        gpus_per_host=gpu_per_host,
+        socket_type=SocketType.UNIX,
+        logging_location=LoggingLocation.DEFAULT,
+    ) as dm:
+        try:
+            if activate:
+                with dm.activate():
+                    yield dm
+            else:
+                yield dm
+            dm.exit()
+        except Exception:
+            dm.client._shutdown = True
+            raise
+remote_test_pdb_actor = remote(
+    "monarch.worker._testing_function.test_pdb_actor",
+    propagate=lambda: torch.zeros(1),
+)
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
+# out is not counted as a failure, so we set a more restrictive timeout to
+# ensure we see a hard failure in CI.
+@pytest.mark.timeout(120)
+class TestPdbActor:
+    def test_pdb_actor(self):
+        with local_mesh(1, 1) as dm:
+            with dm.activate():
+                client = dm.client.inner._actor
+                assert isinstance(client, ClientActor)
+                fut = fetch_shard(remote_test_pdb_actor())
+                msg = client.get_next_message(timeout_msec=None)
+                assert isinstance(msg, ClientDebuggerMessage)
+                assert isinstance(msg.action, DebuggerAction.Paused)
+                client.send(
+                    msg.debugger_actor_id,
+                    PdbDebuggerMessage(action=DebuggerAction.Attach()).serialize(),
+                )
+                msg = client.get_next_message(timeout_msec=None)
+                assert isinstance(msg, ClientDebuggerMessage)
+                assert isinstance(msg.action, DebuggerAction.Read)
+                assert msg.action.requested_size == 4
+                client.send(
+                    msg.debugger_actor_id,
+                    PdbDebuggerMessage(
+                        action=DebuggerAction.Write(b"1234")
+                    ).serialize(),
+                )
+                msg = client.get_next_message(timeout_msec=None)
+                assert isinstance(msg, ClientDebuggerMessage)
+                assert isinstance(msg.action, DebuggerAction.Write)
+                assert get_bytes_from_write_action(msg.action) == b"5678"
+                client.send(
+                    msg.debugger_actor_id,
+                    PdbDebuggerMessage(action=DebuggerAction.Detach()).serialize(),
+                )
+                fut.result()

tests/test_python_actors.py ADDED Viewed

@@ -0,0 +1,372 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import operator
+from types import ModuleType
+import torch
+from monarch.actor_mesh import (
+    Accumulator,
+    Actor,
+    current_actor_name,
+    current_rank,
+    current_size,
+    endpoint,
+)
+from monarch.proc_mesh import local_proc_mesh, proc_mesh
+from monarch.rdma import RDMABuffer
+class Counter(Actor):
+    def __init__(self, v: int):
+        self.v = v
+    @endpoint
+    async def incr(self):
+        self.v += 1
+    @endpoint
+    async def value(self) -> int:
+        return self.v
+class Indirect(Actor):
+    @endpoint
+    async def call_value(self, c: Counter) -> int:
+        return await c.value.choose()
+class ParameterServer(Actor):
+    def __init__(self):
+        self.params = torch.rand(10, 10)
+        self.grad_buffer = torch.rand(10, 10)
+    @endpoint
+    async def grad_handle(self) -> RDMABuffer:
+        byte_tensor = self.grad_buffer.view(torch.uint8).flatten()
+        return RDMABuffer(byte_tensor)
+    @endpoint
+    async def update(self):
+        self.params += 0.01 * self.grad_buffer
+    @endpoint
+    async def get_grad_buffer(self) -> torch.Tensor:
+        # just used for testing
+        return self.grad_buffer
+async def test_choose():
+    proc = await local_proc_mesh(gpus=2)
+    v = await proc.spawn("counter", Counter, 3)
+    i = await proc.spawn("indirect", Indirect)
+    v.incr.broadcast()
+    result = await v.value.choose()
+    result2 = await i.call_value.choose(v)
+    assert result == result2
+async def test_stream():
+    proc = await local_proc_mesh(gpus=2)
+    v = await proc.spawn("counter2", Counter, 3)
+    v.incr.broadcast()
+    assert 8 == sum([x async for x in v.value.stream()])
+class ParameterClient(Actor):
+    def __init__(self, server, buffer):
+        self.server = server
+        byte_tensor = buffer.view(torch.uint8).flatten()
+        self.buffer = byte_tensor
+    @endpoint
+    async def upload(self, tensor):
+        gh = await self.server.grad_handle.call_one()
+        await gh.write(tensor)
+    @endpoint
+    async def download(self):
+        gh = await self.server.grad_handle.call_one()
+        await gh.read_into(self.buffer)
+    @endpoint
+    async def get_buffer(self):
+        return self.buffer
+async def test_proc_mesh_rdma():
+    proc = await proc_mesh(gpus=1)
+    server = await proc.spawn("server", ParameterServer)
+    # --- CPU TESTS ---
+    client_cpu = await proc.spawn(
+        "client_cpu", ParameterClient, server, torch.ones(10, 10)
+    )
+    x = await client_cpu.get_buffer.call_one()
+    assert torch.sum(x.view(torch.float32).view(10, 10)) == 100
+    zeros = torch.zeros(10, 10)
+    await client_cpu.upload.call_one(zeros.view(torch.uint8).flatten())
+    await client_cpu.download.call_one()
+    x = await client_cpu.get_buffer.call_one()
+    assert torch.sum(x.view(torch.float32).view(10, 10)) == 0
+    # --- Modify server's backing buffer directly ---
+    await server.update.call_one()
+    # Should reflect updated values
+    await client_cpu.download.call_one()
+    buffer = await client_cpu.get_buffer.call_one()
+    remote_grad = await server.get_grad_buffer.call_one()
+    assert torch.allclose(buffer.view(torch.float32).view(10, 10), remote_grad)
+    # --- GPU TESTS ---
+    client_gpu = await proc.spawn(
+        "client_gpu", ParameterClient, server, torch.ones(10, 10, device="cuda")
+    )
+    x = await client_gpu.get_buffer.call_one()
+    buffer = x.view(torch.float32).view(10, 10)
+    assert torch.sum(buffer) == 100
+    zeros = torch.zeros(10, 10, device="cuda")
+    await client_gpu.upload.call_one(zeros.view(torch.uint8).flatten())
+    await client_gpu.download.call_one()
+    x = await client_gpu.get_buffer.call_one()
+    buffer_gpu = x.view(torch.float32).view(10, 10)
+    assert torch.sum(buffer_gpu) == 0
+    assert buffer_gpu.device.type == "cuda"
+    # Modify server state again
+    await server.update.call_one()
+    await client_gpu.download.call_one()
+    x = await client_gpu.get_buffer.call_one()
+    buffer_gpu = x.view(torch.float32).view(10, 10)
+    remote_grad = await server.get_grad_buffer.call_one()
+    assert torch.allclose(buffer_gpu.cpu(), remote_grad)
+class To(Actor):
+    @endpoint
+    async def whoami(self):
+        return current_actor_name()
+class From(Actor):
+    @endpoint
+    async def get(self, to: To):
+        return [x async for x in to.whoami.stream()]
+async def test_mesh_passed_to_mesh():
+    proc = await local_proc_mesh(gpus=2)
+    f = await proc.spawn("from", From)
+    t = await proc.spawn("to", To)
+    all = [y async for x in f.get.stream(t) for y in x]
+    assert len(all) == 4
+    assert all[0] != all[1]
+async def test_mesh_passed_to_mesh_on_different_proc_mesh():
+    proc = await local_proc_mesh(gpus=2)
+    proc2 = await local_proc_mesh(gpus=2)
+    f = await proc.spawn("from", From)
+    t = await proc2.spawn("to", To)
+    all = [y async for x in f.get.stream(t) for y in x]
+    assert len(all) == 4
+    assert all[0] != all[1]
+async def test_actor_slicing():
+    proc = await local_proc_mesh(gpus=2)
+    proc2 = await local_proc_mesh(gpus=2)
+    f = await proc.spawn("from", From)
+    t = await proc2.spawn("to", To)
+    assert await t.slice(gpus=0).whoami.call() != await t.slice(gpus=1).whoami.call()
+    result = [y async for x in f.get.stream(t.slice(gpus=0)) for y in x]
+    assert len(result) == 2
+    assert result[0] == result[1]
+async def test_aggregate():
+    proc = await local_proc_mesh(gpus=2)
+    counter = await proc.spawn("counter", Counter, 1)
+    counter.incr.broadcast()
+    acc = Accumulator(counter.value, 0, operator.add)
+    r = await acc.accumulate()
+    assert r == 4
+class RunIt(Actor):
+    @endpoint
+    async def run(self, fn):
+        return fn()
+async def test_rank_size():
+    proc = await local_proc_mesh(gpus=2)
+    r = await proc.spawn("runit", RunIt)
+    acc = Accumulator(r.run, 0, operator.add)
+    assert 1 == await acc.accumulate(lambda: current_rank()["gpus"])
+    assert 4 == await acc.accumulate(lambda: current_size()["gpus"])
+class TrainerActor(Actor):
+    def __init__(self):
+        super().__init__()
+        self.trainer = torch.nn.Linear(10, 10).to("cuda")
+        self.trainer.weight.data.zero_()
+    @endpoint
+    async def init(self, gen):
+        ranks = current_rank()
+        self.gen = gen.slice(**ranks)
+    @endpoint
+    async def exchange_metadata(self):
+        byte_tensor = self.trainer.weight.data.view(torch.uint8).flatten()
+        self.handle = RDMABuffer(byte_tensor)
+        await self.gen.attach_weight_buffer.call(self.handle)
+    @endpoint
+    async def weights_ready(self):
+        self.trainer.weight.data.add_(1.0)
+class GeneratorActor(Actor):
+    def __init__(self):
+        super().__init__()
+        self.generator = torch.nn.Linear(10, 10).to("cuda")
+        self.step = 0
+    @endpoint
+    async def init(self, trainer):
+        ranks = current_rank()
+        self.trainer = trainer.slice(**ranks)
+    @endpoint
+    async def attach_weight_buffer(self, handle):
+        self.handle = handle
+    @endpoint
+    async def update_weights(self):
+        self.step += 1
+        byte_tensor = self.generator.weight.data.view(torch.uint8).flatten()
+        await self.handle.read_into(byte_tensor)
+        assert (
+            torch.sum(self.generator.weight.data) == self.step * 100
+        ), f"{torch.sum(self.generator.weight.data)=}, {self.step=}"
+async def test_gpu_trainer_generator():
+    trainer_proc = await proc_mesh(gpus=1)
+    gen_proc = await proc_mesh(gpus=1)
+    trainer = await trainer_proc.spawn("trainer", TrainerActor)
+    generator = await gen_proc.spawn("gen", GeneratorActor)
+    await generator.init.call(trainer)
+    await trainer.init.call(generator)
+    await trainer.exchange_metadata.call()
+    for _ in range(3):
+        await trainer.weights_ready.call()
+        await generator.update_weights.call()
+class SyncActor(Actor):
+    @endpoint
+    def sync_endpoint(self, a_counter: Counter):
+        return a_counter.value.choose().get()
+async def test_sync_actor():
+    proc = await local_proc_mesh(gpus=2)
+    a = await proc.spawn("actor", SyncActor)
+    c = await proc.spawn("counter", Counter, 5)
+    r = await a.sync_endpoint.choose(c)
+    assert r == 5
+def test_gpu_trainer_generator_sync() -> None:
+    trainer_proc = proc_mesh(gpus=1).get()
+    gen_proc = proc_mesh(gpus=1).get()
+    trainer = trainer_proc.spawn("trainer", TrainerActor).get()
+    generator = gen_proc.spawn("gen", GeneratorActor).get()
+    generator.init.call(trainer).get()
+    trainer.init.call(generator).get()
+    trainer.exchange_metadata.call().get()
+    for _ in range(3):
+        trainer.weights_ready.call().get()
+        generator.update_weights.call().get()
+def test_sync_actor_sync_client():
+    proc = local_proc_mesh(gpus=2).get()
+    a = proc.spawn("actor", SyncActor).get()
+    c = proc.spawn("counter", Counter, 5).get()
+    r = a.sync_endpoint.choose(c).get()
+    assert r == 5
+def test_rank_size_sync() -> None:
+    proc = local_proc_mesh(gpus=2).get()
+    r = proc.spawn("runit", RunIt).get()
+    acc = Accumulator(r.run, 0, operator.add)
+    assert 1 == acc.accumulate(lambda: current_rank()["gpus"]).get()
+    assert 4 == acc.accumulate(lambda: current_size()["gpus"]).get()
+def test_accumulate_sync() -> None:
+    proc = local_proc_mesh(gpus=2).get()
+    counter = proc.spawn("counter", Counter, 1).get()
+    counter.incr.broadcast()
+    acc = Accumulator(counter.value, 0, operator.add)
+    r = acc.accumulate().get()
+    assert r == 4
+class CastToCounter(Actor):
+    @endpoint
+    def doit(self, c: Counter):
+        return list(c.value.call().get())
+def test_value_mesh() -> None:
+    proc = local_proc_mesh(gpus=2).get()
+    counter = proc.spawn("counter", Counter, 0).get()
+    counter.slice(hosts=0, gpus=1).incr.broadcast()
+    x = counter.value.call().get()
+    assert 0 == x.item(hosts=0, gpus=0)
+    assert 1 == x.item(hosts=0, gpus=1)
+    assert 1 == x.slice(hosts=0, gpus=1).item()
+    n = proc.spawn("ctc", CastToCounter).get()
+    assert list(x) == n.slice(gpus=0).doit.call_one(counter).get()
+def test_rust_binding_modules_correct() -> None:
+    import monarch._rust_bindings as bindings
+    def check(module, path):
+        for name, value in module.__dict__.items():
+            if name.startswith("__"):
+                continue
+            if isinstance(value, ModuleType):
+                check(value, f"{path}.{name}")
+            elif hasattr(value, "__module__"):
+                assert value.__name__ == name
+                assert value.__module__ == path
+    check(bindings, "monarch._rust_bindings")