PyPI - torchmonarch-nightly - Versions diffs - 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +58 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +230 -0
monarch/actor_mesh.py +761 -0
monarch/allocator.py +220 -0
monarch/bootstrap_main.py +59 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +68 -0
monarch/cached_remote_function.py +257 -0
monarch/code_sync.py +10 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +690 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +417 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +573 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +297 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +229 -0
monarch/common/stream.py +114 -0
monarch/common/tensor.py +814 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/debugger.py +379 -0
monarch/fetch.py +55 -0
monarch/future.py +76 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/mesh_controller.py +271 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/pdb_wrapper.py +135 -0
monarch/proc_mesh.py +299 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +162 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +359 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/telemetry.py +19 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +251 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +58 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +165 -0
monarch/tools/network.py +69 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +180 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +240 -0
tests/test_alloc.py +25 -0
tests/test_allocator.py +365 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +845 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +736 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +217 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
tests/test_tensor_engine.py +52 -0
torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0

tests/test_controller.py ADDED Viewed

@@ -0,0 +1,845 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import itertools
+import logging
+import re
+import sys
+import traceback
+from contextlib import contextmanager
+import monarch
+import monarch.random
+import pytest
+import torch
+from monarch import (
+    DeviceMesh,
+    fetch_shard,
+    grad_function,
+    grad_generator,
+    no_mesh,
+    Stream,
+    Tensor,
+)
+from monarch._testing import BackendType, TestingContext
+from monarch.common.controller_api import LogMessage
+from monarch.common.invocation import DeviceException
+from monarch.common.remote import remote
+from monarch.common.tree import flattener
+from monarch.rust_local_mesh import (
+    ControllerParams,
+    local_mesh,
+    local_meshes_and_bootstraps,
+    LoggingLocation,
+    SocketType,
+    SupervisionParams,
+)
+from monarch_supervisor.logging import fix_exception_lines
+def custom_excepthook(exc_type, exc_value, exc_traceback):
+    tb_lines = fix_exception_lines(
+        traceback.format_exception(exc_type, exc_value, exc_traceback)
+    )
+    print("\n".join(tb_lines), file=sys.stderr)
+sys.excepthook = custom_excepthook
+@pytest.fixture(scope="module", autouse=True)
+def testing_context():
+    global local
+    with TestingContext() as local:
+        yield
+@contextmanager
+def local_rust_device_mesh(
+    hosts,
+    gpu_per_host,
+    activate: bool = True,
+    controller_params: ControllerParams | None = None,
+):
+    with local_mesh(
+        hosts=hosts,
+        gpus_per_host=gpu_per_host,
+        socket_type=SocketType.UNIX,
+        logging_location=LoggingLocation.FILE,
+        controller_params=controller_params,
+    ) as dm:
+        try:
+            if activate:
+                with dm.activate():
+                    yield dm
+            else:
+                yield dm
+            dm.exit()
+        except Exception:
+            dm.client._shutdown = True
+            raise
+panic = remote("__test_panic", propagate="inspect")
+remote_sleep = remote("time.sleep", propagate="inspect")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"])
+# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
+# out is not counted as a failure, so we set a more restrictive timeout to
+# ensure we see a hard failure in CI.
+@pytest.mark.timeout(120)
+class TestController:
+    @classmethod
+    def local_device_mesh(
+        cls,
+        N,
+        gpu_per_host,
+        backend_type,
+        activate=True,
+    ):
+        return local.local_device_mesh(
+            N,
+            gpu_per_host,
+            activate,
+            backend=str(backend_type),
+        )
+    def test_errors(self, backend_type):
+        t = torch.rand(3, 4)
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            y = torch.rand(3, 4)
+            with pytest.raises(TypeError, match="LOCAL_TENSOR"):
+                t.add(y)
+            with pytest.raises(TypeError, match="WRONG_MESH"):
+                sm = device_mesh.slice(host=0)
+                with sm.activate():
+                    x = torch.rand(3, 4)
+                    x.add(y)
+            other = Stream("other")
+            t = torch.rand(10).cuda()
+            with pytest.raises(TypeError, match="WRONG_STREAM"):
+                with other.activate():
+                    t = t.reduce("host", "sum")
+    def test_sub_mesh(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            h0 = device_mesh.slice(host=0)
+            h1 = device_mesh.slice(host=1)
+            with h0.activate():
+                _ = torch.rand(3, 4)
+            with h1.activate():
+                _ = torch.rand(3, 4)
+                # Runs on a different mesh but should still work
+    def test_fetch_result_device(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            on_gpu = torch.ones(2, 3, device="cuda")
+            on_cpu = torch.ones(2, 3, device="cpu")
+            on_gpu_local = fetch_shard(on_gpu).result()
+            on_cpu_local = fetch_shard(on_cpu).result()
+        assert on_gpu_local.device == torch.device("cpu")
+        assert on_cpu_local.device == torch.device("cpu")
+    def test_dim1_mesh(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh:
+            mesh3d = device_mesh.split(host=("oh", "ih"), ih=1)
+            with mesh3d.activate():
+                x = torch.ones(3, 4)
+                local_x = fetch_shard(x).result()
+        assert torch.equal(local_x, torch.ones(3, 4))
+    def test_sub_mesh_use_only_one(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh:
+            h0 = device_mesh.slice(host=0)
+            with h0.activate():
+                x = torch.ones(3, 4)
+                local_x = fetch_shard(x)
+            local_x = local_x.result(timeout=20)
+            assert torch.equal(local_x, torch.ones(3, 4))
+    def test_sub_mesh_process_grop(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh:
+            h0 = device_mesh.slice(host=0)
+            pg0 = h0.process_group(("gpu",))
+            pg1 = h0.process_group(("gpu",))
+            # Is there a way to functionally test that these two PG's aren't
+            # the same in the backend?
+            assert pg0 != pg1
+    def test_reduce(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            x = (
+                12 * 2 * device_mesh.rank("host")
+                + 12 * device_mesh.rank("gpu")
+                + torch.arange(12, device="cuda").reshape(3, 4)
+            )
+            y = x.reduce("gpu", "sum")
+            g = x.reduce("gpu", "stack")
+            with pytest.raises(TypeError, match="When scattering"):
+                x = x.reduce("gpu", "sum", scatter=True)
+            x = x.reshape(2, 6)
+            atoa = x.reduce("gpu", "stack", scatter=True)
+            rs = x.reduce("gpu", "sum", scatter=True)
+            rad = x.reduce((), "sum")
+            rade = x.reduce(("gpu", "host"), "sum")
+            with pytest.raises(
+                ValueError, match="is not valid for multiple dimensions"
+            ):
+                x.reduce((), "sum", scatter=True)
+            with pytest.raises(
+                ValueError, match="is not valid for multiple dimensions"
+            ):
+                x.reduce((), "stack")
+            with pytest.raises(
+                ValueError, match="is not valid for multiple dimensions"
+            ):
+                x.reduce((), "stack", scatter=True)
+            y_local = fetch_shard(y).result()
+            g_local = fetch_shard(g).result()
+            # TODO compute the expected values to compare agains in the below section
+            _ = fetch_shard(atoa).result()
+            _ = fetch_shard(rs).result()
+            rad_local = fetch_shard(rad).result()
+            rade_local = fetch_shard(rade).result()
+        xs = {
+            (h, g): 12 * 2 * h + 12 * g + torch.arange(12, device="cpu").reshape(3, 4)
+            for h, g in itertools.product(range(2), range(2))
+        }
+        y_expected = xs[(0, 0)] + xs[(0, 1)]
+        g_expected = torch.stack([xs[(0, 0)], xs[(0, 1)]])
+        assert torch.equal(y_local, y_expected)
+        assert torch.equal(g_local, g_expected)
+        rad_expected = (xs[(0, 0)] + xs[(0, 1)] + xs[(1, 0)] + xs[(1, 1)]).reshape(
+            rad_local.shape
+        )
+        assert torch.equal(rad_local, rad_expected)
+        assert torch.equal(rade_local, rad_expected)
+        # test is run on 4 GPUs, can't have mesh with 3 non-trivial dimensions
+        with self.local_device_mesh(2, 2, backend_type, activate=False) as mesh2d:
+            device_mesh = mesh2d.split(host=("oh", "ih"), ih=1)
+            with device_mesh.activate():
+                x = (
+                    12 * 2 * device_mesh.rank("oh")
+                    + 12 * device_mesh.rank("gpu")
+                    + torch.arange(12, device="cuda").reshape(3, 4)
+                )
+                y = x.reduce(("ih", "gpu"), "sum")
+                y_local = fetch_shard(y).result()
+                z = x.reduce(("oh", "gpu"), "sum")
+                z_local = fetch_shard(z).result()
+        assert torch.equal(y_local, y_expected)
+        assert torch.equal(z_local, rad_expected.reshape(z_local.shape))
+    def test_reduce_out(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            inp = torch.rand(2, 4, device="cuda")
+            out_incorrect = torch.rand(2, 4, device="cuda")
+            out = torch.rand(4, device="cuda")
+            with pytest.raises(
+                ValueError, match="Reduce expects the shape to be torch.Size."
+            ):
+                _ = inp.reduce("host", reduction="sum", scatter=True, out=out_incorrect)
+            reduce_out = inp.reduce("host", reduction="sum", scatter=True)
+            local_out = fetch_shard(out).result()
+            local_reduce_out = fetch_shard(reduce_out).result()
+            assert out._fake is not reduce_out._fake
+            with no_mesh.activate():
+                assert not torch.equal(local_out, local_reduce_out)
+            reduce_out = inp.reduce("host", reduction="sum", scatter=True, out=out)
+            local_out = fetch_shard(out).result()
+            local_reduce_out = fetch_shard(reduce_out).result()
+            assert out._fake is reduce_out._fake
+            with no_mesh.activate():
+                assert torch.equal(local_out, local_reduce_out)
+    def test_fetch(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            h = device_mesh.rank("host")
+            g = device_mesh.rank("gpu")
+            for hi in range(2):
+                for gi in range(2):
+                    x, y = fetch_shard((h, g), {"host": hi, "gpu": gi}).result()
+                    with no_mesh.activate():
+                        assert (hi, gi) == (x.item(), y.item())
+    def test_mutate(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            x = torch.rand(3, 4).cuda()
+            x.abs_()
+            s = Stream("other")
+            b, drop = s.borrow(x)
+            with pytest.raises(TypeError, match="would be mutated"):
+                x.abs_()
+            with s.activate():
+                _ = b.add(b)
+            drop.drop()
+            x.abs_()
+            b, drop = s.borrow(x, mutable=True)
+            with s.activate():
+                b.abs_()
+            drop.drop()
+            # del b
+            x.abs_()
+    def test_movement(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            sm0 = device_mesh.slice(host=0)
+            sm1 = device_mesh.slice(host=1)
+            with sm0.activate():
+                x = torch.rand(3, 4, device="cuda")
+                _ = x.to_mesh(sm1)
+            a = torch.rand(3, 4, device="cuda")
+            b = a.slice_mesh(host=0)
+            _ = b.to_mesh(sm0)
+            _ = b.to_mesh(sm1)
+    def test_broadcast_one(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            for dim in ("host", "gpu"):
+                subset = device_mesh.slice(**{dim: 1})
+                with subset.activate():
+                    x = torch.rand(3, device="cuda")
+                    y = x.to_mesh(device_mesh)
+                with subset.activate():
+                    a = monarch.inspect(x)
+                with device_mesh.activate():
+                    b = monarch.inspect(y.reduce(dim, reduction="stack"))
+                with no_mesh.activate():
+                    assert torch.allclose(a.expand(2, -1), b, rtol=0, atol=0)
+    def test_broadcast_two(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            subset = device_mesh.slice(host=1, gpu=1)
+            with subset.activate():
+                x = torch.rand(3, device="cuda")
+                y = x.to_mesh(device_mesh)
+            with subset.activate():
+                a = monarch.inspect(x)
+            with device_mesh.activate():
+                b = monarch.inspect(
+                    y.reduce("host", reduction="stack").reduce("gpu", reduction="stack")
+                )
+            with no_mesh.activate():
+                assert torch.allclose(a.expand(2, 2, -1), b, rtol=0, atol=0)
+    def test_autograd(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            x = torch.rand(3, 4, requires_grad=True)
+            y = torch.rand(4, 3, requires_grad=True)
+            z = torch.rand(3, requires_grad=True)
+            foo = (x @ y + z).sum()
+            with no_mesh.activate():
+                # check backward restores forward mesh
+                for t in grad_generator(foo, [z, y, x]):
+                    with device_mesh.activate():
+                        fetch_shard(t).result()
+    def test_mesh_semantics(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            host0 = device_mesh.slice(host=0)
+            host1 = device_mesh.slice(host=1)
+            with host0.activate():
+                x = torch.randn(5)
+            y = x * 5
+            with host1.activate():
+                a = torch.randn(5)
+                b = a * 5
+                x.cos()
+            y.cos()
+            b.cos()
+    def test_autograd_multi_mesh(self, backend_type):
+        @grad_function
+        def to_mesh(x: Tensor, mesh: DeviceMesh):
+            omesh = x.mesh
+            def backward(grad_x: Tensor):
+                print(grad_x.mesh, omesh)
+                return grad_x.to_mesh(omesh), None
+            return x.to_mesh(mesh), backward
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            host0 = device_mesh.slice(host=0)
+            host1 = device_mesh.slice(host=1)
+            with host0.activate():
+                x = torch.rand(3, 4, requires_grad=True, device="cuda")
+                y = torch.rand(4, 3, requires_grad=True, device="cuda")
+                t = x @ y
+                t = to_mesh(t, host1)
+            with host1.activate():
+                z = torch.rand(3, requires_grad=True, device="cuda")
+                foo = (t + z).sum()
+            for r in grad_generator(foo, [z, y, x]):
+                with r.mesh.activate():
+                    print(fetch_shard(r).result())
+    def test_many(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            x = torch.rand(3, 4)
+            for _ in range(2048):
+                x = x + torch.rand(3, 4)
+            fetch_shard(x).result()
+    def test_flattener(self, backend_type):
+        e = (8, 9, {"a": 10, "b": 11})
+        flatten = flattener(e)
+        e2 = (0, 1, {"a": 2, "b": 3})
+        assert [0, 1, 2, 3] == flatten(e2)
+    def test_torch_tensor(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            t = torch.tensor([1, 2, 4])
+            tc = torch.tensor([1, 2, 4], device="cuda")
+            t2 = fetch_shard(t).result()
+            tc2 = fetch_shard(tc).result()
+        assert torch.allclose(t2, torch.tensor([1, 2, 4]))
+        assert torch.allclose(tc2, torch.tensor([1, 2, 4], device="cpu"))
+    def test_to_mesh_aliasing(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as mesh:
+            p2p_stream = Stream("p2p_stream")
+            ppmesh = mesh.flatten("all").split(
+                all=(
+                    "dp",
+                    "pp",
+                ),
+                pp=2,
+            )
+            pp_meshes = [ppmesh.slice(pp=i) for i in range(2)]
+            with ppmesh.activate():
+                with pp_meshes[0].activate():
+                    x = torch.randn((3, 3), device="cuda")
+                    x_borrowed_tensor, x_borrow = p2p_stream.borrow(x)
+                    with p2p_stream.activate():
+                        y_on_mesh_1_p2p_stream = x_borrowed_tensor.to_mesh(pp_meshes[1])
+                with pp_meshes[1].activate():
+                    x_borrow.drop()
+                    y_on_mesh_1_default_stream, y_borrow = (
+                        monarch.get_active_stream().borrow(y_on_mesh_1_p2p_stream)
+                    )
+                    monarch.inspect(y_on_mesh_1_default_stream)
+                    y_borrow.drop()
+    def test_to_mesh_cow(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as mesh:
+            t = torch.zeros((), device="cuda")
+            t2 = t.to_mesh(mesh)
+            t.add_(1)
+            assert monarch.inspect(t2).item() == 0
+            assert monarch.inspect(t).item() == 1
+    def test_to_mesh_stream(self, backend_type):
+        other = monarch.Stream("other")
+        with self.local_device_mesh(2, 2, backend_type) as mesh:
+            m0 = mesh.slice(host=0)
+            m1 = mesh.slice(host=1)
+            with m0.activate():
+                t2 = torch.rand(3, 4, device="cuda").to_mesh(m1, stream=other)
+            with m1.activate(), other.activate():
+                # assert doesn't fail
+                monarch.inspect(t2 + t2)
+    def test_dropped_trace(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as _:
+            x = torch.rand(4, 4).cuda()
+            s = Stream("other")
+            b, drop = s.borrow(x)
+            drop.drop()
+            with s.activate():
+                pattern = re.compile(
+                    ".*tensor.*is dropped at.*.*drop.drop().*", flags=re.DOTALL
+                )
+                with pytest.raises(TypeError, match=pattern):
+                    _ = b.abs()
+    def test_sub_mesh_reduce(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            host1 = device_mesh.slice(host=1)
+            with host1.activate():
+                myrank = (
+                    (device_mesh.rank("host") + 1) * 2 + device_mesh.rank("gpu") + 1
+                )
+                x = torch.ones((3, 4), device="cuda") * myrank
+                reduce = x.reduce("gpu", "sum")
+                local_reduce = fetch_shard(reduce).result()
+        assert torch.equal(local_reduce, torch.ones(3, 4) * 11)
+    def test_size(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            assert device_mesh.size(["host", "gpu"]) == 4
+    def test_random_state(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            monarch.random.make_deterministic()
+            for device in ("cpu", "cuda"):
+                a = monarch.random.get_state()
+                monarch.inspect(a)
+                first = torch.rand(1, device=device)
+                monarch.random.set_state(a)
+                second = torch.rand(1, device=device)
+                f, s = monarch.inspect((first, second))
+                with no_mesh.activate():
+                    assert torch.allclose(f, s, atol=0, rtol=1)
+                seed = device_mesh.rank(["host", "gpu"]) + 4
+                s2 = monarch.random.new_state(seed)
+                s3 = monarch.random.new_state(seed)
+                monarch.random.set_state(s2)
+                r0 = torch.rand(1, device=device)
+                if device == "cuda":
+                    for d in ("host", "gpu"):
+                        r0 = r0.reduce(d, reduction="stack")
+                monarch.random.set_state(s3)
+                r1 = torch.rand(1, device=device)
+                if device == "cuda":
+                    for d in ("host", "gpu"):
+                        r1 = r1.reduce(d, reduction="stack")
+                r2, r3 = monarch.inspect((r0, r1))
+                monarch.random.set_state(a)
+                with no_mesh.activate():
+                    assert torch.allclose(r2, r3, atol=0, rtol=0)
+                    assert not torch.allclose(r2, f, atol=0, rtol=0)
+    def test_torch_op_with_optional_tensors(self, backend_type):
+        """
+        This test ensures that for torch ops like LayerNorm, which allow for
+        optional tensor arguments, the controller serializes monarch tensors
+        correctly as Refs instead of as IValues.
+        """
+        with self.local_device_mesh(2, 2, backend_type):
+            x = torch.rand(3, 4, device="cuda")
+            # When bias and elementwise_affine are true, extra tensors are passed through optional
+            # fields inside LayerNorm. When they are false, None is passed to the same optional fields.
+            # If we are handling serialization correctly, there shouldn't be a crash in either case.
+            layer_norm_with_vals = torch.nn.LayerNorm(
+                4, device="cuda", bias=True, elementwise_affine=True
+            )
+            layer_norm_with_none = torch.nn.LayerNorm(
+                4, device="cuda", bias=False, elementwise_affine=False
+            )
+            monarch.inspect(layer_norm_with_vals(x))
+            monarch.inspect(layer_norm_with_none(x))
+    def test_reduce_pytree(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            a = device_mesh.rank(("gpu", "host")) + torch.zeros((1,), device="cuda")
+            b = device_mesh.rank(("gpu", "host")) + torch.ones((1,), device="cuda")
+            tensor_dict = {"a": a, "b": b}
+            _ = monarch.reduce_(tensor_dict, dims=("gpu", "host"), reduction="sum")
+            reduced_tensor_dict = monarch.reduce(
+                tensor_dict, dims=("gpu", "host"), reduction="sum"
+            )
+            reduced_a = fetch_shard(reduced_tensor_dict["a"]).result()
+            reduced_b = fetch_shard(reduced_tensor_dict["b"]).result()
+            reduced_a_inplace = fetch_shard(tensor_dict["a"]).result()
+            reduced_b_inplace = fetch_shard(tensor_dict["b"]).result()
+        assert torch.equal(reduced_a_inplace, torch.tensor([6.0]))
+        assert torch.equal(reduced_b_inplace, torch.tensor([10.0]))
+        assert torch.equal(reduced_a, torch.tensor([24.0]))
+        assert torch.equal(reduced_b, torch.tensor([40.0]))
+    def test_to_mesh_pytree(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            host0 = device_mesh.slice(host=0)
+            host1 = device_mesh.slice(host=1)
+            with host0.activate():
+                a = torch.zeros((1,), device="cuda")
+                b = torch.ones((1,), device="cuda")
+                tensor_dict = {"a": a, "b": b}
+                moved_tensor_dict = monarch.to_mesh(tensor_dict, host1)
+            with host1.activate():
+                moved_tensor_dict["a"].add_(1)
+                moved_tensor_dict["b"].add_(1)
+            moved_tensor_a = monarch.inspect(moved_tensor_dict["a"])
+            moved_tensor_b = monarch.inspect(moved_tensor_dict["b"])
+            host0.exit()
+            host1.exit()
+        assert torch.equal(moved_tensor_a, torch.tensor([1.0]))
+        assert torch.equal(moved_tensor_b, torch.tensor([2.0]))
+    def test_hanging_error(self, backend_type):
+        if backend_type != "mesh":
+            pytest.skip("only relevant for mesh backend")
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)()
+            with pytest.raises(Exception, match="The size of tensor"):
+                device_mesh.client.shutdown()
+    def test_slice_mesh_pytree(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            a = device_mesh.rank(("host")) + torch.zeros((1,), device="cuda")
+            b = device_mesh.rank(("host")) + torch.ones((1,), device="cuda")
+            tensor_dict = {"a": a, "b": b}
+            host0_slices = monarch.slice_mesh(tensor_dict, host=0)
+            host1_slices = monarch.slice_mesh(tensor_dict, host=1)
+            host0 = device_mesh.slice(host=0)
+            host1 = device_mesh.slice(host=1)
+            host0_tensors = monarch.to_mesh(host0_slices, host0)
+            host1_tensors = monarch.to_mesh(host1_slices, host1)
+            with host0.activate():
+                _ = monarch.reduce_(host0_tensors, dims=("gpu"), reduction="sum")
+                host0_a = fetch_shard(host0_tensors["a"]).result()
+                host0_b = fetch_shard(host0_tensors["b"]).result()
+            with host1.activate():
+                _ = monarch.reduce_(host1_tensors, dims=("gpu"), reduction="sum")
+                host1_a = fetch_shard(host1_tensors["a"]).result()
+                host1_b = fetch_shard(host1_tensors["b"]).result()
+            host0.exit()
+            host1.exit()
+        assert torch.equal(host0_a, torch.tensor([0.0]))
+        assert torch.equal(host0_b, torch.tensor([2.0]))
+        assert torch.equal(host1_a, torch.tensor([2.0]))
+        assert torch.equal(host1_b, torch.tensor([4.0]))
+def test_panicking_worker():
+    with pytest.raises(DeviceException, match="__test_panic called"):
+        with local_rust_device_mesh(1, 1) as _:
+            panic()
+            # induce a sync to allow the panic to propagate back
+            _ = fetch_shard(torch.ones(2, 3)).result()
+def test_timeout_warning(caplog):
+    timeout = 3
+    with local_rust_device_mesh(
+        1,
+        2,
+        True,
+        controller_params=ControllerParams(1, timeout, 100, False),
+    ) as dm:
+        for _ in range(3):
+            dm.client.new_node([], [])
+        assert dm.client.inner.next_message(timeout * 3) is None
+        remote_sleep(timeout * 2)
+        for _ in range(3):
+            dm.client.new_node([], [])
+        with caplog.at_level(logging.WARNING, logger=dm.client.__module__):
+            has_message = dm.client.handle_next_message(120)
+            assert has_message
+        assert (
+            f"ranks 1, 0 have operations that have not completed after {timeout} seconds"
+            in caplog.text
+        ) or (
+            f"ranks 0, 1 have operations that have not completed after {timeout} seconds"
+            in caplog.text
+        )
+def test_timeout_failure():
+    timeout = 3
+    with local_rust_device_mesh(
+        1,
+        1,
+        True,
+        controller_params=ControllerParams(1, timeout, 100, True),
+    ) as dm:
+        for _ in range(3):
+            dm.client.new_node([], [])
+        assert dm.client.inner.next_message(timeout * 3) is None
+        remote_sleep(timeout * 2)
+        for _ in range(3):
+            dm.client.new_node([], [])
+        for _ in range(5):
+            result = dm.client.inner.next_message(1)
+            if result is None:
+                continue
+            if isinstance(result, LogMessage):
+                continue
+            if result.error is None:
+                continue
+            assert isinstance(result.error, DeviceException)
+            assert "crashed" in result.error.message in result.error.message
+            assert "mesh_0_worker[0].worker[0]" in result.error.message
+            assert (
+                f"ranks 0 have operations that have not completed after {timeout} seconds"
+                in result.error.frames[0].name
+            )
+def test_supervision_heartbeat_failure():
+    (dms, bootstrap) = local_meshes_and_bootstraps(
+        meshes=1,
+        hosts_per_mesh=1,
+        gpus_per_host=2,
+        socket_type=SocketType.UNIX,
+        logging_location=LoggingLocation.DEFAULT,
+        supervision_params=SupervisionParams(
+            # Set a low timeout so heatbeat failure can be detected faster.
+            update_timeout_in_sec=10,
+            query_interval_in_sec=1,
+            update_interval_in_sec=1,
+        ),
+    )
+    assert len(dms) == 1
+    dm = dms[0]
+    # Kill a process of a worker actor. This should trigger supervision
+    # heartbeat failure event.
+    # Index 0 and 1 are system process and controller process respectively.
+    process = bootstrap.processes[2]
+    process.kill()
+    for _ in range(20):
+        # poll the next message in order to get the supervision failure
+        result = dm.client.inner.next_message(3)
+        if result is None:
+            continue
+        if result.error is None:
+            continue
+        assert isinstance(result.error, DeviceException)
+        assert "crashed" in result.error.message
+        return
+    dm.exit()
+    raise AssertionError("Should have failed supervision health check")
+def test_supervision_system_actor_down():
+    (dms, bootstrap) = local_meshes_and_bootstraps(
+        meshes=1,
+        hosts_per_mesh=1,
+        gpus_per_host=2,
+        socket_type=SocketType.UNIX,
+        logging_location=LoggingLocation.DEFAULT,
+        supervision_params=SupervisionParams(
+            # Set a low timeout so heatbeat failure can be detected faster.
+            update_timeout_in_sec=10,
+            query_interval_in_sec=1,
+            update_interval_in_sec=1,
+        ),
+    )
+    assert len(dms) == 1
+    dm = dms[0]
+    # Index 0 is system process
+    process = bootstrap.processes[0]
+    process.kill()
+    try:
+        for _ in range(20):
+            # poll the next message in order to get the supervision failure
+            dm.client.inner.next_message(3)
+    except RuntimeError as e:
+        assert "actor has been stopped" in str(e)
+        return
+    dm.exit()
+    raise AssertionError("Should have failed supervision health check")
+def test_supervision_controller_actor_down():
+    (dms, bootstrap) = local_meshes_and_bootstraps(
+        meshes=1,
+        hosts_per_mesh=1,
+        gpus_per_host=2,
+        socket_type=SocketType.UNIX,
+        logging_location=LoggingLocation.DEFAULT,
+        supervision_params=SupervisionParams(
+            # Set a low timeout so heatbeat failure can be detected faster.
+            update_timeout_in_sec=10,
+            query_interval_in_sec=1,
+            update_interval_in_sec=1,
+        ),
+    )
+    assert len(dms) == 1
+    dm = dms[0]
+    # Index 1 is controller process
+    process = bootstrap.processes[1]
+    process.kill()
+    for _ in range(20):
+        # poll the next message in order to get the supervision failure
+        result = dm.client.inner.next_message(3)
+        if result is None:
+            continue
+        if result.error is None:
+            continue
+        assert isinstance(result.error, DeviceException)
+        assert "mesh_0_controller[0].controller[0] crashed" in result.error.message
+        return
+    dm.exit()
+    raise AssertionError("Should have failed supervision health check")
+def a_function_called_by_a_live_function(x):
+    return 2 * x
+def a_live_function_call_by_a_live_function(x):
+    return 3 * x
+def test_delete_refs():
+    with local_mesh(
+        hosts=2,
+        gpus_per_host=2,
+        socket_type=SocketType.UNIX,
+        logging_location=LoggingLocation.DEFAULT,
+    ) as dm:
+        dm.client.delete_ref(dm, 1)
+        dm.client.delete_ref(dm, 2)
+        assert len(dm.client._pending_del[dm]) == 2
+        dm.client.flush_deletes()
+        assert len(dm.client._pending_del[dm]) == 0