PyPI - torchmonarch-nightly - Versions diffs - 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.4__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +74 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +198 -0
monarch/actor_mesh.py +692 -0
monarch/allocator.py +62 -0
monarch/bootstrap_main.py +75 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +69 -0
monarch/cached_remote_function.py +257 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +646 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +443 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +572 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +304 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +204 -0
monarch/common/stream.py +111 -0
monarch/common/tensor.py +793 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/fetch.py +55 -0
monarch/future.py +25 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/proc_mesh.py +188 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +190 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +357 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +189 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +57 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +121 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +139 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +112 -0
tests/test_alloc.py +25 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +835 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +372 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +182 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
torchmonarch_nightly-2025.6.4.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.4.dist-info/RECORD +157 -0
torchmonarch_nightly-2025.6.4.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.4.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.4.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.4.dist-info/top_level.txt +3 -0

tests/test_remote_functions.py ADDED Viewed

@@ -0,0 +1,1271 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import itertools
+import math
+import sys
+import traceback
+from enum import Enum
+from typing import Callable, ContextManager, Tuple
+from unittest.mock import patch
+import monarch
+import pytest
+import torch
+from monarch import (
+    fetch_shard,
+    inspect,
+    no_mesh,
+    OpaqueRef,
+    Pipe,
+    remote,
+    remote_generator,
+    RemoteException,
+    Stream,
+)
+from monarch._testing import BackendType, TestingContext
+from monarch.builtins.log import log_remote
+from monarch.builtins.random import set_manual_seed_remote
+from monarch.cached_remote_function import remote_autograd_function
+from monarch.common import remote as remote_module
+from monarch.common.device_mesh import DeviceMesh
+from monarch.common.remote import Remote
+from monarch.opaque_module import OpaqueModule
+from monarch.opaque_object import opaque_method, OpaqueObject
+from monarch.worker._testing_function import (
+    all_gather,
+    all_gather_into_tensor,
+    all_reduce,
+    all_to_all,
+    all_to_all_single,
+    barrier,
+    broadcast,
+    gather,
+    irecv,
+    isend,
+    reduce,
+    reduce_scatter,
+    reduce_scatter_tensor,
+    scatter,
+)
+from monarch_supervisor.logging import fix_exception_lines
+from torch.distributed import ReduceOp
+def custom_excepthook(exc_type, exc_value, exc_traceback):
+    tb_lines = fix_exception_lines(
+        traceback.format_exception(exc_type, exc_value, exc_traceback)
+    )
+    print("\n".join(tb_lines), file=sys.stderr)
+sys.excepthook = custom_excepthook
+def _set_device_udf(*args):
+    return torch.zeros(1)
+set_device_udf = remote(
+    "monarch.worker._testing_function.set_device_udf_worker", propagate=_set_device_udf
+)
+rlist = remote("builtins.list", propagate=lambda elem: elem)
+def _do_bogus_tensor_work(x, y, fail_rank=None):
+    return x + y  # real function actually does x @ y
+do_bogus_tensor_work = remote(
+    "monarch.worker._testing_function.do_bogus_tensor_work",
+    propagate=_do_bogus_tensor_work,
+)
+@remote_generator("monarch.worker._testing_function.example_echo_add")
+def example_echo_add(p: "Pipe"):
+    while True:
+        yield p.recv() + 1
+@remote_generator("monarch.worker._testing_function.example_data_loader")
+def example_data_loader(p: "Pipe", x, y):
+    for _i in range(x, y):
+        yield torch.zeros(())
+@remote_generator(
+    "monarch.worker._testing_function.example_data_loader_small_pipe",
+    max_messages=1,
+)
+def example_data_loader_small_pipe(p: "Pipe", iters: int, shape: Tuple[int, int]):
+    for _i in range(iters):
+        yield torch.zeros(shape)
+sleep = remote("monarch.worker._testing_function.remote_sleep", propagate="inspect")
+new_barrier_hackery = remote(
+    "monarch.worker._testing_function.new_barrier_hackery",
+    propagate=lambda threads: torch.zeros(1),
+)
+wait_barrier_hackery = remote(
+    "monarch.worker._testing_function.wait_barrier_hackery",
+    propagate=lambda t: None,
+)
+setup_state = remote(
+    "monarch.worker._testing_function.setup_state_worker",
+    propagate=lambda: [OpaqueRef(None) for _ in range(4)],
+)
+iteration = remote(
+    "monarch.worker._testing_function.iteration_worker",
+    propagate=lambda model, dataloader, criterion, optimizer, pg: torch.zeros(1),
+)
+opaque_ref_key_table_length = remote(
+    "monarch.worker._testing_function.opaque_ref_key_table_length_worker",
+    propagate=lambda: torch.zeros(1),
+)
+create_opaque_ref = remote(
+    "monarch.worker._testing_function.create_opaque_ref_worker",
+    propagate=lambda: OpaqueRef(None),
+)
+outer_remote_function_that_calls_inner = remote(
+    "monarch.worker._testing_function.outer_remote_function_that_calls_inner",
+    propagate=lambda: torch.zeros(1),
+)
+@pytest.fixture(scope="module", autouse=True)
+def testing_context():
+    global local
+    with TestingContext() as local:
+        yield
+class RemoteFunctionsTestBase:
+    @classmethod
+    def local_device_mesh(
+        cls,
+        num_hosts: int,
+        gpu_per_host: int,
+        backend_type: BackendType,
+        activate: bool = True,
+    ) -> ContextManager[DeviceMesh]:
+        # pyre-fixme[10]: pytest defines this fixture.
+        return local.local_device_mesh(
+            num_hosts,
+            gpu_per_host,
+            activate,
+            rust=backend_type == BackendType.RS,
+        )
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
+# out is not counted as a failure, so we set a more restrictive timeout to
+# ensure we see a hard failure in CI.
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
+class TestRemoteFunctions(RemoteFunctionsTestBase):
+    @classmethod
+    def do_test_reduce_scatter_tensor(cls, backend_type, reduce_op, expected_tensor):
+        n_gpus = 2
+        with cls.local_device_mesh(2, n_gpus, backend_type) as device_mesh:
+            rank = device_mesh.rank("host") * n_gpus + device_mesh.rank("gpu")
+            tensor_in = rank * torch.arange(0, 8, device="cuda", dtype=float).reshape(
+                4, 2
+            )
+            tensor_out = torch.arange(2, device="cuda", dtype=float)
+            pg = device_mesh.process_group(("host", "gpu"))
+            reduce_scatter_tensor(tensor_out, tensor_in, op=reduce_op, group=pg)
+            for host in range(2):
+                for gpu in range(n_gpus):
+                    rank = 2 * host + gpu
+                    local_tensor_out = inspect(tensor_out, {"host": host, "gpu": gpu})
+                    with no_mesh.activate():
+                        assert torch.equal(
+                            local_tensor_out,
+                            expected_tensor[rank],
+                        )
+    @classmethod
+    def do_test_reduce_scatter_tensor_subgroup(
+        cls,
+        backend_type: BackendType,
+        reduce_op,
+        expected_tensor_host_group: torch.Tensor,
+        expected_tensor_gpu_group: torch.Tensor,
+    ) -> None:
+        n_gpus = 2
+        with cls.local_device_mesh(2, n_gpus, backend_type) as device_mesh:
+            # Use a group smaller than the world size.
+            host_pg = device_mesh.process_group("host")
+            gpu_pg = device_mesh.process_group("gpu")
+            # host_rank = device_mesh.rank("host")
+            # gpu_rank = device_mesh.rank("gpu")
+            rank = device_mesh.rank(("host", "gpu"))
+            tensor_in = rank * torch.arange(
+                0, 8, device="cuda", dtype=torch.float32
+            ).reshape(4, 2)
+            gpu_tensor_out = torch.zeros(4, device="cuda", dtype=torch.float32)
+            reduce_scatter_tensor(gpu_tensor_out, tensor_in, op=reduce_op, group=gpu_pg)
+            tensor_in = rank * torch.arange(
+                0, 8, device="cuda", dtype=torch.float32
+            ).reshape(4, 2)
+            host_tensor_out = torch.zeros(4, device="cuda", dtype=torch.float32)
+            reduce_scatter_tensor(
+                host_tensor_out, tensor_in, op=reduce_op, group=host_pg
+            )
+            for host in range(2):
+                for gpu in range(n_gpus):
+                    rank = host * 2 + gpu
+                    local_gpu_tensor_out = inspect(
+                        gpu_tensor_out, {"host": host, "gpu": gpu}
+                    )
+                    local_host_tensor_out = inspect(
+                        host_tensor_out, {"host": host, "gpu": gpu}
+                    )
+                    with no_mesh.activate():
+                        assert torch.equal(
+                            local_host_tensor_out,
+                            expected_tensor_host_group[rank],
+                        ), f"{rank=}, {host=}, {gpu=}"
+                        assert torch.equal(
+                            local_gpu_tensor_out,
+                            expected_tensor_gpu_group[rank],
+                        ), f"{rank=}, {host=}, {gpu=}"
+    @classmethod
+    def do_test_reduce_scatter(
+        cls,
+        backend_type: BackendType,
+        reduce_op: ReduceOp,
+        expected_tensor: torch.Tensor,
+    ) -> None:
+        n_gpus = 2
+        with cls.local_device_mesh(2, n_gpus, backend_type) as device_mesh:
+            rank = device_mesh.rank("host") * n_gpus + device_mesh.rank("gpu")
+            tensor_in = rank * torch.arange(0, 8, device="cuda", dtype=torch.float32)
+            tensor_out = torch.arange(2, device="cuda", dtype=torch.float32)
+            pg = device_mesh.process_group(("host", "gpu"))
+            tensor_out = reduce_scatter(
+                tensor_out,
+                list(torch.chunk(tensor_in, 2 * n_gpus)),
+                op=reduce_op,
+                group=pg,
+            )
+            for host in range(2):
+                for gpu in range(n_gpus):
+                    rank = 2 * host + gpu
+                    local_tensor_out = inspect(tensor_out, {"host": host, "gpu": gpu})
+                    with no_mesh.activate():
+                        assert torch.equal(
+                            local_tensor_out,
+                            expected_tensor[rank],
+                        )
+    @classmethod
+    def do_test_all_reduce(cls, backend_type, reduce_op, expected_tensor):
+        n_gpus = 2
+        with cls.local_device_mesh(2, n_gpus, backend_type) as device_mesh:
+            rank = device_mesh.rank(("host", "gpu"))
+            tensor_in = rank * torch.arange(0, 8, device="cuda", dtype=float).reshape(
+                4, 2
+            )
+            pg = device_mesh.process_group(("host", "gpu"))
+            tensor_out = all_reduce(tensor_in, op=reduce_op, group=pg)
+            for host in range(2):
+                for gpu in range(n_gpus):
+                    local_tensor_out = inspect(tensor_out, {"host": host, "gpu": gpu})
+                    with no_mesh.activate():
+                        assert torch.equal(
+                            local_tensor_out,
+                            expected_tensor,
+                        )
+    def test_hello(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            log_remote("hello, world")
+    def test_eager_remote_function_failed(self, backend_type):
+        if backend_type == BackendType.PY:
+            pytest.skip("Python support not planned for this test")
+        with self.local_device_mesh(1, 2, backend_type) as _:
+            x = torch.rand(3, 4)
+            y = torch.rand(3, 4)
+            z = do_bogus_tensor_work(x, y, fail_rank=1)
+            a = z + x
+            with pytest.raises(RemoteException, match="do_bogus_tensor_work"):
+                # NCCL init is slow, and fails on internal RE!
+                _ = fetch_shard(a).result(timeout=40)
+    def test_set_device_inside_udf_fails_with_explanation(self, backend_type):
+        if backend_type == BackendType.PY:
+            pytest.skip("Python support not planned for this test")
+        with self.local_device_mesh(2, 2, backend_type):
+            t = set_device_udf(2)
+            try:
+                inspect(t)
+            except RemoteException as e:
+                backtrace = "\n".join([frame.name for frame in e.worker_frames])
+                assert "are available to monarch worker" in backtrace
+    def test_simple_tensors(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            x = torch.rand(3, 4)
+            y = x + x
+            log_remote("%s %s", x, y)
+            z = torch.std_mean(x)
+            log_remote("%s", z)
+    def test_user_call(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as _:
+            x = torch.rand(3, 4)
+            y = rlist((x + 1, x))
+            log_remote("%s", y)
+            # resume monday:
+            # 1. tensor ctor resource guard (done)
+            # 2. __torch_dispatch__ forward of normal ops (done)
+            # 3. collectives created for device mesh
+            # 4. implement comms APIs
+            # 5. transfer tensor back, and simple future to wait for result.
+    def test_remote_function_with_comms_full_mesh(self, backend_type):
+        nGPUs = 2
+        with self.local_device_mesh(2, nGPUs, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            myrank = (
+                (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank("gpu") + 1
+            )
+            x = torch.ones((3, 4), device="cuda") * myrank
+            reduce = all_reduce(x, group=pg)
+            local_reduce = fetch_shard(reduce).result()
+        assert torch.equal(local_reduce, torch.ones(3, 4) * 18)
+    def test_remote_function_with_comms_by_dimension(self, backend_type):
+        nGPUs = 2
+        with self.local_device_mesh(2, nGPUs, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("gpu",))
+            myrank = (
+                (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank("gpu") + 1
+            )
+            x = torch.ones((3, 4), device="cuda") * myrank
+            reduce = all_reduce(x, group=pg)
+            local_reduce_host_0 = fetch_shard(reduce).result()
+            local_reduce_host_1 = fetch_shard(reduce, {"gpu": 1, "host": 1}).result()
+        assert torch.equal(local_reduce_host_0, torch.ones(3, 4) * 7)
+        assert torch.equal(local_reduce_host_1, torch.ones(3, 4) * 11)
+        with self.local_device_mesh(2, nGPUs, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("host",))
+            myrank = (
+                (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank("gpu") + 1
+            )
+            x = torch.ones((3, 4), device="cuda") * myrank
+            reduce = all_reduce(x, group=pg)
+            local_reduce_gpu_0 = fetch_shard(reduce).result()
+            local_reduce_gpu_2 = fetch_shard(reduce, {"gpu": 1, "host": 0}).result()
+        assert torch.equal(local_reduce_gpu_0, torch.ones(3, 4) * 8)
+        assert torch.equal(local_reduce_gpu_2, torch.ones(3, 4) * 10)
+    def test_remote_function_with_comms_sub_mesh(self, backend_type):
+        nGPUs = 2
+        with self.local_device_mesh(
+            2, nGPUs, backend_type, activate=False
+        ) as device_mesh:
+            host1 = device_mesh(host=1)
+            with host1.activate():
+                pg = device_mesh.process_group(("gpu",))
+                myrank = (
+                    (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank("gpu") + 1
+                )
+                x = torch.ones((3, 4), device="cuda") * myrank
+                reduce = all_reduce(x, group=pg)
+                local_reduce = fetch_shard(reduce).result()
+            assert torch.equal(local_reduce, torch.ones(3, 4) * 11)
+            host0 = device_mesh(host=0)
+            with host0.activate():
+                pg = device_mesh.process_group(("gpu",))
+                myrank = (
+                    (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank("gpu") + 1
+                )
+                x = torch.ones((3, 4), device="cuda") * myrank
+                reduce = all_reduce(x, group=pg)
+                local_reduce = fetch_shard(reduce).result()
+            assert torch.equal(local_reduce, torch.ones(3, 4) * 7)
+    def test_remote_exception(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as _:
+            x = torch.rand(3, 4)
+            y = torch.rand(3, 4)
+            z = do_bogus_tensor_work(x, y)
+            a = z + x
+            b = x + y
+            with pytest.raises(RemoteException, match="do_bogus_tensor_work"):
+                # NCCL init is slow, and fails on internal RE!
+                _ = fetch_shard(a).result(timeout=20)
+            # but values not dependent on z are fine
+            fetch_shard(b).result(timeout=10)
+    def test_remote_function_barrier(self, backend_type):
+        if backend_type == BackendType.PY:
+            pytest.skip("FIXME: Python support for this function")
+        nGPUs = 2
+        with self.local_device_mesh(2, nGPUs, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            finished = barrier(group=pg)
+            local = fetch_shard(finished).result()
+        assert local.item() == 1.0
+    def test_remote_function_all_gather(self, backend_type: BackendType) -> None:
+        nGPUs = 2
+        with self.local_device_mesh(2, nGPUs, backend_type) as device_mesh:
+            myrank = (
+                (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank("gpu") + 1
+            )
+            # Don't start at zero to ensure there are no leftover zeros.
+            tensor_in = torch.arange(1, 3, device="cuda") * myrank
+            world_size = 2 * nGPUs
+            tensor_out = list(
+                torch.zeros(2 * world_size, dtype=torch.int64, device="cuda").chunk(
+                    world_size
+                )
+            )
+            pg = device_mesh.process_group(("host", "gpu"))
+            tensor_out = all_gather(tensor_out, tensor_in, group=pg)
+            local_tensor_out = inspect(tensor_out)
+        t0, t1, t2, t3 = local_tensor_out
+        assert torch.equal(t0, torch.tensor([3, 6]))
+        assert torch.equal(t1, torch.tensor([4, 8]))
+        assert torch.equal(t2, torch.tensor([5, 10]))
+        assert torch.equal(t3, torch.tensor([6, 12]))
+    def test_remote_function_all_gather_into_tensor(self, backend_type):
+        nGPUs = 2
+        with self.local_device_mesh(2, nGPUs, backend_type) as device_mesh:
+            myrank = (
+                (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank("gpu") + 1
+            )
+            # Don't start at zero to ensure there are no leftover zeros.
+            tensor_in = torch.arange(1, 3, device="cuda") * myrank
+            tensor_out = torch.zeros(2 * nGPUs * 2, dtype=torch.int64, device="cuda")
+            pg = device_mesh.process_group(("host", "gpu"))
+            finished = all_gather_into_tensor(tensor_out, tensor_in, group=pg)
+            local_finished = inspect(finished)
+            local_tensor_out = inspect(tensor_out)
+        assert local_finished.item() == 1.0
+        assert torch.equal(local_tensor_out, torch.tensor([3, 6, 4, 8, 5, 10, 6, 12]))
+    def test_remote_function_isend(self, backend_type):
+        nGPUs = 2
+        with self.local_device_mesh(2, nGPUs, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("host",))
+            host_0_mesh = device_mesh(host=0)
+            host_1_mesh = device_mesh(host=1)
+            with host_0_mesh.activate():
+                to_rank = (device_mesh.rank("host") + 1) * nGPUs + device_mesh.rank(
+                    "gpu"
+                )
+                t0 = torch.ones(1, device="cuda")
+                finished0 = isend(t0, to_rank, group=pg)
+            with host_1_mesh.activate():
+                from_rank = (device_mesh.rank("host") - 1) * nGPUs + device_mesh.rank(
+                    "gpu"
+                )
+                t1 = torch.zeros(1, device="cuda")
+                finished1 = irecv(t1, from_rank, group=pg)
+            with host_0_mesh.activate():
+                local_finished_0 = inspect(finished0)
+            with host_1_mesh.activate():
+                local_finished_1 = inspect(finished1)
+        assert local_finished_0.item() == 1.0
+        assert local_finished_1.item() == 1.0
+    def test_distributed_error(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as _:
+            x = torch.rand(3, 4).cuda()
+            y = torch.rand(3, 4).cuda()
+            # z is broken on rank 1 but not others
+            z = do_bogus_tensor_work(x, y, fail_rank=1)
+            # test that rank 1 is still doing work despite z failing
+            a = (x + y).reduce("gpu")
+            fetch_shard(a).result()
+            # but z itself should fail, even if we do not fetch it from rank 1
+            # (since fetch shard says we first want to assert the whole tensor is correct)
+            with pytest.raises(RemoteException, match="do_bogus_tensor_work"):
+                fetch_shard(z).result()
+            # try to reduce z, which should fail, but ranks that are not 1 do not
+            # know about the failure. Rank 1 should still participate in the reduce
+            # to unblock work.
+            rz = z.reduce("gpu")
+            # but we should see the error message still retrieving it because it is
+            # dependent on an error.
+            with pytest.raises(RemoteException, match="do_bogus_tensor_work"):
+                fetch_shard(rz).result()
+            # however, we should still be able to compute and get a result back
+            # from host 1, signaling that the reduction didn't get cuda compute stuck.
+            fetch_shard(2 * x, gpu=1, host=0).result()
+    def test_pipe(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            p = example_echo_add()
+            for _i in range(10):
+                x = torch.rand(3, 4)
+                p.send(x)
+                y = p.recv()
+                x, y = fetch_shard((x, y)).result()
+                with no_mesh.activate():
+                    assert torch.allclose(x + 1, y)
+    def test_loader(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            p = example_data_loader(3, 7)
+            for i in range(3, 7):
+                x = fetch_shard(p.recv()).result()
+                with no_mesh.activate():
+                    assert x.item() == i
+    def test_loader_blocks_with_small_pipe(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            iters = 10
+            p = example_data_loader_small_pipe(iters, (1000, 1000))
+            # timeout should proc on pipe process
+            sleep(0.6)
+            # it takes a few iters of reasonably sized tensors to fill up OS buffer
+            # max_messages (SNDHWM) only affects the zmq buffer
+            for _ in range(iters - 1):
+                p.recv()
+            t = fetch_shard(p.recv()).result()
+        assert t[0][0].item() == -1.0
+    def test_streams_run_parallel(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            # test that these two streams do in fact run in parallel
+            # on the worker by having each stream wait on a barrier.
+            # The Tensor t is just used as a data-dependency so that
+            # we can make sure new_barrier_hackery is called before
+            # the wait on 'other'.
+            other = Stream("other")
+            t = new_barrier_hackery(2)
+            t_other, borrow = other.borrow(t)
+            with borrow:
+                with other.activate():
+                    wait_barrier_hackery(t_other)
+                wait_barrier_hackery(t)
+            fetch_shard(t).result()
+    def test_debug(self, backend_type):
+        gonna_pdb = remote(
+            "monarch.worker._testing_function.gonna_pdb", propagate="inspect"
+        )
+        with self.local_device_mesh(2, 2, backend_type):
+            writes = []
+            def dw(s):
+                writes.append(s)
+            def dr(n):
+                buffer = "".join(["print(x)\n", "c\n"]).encode()
+                assert len(buffer) <= n
+                return buffer
+            if backend_type == BackendType.RS:
+                patch_read = patch(
+                    "monarch.controller.rust_backend.controller.debugger_read", new=dr
+                )
+                patch_write = patch(
+                    "monarch.controller.rust_backend.controller.debugger_write", new=dw
+                )
+            else:
+                patch_read = patch("monarch.controller.debugger.read", new=dr)
+                patch_write = patch("monarch.controller.debugger.write", new=dw)
+            with patch_read, patch_write:
+                gonna_pdb()
+                # xxx: we do not process messages from workers
+                # unless fetching a result
+                fetch_shard(None).result()
+                assert "".join(writes).count("7\n") == 4
+    def test_fetch_preprocess(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            assert (
+                "an argument processed"
+                == remote("monarch.worker._testing_function.do_some_processing")
+                .call_on_shard_and_fetch(
+                    "an argument",
+                )
+                .result()
+            )
+    def test_cached_remote_function(self, backend_type):
+        fn = remote("monarch.worker._testing_function.how_many_of_these_do_you_want")
+        start_hits = remote_module._hit
+        with self.local_device_mesh(2, 2, backend_type):
+            x = torch.ones(3, 4)
+            y = torch.rand(3, 4)
+            a, _, _ = fn(3, x)
+            b, _, _ = fn(3, x)
+            assert len(a._aliases.aliases) == 1
+            assert len(b._aliases.aliases) == 1
+            _, _, _ = fn(3, y)
+            t0, t1 = fn(2, x)
+            t0.add(t1)
+            local_a = fetch_shard(a).result()
+            with no_mesh.activate():
+                assert torch.all(local_a == 1.0)
+            end_hits = remote_module._hit
+            assert end_hits - start_hits == 2
+    def test_remote_autograd_function(self, backend_type):
+        from monarch.worker import _testing_function
+        remote_fn = remote_autograd_function(
+            _testing_function.TestRemoteAutogradFunction
+        )
+        with self.local_device_mesh(1, 1, backend_type):
+            x = torch.ones(1, requires_grad=True)
+            y = torch.ones_like(x).requires_grad_(True)
+            outs = remote_fn.apply(x, y)
+            assert outs[3] == 4
+            local_0 = fetch_shard(outs[0]).result()
+            local_1 = fetch_shard(outs[1]).result()
+            (outs[0] + outs[1]).sum().backward()
+            # unfortunately, grad_fn of local tensor is always None
+            # regardless of whether we set `no_grad` on the worker
+            # so we can test only requires_grad
+            for ll in (local_0, local_1):
+                assert not ll.requires_grad
+            grad_local_0 = fetch_shard(x.grad).result()
+            grad_local_1 = fetch_shard(x.grad).result()
+            x = x.detach()
+            x.grad = None
+            y.grad = None
+            outs = remote_fn.apply(x, y)
+            local_0_f = fetch_shard(outs[0]).result()
+            (outs[0] + outs[1]).sum().backward()
+            assert x.grad is None
+            grad_local_1_f = fetch_shard(y.grad).result()
+        assert torch.equal(local_0_f, torch.full_like(local_0_f, 2))
+        assert torch.equal(local_0, torch.ones_like(local_0))
+        assert torch.equal(grad_local_0, torch.ones_like(local_0))
+        assert torch.equal(grad_local_1, torch.ones_like(local_0))
+        assert torch.equal(grad_local_1_f, torch.ones_like(local_0))
+    def test_cached_remote_aliases(self, backend_type):
+        fn = remote("monarch.worker._testing_function.remote_chunk")
+        with self.local_device_mesh(1, 1, backend_type):
+            x = torch.randn(16, 5, device="cuda")
+            outs = fn(x)
+            aliases = outs[0]._aliases.aliases
+            # x and 4 results of x.chunk(4)
+            assert len(aliases) == 5
+            assert outs[2]._fake.storage_offset() == 40
+    def test_live_function(self, backend_type):
+        def bar(x, y):
+            return (
+                a_function_called_by_a_live_function(x)
+                + a_live_function_call_by_a_live_function(y)
+                + math.pi
+            )
+        @remote
+        def check(x):
+            return torch.allclose(x, torch.zeros(()) + math.pi + 5)
+        y = 7
+        @monarch.remote
+        def close():
+            return y
+        @monarch.remote
+        def cuda_works(x):
+            return x.cuda()
+        with self.local_device_mesh(2, 2, backend_type):
+            a = torch.ones(())
+            assert check.call_on_shard_and_fetch(bar(a, a)).result()
+            # ensure we do not attempt to pickle closures
+            close()
+            b = cuda_works(a)
+            fetch_shard(b).result()
+            @monarch.remote
+            def something_else():
+                raise Exception("No")  # this line appears
+            # check that the stack trace has correct line numbers
+            with pytest.raises(Exception, match=r"this line appears"):
+                something_else()
+    def test_setting_random_seed(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            set_manual_seed_remote(12345)
+            t = torch.randn(3, 4)
+            t_d = torch.randn(3, 4, device="cuda")
+            ref = fetch_shard(t).result()
+            ref_d = fetch_shard(t_d).result()
+            vals = {
+                (h, d): fetch_shard(t, {"host": h, "gpu": d}).result()
+                for h, d in itertools.product(range(2), repeat=2)
+            }
+            vals_d = {
+                (h, d): fetch_shard(t_d, {"host": h, "gpu": d}).result()
+                for h, d in itertools.product(range(2), repeat=2)
+            }
+        for v, v_d in zip(vals.values(), vals_d.values()):
+            assert torch.equal(v, ref)
+            assert torch.equal(v_d, ref_d)
+    def test_return_exception(self, backend_type):
+        @monarch.remote
+        def simple():
+            return Exception("is a valid value to return")
+        with self.local_device_mesh(1, 1, backend_type):
+            # This should be a valid return than an exception to raise
+            simple.call_on_shard_and_fetch().result()
+    def test_opaque_object(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            class Foo(OpaqueObject):
+                @opaque_method
+                def add(self, x: torch.Tensor):
+                    return x + x
+            f = Foo("monarch.worker._testing_function.WorkerFoo", 4.0)
+            result = monarch.inspect(f.add(torch.ones(3, 4)))
+            with monarch.no_mesh.activate():
+                assert torch.allclose(torch.full((3, 4), 5.0), result)
+            f.hi = 4
+            assert f.hi == 4
+    def test_opaqueRef_setup_state_and_iteration(self, backend_type):
+        with self.local_device_mesh(1, 2, backend_type) as mesh:
+            pg = mesh.process_group(("gpu",))
+            model, dataloader, criterion, optimizer = setup_state()
+            num_epochs = 5
+            for _ in range(num_epochs):
+                loss = iteration(model, dataloader, criterion, optimizer, pg)
+                assert inspect(loss).item() > 0
+    def test_opaqueRef_key_deleted(self, backend_type):
+        with self.local_device_mesh(1, 1, backend_type):
+            ref = create_opaque_ref()
+            assert inspect(opaque_ref_key_table_length()).item() == 1
+            del ref
+            assert inspect(opaque_ref_key_table_length()).item() == 0
+    def test_opaque_module(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            linear = OpaqueModule("torch.nn.Linear", 3, 3, device="cuda")
+            with torch.no_grad():
+                for p in linear.parameters():
+                    p.zero_()
+            input_ = torch.rand(4, 3, device="cuda")
+            # we should have been able to clear the parameters and have that result
+            # affect how the linear works.
+            output = linear.call_method("forward", lambda self, x: x.clone(), input_)
+            assert monarch.inspect(output.sum()).item() == 0
+    def test_opaque_module_autograd(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            input_ = torch.rand(3, 3, device="cuda", requires_grad=True)
+            linear = OpaqueModule("torch.nn.Linear", 3, 3, device="cuda")
+            output = linear(input_, propagator=lambda self, x: x.clone())
+            r = output.sum()
+            with torch.no_grad():
+                r.backward()
+            weight, bias = linear.parameters()
+            ig0, wg0, bg0 = monarch.inspect((input_.grad, weight.grad, bias.grad))
+            input_.grad = None
+            weight.grad = None
+            bias.grad = None
+            (input_ @ weight.T + bias).sum().backward()
+            ig1, wg1, bg1 = monarch.inspect((input_.grad, weight.grad, bias.grad))
+            with monarch.no_mesh.activate():
+                assert torch.allclose(ig0, ig1)
+                assert torch.allclose(wg0, wg1)
+                assert torch.allclose(bg0, bg1)
+    def test_remote_function_reduce_scatter_tensor_sum(self, backend_type):
+        self.do_test_reduce_scatter_tensor(
+            backend_type,
+            torch.distributed.ReduceOp.SUM,
+            (
+                torch.arange(0, 8, dtype=float).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=float).unsqueeze(-1).unsqueeze(-1)
+            ).sum(0),
+        )
+    def test_remote_function_reduce_scatter_tensor_subgroup_sum(
+        self, backend_type: BackendType
+    ) -> None:
+        self.do_test_reduce_scatter_tensor_subgroup(
+            backend_type,
+            torch.distributed.ReduceOp.SUM,
+            expected_tensor_host_group=torch.tensor(
+                [[0, 2, 4, 6], [0, 4, 8, 12], [8, 10, 12, 14], [16, 20, 24, 28]],
+                dtype=torch.float32,
+            ),
+            expected_tensor_gpu_group=torch.tensor(
+                [[0, 1, 2, 3], [4, 5, 6, 7], [0, 5, 10, 15], [20, 25, 30, 35]],
+                dtype=torch.float32,
+            ),
+        )
+    def test_remote_function_reduce_scatter_tensor_avg(self, backend_type):
+        self.do_test_reduce_scatter_tensor(
+            backend_type,
+            torch.distributed.ReduceOp.AVG,
+            (
+                torch.arange(0, 8, dtype=float).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=float).unsqueeze(-1).unsqueeze(-1)
+            ).mean(0),
+        )
+    def test_remote_function_reduce_scatter_sum(
+        self, backend_type: BackendType
+    ) -> None:
+        self.do_test_reduce_scatter(
+            backend_type,
+            torch.distributed.ReduceOp.SUM,
+            (
+                torch.arange(0, 8, dtype=torch.float32).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=torch.float32).unsqueeze(-1).unsqueeze(-1)
+            ).sum(0),
+        )
+    def test_remote_function_reduce_scatter_avg(
+        self, backend_type: BackendType
+    ) -> None:
+        self.do_test_reduce_scatter(
+            backend_type,
+            torch.distributed.ReduceOp.AVG,
+            (
+                torch.arange(0, 8, dtype=torch.float32).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=torch.float32).unsqueeze(-1).unsqueeze(-1)
+            ).mean(0),
+        )
+    def test_remote_function_all_reduce_sum(self, backend_type):
+        self.do_test_all_reduce(
+            backend_type,
+            torch.distributed.ReduceOp.SUM,
+            (
+                torch.arange(0, 8, dtype=float).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=float).unsqueeze(-1).unsqueeze(-1)
+            ).sum(0),
+        )
+    def test_remote_function_all_reduce_avg(self, backend_type):
+        self.do_test_all_reduce(
+            backend_type,
+            torch.distributed.ReduceOp.AVG,
+            (
+                torch.arange(0, 8, dtype=float).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=float).unsqueeze(-1).unsqueeze(-1)
+            ).mean(0),
+        )
+    def test_remote_function_all_reduce_max(self, backend_type):
+        self.do_test_all_reduce(
+            backend_type,
+            torch.distributed.ReduceOp.MAX,
+            (
+                torch.arange(0, 8, dtype=float).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=float).unsqueeze(-1).unsqueeze(-1)
+            ).max(0)[0],
+        )
+    def test_remote_function_all_reduce_min(self, backend_type):
+        self.do_test_all_reduce(
+            backend_type,
+            torch.distributed.ReduceOp.MIN,
+            (
+                torch.arange(0, 8, dtype=float).reshape(1, 4, 2).repeat(4, 1, 1)
+                * torch.arange(4, dtype=float).unsqueeze(-1).unsqueeze(-1)
+            ).min(0)[0],
+        )
+    def test_remote_function_failure_message_contains_traceback(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type):
+            x = outer_remote_function_that_calls_inner()
+            try:
+                inspect(x)
+            except RemoteException as e:
+                backtrace = "\n".join([frame.name for frame in e.worker_frames])
+                assert "outer_remote_function" in backtrace
+                assert "inner_remote_function" in backtrace
+    def test_remote_function_broadcast(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            for i in range(4):
+                rank = 2 * device_mesh.rank("host") + device_mesh.rank("gpu")
+                rank = rank.cuda()
+                broadcast(rank, src=i, group=pg)
+                for host in range(2):
+                    for gpu in range(2):
+                        with no_mesh.activate():
+                            assert inspect(rank, {"host": host, "gpu": gpu}).item() == i
+    def test_remote_function_all_to_all_single(self, backend_type):
+        with self.local_device_mesh(2, 2, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            tensor_in = torch.arange(4, device="cuda", dtype=float)
+            tensor_out = torch.empty(4, device="cuda", dtype=float)
+            all_to_all_single(tensor_out, tensor_in, group=pg)
+            for host in range(2):
+                for gpu in range(2):
+                    rank = 2 * host + gpu
+                    with no_mesh.activate():
+                        assert torch.equal(
+                            inspect(tensor_out, {"host": host, "gpu": gpu}),
+                            rank * torch.ones(4),
+                        )
+    def test_remote_function_all_to_all(self, backend_type: BackendType) -> None:
+        world_size = 2
+        n_gpus = 2
+        size = world_size * n_gpus
+        expected_tensors = [
+            torch.tensor([0, 4, 8, 12], dtype=torch.float32),
+            torch.tensor([1, 5, 9, 13], dtype=torch.float32),
+            torch.tensor([2, 6, 10, 14], dtype=torch.float32),
+            torch.tensor([3, 7, 11, 15], dtype=torch.float32),
+        ]
+        with self.local_device_mesh(world_size, n_gpus, backend_type) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            rank = n_gpus * device_mesh.rank("host") + device_mesh.rank("gpu")
+            in_tensors = list(
+                torch.chunk(
+                    torch.arange(size, device="cuda", dtype=torch.float32)
+                    + (rank * size),
+                    size,
+                )
+            )
+            # These values will be replaced, just used for shape.
+            out_tensors = list(torch.zeros(size, device="cuda").chunk(size))
+            out_tensors = all_to_all(out_tensors, in_tensors, group=pg)
+            for host in range(world_size):
+                for gpu in range(n_gpus):
+                    local_tensor_out = inspect(out_tensors, {"host": host, "gpu": gpu})
+                    rank = host * n_gpus + gpu
+                    with no_mesh.activate():
+                        # Combine the tensor list together for a better comparison
+                        # message.
+                        local_tensor_out = torch.cat(local_tensor_out)
+                        assert torch.equal(
+                            local_tensor_out, expected_tensors[rank]
+                        ), f"For {rank=}, {host=}, {gpu=}"
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
+# out is not counted as a failure, so we set a more restrictive timeout to
+# ensure we see a hard failure in CI.
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
+class TestComm(RemoteFunctionsTestBase):
+    N_GPUS: int = 2
+    N_HOSTS: int = 2
+    @property
+    def world_size(self) -> int:
+        return self.N_GPUS * self.N_HOSTS
+    @property
+    def device(self):
+        self.fail("test subclass didn't override device")
+    def _test_tensor_dtype_complex(self, backend_type: BackendType) -> None:
+        with self.local_device_mesh(
+            self.N_HOSTS, self.N_GPUS, backend_type
+        ) as device_mesh:
+            group = device_mesh.process_group(("host", "gpu"))
+            tensor = torch.rand(2, device="cuda")
+            tensor_c = torch.view_as_complex(tensor)
+            tensor_list = [
+                torch.rand(2, device="cuda") for _ in range(self.N_HOSTS * self.N_GPUS)
+            ]
+            tensor_list_c = list(tensor_list)
+            tensor_list_c[1] = torch.view_as_complex(tensor_list_c[1])
+            inspect(all_gather(tensor_list, tensor, group=group))
+            inspect(all_gather(tensor_list, tensor_c, group=group))
+            inspect(all_gather(tensor_list_c, tensor, group=group))
+            inspect(all_gather(tensor_list_c, tensor_c, group=group))
+    def test_nccl_barrier(self, backend_type: BackendType) -> None:
+        with self.local_device_mesh(
+            self.N_HOSTS, self.N_GPUS, backend_type
+        ) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            rank = device_mesh.rank(("host", "gpu"))
+            t = torch.tensor([1] * 10, device="cuda") + rank
+            all_reduce(t, group=pg)
+            for host in range(self.N_HOSTS):
+                for gpu in range(self.N_GPUS):
+                    rank = 2 * host + gpu
+                    with no_mesh.activate():
+                        # all reduce will sum rank + 1 across all ranks.
+                        expected_tensor = torch.tensor(
+                            [sum(range(1, self.world_size + 1))] * 10
+                        )
+                        assert torch.equal(
+                            expected_tensor,
+                            inspect(t, {"host": host, "gpu": gpu}),
+                        )
+    def test_tensor_dtype_complex(self, backend_type: BackendType) -> None:
+        self._test_tensor_dtype_complex(backend_type)
+    def test_reduce_scatter_base_k(self, backend_type: BackendType) -> None:
+        expected_tensor = (
+            torch.arange(self.N_HOSTS * self.N_GPUS * 2, dtype=torch.float32)
+            .reshape(1, self.N_HOSTS * self.N_GPUS, 2)
+            .repeat(self.N_HOSTS * self.N_GPUS, 1, 1)
+        ).sum(0)
+        with self.local_device_mesh(
+            self.N_HOSTS, self.N_GPUS, backend_type
+        ) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            output_tensor = torch.zeros(2, dtype=torch.int64, device="cuda")
+            input_tensors = torch.arange(
+                self.N_HOSTS * self.N_GPUS * 2, dtype=torch.int64, device="cuda"
+            )
+            input_tensors = torch.reshape(
+                input_tensors, (self.N_HOSTS * self.N_GPUS, 2)
+            )
+            # Input is [[0, 1], [2, 3], [4, 5], [6, 7]] across 4 ranks.
+            # After reduce + scatter, output_tensor should be [0 * 4, 1 * 4] on the 0th rank
+            # and [2 * 4, 3 * 4] on the 1st rank, and so on
+            reduce_scatter_tensor(output_tensor, input_tensors, group=pg)
+            for host in range(self.N_HOSTS):
+                for gpu in range(self.N_GPUS):
+                    rank = 2 * host + gpu
+                    output_tensor_local = inspect(
+                        output_tensor, {"host": host, "gpu": gpu}
+                    )
+                    with no_mesh.activate():
+                        assert torch.equal(output_tensor_local, expected_tensor[rank])
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
+# out is not counted as a failure, so we set a more restrictive timeout to
+# ensure we see a hard failure in CI.
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
+class TestNcclProcessGroupWithDispatchedCollectives(RemoteFunctionsTestBase):
+    """This test is copied from test_c10d_nccl.py::NcclProcessGroupWithDispatchedCollectivesTests
+    in torch, but modified to setup a Monarch device mesh and use remote functions"""
+    N_GPUS: int = 2
+    N_HOSTS: int = 2
+    def _call_collective_with_varying_tensors(
+        self,
+        world_size: int,
+        # pyre-fixme[24]: Incorrect ParamsSpec annotation.
+        collective: Remote[..., torch.Tensor],
+        *args,
+        **kwargs,
+    ) -> None:
+        # call collective with varying tensors to ensure that the tensors are
+        # correctly dispatched
+        # ensure supported devices (cpu, cuda) succeeds during dispatch call
+        tensor = torch.zeros(2, 2, device=torch.device("cuda"))
+        # multi tensor collectives
+        if collective == barrier:
+            fetch_shard(collective(*args, **kwargs)).result()
+        elif collective == all_gather:
+            output_list = list(
+                torch.zeros(world_size * 2, 2, device=torch.device("cuda")).chunk(
+                    world_size
+                )
+            )
+            fetch_shard(collective(output_list, tensor, *args, **kwargs)).result()
+        elif collective == reduce_scatter:
+            fetch_shard(
+                collective(tensor, [tensor] * world_size, *args, **kwargs)
+            ).result()
+        elif collective == gather:
+            gather_list = list(
+                torch.zeros(world_size * 2, 2, device=torch.device("cuda")).chunk(
+                    world_size
+                )
+            )
+            fetch_shard(collective(tensor, gather_list, *args, **kwargs)).result()
+        elif collective == scatter:
+            fetch_shard(
+                collective(tensor, [tensor] * world_size, *args, **kwargs)
+            ).result()
+        elif collective == all_to_all:
+            fetch_shard(
+                collective(
+                    [tensor] * world_size, [tensor] * world_size, *args, **kwargs
+                )
+            ).result()
+        else:
+            fetch_shard(collective(tensor, *args, **kwargs)).result()
+    @pytest.mark.parametrize(
+        "collective",
+        [
+            reduce,
+            broadcast,
+            all_reduce,
+            all_gather,
+            reduce_scatter,
+            barrier,
+            all_to_all,
+            gather,
+            scatter,
+        ],
+        ids=[
+            "reduce",
+            "broadcast",
+            "all_reduce",
+            "all_gather",
+            "reduce_scatter",
+            "barrier",
+            "all_to_all",
+            "gather",
+            "scatter",
+        ],
+    )
+    def test_collectives(
+        self, backend_type: BackendType, collective: Callable[..., torch.Tensor]
+    ) -> None:
+        world_size = self.N_HOSTS * self.N_GPUS
+        with self.local_device_mesh(
+            self.N_HOSTS, self.N_GPUS, backend_type
+        ) as device_mesh:
+            rank = device_mesh.rank(("host", "gpu"))
+            pg = device_mesh.process_group(("host", "gpu"))
+            kwargs: dict[str, object] = {"group": pg}
+            if collective == reduce:
+                kwargs["group_dst"] = 0
+            elif collective == broadcast:
+                kwargs["group_src"] = rank
+            elif collective == gather:
+                kwargs["group_dst"] = 0
+            elif collective == scatter:
+                kwargs["group_src"] = 0
+            self._call_collective_with_varying_tensors(world_size, collective, **kwargs)
+    def test_all_to_all_single(self, backend_type: BackendType) -> None:
+        with self.local_device_mesh(
+            self.N_HOSTS, self.N_GPUS, backend_type
+        ) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            # test alltoall_base
+            tensor_in = torch.arange(4, device="cuda", dtype=torch.float32)
+            tensor_out = torch.empty(4, device="cuda", dtype=torch.float32)
+            all_to_all_single(tensor_out, tensor_in, group=pg)
+            for host in range(self.N_HOSTS):
+                for gpu in range(self.N_GPUS):
+                    rank = 2 * host + gpu
+                    with no_mesh.activate():
+                        assert torch.equal(
+                            inspect(tensor_out, {"host": host, "gpu": gpu}),
+                            rank * torch.ones(4),
+                        )
+    def test_allgather_base(self, backend_type: BackendType) -> None:
+        with self.local_device_mesh(
+            self.N_HOSTS, self.N_GPUS, backend_type
+        ) as device_mesh:
+            pg = device_mesh.process_group(("host", "gpu"))
+            rank = (
+                (device_mesh.rank("host") + 1) * self.N_GPUS
+                + device_mesh.rank("gpu")
+                + 1
+            )
+            tensor_in = torch.arange(2, device="cuda") * rank
+            tensor_out = torch.zeros(
+                self.N_HOSTS * self.N_GPUS * 2, dtype=torch.int64, device="cuda"
+            )
+            all_gather_into_tensor(tensor_out, tensor_in, group=pg)
+            local_tensor_out = inspect(tensor_out)
+            with no_mesh.activate():
+                assert torch.equal(
+                    local_tensor_out, torch.tensor([0, 3, 0, 4, 0, 5, 0, 6])
+                )
+def a_function_called_by_a_live_function(x):
+    return 2 * x
+def a_live_function_call_by_a_live_function(x):
+    return 3 * x