PyPI - torchmonarch-nightly - Versions diffs - 2025.6.30__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.25__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.30__cp310-cp310-manylinux2014_x86_64.whl → 2025.7.25__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +874 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +270 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +500 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +56 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +51 -0
monarch/actor_mesh.py +6 -752
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +12 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/mesh_controller.py +201 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +95 -35
monarch/tools/mesh_spec.py +55 -0
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +75 -9
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +373 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +162 -0
tests/test_python_actors.py +184 -332
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +55 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0

tests/test_rdma.py ADDED Viewed

@@ -0,0 +1,198 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import pytest
+import torch
+from monarch.actor import Actor, current_rank, endpoint, proc_mesh
+from monarch.tensor_engine import is_available as rdma_available, RDMABuffer
+needs_cuda = pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="CUDA not available",
+)
+needs_rdma = pytest.mark.skipif(
+    not rdma_available(),
+    reason="RDMA not available",
+)
+class ParameterServer(Actor):
+    def __init__(self):
+        self.params = torch.rand(10, 10)
+        self.grad_buffer = torch.rand(10, 10)
+    @endpoint
+    async def grad_handle(self) -> RDMABuffer:
+        byte_tensor = self.grad_buffer.view(torch.uint8).flatten()
+        buffer = RDMABuffer(byte_tensor)
+        return buffer
+    @endpoint
+    async def update(self):
+        self.params += 0.01 * self.grad_buffer
+    @endpoint
+    async def get_grad_buffer(self) -> torch.Tensor:
+        # just used for testing
+        return self.grad_buffer
+class ParameterClient(Actor):
+    def __init__(self, server, buffer):
+        self.server = server
+        byte_tensor = buffer.view(torch.uint8).flatten()
+        self.buffer = byte_tensor
+    @endpoint
+    async def upload(self, tensor):
+        gh = await self.server.grad_handle.call_one()
+        await gh.write_from(tensor)
+    @endpoint
+    async def download(self):
+        gh = await self.server.grad_handle.call_one()
+        await gh.read_into(self.buffer)
+    @endpoint
+    async def get_buffer(self):
+        return self.buffer
+@needs_rdma
+@needs_cuda
+async def test_proc_mesh_rdma():
+    proc = await proc_mesh(gpus=1)
+    server = await proc.spawn("server", ParameterServer)
+    # --- CPU TESTS ---
+    client_cpu = await proc.spawn(
+        "client_cpu", ParameterClient, server, torch.ones(10, 10)
+    )
+    x = await client_cpu.get_buffer.call_one()
+    assert torch.sum(x.view(torch.float32).view(10, 10)) == 100
+    zeros = torch.zeros(10, 10)
+    await client_cpu.upload.call_one(zeros.view(torch.uint8).flatten())
+    await client_cpu.download.call_one()
+    x = await client_cpu.get_buffer.call_one()
+    assert torch.sum(x.view(torch.float32).view(10, 10)) == 0
+    # --- Modify server's backing buffer directly ---
+    await server.update.call_one()
+    # Should reflect updated values
+    await client_cpu.download.call_one()
+    buffer = await client_cpu.get_buffer.call_one()
+    remote_grad = await server.get_grad_buffer.call_one()
+    assert torch.allclose(buffer.view(torch.float32).view(10, 10), remote_grad)
+    # --- GPU TESTS ---
+    client_gpu = await proc.spawn(
+        "client_gpu", ParameterClient, server, torch.ones(10, 10, device="cuda")
+    )
+    x = await client_gpu.get_buffer.call_one()
+    buffer = x.view(torch.float32).view(10, 10)
+    assert torch.sum(buffer) == 100
+    zeros = torch.zeros(10, 10, device="cuda")
+    await client_gpu.upload.call_one(zeros.view(torch.uint8).flatten())
+    await client_gpu.download.call_one()
+    x = await client_gpu.get_buffer.call_one()
+    buffer_gpu = x.view(torch.float32).view(10, 10)
+    assert torch.sum(buffer_gpu) == 0
+    # copying a tensor across hosts moves it to CPU
+    assert buffer_gpu.device.type == "cpu"
+    # Modify server state again
+    await server.update.call_one()
+    await client_gpu.download.call_one()
+    x = await client_gpu.get_buffer.call_one()
+    buffer_gpu = x.view(torch.float32).view(10, 10)
+    remote_grad = await server.get_grad_buffer.call_one()
+    assert torch.allclose(buffer_gpu.cpu(), remote_grad)
+class TrainerActor(Actor):
+    def __init__(self):
+        super().__init__()
+        # TODO - switch to CUDA once GPU support is added
+        self.trainer = torch.nn.Linear(10, 10).to("cpu")
+        self.trainer.weight.data.zero_()
+    @endpoint
+    async def init(self, gen):
+        ranks = current_rank()
+        self.gen = gen.slice(**ranks)
+    @endpoint
+    async def exchange_metadata(self):
+        byte_tensor = self.trainer.weight.data.view(torch.uint8).flatten()
+        self.handle = RDMABuffer(byte_tensor)
+        await self.gen.attach_weight_buffer.call(self.handle)
+    @endpoint
+    async def weights_ready(self):
+        self.trainer.weight.data.add_(1.0)
+class GeneratorActor(Actor):
+    def __init__(self):
+        super().__init__()
+        self.generator = torch.nn.Linear(10, 10).to("cuda")
+        self.step = 0
+    @endpoint
+    async def init(self, trainer):
+        ranks = current_rank()
+        self.trainer = trainer.slice(**ranks)
+    @endpoint
+    async def attach_weight_buffer(self, handle):
+        self.handle = handle
+    @endpoint
+    async def update_weights(self):
+        self.step += 1
+        byte_tensor = self.generator.weight.data.view(torch.uint8).flatten()
+        await self.handle.read_into(byte_tensor)
+        assert (
+            torch.sum(self.generator.weight.data) == self.step * 100
+        ), f"{torch.sum(self.generator.weight.data)=}, {self.step=}"
+@needs_rdma
+@needs_cuda
+async def test_gpu_trainer_generator():
+    trainer_proc = await proc_mesh(gpus=1)
+    gen_proc = await proc_mesh(gpus=1)
+    trainer = await trainer_proc.spawn("trainer", TrainerActor)
+    generator = await gen_proc.spawn("gen", GeneratorActor)
+    await generator.init.call(trainer)
+    await trainer.init.call(generator)
+    await trainer.exchange_metadata.call()
+    for _ in range(3):
+        await trainer.weights_ready.call()
+        await generator.update_weights.call()
+@needs_rdma
+@needs_cuda
+def test_gpu_trainer_generator_sync() -> None:
+    trainer_proc = proc_mesh(gpus=1).get()
+    gen_proc = proc_mesh(gpus=1).get()
+    trainer = trainer_proc.spawn("trainer", TrainerActor).get()
+    generator = gen_proc.spawn("gen", GeneratorActor).get()
+    generator.init.call(trainer).get()
+    trainer.init.call(generator).get()
+    trainer.exchange_metadata.call().get()
+    for _ in range(1):
+        trainer.weights_ready.call().get()
+        generator.update_weights.call().get()

tests/test_remote_functions.py CHANGED Viewed

@@ -9,7 +9,6 @@ import itertools
 import math
 import sys
 import traceback
-from enum import Enum
 from typing import Callable, ContextManager, Tuple
 from unittest.mock import patch
@@ -25,16 +24,18 @@ from monarch import (
     Pipe,
     remote,
     remote_generator,
-    RemoteException,
+    RemoteException as OldRemoteException,
     Stream,
 )
 from monarch._testing import BackendType, TestingContext
 from monarch.builtins.log import log_remote
 from monarch.builtins.random import set_manual_seed_remote
 from monarch.cached_remote_function import remote_autograd_function
 from monarch.common import remote as remote_module
 from monarch.common.device_mesh import DeviceMesh
-from monarch.common.remote import Remote
+from monarch.common.remote import call_on_shard_and_fetch, Remote
+from monarch.mesh_controller import RemoteException as NewRemoteException
 from monarch.opaque_module import OpaqueModule
 from monarch.opaque_object import opaque_method, OpaqueObject
@@ -57,6 +58,8 @@ from monarch.worker._testing_function import (
 from monarch_supervisor.logging import fix_exception_lines
 from torch.distributed import ReduceOp
+RemoteException = (NewRemoteException, OldRemoteException)
 def custom_excepthook(exc_type, exc_value, exc_traceback):
     tb_lines = fix_exception_lines(
@@ -181,7 +184,9 @@ class RemoteFunctionsTestBase:
 # out is not counted as a failure, so we set a more restrictive timeout to
 # ensure we see a hard failure in CI.
 @pytest.mark.timeout(120)
-@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
+@pytest.mark.parametrize(
+    "backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
+)
 class TestRemoteFunctions(RemoteFunctionsTestBase):
     @classmethod
     def do_test_reduce_scatter_tensor(cls, backend_type, reduce_op, expected_tensor):
@@ -326,7 +331,7 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
                 _ = fetch_shard(a).result(timeout=40)
     def test_set_device_inside_udf_fails_with_explanation(self, backend_type):
-        if backend_type == BackendType.PY:
+        if backend_type != BackendType.RS:
             pytest.skip("Python support not planned for this test")
         with self.local_device_mesh(2, 2, backend_type):
             t = set_device_udf(2)
@@ -628,11 +633,10 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
         with self.local_device_mesh(2, 2, backend_type):
             assert (
                 "an argument processed"
-                == remote("monarch.worker._testing_function.do_some_processing")
-                .call_on_shard_and_fetch(
+                == call_on_shard_and_fetch(
+                    remote("monarch.worker._testing_function.do_some_processing"),
                     "an argument",
-                )
-                .result()
+                ).result()
             )
     def test_cached_remote_function(self, backend_type):
@@ -727,7 +731,7 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
         with self.local_device_mesh(2, 2, backend_type):
             a = torch.ones(())
-            assert check.call_on_shard_and_fetch(bar(a, a)).result()
+            assert call_on_shard_and_fetch(check, bar(a, a)).result()
             # ensure we do not attempt to pickle closures
             close()
@@ -770,7 +774,7 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
         with self.local_device_mesh(1, 1, backend_type):
             # This should be a valid return than an exception to raise
-            simple.call_on_shard_and_fetch().result()
+            call_on_shard_and_fetch(simple).result()
     def test_opaque_object(self, backend_type):
         with self.local_device_mesh(2, 2, backend_type):
@@ -948,10 +952,13 @@ class TestRemoteFunctions(RemoteFunctionsTestBase):
             x = outer_remote_function_that_calls_inner()
             try:
                 inspect(x)
-            except RemoteException as e:
+            except OldRemoteException as e:
                 backtrace = "\n".join([frame.name for frame in e.worker_frames])
                 assert "outer_remote_function" in backtrace
                 assert "inner_remote_function" in backtrace
+            except NewRemoteException as e:
+                assert "outer_remote_function" in e.worker_error_string
+                assert "inner_remote_function" in e.worker_error_string
     def test_remote_function_broadcast(self, backend_type):
         with self.local_device_mesh(2, 2, backend_type) as device_mesh:
@@ -1269,3 +1276,24 @@ def a_function_called_by_a_live_function(x):
 def a_live_function_call_by_a_live_function(x):
     return 3 * x
+@remote
+def return_them(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    return (x, y)
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="Not enough GPUs, this test requires at least 2 GPUs",
+)
+class TestMeshSpecific(RemoteFunctionsTestBase):
+    def test_value_mesh(self):
+        with self.local_device_mesh(2, 2, "mesh") as device_mesh:
+            x = device_mesh.rank("host")
+            y = device_mesh.rank("gpu")
+            r = return_them.call(x, y).get()
+            for p, (h, g) in r:
+                assert p["host"] == h.item()
+                assert p["gpu"] == g.item()

tests/test_rust_backend.py CHANGED Viewed

@@ -17,6 +17,7 @@ import torch
 import torch.utils._python_dispatch
 from monarch import fetch_shard, no_mesh, remote, Stream
 from monarch.common.device_mesh import DeviceMesh
+from monarch.common.remote import call_on_shard_and_fetch
 from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
 from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.functional import scaled_dot_product_attention
@@ -111,9 +112,10 @@ class TestRustBackend(TestCase):
         with local_mesh():
             assert (
                 "an argument processed"
-                == remote("monarch.worker._testing_function.do_some_processing")
-                .call_on_shard_and_fetch("an argument")
-                .result()
+                == call_on_shard_and_fetch(
+                    remote("monarch.worker._testing_function.do_some_processing"),
+                    "an argument",
+                ).result()
             )
     def test_brutal_shutdown(self):
@@ -143,8 +145,8 @@ class TestRustBackend(TestCase):
                 return torch.isnan(t).any().item()
             t = torch.rand(3, 4)
-            res = has_nan.call_on_shard_and_fetch(
-                t, shard={"host": 0, "gpu": 0}
+            res = call_on_shard_and_fetch(
+                has_nan, t, shard={"host": 0, "gpu": 0}
             ).result()
         self.assertFalse(res)

tests/test_sim_backend.py CHANGED Viewed

@@ -24,11 +24,8 @@ def local_sim_mesh(
     # TODO: support multiple gpus in a mesh.
     gpu_per_host: int = 1,
     activate: bool = True,
-    proxy_addr: Optional[str] = None,
 ) -> Generator[DeviceMesh, None, None]:
-    dms = sim_mesh(
-        n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host, proxy_addr=proxy_addr
-    )
+    dms = sim_mesh(n_meshes=1, hosts=hosts, gpus_per_host=gpu_per_host)
     dm = dms[0]
     try:
         if activate:

tests/test_tensor_engine.py CHANGED Viewed

@@ -7,8 +7,9 @@
 import monarch
 import pytest
 import torch
+from monarch import remote
+from monarch.actor import Actor, endpoint, proc_mesh
 from monarch.mesh_controller import spawn_tensor_engine
-from monarch.proc_mesh import proc_mesh
 two_gpu = pytest.mark.skipif(
@@ -32,6 +33,14 @@ def test_tensor_engine() -> None:
     assert torch.allclose(torch.zeros(3, 4), r)
     assert torch.allclose(torch.zeros(3, 4), f)
+    @remote(propagate=lambda x: x)
+    def nope(x):
+        raise ValueError("nope")
+    with pytest.raises(monarch.mesh_controller.RemoteException):
+        with dm.activate():
+            monarch.inspect(nope(torch.zeros(3, 4)))
     dm.exit()
@@ -50,3 +59,48 @@ def test_proc_mesh_tensor_engine() -> None:
     assert a == 0
     assert b == 10
     assert c == 100
+class AddWithState(Actor):
+    def __init__(self, state: torch.Tensor):
+        super().__init__()
+        self.state = state
+    @endpoint
+    def forward(self, x) -> torch.Tensor:
+        return x + self.state
+@two_gpu
+def test_actor_with_tensors() -> None:
+    pm = proc_mesh(gpus=1).get()
+    with pm.activate():
+        x = pm.spawn("adder", AddWithState, torch.ones(())).get()
+        y = torch.ones(())
+        assert x.forward.call(y).get(timeout=5).item(hosts=0, gpus=0).item() == 2
+class Counter(Actor):
+    def __init__(self):
+        super().__init__()
+        self.c = 0
+    @endpoint
+    def incr(self, x) -> int:
+        self.c += 1
+        return self.c - 1
+@two_gpu
+def test_actor_tensor_ordering() -> None:
+    pm = proc_mesh(gpus=1).get()
+    with pm.activate():
+        counter = pm.spawn("a", Counter).get()
+        results = []
+        for _ in range(0, 10, 2):
+            # tensor engine call
+            results.append(counter.incr.call(torch.ones(())))
+            # non-tensor engine call
+            results.append(counter.incr.call(1))
+        assert list(range(10)) == [r.get().item(hosts=0, gpus=0) for r in results]

{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchmonarch-nightly
-Version: 2025.6.30
+Version: 2025.7.25
 Summary: Monarch: Single controller library
 Author: Meta
 Author-email: oncall+monarch@xmail.facebook.com
@@ -15,6 +15,8 @@ Requires-Dist: numpy
 Requires-Dist: pyre-extensions
 Requires-Dist: cloudpickle
 Requires-Dist: torchx-nightly
+Requires-Dist: lark
+Requires-Dist: tabulate
 Dynamic: author
 Dynamic: author-email
 Dynamic: description
@@ -69,6 +71,9 @@ sudo dnf install clang-devel libnccl-devel
 conda install -c conda-forge clangdev nccl
 conda update -n monarchenv --all -c conda-forge -y
+# If you are building with RDMA support, build monarch with `USE_TENSOR_ENGINE=1 pip install --no-build-isolation .` and dnf install the following packages
+sudo dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
 # Install build dependencies
 pip install -r build-requirements.txt
 # Install test dependencies