PyPI - torchmonarch-nightly - Versions diffs - 2025.6.9__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.11__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.9__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.11__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/actor_mesh.py CHANGED Viewed

@@ -7,6 +7,7 @@
 import asyncio
 import collections
 import contextvars
+import functools
 import inspect
 import itertools
@@ -38,6 +39,7 @@ from typing import (
 import monarch
 from monarch import ActorFuture as Future
+from monarch._rust_bindings.hyperactor_extension.telemetry import enter_span, exit_span
 from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
 from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
@@ -49,6 +51,7 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import (
 )
 from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
 from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
 from monarch.common.pickle_flatten import flatten, unflatten
 from monarch.common.shape import MeshTrait, NDSlice
@@ -83,7 +86,7 @@ class MonarchContext:
 _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
-    "monarch.service._context"
+    "monarch.actor_mesh._context"
 )
@@ -492,13 +495,29 @@ class _Actor:
                 return None
             else:
                 the_method = getattr(self.instance, message.method)._method
-                result = the_method(self.instance, *args, **kwargs)
                 if not inspect.iscoroutinefunction(the_method):
+                    enter_span(
+                        the_method.__module__, message.method, str(ctx.mailbox.actor_id)
+                    )
+                    result = the_method(self.instance, *args, **kwargs)
+                    exit_span()
                     if port is not None:
                         port.send("result", result)
                     return None
-                return self.run_async(ctx, self.run_task(port, result, panic_flag))
+                async def instrumented():
+                    enter_span(
+                        the_method.__module__, message.method, str(ctx.mailbox.actor_id)
+                    )
+                    result = await the_method(self.instance, *args, **kwargs)
+                    exit_span()
+                    return result
+                return self.run_async(
+                    ctx,
+                    self.run_task(port, instrumented(), panic_flag),
+                )
         except Exception as e:
             traceback.print_exc()
             s = ActorError(e)
@@ -510,7 +529,11 @@ class _Actor:
             else:
                 raise s from None
-    async def run_async(self, ctx, coroutine):
+    async def run_async(
+        self,
+        ctx: MonarchContext,
+        coroutine: Coroutine[Any, None, Any],
+    ) -> None:
         _context.set(ctx)
         if self.complete_task is None:
             self.complete_task = asyncio.create_task(self._complete())
@@ -564,6 +587,12 @@ def _unpickle(data: bytes, mailbox: Mailbox) -> Any:
 class Actor(MeshTrait):
+    @functools.cached_property
+    def logger(cls) -> logging.Logger:
+        lgr = logging.getLogger(cls.__class__.__name__)
+        lgr.setLevel(logging.DEBUG)
+        return lgr
     @property
     def _ndslice(self) -> NDSlice:
         raise NotImplementedError(
@@ -677,7 +706,7 @@ class ActorError(Exception):
     def __init__(
         self,
         exception: Exception,
-        message: str = "A remote service call has failed asynchronously.",
+        message: str = "A remote actor call has failed asynchronously.",
     ) -> None:
         self.exception = exception
         self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
@@ -688,7 +717,7 @@ class ActorError(Exception):
         actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
         return (
             f"{self.message}\n"
-            f"Traceback of where the service call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
+            f"Traceback of where the remote call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
         )

monarch/allocator.py CHANGED Viewed

@@ -4,6 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
+import abc
 from typing import final
 from monarch import ActorFuture as Future
@@ -15,6 +18,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//mon
 from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
     LocalAllocatorBase,
     ProcessAllocatorBase,
+    RemoteAllocatorBase,
 )
@@ -60,3 +64,66 @@ class LocalAllocator(LocalAllocatorBase):
             lambda: self.allocate_nonblocking(spec),
             lambda: self.allocate_blocking(spec),
         )
+class RemoteAllocInitializer(abc.ABC):
+    """Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
+    NOTE: changes to method signatures of this class must be made to the call-site at
+    `PyRemoteProcessAllocInitializer.py_initialize_alloc()` in `monarch/monarch_hyperactor/src/alloc.rs`
+    """
+    @abc.abstractmethod
+    async def initialize_alloc(self) -> list[str]:
+        """
+        Return the addresses of the servers that should be used to allocate processes
+        for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
+        Each address is of the form `{transport}!{addr}(:{port})`.
+        This is the string form of `hyperactor::channel::ChannelAddr` (Rust).
+        For example, `tcp!127.0.0.1:1234`.
+        NOTE: Currently, all the addresses must have the same transport type and port
+        NOTE: Although this method is currently called once at the initialization of the Allocator,
+            in the future this method can be called multiple times and should return the current set of
+            addresses that are eligible to handle allocation requests.
+        """
+        ...
+class StaticRemoteAllocInitializer(RemoteAllocInitializer):
+    """
+    Returns the static list of server addresses that this initializer
+    was constructed with on each `initialize_alloc()` call.
+    """
+    def __init__(self, *addrs: str) -> None:
+        super().__init__()
+        self.addrs: list[str] = list(addrs)
+    async def initialize_alloc(self) -> list[str]:
+        return list(self.addrs)
+@final
+class RemoteAllocator(RemoteAllocatorBase):
+    """
+    An allocator that allocates by spawning actors on a remote host.
+    The remote host must be running hyperactor's remote-process-allocator.
+    """
+    def allocate(self, spec: AllocSpec) -> Future[Alloc]:
+        """
+        Allocate a process according to the provided spec.
+        Arguments:
+        - `spec`: The spec to allocate according to.
+        Returns:
+        - A future that will be fulfilled when the requested allocation is fulfilled.
+        """
+        return Future(
+            lambda: self.allocate_nonblocking(spec),
+            lambda: self.allocate_blocking(spec),
+        )

monarch/bootstrap_main.py CHANGED Viewed

@@ -58,7 +58,7 @@ def invoke_main():
     # forward logs to rust tracing. Defaults to on.
     if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
-        logging.root.addHandler(TracingForwarder())
+        logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
     try:
         with (

monarch/mesh_controller.py ADDED Viewed

@@ -0,0 +1,209 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import traceback
+from collections import deque
+from logging import Logger
+from typing import List, NamedTuple, Optional, Union
+import torch.utils._python_dispatch
+from monarch import NDSlice
+from monarch._rust_bindings.monarch_extension import client, debugger
+from monarch._rust_bindings.monarch_extension.client import (  # @manual=//monarch/monarch_extension:monarch_extension
+    WorldState,
+)
+from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
+from monarch._rust_bindings.monarch_hyperactor.proc import (  # @manual=//monarch/monarch_extension:monarch_extension
+    ActorId,
+)
+from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
+from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
+from monarch.common.client import Client
+from monarch.common.controller_api import LogMessage, MessageResult
+from monarch.common.device_mesh import DeviceMesh, no_mesh
+from monarch.common.invocation import DeviceException, RemoteException
+from monarch.controller.debugger import read as debugger_read, write as debugger_write
+from monarch.proc_mesh import ProcMesh
+from pyre_extensions import none_throws
+logger: Logger = logging.getLogger(__name__)
+class Controller(_Controller):
+    def __init__(self, workers: HyProcMesh) -> None:
+        super().__init__()
+        # Buffer for messages unrelated to debugging that are received while a
+        # debugger session is active.
+        self._non_debugger_pending_messages: deque[
+            Optional[client.LogMessage | client.WorkerResponse]
+        ] = deque()
+        self._pending_debugger_sessions: deque[ActorId] = deque()
+    def next_message(
+        self, timeout: Optional[float]
+    ) -> Optional[LogMessage | MessageResult]:
+        if self._non_debugger_pending_messages:
+            msg = self._non_debugger_pending_messages.popleft()
+        else:
+            msg = self._get_next_message(timeout_msec=int((timeout or 0.0) * 1000.0))
+        if msg is None:
+            return None
+        if isinstance(msg, client.WorkerResponse):
+            return _worker_response_to_result(msg)
+        elif isinstance(msg, client.LogMessage):
+            return LogMessage(msg.level, msg.message)
+        elif isinstance(msg, client.DebuggerMessage):
+            self._run_debugger_loop(msg)
+    def send(
+        self,
+        ranks: Union[NDSlice, List[NDSlice]],
+        msg: NamedTuple,
+    ) -> None:
+        with torch.utils._python_dispatch._disable_current_modes():
+            return super().send(ranks, msg)
+    def drain_and_stop(
+        self,
+    ) -> List[LogMessage | MessageResult | client.DebuggerMessage]:
+        logger.info("rust controller shutting down")
+        results = []
+        for msg in self._drain_and_stop():
+            if isinstance(msg, client.WorkerResponse):
+                results.append(_worker_response_to_result(msg))
+            elif isinstance(msg, client.LogMessage):
+                results.append(LogMessage(msg.level, msg.message))
+            elif isinstance(msg, client.DebuggerMessage):
+                results.append(msg)
+            else:
+                raise RuntimeError(f"Unexpected message type {type(msg)}")
+        return results
+    def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
+        if not isinstance(message.action, DebuggerAction.Paused):
+            raise RuntimeError(
+                f"Unexpected debugger message {message} when no debugger session is running"
+            )
+        self._pending_debugger_sessions.append(message.debugger_actor_id)
+        while self._pending_debugger_sessions:
+            debugger_actor_id = self._pending_debugger_sessions.popleft()
+            rank = debugger_actor_id.rank
+            proc_id = debugger_actor_id.proc_id
+            debugger_write(
+                f"pdb attached to proc {proc_id} with rank {rank}, debugger actor {debugger_actor_id} \n"
+            )
+            self._debugger_attach(debugger_actor_id)
+            while True:
+                # TODO: Add appropriate timeout.
+                msg = self._get_next_message(timeout_msec=None)
+                if not isinstance(msg, client.DebuggerMessage):
+                    self._non_debugger_pending_messages.append(msg)
+                    continue
+                if msg.debugger_actor_id != debugger_actor_id:
+                    if isinstance(msg.action, DebuggerAction.Paused):
+                        self._pending_debugger_sessions.append(msg.debugger_actor_id)
+                        continue
+                    else:
+                        raise RuntimeError(
+                            f"unexpected debugger message {msg} from rank {msg.debugger_actor_id.rank} "
+                            f"when debugging rank {debugger_actor_id.rank}"
+                        )
+                action = msg.action
+                if isinstance(action, DebuggerAction.Detach):
+                    break
+                elif isinstance(action, DebuggerAction.Read):
+                    self._debugger_write(
+                        debugger_actor_id, debugger_read(action.requested_size)
+                    )
+                elif isinstance(action, DebuggerAction.Write):
+                    debugger_write(
+                        debugger.get_bytes_from_write_action(action).decode()
+                    )
+                else:
+                    raise RuntimeError(
+                        f"unexpected debugger message {msg} when debugging rank {debugger_actor_id.rank}"
+                    )
+    def worker_world_state(self) -> WorldState:
+        raise NotImplementedError("worker world state")
+    def stop_mesh(self):
+        # I think this is a noop?
+        pass
+# TODO: Handling conversion of the response can move to a separate module over time
+# especially as we have structured error messages.
+def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
+    if not result.is_exception():
+        # The result of the message needs to be unwrapped on a real device.
+        # Staying as a fake tensor will fail the tensor deserialization.
+        with no_mesh.activate():
+            return MessageResult(result.seq, result.result(), None)
+    exc = none_throws(result.exception())
+    if isinstance(exc, client.Error):
+        worker_frames = [
+            traceback.FrameSummary("<unknown>", None, frame)
+            for frame in exc.backtrace.split("\\n")
+        ]
+        logger.error(f"Worker {exc.actor_id} failed")
+        return MessageResult(
+            seq=result.seq,
+            result=None,
+            error=RemoteException(
+                seq=exc.caused_by_seq,
+                exception=RuntimeError(exc.backtrace),
+                controller_frame_index=0,  # TODO: T225205291 fix this once we have recording support in rust
+                controller_frames=None,
+                worker_frames=worker_frames,
+                source_actor_id=exc.actor_id,
+                message=f"Worker {exc.actor_id} failed",
+            ),
+        )
+    elif isinstance(exc, client.Failure):
+        frames = [
+            traceback.FrameSummary("<unknown>", None, frame)
+            for frame in exc.backtrace.split("\n")
+        ]
+        reason = f"Actor {exc.actor_id} crashed on {exc.address}, check the host log for details"
+        logger.error(reason)
+        return MessageResult(
+            seq=0,  # seq is not consumed for DeviceException; it will be directly thrown by the client
+            result=None,
+            error=DeviceException(
+                exception=RuntimeError(reason),
+                frames=frames,
+                source_actor_id=exc.actor_id,
+                message=reason,
+            ),
+        )
+    else:
+        raise RuntimeError(f"Unknown exception type: {type(exc)}")
+def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
+    # This argument to Controller
+    # is currently only used for debug printing. It should be fixed to
+    # report the proc ID instead of the rank it currently does.
+    gpus = proc_mesh.sizes.get("gpus", 1)
+    backend_ctrl = Controller(proc_mesh._proc_mesh)
+    client = Client(backend_ctrl, proc_mesh.size(), gpus)
+    dm = DeviceMesh(
+        client,
+        NDSlice.new_row_major(list(proc_mesh.sizes.values())),
+        tuple(proc_mesh.sizes.keys()),
+    )
+    dm.exit = lambda: client.shutdown()
+    return dm

monarch/monarch_controller CHANGED Viewed

Binary file

tests/test_allocator.py ADDED Viewed

@@ -0,0 +1,216 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import contextlib
+import importlib.resources
+import math
+import os
+import subprocess
+import sys
+import unittest
+from datetime import timedelta
+from typing import Generator
+import cloudpickle
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from monarch._rust_bindings.hyperactor_extension.alloc import (
+    AllocConstraints,
+    AllocSpec,
+)
+from monarch._rust_bindings.monarch_hyperactor.channel import (
+    ChannelAddr,
+    ChannelTransport,
+)
+from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
+from monarch.allocator import RemoteAllocator, StaticRemoteAllocInitializer
+from monarch.proc_mesh import ProcMesh
+from torch.distributed.elastic.utils.distributed import get_free_port
+_100_MILLISECONDS = timedelta(milliseconds=100)
+class TestActor(Actor):
+    """Silly actor that computes the world size by all-reducing rank-hot tensors"""
+    def __init__(self) -> None:
+        self.rank: int = current_rank().rank
+        self.world_size: int = math.prod(current_size().values())
+    @endpoint
+    async def compute_world_size(self, master_addr: str, master_port: int) -> int:
+        os.environ["MASTER_ADDR"] = master_addr
+        os.environ["MASTER_PORT"] = str(master_port)
+        dist.init_process_group("gloo", rank=self.rank, world_size=self.world_size)
+        try:
+            t = F.one_hot(torch.tensor(self.rank), num_classes=dist.get_world_size())
+            dist.all_reduce(t)
+            return int(torch.sum(t).item())
+        finally:
+            dist.destroy_process_group()
+@contextlib.contextmanager
+def remote_process_allocator() -> Generator[str, None, None]:
+    with importlib.resources.path(__package__, "") as package_path:
+        addr = ChannelAddr.any(ChannelTransport.Unix)
+        process_allocator = subprocess.Popen(
+            args=[
+                "process_allocator",
+                f"--addr={addr}",
+            ],
+            env={
+                # prefix PATH with this test module's directory to
+                # give 'process_allocator' and 'monarch_bootstrap' binary resources
+                # in this test module's directory precedence over the installed ones
+                # useful in BUCK where these binaries are added as 'resources' of this test target
+                "PATH": f"{package_path}:{os.getenv('PATH', '')}",
+                "RUST_LOG": "debug",
+            },
+        )
+        try:
+            yield addr
+        finally:
+            process_allocator.terminate()
+            try:
+                five_seconds = 5
+                process_allocator.wait(timeout=five_seconds)
+            except subprocess.TimeoutExpired:
+                process_allocator.kill()
+class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
+    def assert_computed_world_size(
+        self, computed: ValueMesh[int], expected_world_size: int
+    ) -> None:
+        expected_world_sizes = {
+            rank: expected_world_size for rank in range(0, expected_world_size)
+        }
+        computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
+        self.assertDictEqual(expected_world_sizes, computed_world_sizes)
+    async def test_call_allocate_twice(self) -> None:
+        class DeletingAllocInitializer(StaticRemoteAllocInitializer):
+            """test initializer that removes the last address from the list each time initialize_alloc() is called
+            used to test that the state of the initializer is preserved across calls to allocate()
+            """
+            async def initialize_alloc(self) -> list[str]:
+                alloc = await super().initialize_alloc()
+                self.addrs.pop(-1)
+                return alloc
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            initializer = DeletingAllocInitializer(host1, host2)
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=initializer,
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
+            await allocator.allocate(spec)
+            self.assertEqual([host1], initializer.addrs)
+            await allocator.allocate(spec)
+            self.assertEqual([], initializer.addrs)
+    async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
+        class EmptyAllocInitializer(StaticRemoteAllocInitializer):
+            """test initializer that returns an empty list of addresses"""
+            async def initialize_alloc(self) -> list[str]:
+                return []
+        empty_initializer = EmptyAllocInitializer()
+        with self.assertRaisesRegex(
+            RuntimeError, r"initializer must return non-empty list of addresses"
+        ):
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=empty_initializer,
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
+    async def test_allocate_2d_mesh(self) -> None:
+        hosts = 2
+        gpus = 4
+        world_size = hosts * gpus
+        spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
+        # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc)
+            actor = await proc_mesh.spawn("test_actor", TestActor)
+            values = await actor.compute_world_size.call(
+                master_addr="::",
+                master_port=get_free_port(),
+            )
+            self.assert_computed_world_size(values, world_size)
+    async def test_stacked_1d_meshes(self) -> None:
+        # create two stacked actor meshes on the same host
+        # each actor mesh running on separate process-allocators
+        with remote_process_allocator() as host1_a, remote_process_allocator() as host1_b:
+            allocator_a = RemoteAllocator(
+                world_id="a",
+                initializer=StaticRemoteAllocInitializer(host1_a),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            allocator_b = RemoteAllocator(
+                world_id="b",
+                initializer=StaticRemoteAllocInitializer(host1_b),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
+            spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
+            proc_mesh_a = await ProcMesh.from_alloc(await allocator_a.allocate(spec_a))
+            proc_mesh_b = await ProcMesh.from_alloc(await allocator_b.allocate(spec_b))
+            actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
+            actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
+            results_a = await actor_a.compute_world_size.call(
+                master_addr="::", master_port=get_free_port()
+            )
+            results_b = await actor_b.compute_world_size.call(
+                master_addr="::", master_port=get_free_port()
+            )
+            self.assert_computed_world_size(results_a, 2)  # a is a 1x2 mesh
+            self.assert_computed_world_size(results_b, 6)  # b is a 1x6 mesh

tests/test_python_actors.py CHANGED Viewed

@@ -7,7 +7,12 @@
 import operator
 from types import ModuleType
+import monarch
+import pytest
 import torch
 from monarch.actor_mesh import (
     Accumulator,
     Actor,
@@ -17,6 +22,8 @@ from monarch.actor_mesh import (
     endpoint,
 )
+from monarch.mesh_controller import spawn_tensor_engine
 from monarch.proc_mesh import local_proc_mesh, proc_mesh
 from monarch.rdma import RDMABuffer
@@ -375,3 +382,20 @@ def test_rust_binding_modules_correct() -> None:
                 assert value.__module__ == path
     check(bindings, "monarch._rust_bindings")
+def test_tensor_engine() -> None:
+    pm = proc_mesh(gpus=2).get()
+    dm = spawn_tensor_engine(pm)
+    with dm.activate():
+        r = monarch.inspect(2 * torch.zeros(3, 4))
+    fm = dm.flatten("all")
+    with fm.activate():
+        f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
+    assert torch.allclose(torch.zeros(3, 4), r)
+    assert torch.allclose(torch.zeros(3, 4), f)
+    dm.exit()

tests/test_rust_backend.py CHANGED Viewed

@@ -14,6 +14,7 @@ import monarch
 import pytest
 import torch
+import torch.utils._python_dispatch
 from monarch import fetch_shard, no_mesh, remote, Stream
 from monarch.common.device_mesh import DeviceMesh
 from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
@@ -180,3 +181,37 @@ class TestRustBackend(TestCase):
             self.assertIsNotNone(mesh_info.mesh_labels)
             self.assertEqual(len(mesh_info.devices_labels), 2)
+    def test_ivalue_problems(self) -> None:
+        with local_mesh(hosts=1, gpu_per_host=1):
+            from typing import cast
+            from monarch.common.messages import CallFunction, CommandGroup
+            a = cast(monarch.Tensor, torch.rand(3, 4))
+            result = monarch.Tensor(a._fake, a.mesh, a.stream)
+            msg = CallFunction(
+                0,
+                result,
+                (),
+                monarch.common.function.ResolvableFunctionFromPath(
+                    "torch.ops.aten.mul.Tensor"
+                ),
+                (2, a),
+                {},
+                a.stream._to_ref(a.mesh.client),
+                a.mesh,
+                [],
+            )
+            # Internally, this will call CallFunction(...).to_rust_message().
+            # The 2 arg will be converted to an IValue tensor via rust + C++.
+            # Then when the CommandGroup message gets converted to rust, it
+            # will attempt to clone the rust CallFunction message, which will
+            # attempt to clone the IValue tensor, which will cause a crash.
+            # Upon attempting to clone the IValue tensor, our custom __torch_dispatch__
+            # intercepts the following two calls:
+            #   aten._to_copy.default () (2,) {'dtype': torch.float64, 'device': device(type='cpu')}
+            #   aten.clone.default () (2,) {}
+            with torch.utils._python_dispatch._disable_current_modes():
+                CommandGroup([msg]).to_rust_message()

{torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchmonarch-nightly
-Version: 2025.6.9
+Version: 2025.6.11
 Summary: Monarch: Single controller library
 Author: Meta
 Author-email: oncall+monarch@xmail.facebook.com

{torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,16 @@
 monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
-monarch/_rust_bindings.so,sha256=hpE8smD7nqpmTQZsRLg08OMLryvUehI9_0aDbdcsVLQ,39166496
+monarch/_rust_bindings.so,sha256=g2tlum6iqfdR4KRkVhp_BwUmlz0tYUSITNVaJjSNitE,40645720
 monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
-monarch/actor_mesh.py,sha256=5DbU9OrmNk5I9yasmE-rkTgHyO07oiLlAG0jbJBOXgI,23000
-monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
-monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
+monarch/actor_mesh.py,sha256=4I8xp_XIM6KZJY_jXVjJ8tPW2l1J4a6ZhrknU7zKbAk,23947
+monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
+monarch/bootstrap_main.py,sha256=EYaTMA1lxy2213L_04drTKlJvZQjzNdD3jeUHiqSBJc,2578
 monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
 monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
 monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
 monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
 monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
-monarch/monarch_controller,sha256=TvAJzOeJIiFdC9QPnzrsw5ziCFA9balBWzEStq3O8u8,20395288
+monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
+monarch/monarch_controller,sha256=41B7zLv7M7_CSmChN5bfvVrygi2VeBhMDcNQXlnbVZU,20394376
 monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
 monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
 monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
@@ -131,6 +132,7 @@ tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,48
 tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
 tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
 tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
+tests/test_allocator.py,sha256=dqQbQyOjOX3JgnHIPT0iawT0wMeFztbLCYjK2tl8GcI,8149
 tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
 tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
 tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
@@ -139,9 +141,9 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
 tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
 tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
 tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
-tests/test_python_actors.py,sha256=fDvHUIWNZeL3CWnTJMbdh98i1tnH1-LJEG1pIFkGYF8,10898
+tests/test_python_actors.py,sha256=gP6MDN2BL282qInUGP9untlpsqqB2uy1Iq5gUXnXcUo,11387
 tests/test_remote_functions.py,sha256=ExqYlRQWRabpGBuKvNIOa8Hwj-iXuP87Jfb9i5RhaGs,50066
-tests/test_rust_backend.py,sha256=nXSa0ZQ0NniZm4PzvKhrWvVLD-RKvIWYkPXm1BEBXq8,6235
+tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
 tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
 tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
 tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -149,9 +151,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
 tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
 tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
 tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
-torchmonarch_nightly-2025.6.9.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
-torchmonarch_nightly-2025.6.9.dist-info/METADATA,sha256=SFAiEIRUzlpHy2_j-bRjx22U-753WotqxjEp0uwud-w,2771
-torchmonarch_nightly-2025.6.9.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
-torchmonarch_nightly-2025.6.9.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
-torchmonarch_nightly-2025.6.9.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
-torchmonarch_nightly-2025.6.9.dist-info/RECORD,,
+torchmonarch_nightly-2025.6.11.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
+torchmonarch_nightly-2025.6.11.dist-info/METADATA,sha256=SCdAxETtVZ5ESzbLepOp6mf1L4G-HSYVkjdRFT7D0kg,2772
+torchmonarch_nightly-2025.6.11.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
+torchmonarch_nightly-2025.6.11.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
+torchmonarch_nightly-2025.6.11.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
+torchmonarch_nightly-2025.6.11.dist-info/RECORD,,

{torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.9.dist-info → torchmonarch_nightly-2025.6.11.dist-info}/top_level.txt RENAMED Viewed

File without changes