PyPI - torchmonarch-nightly - Versions diffs - 2025.7.28__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.30__cp311-cp311-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.7.28__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.30__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +9 -5
monarch/_src/actor/allocator.py +18 -47
monarch/_src/actor/debugger.py +159 -98
monarch/_src/actor/endpoint.py +15 -4
monarch/_src/actor/future.py +79 -32
monarch/_src/actor/pdb_wrapper.py +10 -4
monarch/_src/actor/proc_mesh.py +82 -114
monarch/_src/actor/shape.py +32 -33
monarch/_src/tensor_engine/rdma.py +12 -6
monarch/mesh_controller.py +37 -4
monarch/monarch_controller +0 -0
monarch/tools/components/hyperactor.py +1 -1
monarch/tools/config/__init__.py +1 -1
monarch/tools/config/defaults.py +1 -1
monarch/tools/utils.py +27 -0
tests/test_actor_error.py +3 -4
tests/test_actor_shape.py +114 -0
tests/test_debugger.py +406 -178
tests/test_python_actors.py +67 -67
{torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/RECORD +26 -25
{torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.7.28.dist-info → torchmonarch_nightly-2025.7.30.dist-info}/top_level.txt +0 -0

monarch/_src/actor/future.py CHANGED Viewed

@@ -7,7 +7,21 @@
 import asyncio
 import traceback
 from functools import partial
-from typing import Generator, Generic, Optional, TypeVar
+from typing import (
+    Any,
+    cast,
+    Coroutine,
+    Generator,
+    Generic,
+    Literal,
+    NamedTuple,
+    Optional,
+    TypeVar,
+)
+from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask
+from typing_extensions import Self
 R = TypeVar("R")
@@ -48,43 +62,76 @@ async def _aincomplete(impl, self):
 # loop machinery, this gives it the same throughput as if we ran it synchronously.
-class Future(Generic[R]):
-    def __init__(self, *, impl, requires_loop=True):
-        self._aget = partial(_aincomplete, impl)
-        self._requires_loop = requires_loop
+class _Unawaited(NamedTuple):
+    coro: PythonTask
-    def get(self, timeout: Optional[float] = None) -> R:
-        if asyncio._get_running_loop() is not None:
-            raise RuntimeError("get() cannot be called from within an async context")
-        if timeout is not None:
-            return asyncio.run(asyncio.wait_for(self._aget(self), timeout))
-        if not self._requires_loop:
-            try:
-                coro = self._aget(self)
-                next(coro.__await__())
-                tb_str = "".join(traceback.format_stack(coro.cr_frame))
-                raise RuntimeError(
-                    f"a coroutine paused with a future with requires_loop=False cannot block on a python asyncio.Future. Use requires_loop=True.\n{tb_str}"
-                )
-            except StopIteration as e:
-                return e.value
-        return asyncio.run(self._aget(self))
-    def __await__(self) -> Generator[R, None, R]:
-        return self._aget(self).__await__()
+class _Complete(NamedTuple):
+    value: Any
+class _Exception(NamedTuple):
+    exe: Exception
-    def _set_result(self, result):
-        async def af(self):
-            return result
+class _Asyncio(NamedTuple):
+    fut: asyncio.Future
-        self._aget = af
-        return result
-    def _set_exception(self, e):
-        async def af(self):
-            raise e
+_Status = _Unawaited | _Complete | _Exception | _Asyncio
-        self._aget = af
+class Future(Generic[R]):
+    def __init__(self, *, coro: "Coroutine[Any, Any, R] | PythonTask[R]"):
+        self._status: _Status = _Unawaited(
+            coro if isinstance(coro, PythonTask) else PythonTask.from_coroutine(coro)
+        )
+    def get(self, timeout: Optional[float] = None) -> R:
+        match self._status:
+            case _Unawaited(coro=coro):
+                try:
+                    if timeout is not None:
+                        coro = coro.with_timeout(timeout)
+                    v = coro.block_on()
+                    self._status = _Complete(v)
+                    return cast("R", v)
+                except Exception as e:
+                    self._status = _Exception(e)
+                    raise e from None
+            case _Asyncio(_):
+                raise ValueError(
+                    "already converted into an asyncio.Future, use 'await' to get the value."
+                )
+            case _Complete(value=value):
+                return cast("R", value)
+            case _Exception(exe=exe):
+                raise exe
+            case _:
+                raise RuntimeError("unknown status")
+    def __await__(self) -> Generator[Any, Any, R]:
+        match self._status:
+            case _Unawaited(coro=coro):
+                loop = asyncio.get_running_loop()
+                fut = loop.create_future()
+                self._status = _Asyncio(fut)
+                async def mark_complete():
+                    try:
+                        func, value = fut.set_result, await coro
+                    except Exception as e:
+                        func, value = fut.set_exception, e
+                    loop.call_soon_threadsafe(func, value)
+                PythonTask.from_coroutine(mark_complete()).spawn()
+                return fut.__await__()
+            case _Asyncio(fut=fut):
+                return fut.__await__()
+            case _:
+                raise ValueError(
+                    "already converted into a synchronous future, use 'get' to get the value."
+                )
     # compatibility with old tensor engine Future objects
     # hopefully we do not need done(), add_callback because

monarch/_src/actor/pdb_wrapper.py CHANGED Viewed

@@ -47,9 +47,12 @@ class PdbWrapper(pdb.Pdb):
         super().__init__(stdout=WriteWrapper(self), stdin=ReadWrapper.create(self))
         self._first = True
-    def set_trace(self, frame):
+    def set_trace(self, frame=None):
         self.client_ref.debugger_session_start.broadcast(
-            self.rank, self.coords, socket.getfqdn(socket.gethostname()), self.actor_id
+            self.rank,
+            self.coords,
+            socket.getfqdn(socket.gethostname()),
+            self.actor_id.actor_name,
         )
         if self.header:
             self.message(self.header)
@@ -67,7 +70,9 @@ class PdbWrapper(pdb.Pdb):
             super().do_clear(arg)
     def end_debug_session(self):
-        self.client_ref.debugger_session_end.broadcast(self.rank)
+        self.client_ref.debugger_session_end.broadcast(
+            self.actor_id.actor_name, self.rank
+        )
         # Once the debug client actor is notified of the session being over,
         # we need to prevent any additional requests being sent for the session
         # by redirecting stdin and stdout.
@@ -88,7 +93,7 @@ class ReadWrapper(io.RawIOBase):
     def readinto(self, b):
         with fake_sync_state():
             response = self.session.client_ref.debugger_read.call_one(
-                self.session.rank, len(b)
+                self.session.actor_id.actor_name, self.session.rank, len(b)
             ).get()
             if response == "detach":
                 # this gets injected by the worker event loop to
@@ -124,6 +129,7 @@ class WriteWrapper:
             # pyre-ignore
             lineno = self.session.curframe.f_lineno
         self.session.client_ref.debugger_write.broadcast(
+            self.session.actor_id.actor_name,
             self.session.rank,
             DebuggerWrite(
                 s.encode(),

monarch/_src/actor/proc_mesh.py CHANGED Viewed

@@ -37,15 +37,14 @@ from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
     ProcMeshMonitor,
 )
 from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
-from monarch._src.actor.actor_mesh import (
-    _Actor,
-    _ActorMeshRefImpl,
-    Actor,
-    ActorMeshRef,
-    fake_sync_state,
-)
+from monarch._src.actor.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
-from monarch._src.actor.allocator import LocalAllocator, ProcessAllocator, SimAllocator
+from monarch._src.actor.allocator import (
+    AllocateMixin,
+    LocalAllocator,
+    ProcessAllocator,
+    SimAllocator,
+)
 from monarch._src.actor.code_sync import (
     CodeSyncMeshClient,
     RemoteWorkspace,
@@ -111,29 +110,12 @@ except ImportError:
     IN_PAR = False
-async def _allocate_nonblocking(
-    alloc: Alloc, setup: Callable[[], None] | None = None
-) -> "ProcMesh":
-    _proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
-    if setup is None:
-        return ProcMesh(_proc_mesh)
-    # If the user has passed the setup lambda, we need to call
-    # it here before any of the other actors are spawned so that
-    # the environment variables are set up before cuda init.
-    proc_mesh = ProcMesh(_proc_mesh)
-    setup_actor = await proc_mesh.spawn("setup", SetupActor, setup)
-    await setup_actor.setup.call()
-    del setup_actor
-    return proc_mesh
 class ProcMesh(MeshTrait):
     def __init__(
         self,
         hy_proc_mesh: HyProcMesh,
         _mock_shape: Optional[Shape] = None,
         _device_mesh: Optional["DeviceMesh"] = None,
-        _is_initializing_debugger: bool = False,
     ) -> None:
         self._proc_mesh = hy_proc_mesh
         self._mock_shape: Optional[Shape] = _mock_shape
@@ -146,20 +128,32 @@ class ProcMesh(MeshTrait):
         self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
         self._stopped = False
-        # This code is unsafe in async contexts, but we currently do it all over the place
-        # we need to refactor this by moving it to the first time we try to spawn on the mesh.
-        # Right now we simply preserve the previous behavior and disable the check that prevents
-        # end users from doing the same.
-        with fake_sync_state():
-            if _mock_shape is None and HAS_TENSOR_ENGINE:
-                # type: ignore[21]
-                self._rdma_manager = _RdmaManager.create_rdma_manager_blocking(
-                    self._proc_mesh
-                )
-            if not _is_initializing_debugger and _mock_shape is None:
-                self._debug_manager = self.spawn(
-                    _DEBUG_MANAGER_ACTOR_NAME, DebugManager, debug_client()
-                ).get()
+    async def _init_manager_actors(
+        self,
+        setup: Callable[[], None] | None = None,
+    ) -> "ProcMesh":
+        _rdma_manager = (
+            # pyre-ignore
+            await _RdmaManager.create_rdma_manager_nonblocking(self._proc_mesh)
+            if HAS_TENSOR_ENGINE
+            else None
+        )
+        _debug_manager = await self._spawn_nonblocking(
+            _DEBUG_MANAGER_ACTOR_NAME, DebugManager, await _debug_client()
+        )
+        self._debug_manager = _debug_manager
+        self._rdma_manager = _rdma_manager
+        if setup is not None:
+            # If the user has passed the setup lambda, we need to call
+            # it here before any of the other actors are spawned so that
+            # the environment variables are set up before cuda init.
+            setup_actor = await self._spawn_nonblocking("setup", SetupActor, setup)
+            # pyre-ignore
+            await setup_actor.setup.call()._status.coro
+        return self
     @property
     def _shape(self) -> Shape:
@@ -184,10 +178,7 @@ class ProcMesh(MeshTrait):
     def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
         if self._mock_shape is not None:
             raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
-        return Future(
-            impl=lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
-            requires_loop=False,
-        )
+        return Future(coro=self._spawn_nonblocking(name, Class, *args, **kwargs))
     async def monitor(self) -> ProcMeshMonitor:
         """
@@ -230,8 +221,7 @@ class ProcMesh(MeshTrait):
         ```
         """
         return Future(
-            impl=lambda: _allocate_nonblocking(alloc, setup),
-            requires_loop=False,
+            coro=_proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors=True)
         )
     def __repr__(self) -> str:
@@ -345,10 +335,7 @@ class ProcMesh(MeshTrait):
             await self._proc_mesh.stop_nonblocking()
             self._stopped = True
-        return Future(
-            impl=lambda: _stop_nonblocking(),
-            requires_loop=False,
-        )
+        return Future(coro=_stop_nonblocking())
     async def __aexit__(
         self, exc_type: object, exc_val: object, exc_tb: object
@@ -370,46 +357,15 @@ class ProcMesh(MeshTrait):
             # Cannot call stop here because it is async.
-async def local_proc_mesh_nonblocking(
-    *,
-    gpus: Optional[int] = None,
-    hosts: int = 1,
-    _is_initializing_debugger: bool = False,
-) -> ProcMesh:
-    if gpus is None:
-        gpus = _local_device_count()
-    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
-    allocator = LocalAllocator()
-    alloc = await allocator.allocate(spec)
-    proc_mesh = HyProcMesh.allocate_nonblocking(alloc)
-    return ProcMesh(
-        await proc_mesh,
-        _is_initializing_debugger=_is_initializing_debugger,
-    )
 def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
     return Future(
-        impl=lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
-        requires_loop=False,
+        coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=LocalAllocator())
     )
-async def sim_proc_mesh_nonblocking(
-    *, gpus: Optional[int] = None, hosts: int = 1
-) -> ProcMesh:
-    if gpus is None:
-        gpus = _local_device_count()
-    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
-    allocator = SimAllocator()
-    alloc = await allocator.allocate(spec)
-    return await ProcMesh.from_alloc(alloc)
 def sim_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
     return Future(
-        impl=lambda: sim_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
-        requires_loop=False,
+        coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=SimAllocator())
     )
@@ -431,33 +387,35 @@ def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
     return cmd, args, env
-async def proc_mesh_nonblocking(
+async def _proc_mesh_from_alloc_coro(
+    alloc: Alloc,
+    setup: Callable[[], None] | None,
+    init_manager_actors: bool,
+) -> ProcMesh:
+    _hy_proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
+    proc_mesh = ProcMesh(_hy_proc_mesh)
+    if init_manager_actors:
+        await proc_mesh._init_manager_actors(setup)
+    return proc_mesh
+async def _proc_mesh_coro(
     *,
+    allocator: AllocateMixin,
     gpus: Optional[int] = None,
     hosts: int = 1,
-    env: dict[str, str] | None = None,
     setup: Callable[[], None] | None = None,
+    init_manager_actors: bool = True,
 ) -> ProcMesh:
     if gpus is None:
         gpus = _local_device_count()
     # gpus must come last in this order because
     # test_remote_function_all_gather expects that hosts comes before gpus
     # in the order of the dimensions.
-    spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
-    env = env or {}
-    # Todo: Deprecate the env field from the ProcessAllocator
-    # The PAR_MAIN_OVERRIDE needs to be passed as an env
-    # to the proc mesh construction in rust, so can not be moved to the
-    # SetupActor yet
-    cmd, args, bootstrap_env = _get_bootstrap_args()
-    env.update(bootstrap_env)
-    allocator = ProcessAllocator(cmd, args, env)
-    alloc = await allocator.allocate(spec)
+    spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
+    alloc = await allocator.allocate_nonblocking(spec)
-    return await ProcMesh.from_alloc(
-        alloc,
-        setup=setup,
-    )
+    return await _proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors)
 def proc_mesh(
@@ -467,12 +425,22 @@ def proc_mesh(
     env: dict[str, str] | None = None,
     setup: Callable[[], None] | None = None,
 ) -> Future[ProcMesh]:
-    return Future(
-        impl=lambda: proc_mesh_nonblocking(
-            gpus=gpus, hosts=hosts, env=env, setup=setup
-        ),
-        requires_loop=False,
+    env = env or {}
+    # Todo: Deprecate the env field from the ProcessAllocator
+    # The PAR_MAIN_OVERRIDE needs to be passed as an env
+    # to the proc mesh construction in rust, so can not be moved to the
+    # SetupActor yet
+    cmd, args, bootstrap_env = _get_bootstrap_args()
+    env.update(bootstrap_env)
+    task = _proc_mesh_coro(
+        gpus=gpus,
+        hosts=hosts,
+        setup=setup,
+        allocator=ProcessAllocator(cmd, args, env),
+        init_manager_actors=True,
     )
+    return Future(coro=task)
 _debug_proc_mesh: Optional["ProcMesh"] = None
@@ -482,15 +450,12 @@ _debug_proc_mesh: Optional["ProcMesh"] = None
 # doesn't trigger the debug client to spawn, which could cause confusing
 # logs. This is defined in proc_mesh.py instead of debugger.py for
 # circular import reasons.
-def _get_debug_proc_mesh() -> "ProcMesh":
+async def _get_debug_proc_mesh() -> "ProcMesh":
     global _debug_proc_mesh
     if _debug_proc_mesh is None:
-        _debug_proc_mesh = Future(
-            impl=lambda: local_proc_mesh_nonblocking(
-                gpus=1, hosts=1, _is_initializing_debugger=True
-            ),
-            requires_loop=False,
-        ).get()
+        _debug_proc_mesh = await _proc_mesh_coro(
+            gpus=1, hosts=1, allocator=LocalAllocator(), init_manager_actors=False
+        )
     return _debug_proc_mesh
@@ -499,10 +464,13 @@ _debug_client_mesh: Optional[DebugClient] = None
 # Lazy init for the same reason as above. This is defined in proc_mesh.py
 # instead of debugger.py for circular import reasons.
-def debug_client() -> DebugClient:
+async def _debug_client() -> DebugClient:
     global _debug_client_mesh
     if _debug_client_mesh is None:
-        _debug_client_mesh = (
-            _get_debug_proc_mesh().spawn("debug_client", DebugClient).get()
-        )
+        mesh = await _get_debug_proc_mesh()
+        _debug_client_mesh = await mesh._spawn_nonblocking("debug_client", DebugClient)
     return _debug_client_mesh
+def debug_client() -> DebugClient:
+    return Future(coro=_debug_client()).get()

monarch/_src/actor/shape.py CHANGED Viewed

@@ -31,6 +31,32 @@ def iter_ranks(ranks: Slices) -> Generator[int, None, None]:
         yield from ranks
+class ShapeExt:
+    """Extension methods for Shape that add higher-level
+    functionality."""
+    @staticmethod
+    def slice(shape: Shape, **kwargs) -> Shape:
+        """Select along named dimensions. Integer values remove
+        dimensions, slice objects keep dimensions but restrict them.
+        Examples: ShapeExt.slice(shape, batch=3, gpu=slice(2, 6))
+        """
+        for label, selector in kwargs.items():
+            if label not in shape.labels:
+                raise TypeError(f"Shape does not have dimension labeled {label!r}")
+            if isinstance(selector, slice):
+                shape = shape.select(label, selector)
+            else:
+                if (
+                    selector < 0
+                    or selector >= shape.ndslice.sizes[shape.labels.index(label)]
+                ):
+                    raise IndexError("index out of range")
+                shape = shape.at(label, selector)
+        return shape
 class MeshTrait(ABC):
     """
     Mesh interface. Implemented via Shape.
@@ -51,40 +77,13 @@ class MeshTrait(ABC):
     def _new_with_shape(self, shape: Shape) -> Self: ...
     def slice(self, **kwargs) -> Self:
-        """
-        mesh.slice(batch=3) or mesh.slice(batch=slice(3, None))
-        """
-        ndslice = self._ndslice
-        labels = self._labels
-        offset = ndslice.offset
-        names = []
-        sizes = []
-        strides = []
-        for name, size, stride in zip(labels, ndslice.sizes, ndslice.strides):
-            if name in kwargs:
-                e = kwargs.pop(name)
-                if isinstance(e, slice):
-                    start, stop, slice_stride = e.indices(size)
-                    offset += start * stride
-                    names.append(name)
-                    sizes.append((stop - start) // slice_stride)
-                    strides.append(slice_stride * stride)
-                else:
-                    if e >= size or e < 0:
-                        raise IndexError("index out of range")
-                    offset += e * stride
-            else:
-                names.append(name)
-                sizes.append(size)
-                strides.append(stride)
-        if kwargs:
-            raise TypeError(
-                f"{self} does not have dimension(s) named {tuple(kwargs.keys())}"
-            )
+        """Select along named dimensions. Integer values remove
+        dimensions, slice objects keep dimensions but restrict them.
-        new_ndslice = NDSlice(offset=offset, sizes=sizes, strides=strides)
-        return self._new_with_shape(Shape(names, new_ndslice))
+        Examples: mesh.slice(batch=3, gpu=slice(2, 6))
+        """
+        shape = Shape(list(self._labels), self._ndslice)
+        return self._new_with_shape(ShapeExt.slice(shape, **kwargs))
     def split(self, **kwargs) -> Self:
         """

monarch/_src/tensor_engine/rdma.py CHANGED Viewed

@@ -120,12 +120,15 @@ class RDMABuffer:
                 f"offset + size ({offset + size}) must be <= dst.numel() ({dst.numel()})"
             )
+        local_proc_id = MonarchContext.get().proc_id
+        client = MonarchContext.get().mailbox
         async def read_into_nonblocking() -> Optional[int]:
             res = await self._buffer.read_into(
                 addr=addr,
                 size=size,
-                local_proc_id=MonarchContext.get().proc_id,
-                client=MonarchContext.get().mailbox,
+                local_proc_id=local_proc_id,
+                client=client,
                 timeout=timeout,
             )
             # TODO - remove this once GPU support is added.
@@ -133,7 +136,7 @@ class RDMABuffer:
                 dst_gpu.copy_(dst)
             return res
-        return Future(impl=read_into_nonblocking, requires_loop=False)
+        return Future(coro=read_into_nonblocking())
     def write_from(
         self, src: torch.Tensor, offset: int = 0, timeout: int = 3
@@ -164,12 +167,15 @@ class RDMABuffer:
                 f"size + offset ({size + offset}) must be <= src.numel() ({src.numel()})"
             )
+        local_proc_id = MonarchContext.get().proc_id
+        client = MonarchContext.get().mailbox
         async def write_from_nonblocking() -> None:
             res = await self._buffer.write_from(
                 addr=addr,
                 size=size,
-                local_proc_id=MonarchContext.get().proc_id,
-                client=MonarchContext.get().mailbox,
+                local_proc_id=local_proc_id,
+                client=client,
                 timeout=timeout,
             )
             # TODO - remove this once GPU support is added.
@@ -177,4 +183,4 @@ class RDMABuffer:
                 src_gpu.copy_(src)
             return res
-        return Future(impl=write_from_nonblocking, requires_loop=False)
+        return Future(coro=write_from_nonblocking())

monarch/mesh_controller.py CHANGED Viewed

@@ -11,6 +11,7 @@ import os
 import pdb  # noqa
 import traceback
 from collections import deque
+from functools import partial
 from logging import Logger
 from typing import (
     Any,
@@ -32,6 +33,7 @@ from monarch._rust_bindings.monarch_extension.client import (  # @manual=//monar
 from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
 from monarch._rust_bindings.monarch_extension.tensor_worker import Ref
 from monarch._rust_bindings.monarch_hyperactor.actor import (
+    MethodSpecifier,
     PythonMessage,
     PythonMessageKind,
     UnflattenArg,
@@ -40,6 +42,7 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
 from monarch._rust_bindings.monarch_hyperactor.proc import (  # @manual=//monarch/monarch_extension:monarch_extension
     ActorId,
 )
+from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask
 from monarch._src.actor.actor_mesh import ActorEndpoint, Port, PortTuple
 from monarch._src.actor.endpoint import Selection
 from monarch._src.actor.shape import NDSlice
@@ -48,7 +51,7 @@ from monarch.common.controller_api import TController
 from monarch.common.function import ResolvableFunction
 from monarch.common.invocation import Seq
 from monarch.common.messages import Referenceable, SendResultOfActorCall
-from monarch.common.stream import StreamRef
+from monarch.common.stream import Stream, StreamRef
 from monarch.common.tensor import dtensor_check, InputChecker, Tensor
 from monarch.common.tree import flatten
 from monarch.tensor_worker_main import _set_trace
@@ -322,9 +325,39 @@ def actor_send(
     client = cast(MeshClient, checker.mesh.client)
-    stream_ref = chosen_stream._to_ref(client)
+    rest = partial(
+        _actor_send,
+        endpoint,
+        args_kwargs_tuple,
+        refs,
+        port,
+        selection,
+        client,
+        checker.mesh,
+        tensors,
+        chosen_stream,
+    )
+    if isinstance(endpoint._name, MethodSpecifier.Init):
+        # Init runs within the tokio loop, but creating a node blocks the loop sending actor messages, so
+        # we offload to a blocking thread
+        PythonTask.spawn_blocking(rest)
+    else:
+        rest()
-    fut = (port, checker.mesh._ndslice) if port is not None else None
+def _actor_send(
+    endpoint: ActorEndpoint,
+    args_kwargs_tuple: bytes,
+    refs: Sequence[Any],
+    port: Optional[Port[Any]],
+    selection: Selection,
+    client: MeshClient,
+    mesh: DeviceMesh,
+    tensors: List[Tensor],
+    chosen_stream: Stream,
+):
+    stream_ref = chosen_stream._to_ref(client)
+    fut = (port, mesh._ndslice) if port is not None else None
     ident = client.new_node([], tensors, cast("OldFuture", fut))
@@ -340,7 +373,7 @@ def actor_send(
         endpoint, selection, client, ident, args_kwargs_tuple, refs
     )
     worker_msg = SendResultOfActorCall(ident, broker_id, tensors, [], stream_ref)
-    client.send(checker.mesh._ndslice, worker_msg)
+    client.send(mesh._ndslice, worker_msg)
     # we have to ask for status updates
     # from workers to be sure they have finished
     # enough work to count this future as finished,

monarch/monarch_controller CHANGED Viewed

Binary file