PyPI - torchmonarch-nightly - Versions diffs - 2025.6.10__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.12__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.10__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.12__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

monarch/_monarch/hyperactor/__init__.py CHANGED Viewed

@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 # pyre-strict
-import abc
 from monarch._rust_bindings.monarch_hyperactor.actor import PythonMessage
@@ -29,21 +28,6 @@ from monarch._rust_bindings.monarch_hyperactor.shape import (  # @manual=//monar
     Shape,
 )
-class Actor(abc.ABC):
-    @abc.abstractmethod
-    async def handle(self, mailbox: Mailbox, message: PythonMessage) -> None: ...
-    async def handle_cast(
-        self,
-        mailbox: Mailbox,
-        rank: int,
-        coordinates: list[tuple[str, int]],
-        message: PythonMessage,
-    ) -> None:
-        await self.handle(mailbox, message)
 __all__ = [
     "init_proc",
     "Actor",

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/actor_mesh.py CHANGED Viewed

@@ -4,9 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-unsafe
 import asyncio
 import collections
 import contextvars
+import functools
 import inspect
 import itertools
@@ -19,6 +22,7 @@ from traceback import extract_tb, StackSummary
 from typing import (
     Any,
     AsyncGenerator,
+    Awaitable,
     Callable,
     cast,
     Concatenate,
@@ -38,6 +42,7 @@ from typing import (
 import monarch
 from monarch import ActorFuture as Future
+from monarch._rust_bindings.hyperactor_extension.telemetry import enter_span, exit_span
 from monarch._rust_bindings.monarch_hyperactor.actor import PanicFlag, PythonMessage
 from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
@@ -49,10 +54,11 @@ from monarch._rust_bindings.monarch_hyperactor.mailbox import (
 )
 from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
 from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
 from monarch.common.pickle_flatten import flatten, unflatten
 from monarch.common.shape import MeshTrait, NDSlice
-logger = logging.getLogger(__name__)
+logger: logging.Logger = logging.getLogger(__name__)
 Allocator = monarch.ProcessAllocator | monarch.LocalAllocator
@@ -89,7 +95,7 @@ _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
 # this was implemented in python 3.12 as an argument to task
 # but I have to backport to 3.10/3.11.
-def create_eager_task(coro: Coroutine[Any, None, Any]) -> asyncio.Future:
+def create_eager_task(coro: Awaitable[None]) -> asyncio.Future:
     iter = coro.__await__()
     try:
         first_yield = next(iter)
@@ -232,7 +238,7 @@ class Endpoint(Generic[P, R]):
         self,
         actor_mesh_ref: _ActorMeshRefImpl,
         name: str,
-        impl: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]],
+        impl: Callable[Concatenate[Any, P], Awaitable[R]],
         mailbox: Mailbox,
     ) -> None:
         self._actor_mesh = actor_mesh_ref
@@ -264,14 +270,16 @@ class Endpoint(Generic[P, R]):
         return self.choose(*args, **kwargs)
     def call(self, *args: P.args, **kwargs: P.kwargs) -> "Future[ValueMesh[R]]":
+        p: PortId
+        r: PortReceiver[R]
         p, r = port(self)
         # pyre-ignore
         send(self, args, kwargs, port=p, rank_in_response=True)
-        async def process():
-            results = [None] * len(self._actor_mesh)
+        async def process() -> ValueMesh[R]:
+            results: List[R] = [None] * len(self._actor_mesh)  # pyre-fixme[9]
             for _ in range(len(self._actor_mesh)):
-                rank, value = await r.recv()
+                rank, value = await r.recv()  # pyre-fixme[23]
                 results[rank] = value
             call_shape = Shape(
                 self._actor_mesh._shape.labels,
@@ -309,15 +317,15 @@ class Endpoint(Generic[P, R]):
 class Accumulator(Generic[P, R, A]):
     def __init__(
         self, endpoint: Endpoint[P, R], identity: A, combine: Callable[[A, R], A]
-    ):
-        self._endpoint = endpoint
-        self._identity = identity
-        self._combine = combine
+    ) -> None:
+        self._endpoint: Endpoint[P, R] = endpoint
+        self._identity: A = identity
+        self._combine: Callable[[A, R], A] = combine
     def accumulate(self, *args: P.args, **kwargs: P.kwargs) -> "Future[A]":
-        gen = self._endpoint.stream(*args, **kwargs)
+        gen: AsyncGenerator[R, R] = self._endpoint.stream(*args, **kwargs)
-        async def impl():
+        async def impl() -> A:
             value = self._identity
             async for x in gen:
                 value = self._combine(value, x)
@@ -334,7 +342,7 @@ class ValueMesh(MeshTrait, Generic[R]):
     def _new_with_shape(self, shape: Shape) -> "ValueMesh[R]":
         return ValueMesh(shape, self._values)
-    def item(self, **kwargs):
+    def item(self, **kwargs) -> R:
         coordinates = [kwargs.pop(label) for label in self._labels]
         if kwargs:
             raise KeyError(f"item has extra dimensions: {list(kwargs.keys())}")
@@ -345,7 +353,7 @@ class ValueMesh(MeshTrait, Generic[R]):
         for rank in self._shape.ranks():
             yield Point(rank, self._shape), self._values[rank]
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._shape)
     @property
@@ -378,7 +386,7 @@ def send(
 class EndpointProperty(Generic[P, R]):
-    def __init__(self, method: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]]):
+    def __init__(self, method: Callable[Concatenate[Any, P], Awaitable[R]]) -> None:
         self._method = method
     def __get__(self, instance, owner) -> Endpoint[P, R]:
@@ -389,7 +397,7 @@ class EndpointProperty(Generic[P, R]):
 def endpoint(
-    method: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]],
+    method: Callable[Concatenate[Any, P], Awaitable[R]],
 ) -> EndpointProperty[P, R]:
     return EndpointProperty(method)
@@ -412,7 +420,9 @@ class Port:
 # advance lower-level API for sending messages. This is intentially
 # not part of the Endpoint API because they way it accepts arguments
 # and handles concerns is different.
-def port(endpoint: Endpoint[P, R], once=False) -> Tuple["PortId", "PortReceiver[R]"]:
+def port(
+    endpoint: Endpoint[P, R], once: bool = False
+) -> Tuple["PortId", "PortReceiver[R]"]:
     handle, receiver = (
         endpoint._mailbox.open_once_port() if once else endpoint._mailbox.open_port()
     )
@@ -425,9 +435,9 @@ class PortReceiver(Generic[R]):
         self,
         mailbox: Mailbox,
         receiver: HyPortReceiver | OncePortReceiver,
-    ):
-        self._mailbox = mailbox
-        self._receiver = receiver
+    ) -> None:
+        self._mailbox: Mailbox = mailbox
+        self._receiver: HyPortReceiver | OncePortReceiver = receiver
     async def _recv(self) -> R:
         return self._process(await self._receiver.recv())
@@ -435,7 +445,7 @@ class PortReceiver(Generic[R]):
     def _blocking_recv(self) -> R:
         return self._process(self._receiver.blocking_recv())
-    def _process(self, msg: PythonMessage):
+    def _process(self, msg: PythonMessage) -> R:
         # TODO: Try to do something more structured than a cast here
         payload = cast(R, _unpickle(msg.message, self._mailbox))
         if msg.method == "result":
@@ -482,7 +492,9 @@ class _Actor:
             else None
         )
         try:
-            ctx = MonarchContext(mailbox, mailbox.actor_id.proc_id, Point(rank, shape))
+            ctx: MonarchContext = MonarchContext(
+                mailbox, mailbox.actor_id.proc_id, Point(rank, shape)
+            )
             _context.set(ctx)
             args, kwargs = _unpickle(message.message, mailbox)
@@ -492,13 +504,29 @@ class _Actor:
                 return None
             else:
                 the_method = getattr(self.instance, message.method)._method
-                result = the_method(self.instance, *args, **kwargs)
                 if not inspect.iscoroutinefunction(the_method):
+                    enter_span(
+                        the_method.__module__, message.method, str(ctx.mailbox.actor_id)
+                    )
+                    result = the_method(self.instance, *args, **kwargs)
+                    exit_span()
                     if port is not None:
                         port.send("result", result)
                     return None
-                return self.run_async(ctx, self.run_task(port, result, panic_flag))
+                async def instrumented():
+                    enter_span(
+                        the_method.__module__, message.method, str(ctx.mailbox.actor_id)
+                    )
+                    result = await the_method(self.instance, *args, **kwargs)
+                    exit_span()
+                    return result
+                return self.run_async(
+                    ctx,
+                    self.run_task(port, instrumented(), panic_flag),
+                )
         except Exception as e:
             traceback.print_exc()
             s = ActorError(e)
@@ -510,13 +538,22 @@ class _Actor:
             else:
                 raise s from None
-    async def run_async(self, ctx, coroutine):
+    async def run_async(
+        self,
+        ctx: MonarchContext,
+        coroutine: Awaitable[None],
+    ) -> None:
         _context.set(ctx)
         if self.complete_task is None:
             self.complete_task = asyncio.create_task(self._complete())
         await self.active_requests.put(create_eager_task(coroutine))
-    async def run_task(self, port, coroutine, panic_flag):
+    async def run_task(
+        self,
+        port: Port | None,
+        coroutine: Awaitable[Any],
+        panic_flag: PanicFlag,
+    ) -> None:
         try:
             result = await coroutine
             if port is not None:
@@ -564,6 +601,12 @@ def _unpickle(data: bytes, mailbox: Mailbox) -> Any:
 class Actor(MeshTrait):
+    @functools.cached_property
+    def logger(cls) -> logging.Logger:
+        lgr = logging.getLogger(cls.__class__.__name__)
+        lgr.setLevel(logging.DEBUG)
+        return lgr
     @property
     def _ndslice(self) -> NDSlice:
         raise NotImplementedError(
@@ -586,10 +629,10 @@ class ActorMeshRef(MeshTrait):
     def __init__(
         self, Class: Type[T], actor_mesh_ref: _ActorMeshRefImpl, mailbox: Mailbox
     ) -> None:
-        self.__name__ = Class.__name__
-        self._class = Class
-        self._actor_mesh_ref = actor_mesh_ref
-        self._mailbox = mailbox
+        self.__name__: str = Class.__name__
+        self._class: Type[T] = Class
+        self._actor_mesh_ref: _ActorMeshRefImpl = actor_mesh_ref
+        self._mailbox: Mailbox = mailbox
         for attr_name in dir(self._class):
             attr_value = getattr(self._class, attr_name, None)
             if isinstance(attr_value, EndpointProperty):
@@ -630,7 +673,11 @@ class ActorMeshRef(MeshTrait):
             f"'{self.__class__.__name__}' object has no attribute '{name}'"
         )
-    def _create(self, args: Iterable[Any], kwargs: Dict[str, Any]) -> None:
+    def _create(
+        self,
+        args: Iterable[Any],
+        kwargs: Dict[str, Any],
+    ) -> None:
         async def null_func(*_args: Iterable[Any], **_kwargs: Dict[str, Any]) -> None:
             return None

monarch/allocator.py CHANGED Viewed

@@ -4,6 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
+import abc
 from typing import final
 from monarch import ActorFuture as Future
@@ -15,6 +18,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//mon
 from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
     LocalAllocatorBase,
     ProcessAllocatorBase,
+    RemoteAllocatorBase,
 )
@@ -60,3 +64,66 @@ class LocalAllocator(LocalAllocatorBase):
             lambda: self.allocate_nonblocking(spec),
             lambda: self.allocate_blocking(spec),
         )
+class RemoteAllocInitializer(abc.ABC):
+    """Subclass-able Python interface for `hyperactor_mesh::alloc::remoteprocess:RemoteProcessAllocInitializer`.
+    NOTE: changes to method signatures of this class must be made to the call-site at
+    `PyRemoteProcessAllocInitializer.py_initialize_alloc()` in `monarch/monarch_hyperactor/src/alloc.rs`
+    """
+    @abc.abstractmethod
+    async def initialize_alloc(self) -> list[str]:
+        """
+        Return the addresses of the servers that should be used to allocate processes
+        for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
+        Each address is of the form `{transport}!{addr}(:{port})`.
+        This is the string form of `hyperactor::channel::ChannelAddr` (Rust).
+        For example, `tcp!127.0.0.1:1234`.
+        NOTE: Currently, all the addresses must have the same transport type and port
+        NOTE: Although this method is currently called once at the initialization of the Allocator,
+            in the future this method can be called multiple times and should return the current set of
+            addresses that are eligible to handle allocation requests.
+        """
+        ...
+class StaticRemoteAllocInitializer(RemoteAllocInitializer):
+    """
+    Returns the static list of server addresses that this initializer
+    was constructed with on each `initialize_alloc()` call.
+    """
+    def __init__(self, *addrs: str) -> None:
+        super().__init__()
+        self.addrs: list[str] = list(addrs)
+    async def initialize_alloc(self) -> list[str]:
+        return list(self.addrs)
+@final
+class RemoteAllocator(RemoteAllocatorBase):
+    """
+    An allocator that allocates by spawning actors on a remote host.
+    The remote host must be running hyperactor's remote-process-allocator.
+    """
+    def allocate(self, spec: AllocSpec) -> Future[Alloc]:
+        """
+        Allocate a process according to the provided spec.
+        Arguments:
+        - `spec`: The spec to allocate according to.
+        Returns:
+        - A future that will be fulfilled when the requested allocation is fulfilled.
+        """
+        return Future(
+            lambda: self.allocate_nonblocking(spec),
+            lambda: self.allocate_blocking(spec),
+        )

monarch/bootstrap_main.py CHANGED Viewed

@@ -58,7 +58,7 @@ def invoke_main():
     # forward logs to rust tracing. Defaults to on.
     if os.environ.get("MONARCH_PYTHON_LOG_TRACING", "1") == "1":
-        logging.root.addHandler(TracingForwarder())
+        logging.root.addHandler(TracingForwarder(level=logging.DEBUG))
     try:
         with (

monarch/common/client.py CHANGED Viewed

@@ -302,7 +302,7 @@ class Client:
         self.last_processed_seq = max(self.last_processed_seq, seq)
         if error is not None:
-            logging.error("Received error for seq %s: %s", seq, error)
+            logging.info("Received error for seq %s: %s", seq, error)
             # We should not have set result if we have an error.
             assert result is None
             if not isinstance(error, RemoteException):
@@ -332,9 +332,7 @@ class Client:
         elif error is not None:
             # errors get reported as results even if they
             # do not have futures attached.
-            logger.warning(
-                f"Error encountered for this instruction {seq}. Proceeding forward because error is unused and unhandled. Error details:\n{error}."
-            )
+            pass
         # We can safely delete the seq as tracebacks have been saved to the remote failure itself.
         del self.pending_results[seq]

monarch/common/stream.py CHANGED Viewed

@@ -82,6 +82,9 @@ class StreamRef(Referenceable):
             messages.CreateStream(self, self.default),
         )
+    def __repr__(self):
+        return f"<StreamRef {repr(self.name)} {self.ref}>"
     def delete_ref(self, ref):
         client = self.client()
         if client is not None and not client._shutdown:

monarch/mesh_controller.py CHANGED Viewed

@@ -158,7 +158,6 @@ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
             traceback.FrameSummary("<unknown>", None, frame)
             for frame in exc.backtrace.split("\\n")
         ]
-        logger.error(f"Worker {exc.actor_id} failed")
         return MessageResult(
             seq=result.seq,
             result=None,
@@ -169,7 +168,7 @@ def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
                 controller_frames=None,
                 worker_frames=worker_frames,
                 source_actor_id=exc.actor_id,
-                message=f"Worker {exc.actor_id} failed",
+                message=f"Remote function in {exc.actor_id} errored.",
             ),
         )
     elif isinstance(exc, client.Failure):

monarch/monarch_controller CHANGED Viewed

Binary file

monarch/proc_mesh.py CHANGED Viewed

@@ -4,9 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
 import sys
-from typing import Any, cast, Optional, Type, TypeVar
+from typing import Any, cast, List, Optional, Type, TypeVar
 import monarch
 from monarch import ActorFuture as Future
@@ -18,7 +20,7 @@ from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//mon
 )
 from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
 from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
-from monarch._rust_bindings.monarch_hyperactor.shape import Shape
+from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
 from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
 from monarch.common._device_utils import _local_device_count
@@ -46,14 +48,16 @@ class ProcMesh(MeshTrait):
     def __init__(self, hy_proc_mesh: HyProcMesh) -> None:
         self._proc_mesh = hy_proc_mesh
         self._mailbox: Mailbox = self._proc_mesh.client
-        self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
+        self._rdma_manager: RDMAManager = self._spawn_blocking(
+            "rdma_manager", RDMAManager
+        )
     @property
-    def _ndslice(self):
+    def _ndslice(self) -> Slice:
         return self._proc_mesh.shape.ndslice
     @property
-    def _labels(self):
+    def _labels(self) -> List[str]:
         return self._proc_mesh.shape.labels
     def _new_with_shape(self, shape: Shape) -> "ProcMesh":

tests/test_allocator.py ADDED Viewed

@@ -0,0 +1,216 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import contextlib
+import importlib.resources
+import math
+import os
+import subprocess
+import sys
+import unittest
+from datetime import timedelta
+from typing import Generator
+import cloudpickle
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from monarch._rust_bindings.hyperactor_extension.alloc import (
+    AllocConstraints,
+    AllocSpec,
+)
+from monarch._rust_bindings.monarch_hyperactor.channel import (
+    ChannelAddr,
+    ChannelTransport,
+)
+from monarch.actor_mesh import Actor, current_rank, current_size, endpoint, ValueMesh
+from monarch.allocator import RemoteAllocator, StaticRemoteAllocInitializer
+from monarch.proc_mesh import ProcMesh
+from torch.distributed.elastic.utils.distributed import get_free_port
+_100_MILLISECONDS = timedelta(milliseconds=100)
+class TestActor(Actor):
+    """Silly actor that computes the world size by all-reducing rank-hot tensors"""
+    def __init__(self) -> None:
+        self.rank: int = current_rank().rank
+        self.world_size: int = math.prod(current_size().values())
+    @endpoint
+    async def compute_world_size(self, master_addr: str, master_port: int) -> int:
+        os.environ["MASTER_ADDR"] = master_addr
+        os.environ["MASTER_PORT"] = str(master_port)
+        dist.init_process_group("gloo", rank=self.rank, world_size=self.world_size)
+        try:
+            t = F.one_hot(torch.tensor(self.rank), num_classes=dist.get_world_size())
+            dist.all_reduce(t)
+            return int(torch.sum(t).item())
+        finally:
+            dist.destroy_process_group()
+@contextlib.contextmanager
+def remote_process_allocator() -> Generator[str, None, None]:
+    with importlib.resources.path(__package__, "") as package_path:
+        addr = ChannelAddr.any(ChannelTransport.Unix)
+        process_allocator = subprocess.Popen(
+            args=[
+                "process_allocator",
+                f"--addr={addr}",
+            ],
+            env={
+                # prefix PATH with this test module's directory to
+                # give 'process_allocator' and 'monarch_bootstrap' binary resources
+                # in this test module's directory precedence over the installed ones
+                # useful in BUCK where these binaries are added as 'resources' of this test target
+                "PATH": f"{package_path}:{os.getenv('PATH', '')}",
+                "RUST_LOG": "debug",
+            },
+        )
+        try:
+            yield addr
+        finally:
+            process_allocator.terminate()
+            try:
+                five_seconds = 5
+                process_allocator.wait(timeout=five_seconds)
+            except subprocess.TimeoutExpired:
+                process_allocator.kill()
+class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        cloudpickle.register_pickle_by_value(sys.modules[TestActor.__module__])
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cloudpickle.unregister_pickle_by_value(sys.modules[TestActor.__module__])
+    def assert_computed_world_size(
+        self, computed: ValueMesh[int], expected_world_size: int
+    ) -> None:
+        expected_world_sizes = {
+            rank: expected_world_size for rank in range(0, expected_world_size)
+        }
+        computed_world_sizes = {p.rank: v for p, v in list(computed.flatten("rank"))}
+        self.assertDictEqual(expected_world_sizes, computed_world_sizes)
+    async def test_call_allocate_twice(self) -> None:
+        class DeletingAllocInitializer(StaticRemoteAllocInitializer):
+            """test initializer that removes the last address from the list each time initialize_alloc() is called
+            used to test that the state of the initializer is preserved across calls to allocate()
+            """
+            async def initialize_alloc(self) -> list[str]:
+                alloc = await super().initialize_alloc()
+                self.addrs.pop(-1)
+                return alloc
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            initializer = DeletingAllocInitializer(host1, host2)
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=initializer,
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            spec = AllocSpec(AllocConstraints(), host=1, gpu=1)
+            await allocator.allocate(spec)
+            self.assertEqual([host1], initializer.addrs)
+            await allocator.allocate(spec)
+            self.assertEqual([], initializer.addrs)
+    async def test_throws_when_initializer_returns_empty_addrs(self) -> None:
+        class EmptyAllocInitializer(StaticRemoteAllocInitializer):
+            """test initializer that returns an empty list of addresses"""
+            async def initialize_alloc(self) -> list[str]:
+                return []
+        empty_initializer = EmptyAllocInitializer()
+        with self.assertRaisesRegex(
+            RuntimeError, r"initializer must return non-empty list of addresses"
+        ):
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=empty_initializer,
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            await allocator.allocate(AllocSpec(AllocConstraints(), host=1, gpu=1))
+    async def test_allocate_2d_mesh(self) -> None:
+        hosts = 2
+        gpus = 4
+        world_size = hosts * gpus
+        spec = AllocSpec(AllocConstraints(), host=hosts, gpu=gpus)
+        # create 2x process-allocators (on their own bind addresses) to simulate 2 hosts
+        with remote_process_allocator() as host1, remote_process_allocator() as host2:
+            allocator = RemoteAllocator(
+                world_id="test_remote_allocator",
+                initializer=StaticRemoteAllocInitializer(host1, host2),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            alloc = await allocator.allocate(spec)
+            proc_mesh = await ProcMesh.from_alloc(alloc)
+            actor = await proc_mesh.spawn("test_actor", TestActor)
+            values = await actor.compute_world_size.call(
+                master_addr="0.0.0.0",
+                master_port=get_free_port(),
+            )
+            self.assert_computed_world_size(values, world_size)
+    async def test_stacked_1d_meshes(self) -> None:
+        # create two stacked actor meshes on the same host
+        # each actor mesh running on separate process-allocators
+        with remote_process_allocator() as host1_a, remote_process_allocator() as host1_b:
+            allocator_a = RemoteAllocator(
+                world_id="a",
+                initializer=StaticRemoteAllocInitializer(host1_a),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            allocator_b = RemoteAllocator(
+                world_id="b",
+                initializer=StaticRemoteAllocInitializer(host1_b),
+                heartbeat_interval=_100_MILLISECONDS,
+            )
+            spec_a = AllocSpec(AllocConstraints(), host=1, gpu=2)
+            spec_b = AllocSpec(AllocConstraints(), host=1, gpu=6)
+            proc_mesh_a = await ProcMesh.from_alloc(await allocator_a.allocate(spec_a))
+            proc_mesh_b = await ProcMesh.from_alloc(await allocator_b.allocate(spec_b))
+            actor_a = await proc_mesh_a.spawn("actor_a", TestActor)
+            actor_b = await proc_mesh_b.spawn("actor_b", TestActor)
+            results_a = await actor_a.compute_world_size.call(
+                master_addr="0.0.0.0", master_port=get_free_port()
+            )
+            results_b = await actor_b.compute_world_size.call(
+                master_addr="0.0.0.0", master_port=get_free_port()
+            )
+            self.assert_computed_world_size(results_a, 2)  # a is a 1x2 mesh
+            self.assert_computed_world_size(results_b, 6)  # b is a 1x6 mesh

{torchmonarch_nightly-2025.6.10.dist-info → torchmonarch_nightly-2025.6.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchmonarch-nightly
-Version: 2025.6.10
+Version: 2025.6.12
 Summary: Monarch: Single controller library
 Author: Meta
 Author-email: oncall+monarch@xmail.facebook.com

{torchmonarch_nightly-2025.6.10.dist-info → torchmonarch_nightly-2025.6.12.dist-info}/RECORD RENAMED Viewed

@@ -1,20 +1,20 @@
 monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
-monarch/_rust_bindings.so,sha256=0-svsKnUJboaOBd5i-LOfpHiRRAgVLX_1Hq_YYREQi8,39756680
+monarch/_rust_bindings.so,sha256=VPU8MhCnz10umRwSqv99QvwFkr2q0N0DiOTpZ37Ecl0,40645344
 monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
-monarch/actor_mesh.py,sha256=AKdjPg3FM6Yt35uFPBnP7fNVEu6busu5BXVWLwjU2A4,23000
-monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
-monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
+monarch/actor_mesh.py,sha256=ovi5RBxobGEcg7zKkzhRc83n82KOD6ermhuloHKbuFs,24420
+monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
+monarch/bootstrap_main.py,sha256=EYaTMA1lxy2213L_04drTKlJvZQjzNdD3jeUHiqSBJc,2578
 monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
 monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
 monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
 monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
 monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
-monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
-monarch/monarch_controller,sha256=Q1eR_EVJqDQLrJZ_6p1ldxVDAU1OmN5lSSuctDcaAFY,20396832
+monarch/mesh_controller.py,sha256=Rr4VNUNN0pJdThbPmbCoaPWid4QpTNHya9xYpmjTkW0,8575
+monarch/monarch_controller,sha256=MECcriPRnSdI_NpAG6y-GiK2-DqnDsLBfyOHVdqewRU,20397992
 monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
 monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
 monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
-monarch/proc_mesh.py,sha256=pVN0BLnjGaty6-UGn1U81rNdmfiDvD4gO1c4bISHtqs,6807
+monarch/proc_mesh.py,sha256=xoaReM9Ab9TWkesxedWSyyk4TMD0HLV88dQ8CQcbqTI,6892
 monarch/profiler.py,sha256=TQ9fnVM8H7smBWtYdB_6Irtzz8DBOmcp7U1T3wlUmco,4911
 monarch/python_local_mesh.py,sha256=YsureIzR9uGlNVrKd4vRghxOXBeYabkt9lICRErfRAI,3536
 monarch/random.py,sha256=f9QR7Esu4Vxqxs-KCf5QYyVqlWvXJ3-UtG90L_h4j40,1527
@@ -27,7 +27,7 @@ monarch/tensor_worker_main.py,sha256=Nbarl2sJKIddLeaRFsaUnqOerLHjzggUr9SqCr2_GYI
 monarch/tensorboard.py,sha256=MnLgH5lbqeUJauEuirEgR6L_qYl2NGdtwZOWIAuOZao,2587
 monarch/world_mesh.py,sha256=GqZpFoVNJPxYa70rLYgv0vu8Vg1nXqx_GYERRb1E9Pc,975
 monarch/_monarch/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
-monarch/_monarch/hyperactor/__init__.py,sha256=H-9w80ejck1lBVfpqOLikT-mPLMLpi7ZZfqrmprMxL0,1748
+monarch/_monarch/hyperactor/__init__.py,sha256=JLnB2_-bKHLqAcZwehKvPkbwbxF-gCq5LODJiWGU_b8,1384
 monarch/_monarch/selection/__init__.py,sha256=47arOElvlK0uYcTNrd__1BwXSfsMosnVw4_tgu2hA-I,381
 monarch/_monarch/worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 monarch/_monarch/worker/debugger.py,sha256=JJZwRPTgQO2emz-hrMelkOSxJFIR3dV4ZA6e7ftYUKA,3614
@@ -43,7 +43,7 @@ monarch/common/_device_utils.py,sha256=gBpl23wMjppVAEzzj8U9HyX-B7Bs2_3ftiMAkzUS4
 monarch/common/_tensor_to_table.py,sha256=yRjCNwvtl188Z1Dwkx3ZU-Bh2mwYnQ0Lnue2RAztwvc,5753
 monarch/common/base_tensor.py,sha256=ujRzR6lWaeCdPv2JX0vCR-VsCWn-3SHaJIkZH1Sw9FQ,1159
 monarch/common/borrows.py,sha256=7KR62xoUat1T6FyADsdHsxVAVIJDvfJWUnPO-xx277U,5307
-monarch/common/client.py,sha256=wOAnoaLmabrcv7mK_z_HVnk_ivGe5igPy3iWZI4LVZc,24517
+monarch/common/client.py,sha256=BaBhOzQaNsqTa-BGy7_IknQxpnpK0j4C5QsNyFHZHW4,24343
 monarch/common/constants.py,sha256=ohvsVYMpfeWopv3KXDAeHWDFLukwc-OY37VRxpKNBE8,300
 monarch/common/context_manager.py,sha256=GOeyaFbyCqvQmkJ0oI7q6IxRd8_0mVyYKZRccI8iaug,1067
 monarch/common/controller_api.py,sha256=djGkK5aSd-V6pBkr3uBCXbfJv3OKf2o2VbBXJgFF2WI,3202
@@ -65,7 +65,7 @@ monarch/common/reference.py,sha256=O26lkzEeVwj0S1xEy-OLqdHVnACmmlbQCUmXRrW4n1Q,9
 monarch/common/remote.py,sha256=qZWXkShX20l07TseQSpVECh2yXZaVKYUvQXkeEM-zvY,9220
 monarch/common/selection.py,sha256=lpWFbZs3ArYy29e-53eoAVAjQFksf1RvZz9NvM0CUW4,308
 monarch/common/shape.py,sha256=k6-0S0U19PmrfP62SMb9Ihx6_I4QQFUGErloZn8GcZ0,8144
-monarch/common/stream.py,sha256=J9UCqhSXSbKYFGtbKaqAq1Vgmg6DJcLzsXXm-tsBQ-w,3499
+monarch/common/stream.py,sha256=_ejoxafHtdD10lLzznRCXKwrkZ_ZH9k_VTgiA5yfBrI,3583
 monarch/common/tensor.py,sha256=mSXiHoD0Up4m2RLdQcsbesaz2N4QCFS34UNNX3Dbldk,28842
 monarch/common/tensor_factory.py,sha256=qm8NZx-5ezMAFjNLiXQvb66okm5XgdboB_GRarGOdN0,801
 monarch/common/tree.py,sha256=1DG3siiE7ixBV6v5cwN8RT_17aJhYZTE-L3i7wZe2_c,2282
@@ -132,6 +132,7 @@ tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,48
 tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
 tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
 tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
+tests/test_allocator.py,sha256=P11sQ95ADjzC_-CfPs3CEP80nP8sn7wW8vVPsmpSVoM,8164
 tests/test_coalescing.py,sha256=-KtAWzTaeXbyzltplfojavx0iFeeZnvej-tFTlu2p5k,15616
 tests/test_controller.py,sha256=yxuVp2DG3TDKJlwuE3cFm9dbWMlbrYtG1uHfvVWRYbw,30935
 tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
@@ -150,9 +151,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
 tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
 tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
 tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
-torchmonarch_nightly-2025.6.10.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
-torchmonarch_nightly-2025.6.10.dist-info/METADATA,sha256=DR1GtSFqtqsjhKWi38uGcvhw2p3ycHYSOwDmsErwLj0,2772
-torchmonarch_nightly-2025.6.10.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
-torchmonarch_nightly-2025.6.10.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
-torchmonarch_nightly-2025.6.10.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
-torchmonarch_nightly-2025.6.10.dist-info/RECORD,,
+torchmonarch_nightly-2025.6.12.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
+torchmonarch_nightly-2025.6.12.dist-info/METADATA,sha256=mBsDu66W3vkM2SdaxX7hw8_B6kl_XgQZT7nQKZhVkMk,2772
+torchmonarch_nightly-2025.6.12.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
+torchmonarch_nightly-2025.6.12.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
+torchmonarch_nightly-2025.6.12.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
+torchmonarch_nightly-2025.6.12.dist-info/RECORD,,

{torchmonarch_nightly-2025.6.10.dist-info → torchmonarch_nightly-2025.6.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.10.dist-info → torchmonarch_nightly-2025.6.12.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.10.dist-info → torchmonarch_nightly-2025.6.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.10.dist-info → torchmonarch_nightly-2025.6.12.dist-info}/top_level.txt RENAMED Viewed

File without changes