PyPI - torchmonarch-nightly - Versions diffs - 2025.6.17__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.19__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.17__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.19__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

monarch/_rust_bindings.so +0 -0
monarch/actor_mesh.py +66 -103
monarch/allocator.py +7 -2
monarch/common/messages.py +5 -4
monarch/monarch_controller +0 -0
monarch/tools/cli.py +1 -1
monarch/tools/commands.py +64 -2
monarch/tools/mesh_spec.py +7 -1
monarch/tools/network.py +62 -0
tests/error_test_binary.py +31 -1
tests/test_actor_error.py +31 -1
tests/test_allocator.py +4 -3
tests/test_python_actors.py +68 -7
{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/RECORD +19 -18
{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/top_level.txt +0 -0

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/actor_mesh.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # pyre-unsafe
-import asyncio
 import collections
 import contextvars
 import functools
@@ -27,9 +26,7 @@ from typing import (
     Callable,
     cast,
     Concatenate,
-    Coroutine,
     Dict,
-    Generator,
     Generic,
     Iterable,
     List,
@@ -51,8 +48,9 @@ from monarch._rust_bindings.monarch_hyperactor.actor_mesh import PythonActorMesh
 from monarch._rust_bindings.monarch_hyperactor.mailbox import (
     Mailbox,
     OncePortReceiver,
-    PortId,
+    OncePortRef,
     PortReceiver as HyPortReceiver,
+    PortRef,
 )
 from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
 from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Shape
@@ -99,39 +97,6 @@ _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
 )
-# this was implemented in python 3.12 as an argument to task
-# but I have to backport to 3.10/3.11.
-def create_eager_task(coro: Awaitable[None]) -> asyncio.Future:
-    iter = coro.__await__()
-    try:
-        first_yield = next(iter)
-        return asyncio.create_task(RestOfCoroutine(first_yield, iter).run())
-    except StopIteration as e:
-        t = asyncio.Future()
-        t.set_result(e.value)
-        return t
-class RestOfCoroutine(Generic[T1, T2]):
-    def __init__(self, first_yield: T1, iter: Generator[T2, None, T2]) -> None:
-        self.first_yield: T1 | None = first_yield
-        self.iter: Generator[T2, None, T2] = iter
-    def __await__(self) -> Generator[T1, None, T1] | Generator[T2, None, T2]:
-        first_yield = self.first_yield
-        assert first_yield is not None
-        yield first_yield
-        self.first_yield = None
-        while True:
-            try:
-                yield next(self.iter)
-            except StopIteration as e:
-                return e.value
-    async def run(self) -> T1 | T2:
-        return await self
 T = TypeVar("T")
 P = ParamSpec("P")
 R = TypeVar("R")
@@ -263,6 +228,8 @@ class Endpoint(Generic[P, R]):
         Load balanced RPC-style entrypoint for request/response messaging.
         """
+        p: Port[R]
+        r: PortReceiver[R]
         p, r = port(self, once=True)
         # pyre-ignore
         send(self, args, kwargs, port=p, selection="choose")
@@ -285,7 +252,18 @@ class Endpoint(Generic[P, R]):
         async def process() -> ValueMesh[R]:
             results: List[R] = [None] * len(self._actor_mesh)  # pyre-fixme[9]
             for _ in range(len(self._actor_mesh)):
-                rank, value = await r.recv()  # pyre-fixme[23]
+                rank, value = await r.recv()
+                results[rank] = value
+            call_shape = Shape(
+                self._actor_mesh._shape.labels,
+                NDSlice.new_row_major(self._actor_mesh._shape.ndslice.sizes),
+            )
+            return ValueMesh(call_shape, results)
+        def process_blocking() -> ValueMesh[R]:
+            results: List[R] = [None] * len(self._actor_mesh)  # pyre-fixme[9]
+            for _ in range(len(self._actor_mesh)):
+                rank, value = r.recv().get()
                 results[rank] = value
             call_shape = Shape(
                 self._actor_mesh._shape.labels,
@@ -293,7 +271,7 @@ class Endpoint(Generic[P, R]):
             )
             return ValueMesh(call_shape, results)
-        return Future(process)
+        return Future(process, process_blocking)
     async def stream(self, *args: P.args, **kwargs: P.kwargs) -> AsyncGenerator[R, R]:
         """
@@ -362,6 +340,9 @@ class ValueMesh(MeshTrait, Generic[R]):
     def __len__(self) -> int:
         return len(self._shape)
+    def __repr__(self) -> str:
+        return f"ValueMesh({self._shape})"
     @property
     def _ndslice(self) -> NDSlice:
         return self._shape.ndslice
@@ -387,7 +368,7 @@ def send(
     message = PythonMessage(
         endpoint._name,
         _pickle((args, kwargs)),
-        None if port is None else port._port,
+        None if port is None else port._port_ref,
         None,
     )
     endpoint._actor_mesh.cast(message, selection)
@@ -411,14 +392,16 @@ def endpoint(
 class Port(Generic[R]):
-    def __init__(self, port: PortId, mailbox: Mailbox, rank: Optional[int]) -> None:
-        self._port = port
+    def __init__(
+        self, port_ref: PortRef | OncePortRef, mailbox: Mailbox, rank: Optional[int]
+    ) -> None:
+        self._port_ref = port_ref
         self._mailbox = mailbox
         self._rank = rank
     def send(self, method: str, obj: R) -> None:
-        self._mailbox.post(
-            self._port,
+        self._port_ref.send(
+            self._mailbox,
             PythonMessage(method, _pickle(obj), None, self._rank),
         )
@@ -432,8 +415,8 @@ def port(
     handle, receiver = (
         endpoint._mailbox.open_once_port() if once else endpoint._mailbox.open_port()
     )
-    port_id: PortId = handle.bind()
-    return Port(port_id, endpoint._mailbox, rank=None), PortReceiver(
+    port_ref: PortRef | OncePortRef = handle.bind()
+    return Port(port_ref, endpoint._mailbox, rank=None), PortReceiver(
         endpoint._mailbox, receiver
     )
@@ -485,24 +468,36 @@ singleton_shape = Shape([], NDSlice(offset=0, sizes=[], strides=[]))
 class _Actor:
+    """
+    This is the message handling implementation of a Python actor.
+    The layering goes:
+        Rust `PythonActor` -> `_Actor` -> user-provided `Actor` instance
+    Messages are received from the Rust backend, and forwarded to the `handle`
+    methods on this class.
+    This class wraps the actual `Actor` instance provided by the user, and
+    routes messages to it, managing argument serialization/deserialization and
+    error handling.
+    """
     def __init__(self) -> None:
         self.instance: object | None = None
-        self.active_requests: asyncio.Queue[asyncio.Future[object]] = asyncio.Queue()
-        self.complete_task: asyncio.Task | None = None
-    def handle(
+    async def handle(
         self, mailbox: Mailbox, message: PythonMessage, panic_flag: PanicFlag
-    ) -> Optional[Coroutine[Any, Any, Any]]:
-        return self.handle_cast(mailbox, 0, singleton_shape, message, panic_flag)
+    ) -> None:
+        return await self.handle_cast(mailbox, 0, singleton_shape, message, panic_flag)
-    def handle_cast(
+    async def handle_cast(
         self,
         mailbox: Mailbox,
         rank: int,
         shape: Shape,
         message: PythonMessage,
         panic_flag: PanicFlag,
-    ) -> Optional[Coroutine[Any, Any, Any]]:
+    ) -> None:
         port = (
             Port(message.response_port, mailbox, rank)
             if message.response_port
@@ -515,26 +510,21 @@ class _Actor:
             _context.set(ctx)
             args, kwargs = _unpickle(message.message, mailbox)
             if message.method == "__init__":
                 Class, *args = args
                 self.instance = Class(*args, **kwargs)
                 return None
-            else:
-                the_method = getattr(self.instance, message.method)._method
-                if not inspect.iscoroutinefunction(the_method):
-                    enter_span(
-                        the_method.__module__, message.method, str(ctx.mailbox.actor_id)
-                    )
-                    result = the_method(self.instance, *args, **kwargs)
-                    exit_span()
-                    if port is not None:
-                        port.send("result", result)
-                    return None
+            the_method = getattr(self.instance, message.method)._method
+            if inspect.iscoroutinefunction(the_method):
                 async def instrumented():
                     enter_span(
-                        the_method.__module__, message.method, str(ctx.mailbox.actor_id)
+                        the_method.__module__,
+                        message.method,
+                        str(ctx.mailbox.actor_id),
                     )
                     try:
                         result = await the_method(self.instance, *args, **kwargs)
@@ -547,39 +537,14 @@ class _Actor:
                     exit_span()
                     return result
-                return self.run_async(
-                    ctx,
-                    self.run_task(port, instrumented(), panic_flag),
-                )
-        except Exception as e:
-            traceback.print_exc()
-            s = ActorError(e)
-            # The exception is delivered to exactly one of:
-            # (1) our caller, (2) our supervisor
-            if port is not None:
-                port.send("exception", s)
+                result = await instrumented()
             else:
-                raise s from None
-    async def run_async(
-        self,
-        ctx: MonarchContext,
-        coroutine: Awaitable[None],
-    ) -> None:
-        _context.set(ctx)
-        if self.complete_task is None:
-            self.complete_task = asyncio.create_task(self._complete())
-        await self.active_requests.put(create_eager_task(coroutine))
+                enter_span(
+                    the_method.__module__, message.method, str(ctx.mailbox.actor_id)
+                )
+                result = the_method(self.instance, *args, **kwargs)
+                exit_span()
-    async def run_task(
-        self,
-        port: Port | None,
-        coroutine: Awaitable[Any],
-        panic_flag: PanicFlag,
-    ) -> None:
-        try:
-            result = await coroutine
             if port is not None:
                 port.send("result", result)
         except Exception as e:
@@ -603,11 +568,6 @@ class _Actor:
                 pass
             raise
-    async def _complete(self) -> None:
-        while True:
-            task = await self.active_requests.get()
-            await task
 def _is_mailbox(x: object) -> bool:
     return isinstance(x, Mailbox)
@@ -648,8 +608,8 @@ class Actor(MeshTrait):
             "actor implementations are not meshes, but we can't convince the typechecker of it..."
         )
-    @endpoint
-    async def _set_debug_client(self, client: "DebugClient") -> None:
+    @endpoint  # pyre-ignore
+    def _set_debug_client(self, client: "DebugClient") -> None:
         point = MonarchContext.get().point
         # For some reason, using a lambda instead of functools.partial
         # confuses the pdb wrapper implementation.
@@ -750,6 +710,9 @@ class ActorMeshRef(MeshTrait):
             self._mailbox,
         )
+    def __repr__(self) -> str:
+        return f"ActorMeshRef(class={self._class}, shape={self._actor_mesh_ref._shape})"
 class ActorError(Exception):
     """

monarch/allocator.py CHANGED Viewed

@@ -74,7 +74,7 @@ class RemoteAllocInitializer(abc.ABC):
     """
     @abc.abstractmethod
-    async def initialize_alloc(self) -> list[str]:
+    async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
         """
         Return the addresses of the servers that should be used to allocate processes
         for the proc mesh. The addresses should be running hyperactor's RemoteProcessAllocator.
@@ -88,6 +88,10 @@ class RemoteAllocInitializer(abc.ABC):
             in the future this method can be called multiple times and should return the current set of
             addresses that are eligible to handle allocation requests.
+        Arguments:
+        - `match_labels`: The match labels specified in `AllocSpec.AllocConstraints`. Initializer implementations
+            can read specific labels for matching a set of hosts that will service `allocate()` requests.
         """
         ...
@@ -102,7 +106,8 @@ class StaticRemoteAllocInitializer(RemoteAllocInitializer):
         super().__init__()
         self.addrs: list[str] = list(addrs)
-    async def initialize_alloc(self) -> list[str]:
+    async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
+        _ = match_labels  # Suppress unused variable warning
         return list(self.addrs)

monarch/common/messages.py CHANGED Viewed

@@ -25,7 +25,6 @@ from monarch._rust_bindings.monarch_extension import tensor_worker
 from monarch.common.function import ResolvableFromCloudpickle, ResolvableFunction
 from monarch.common.invocation import DeviceException, RemoteException
 from monarch.common.reference import Referenceable
-from monarch.common.stream import StreamRef
 from monarch.common.tree import flattener
 from pyre_extensions import none_throws
@@ -33,6 +32,8 @@ from .shape import NDSlice
 from .tensor_factory import TensorFactory
 if TYPE_CHECKING:
+    from monarch.common.stream import StreamRef
     from .device_mesh import DeviceMesh, RemoteProcessGroup
     from .pipe import Pipe
     from .recording import Recording
@@ -98,7 +99,7 @@ class CreateDeviceMesh(NamedTuple):
 class CreateStream(NamedTuple):
-    result: StreamRef
+    result: "StreamRef"
     default: bool
     def to_rust_message(self) -> tensor_worker.WorkerMessage:
@@ -132,7 +133,7 @@ class CallFunction(NamedTuple):
     function: ResolvableFunction
     args: Tuple[object, ...]
     kwargs: Dict[str, object]
-    stream: StreamRef
+    stream: "StreamRef"
     device_mesh: DeviceMesh
     remote_process_groups: List[RemoteProcessGroup]
@@ -199,7 +200,7 @@ class RecordingFormal(NamedTuple):
 class RecordingResult(NamedTuple):
     input: Tensor | tensor_worker.Ref
     output_index: int
-    stream: StreamRef
+    stream: "StreamRef"
     def to_rust_message(self) -> tensor_worker.WorkerMessage:
         return tensor_worker.RecordingResult(

monarch/monarch_controller CHANGED Viewed

Binary file

monarch/tools/cli.py CHANGED Viewed

@@ -112,7 +112,7 @@ class InfoCmd:
                 file=sys.stderr,
             )
         else:
-            json.dump(server_spec.to_json(), fp=sys.stdout)
+            json.dump(server_spec.to_json(), indent=2, fp=sys.stdout)
 class KillCmd:

monarch/tools/commands.py CHANGED Viewed

@@ -9,7 +9,10 @@
 import argparse
 import functools
 import inspect
+import logging
 import os
+import time
+from datetime import timedelta
 from typing import Any, Callable, Mapping, Optional, Union
 from monarch.tools.config import (  # @manual=//monarch/python/monarch/tools/config/meta:defaults
@@ -18,12 +21,13 @@ from monarch.tools.config import (  # @manual=//monarch/python/monarch/tools/con
 )
 from monarch.tools.mesh_spec import mesh_spec_from_metadata, ServerSpec
 from torchx.runner import Runner
-from torchx.specs import AppDef, AppDryRunInfo, CfgVal
+from torchx.specs import AppDef, AppDryRunInfo, AppState, CfgVal
 from torchx.specs.builders import parse_args
 from torchx.util.types import decode, decode_optional
+logger: logging.Logger = logging.getLogger(__name__)
 def torchx_runner() -> Runner:
     # namespace is currently unused so make it empty str
@@ -165,15 +169,73 @@ def info(server_handle: str) -> Optional[ServerSpec]:
         if appdef is None:
             return None
+    # host status grouped by mesh (role) names
+    replica_status = {r.role: r.replicas for r in status.roles}
     mesh_specs = []
     for role in appdef.roles:
         spec = mesh_spec_from_metadata(appdef, role.name)
         assert spec is not None, "cannot be 'None' since we iterate over appdef's roles"
+        # null-guard since some schedulers do not fill replica_status
+        if host_status := replica_status.get(role.name):
+            spec.hostnames = [h.hostname for h in host_status]
         mesh_specs.append(spec)
     return ServerSpec(name=appdef.name, state=status.state, meshes=mesh_specs)
+_5_SECONDS = timedelta(seconds=5)
+async def server_ready(
+    server_handle: str, check_interval: timedelta = _5_SECONDS
+) -> Optional[ServerSpec]:
+    """Waits until the server's job is in RUNNING state to returns the server spec.
+    Returns `None` if the server does not exist.
+    NOTE: Certain fields such as `hostnames` is only filled (and valid) when the server is RUNNING.
+    Usage:
+    .. code-block:: python
+        server_info = await server_ready("slurm:///123")
+        if not server_info:
+            print(f"Job does not exist")
+        else:
+            if server_info.is_running:
+                for mesh in server_info.meshes:
+                    connect_to(mesh.hostnames)
+            else:
+                print(f"Job in {server_info.state} state. Hostnames are not available")
+    """
+    while True:
+        server_spec = info(server_handle)
+        if not server_spec:  # server not found
+            return None
+        if server_spec.state <= AppState.PENDING:  # UNSUBMITTED or SUBMITTED or PENDING
+            # NOTE: TorchX currently does not have async APIs so need to loop-on-interval
+            # TODO maybe inverse exponential backoff instead of constant interval?
+            check_interval_seconds = check_interval.total_seconds()
+            logger.info(
+                "waiting for %s to be %s (current: %s), will check again in %g seconds...",
+                server_handle,
+                AppState.RUNNING,
+                server_spec.state,
+                check_interval_seconds,
+            )
+            time.sleep(check_interval_seconds)
+            continue
+        else:
+            return server_spec
 def kill(server_handle: str) -> None:
     with torchx_runner() as runner:
         runner.cancel(server_handle)

monarch/tools/mesh_spec.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # pyre-strict
 import string
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any, Optional
 from torchx import specs
@@ -29,6 +29,7 @@ class MeshSpec:
     host_type: str
     gpus: int
     port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
+    hostnames: list[str] = field(default_factory=list)
 def _tag(mesh_name: str, tag_template: str) -> str:
@@ -84,6 +85,10 @@ class ServerSpec:
     state: specs.AppState
     meshes: list[MeshSpec]
+    @property
+    def is_running(self) -> bool:
+        return self.state == specs.AppState.RUNNING
     def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
         for mesh_spec in self.meshes:
             if mesh_spec.name == mesh_name:
@@ -115,6 +120,7 @@ class ServerSpec:
                     "host_type": mesh.host_type,
                     "hosts": mesh.num_hosts,
                     "gpus": mesh.gpus,
+                    "hostnames": mesh.hostnames,
                 }
                 for mesh in self.meshes
             },

monarch/tools/network.py ADDED Viewed

@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import logging
+import socket
+from typing import Optional
+logger: logging.Logger = logging.getLogger(__name__)
+def get_ip_addr(hostname: str) -> str:
+    """Resolves and returns the ip address of the given hostname.
+    This function will return an ipv6 address if one that can bind
+    `SOCK_STREAM` (TCP) socket is found. Otherwise it will fall-back
+    to resolving an ipv4 `SOCK_STREAM` address.
+    Raises a `RuntimeError` if neither ipv6 or ipv4 ip can be resolved from hostname.
+    """
+    def get_sockaddr(family: socket.AddressFamily) -> Optional[str]:
+        try:
+            # patternlint-disable-next-line python-dns-deps (only used for oss)
+            addrs = socket.getaddrinfo(
+                hostname, port=None, family=family, type=socket.SOCK_STREAM
+            )  # tcp
+            if addrs:
+                # socket.getaddrinfo return a list of addr 5-tuple addr infos
+                _, _, _, _, sockaddr = addrs[0]  # use the first address
+                # sockaddr is a tuple (ipv4) or a 4-tuple (ipv6) where the first element is the ip addr
+                ipaddr = str(sockaddr[0])
+                logger.info(
+                    "Resolved %s address: `%s` for host: `%s`",
+                    family.name,
+                    ipaddr,
+                    hostname,
+                )
+                return str(ipaddr)
+            else:
+                return None
+        except socket.gaierror as e:
+            logger.info(
+                "No %s address that can bind TCP sockets for host: %s. %s",
+                family.name,
+                hostname,
+                e,
+            )
+            return None
+    ipaddr = get_sockaddr(socket.AF_INET6) or get_sockaddr(socket.AF_INET)
+    if not ipaddr:
+        raise RuntimeError(
+            f"Unable to resolve `{hostname}` to ipv6 or ipv4 address that can bind TCP socket."
+            " Check the network configuration on the host."
+        )
+    return ipaddr

tests/error_test_binary.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import asyncio
 import ctypes
 import sys
@@ -11,7 +12,7 @@ import click
 from monarch._rust_bindings.monarch_extension.panic import panicking_function
-from monarch.actor_mesh import Actor, endpoint
+from monarch.actor_mesh import Actor, endpoint, send
 from monarch.proc_mesh import proc_mesh
@@ -35,6 +36,12 @@ class ErrorActor(Actor):
         """Endpoint that calls a Rust function that panics."""
         panicking_function()
+    @endpoint
+    async def await_then_error(self) -> None:
+        await asyncio.sleep(0.1)
+        await asyncio.sleep(0.1)
+        raise RuntimeError("oh noez")
 class ErrorActorSync(Actor):
     """An actor that has endpoints cause segfaults."""
@@ -146,5 +153,28 @@ def error_bootstrap():
     proc_mesh(gpus=4, env={"MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING": "1"}).get()
+async def _error_unmonitored():
+    print("I actually ran")
+    sys.stdout.flush()
+    proc = await proc_mesh(gpus=1)
+    actor = await proc.spawn("error_actor", ErrorActor)
+    # fire and forget
+    send(actor.await_then_error, (), {}, None, "all")
+    # Wait. Eventually a supervision event will get propagated and the process
+    # will exit.
+    #
+    # If an event is not delivered, the test will time out before this sleep
+    # finishes.
+    await asyncio.sleep(300)
+@main.command("error-unmonitored")
+def error_unmonitored():
+    asyncio.run(_error_unmonitored())
 if __name__ == "__main__":
     main()

tests/test_actor_error.py CHANGED Viewed

@@ -4,11 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import asyncio
 import importlib.resources
 import subprocess
 import pytest
-from monarch.actor_mesh import Actor, ActorError, endpoint
+from monarch.actor_mesh import Actor, ActorError, endpoint, send
 from monarch.proc_mesh import proc_mesh
@@ -128,6 +129,7 @@ def test_actor_supervision(num_procs, sync_endpoint, sync_test_impl, endpoint_na
         f"--endpoint-name={endpoint_name}",
     ]
     try:
+        print("running cmd", " ".join(cmd))
         process = subprocess.run(cmd, capture_output=True, timeout=180)
     except subprocess.TimeoutExpired as e:
         print("timeout expired")
@@ -157,6 +159,7 @@ def test_proc_mesh_bootstrap_error():
         "error-bootstrap",
     ]
     try:
+        print("running cmd", " ".join(cmd))
         process = subprocess.run(cmd, capture_output=True, timeout=180)
     except subprocess.TimeoutExpired as e:
         print("timeout expired")
@@ -208,3 +211,30 @@ async def test_broken_pickle_class(raise_on_getstate, raise_on_setstate, num_pro
             await exception_actor.print_value.call_one(broken_obj)
         else:
             await exception_actor.print_value.call(broken_obj)
+# oss_skip: importlib not pulling resource correctly in git CI, needs to be revisited
+@pytest.mark.oss_skip
+async def test_exception_after_wait_unmonitored():
+    # Run the test in a subprocess
+    test_bin = importlib.resources.files("monarch.python.tests").joinpath("test_bin")
+    cmd = [
+        str(test_bin),
+        "error-unmonitored",
+    ]
+    try:
+        print("running cmd", " ".join(cmd))
+        process = subprocess.run(cmd, capture_output=True, timeout=180)
+    except subprocess.TimeoutExpired as e:
+        print("timeout expired")
+        if e.stdout is not None:
+            print(e.stdout.decode())
+        if e.stderr is not None:
+            print(e.stderr.decode())
+        raise
+    # Assert that the subprocess exited with a non-zero code
+    assert "I actually ran" in process.stdout.decode()
+    assert (
+        process.returncode != 0
+    ), f"Expected non-zero exit code, got {process.returncode}"

tests/test_allocator.py CHANGED Viewed

@@ -116,8 +116,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
             used to test that the state of the initializer is preserved across calls to allocate()
             """
-            async def initialize_alloc(self) -> list[str]:
-                alloc = await super().initialize_alloc()
+            async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
+                alloc = await super().initialize_alloc(match_labels)
                 self.addrs.pop(-1)
                 return alloc
@@ -142,7 +142,8 @@ class TestRemoteAllocator(unittest.IsolatedAsyncioTestCase):
         class EmptyAllocInitializer(StaticRemoteAllocInitializer):
             """test initializer that returns an empty list of addresses"""
-            async def initialize_alloc(self) -> list[str]:
+            async def initialize_alloc(self, match_labels: dict[str, str]) -> list[str]:
+                _ = match_labels  # Suppress unused variable warning
                 return []
         empty_initializer = EmptyAllocInitializer()

tests/test_python_actors.py CHANGED Viewed

@@ -9,6 +9,7 @@ import operator
 import os
 import re
 import threading
+import time
 from types import ModuleType
 from unittest.mock import AsyncMock, patch
@@ -391,6 +392,16 @@ def test_rust_binding_modules_correct() -> None:
     check(bindings, "monarch._rust_bindings")
+def test_proc_mesh_liveness() -> None:
+    mesh = proc_mesh(gpus=2).get()
+    counter = mesh.spawn("counter", Counter, 1).get()
+    del mesh
+    # Give some time for the mesh to have been shut down.
+    # (It only would if there were a bug.)
+    time.sleep(0.5)
+    counter.value.call().get()
 two_gpu = pytest.mark.skipif(
     torch.cuda.device_count() < 2,
     reason="Not enough GPUs, this test requires at least 2 GPUs",
@@ -584,16 +595,40 @@ async def test_actor_tls() -> None:
     pm = await proc_mesh(gpus=1)
     am = await pm.spawn("tls", TLSActor)
     await am.increment.call_one()
-    # TODO(suo): TLS is NOT preserved across async/sync endpoints, because currently
-    # we run async endpoints on a different thread than sync ones.
-    # Will fix this in a followup diff.
+    await am.increment_async.call_one()
+    await am.increment.call_one()
+    await am.increment_async.call_one()
+    assert 4 == await am.get.call_one()
+    assert 4 == await am.get_async.call_one()
+class TLSActorFullSync(Actor):
+    """An actor that manages thread-local state."""
+    def __init__(self):
+        self.local = threading.local()
+        self.local.value = 0
+    @endpoint
+    def increment(self):
+        self.local.value += 1
+    @endpoint
+    def get(self):
+        return self.local.value
-    # await am.increment_async.call_one()
+async def test_actor_tls_full_sync() -> None:
+    """Test that thread-local state is respected."""
+    pm = await proc_mesh(gpus=1)
+    am = await pm.spawn("tls", TLSActorFullSync)
+    await am.increment.call_one()
+    await am.increment.call_one()
+    await am.increment.call_one()
     await am.increment.call_one()
-    # await am.increment_async.call_one()
-    assert 2 == await am.get.call_one()
-    # assert 4 == await am.get_async.call_one()
+    assert 4 == await am.get.call_one()
 @two_gpu
@@ -611,3 +646,29 @@ def test_proc_mesh_tensor_engine() -> None:
     assert a == 0
     assert b == 10
     assert c == 100
+class AsyncActor(Actor):
+    def __init__(self):
+        self.should_exit = False
+    @endpoint
+    async def sleep(self) -> None:
+        while True and not self.should_exit:
+            await asyncio.sleep(1)
+    @endpoint
+    async def no_more(self) -> None:
+        self.should_exit = True
+@pytest.mark.timeout(15)
+async def test_async_concurrency():
+    """Test that async endpoints will be processed concurrently."""
+    pm = await proc_mesh(gpus=1)
+    am = await pm.spawn("async", AsyncActor)
+    fut = am.sleep.call()
+    # This call should go through and exit the sleep loop, as long as we are
+    # actually concurrently processing messages.
+    await am.no_more.call()
+    await fut

{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchmonarch-nightly
-Version: 2025.6.17
+Version: 2025.6.19
 Summary: Monarch: Single controller library
 Author: Meta
 Author-email: oncall+monarch@xmail.facebook.com

{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
-monarch/_rust_bindings.so,sha256=BIOc6AH_iVbNSGCnF3de-4l9bp82KlPwWxBWUCMKf40,40709968
+monarch/_rust_bindings.so,sha256=EUkkinIuX45ihfDu4ot656fOd0CxaepnmaZdUv0cOMY,41044112
 monarch/_testing.py,sha256=jOIOG6jcZBzvEvG_DwSnwCkaMVXvSun6sJAG6nXemww,7859
-monarch/actor_mesh.py,sha256=nAW65WFEWMJWCv8zuH9GSOyTNXwFN8QNqZxMZTuSYxw,25537
-monarch/allocator.py,sha256=ylvYTf31o-PT385cYJPhi17uNbC4yl_RAraqD0fVe4g,4112
+monarch/actor_mesh.py,sha256=m6QapbZHqYujXya28jW1II2wkBUV_nKGvxmWPSW9lsQ,24327
+monarch/allocator.py,sha256=UEaVLntH4xQ8Lr84TbgcXusvuK8FhSMJmav-omztUbw,4473
 monarch/bootstrap_main.py,sha256=RCUQhJk07yMFiKp6HzQuqZFUpkgsT9kVEyimiwjn6_E,1827
 monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
 monarch/debugger.py,sha256=AdlvOG3X-9Pw9c1DLQYEy4vjEfh0ZtwtsNJEFLFzN8o,13312
@@ -11,7 +11,7 @@ monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
 monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
 monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
 monarch/mesh_controller.py,sha256=am1QP7dvn0OH1z9ADSKm41APs1HY_dHcBAhOVP-QDmE,10427
-monarch/monarch_controller,sha256=yEs4PlEWgSMnRUSNWyFKvT5LmpkJ9p7GRi6WF-nsdM0,20347496
+monarch/monarch_controller,sha256=sWOUMClz3JPUjZbppDWgdrPOAjbydygdRPDZ1kaAVC4,20328464
 monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
 monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
 monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
@@ -57,7 +57,7 @@ monarch/common/function_caching.py,sha256=HVdbWtv6Eea7ENMWi8iv36w1G1TaVuUJhkUX_J
 monarch/common/future.py,sha256=D1UJ_8Rvb8-VG9vNE-z7xz2m2otMd2HgB0rnA02nlvA,4681
 monarch/common/invocation.py,sha256=L4mSmzqlHMxo1Tb71hBU_M8aBZCRCOcb6vvPhvvewec,4195
 monarch/common/mast.py,sha256=XTzYljGR0aZ7GjmNMPgU2HyuL4HWSAy4IwE3kEDqdOw,7735
-monarch/common/messages.py,sha256=El7BoGZ2jlP8HyyE-S8wkiG9W8Ciw3_5JERnNrgOYHU,18278
+monarch/common/messages.py,sha256=OFMd_4yBoMIHjdXcKcJDG88iERfViLG3QxTqzwV4Gnw,18289
 monarch/common/mock_cuda.py,sha256=x6ho1Ton6BbKjBZ5ZxnFOUaQM032X70wnpoUNB7Ci2w,1039
 monarch/common/opaque_ref.py,sha256=tWNvOC6CsjNPKD1JDx-8PSaeXqZC3eermgBExUPKML4,2871
 monarch/common/pickle_flatten.py,sha256=2mc-dPiZy7kRqAstyfMLnPuoGJwsBftYYEHyF_HOZw4,1313
@@ -106,9 +106,10 @@ monarch/timer/example_spmd.py,sha256=p8i3_tO1AmpwSkZryiSjgkh7qaEZ6QXp2Fy1qtPpECA
 monarch/timer/execution_timer.py,sha256=1YsrLIZirdohKOeFAU2H4UcONhQXHuctJbYcoX8I6gY,6985
 monarch/timer/execution_timer_test.py,sha256=CSxTv44fFZQURJlCBmYvysQI1aS_zEGZs_uxl9SOHak,4486
 monarch/tools/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
-monarch/tools/cli.py,sha256=66F7dr90bh27P3kOCmxwJkVmWv2v4wBrkifvwqwUwFE,4967
-monarch/tools/commands.py,sha256=BfmXndJmU_cZP4cMPlknkxGca1NjqYd8_ReDePWksXw,6908
-monarch/tools/mesh_spec.py,sha256=JLykhgy1dClXiNbH1Qsl2fX5MbqplQAhl8LGoragvbo,3702
+monarch/tools/cli.py,sha256=EIdarsfuFX0WqRCe29_5GNKWJBhxx0lABalw3zPSagw,4977
+monarch/tools/commands.py,sha256=OuFDVAcl5LvBdBZ-HyemErR0IiDtiMMNgmGPD4MWTHY,8996
+monarch/tools/mesh_spec.py,sha256=3Qp7Lu3pAa9tfaG-METsCmj-QXECQ6OsrPWiLydWvKc,3914
+monarch/tools/network.py,sha256=bRj-jOs5qDqnM3BcE9MSXCLS01hiMN4YSWfKZ_d7bc4,2182
 monarch/tools/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
 monarch/tools/components/hyperactor.py,sha256=Ryi1X07VLcaQVlpc4af65JNBbZtOb9IAlKxSKMZ1AW4,2120
 monarch/tools/config/__init__.py,sha256=OPSflEmJB2zxAaRVzzWSWXV5M5vlknLgpulGdW1ze5U,510
@@ -131,11 +132,11 @@ monarch_supervisor/python_executable.py,sha256=WfCiK3wdAvm9Jxx5jgjGF991NgGc9-oHU
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/dispatch_bench.py,sha256=sU_m-8KAjQgYTsxI5khV664NdgLLutidni69Rtowk98,3933
 tests/dispatch_bench_helper.py,sha256=1ORgAMrRgjAjmmWeCHLLQd_bda9mJk0rS2ucEbRu28s,633
-tests/error_test_binary.py,sha256=64H-ucdkQ2i7GD8sidStl227cOy7gyeqvO4kTm1y7Ic,4817
+tests/error_test_binary.py,sha256=BRj13wAROsUWx4jcxc07HYN2n-xyBNhnnRAhjqah-A0,5582
 tests/sleep_binary.py,sha256=XfLYaAfwm9xgzM-svs8fhAeFhwYIg6SyVEnx4e6wbUw,1009
-tests/test_actor_error.py,sha256=z3Sf4lteUggTryPLOhRKJ55v0MwVK3a7QN7-U2U9iJg,7484
+tests/test_actor_error.py,sha256=-0UJCEpyzsBh-RdbGhDiG1-sRtu7bJPQWmtjUD0ad48,8526
 tests/test_alloc.py,sha256=D6DdQbtOZEvvnnc7LV-WyWFMk0Xb77eblH6Oz90zJTA,745
-tests/test_allocator.py,sha256=P11sQ95ADjzC_-CfPs3CEP80nP8sn7wW8vVPsmpSVoM,8164
+tests/test_allocator.py,sha256=jaYWPVEFdcK0XmmEA1Y9uwkeBjhxb2iI1GUL6IZKh4s,8305
 tests/test_coalescing.py,sha256=JZ4YgQNlWWs7N-Z8KCCXQPANcuyyXEKjeHIXYbPnQhk,15606
 tests/test_controller.py,sha256=Rp_kW20zYT8ocsK5LX0Ha3LB9azS2LSKpp8n_dBlzVU,31384
 tests/test_device_mesh.py,sha256=DrbezYOM0thfP9MgLXb5-F0VoLOmSz5GR0GwjR_3bE4,5290
@@ -144,7 +145,7 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
 tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
 tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
 tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
-tests/test_python_actors.py,sha256=YiDJaMFoQ3xPGq602QTuhRM8CsgZo5pttKMKAnLm6io,17773
+tests/test_python_actors.py,sha256=3ru2JsPQmaO7ppVX3-ls7JcvIeOgEmWWUsYKZCuBXPg,19256
 tests/test_remote_functions.py,sha256=5nxYB8dfA9NT9f9Od9O3htgQtPbiRNiXZ1Kgtn75sOQ,50056
 tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
 tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
@@ -154,9 +155,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
 tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
 tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
 tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
-torchmonarch_nightly-2025.6.17.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
-torchmonarch_nightly-2025.6.17.dist-info/METADATA,sha256=xnYwQ3UlDfJcHRWA86w2X71Fzl0Eddvs4u4UKveyIuo,2772
-torchmonarch_nightly-2025.6.17.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
-torchmonarch_nightly-2025.6.17.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
-torchmonarch_nightly-2025.6.17.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
-torchmonarch_nightly-2025.6.17.dist-info/RECORD,,
+torchmonarch_nightly-2025.6.19.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
+torchmonarch_nightly-2025.6.19.dist-info/METADATA,sha256=2XYBEhTb9iSTFKhAGmq2Bg_AXwjQvcPj6CQmG4bBiLE,2772
+torchmonarch_nightly-2025.6.19.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
+torchmonarch_nightly-2025.6.19.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
+torchmonarch_nightly-2025.6.19.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
+torchmonarch_nightly-2025.6.19.dist-info/RECORD,,

{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.17.dist-info → torchmonarch_nightly-2025.6.19.dist-info}/top_level.txt RENAMED Viewed

File without changes