PyPI - torchmonarch-nightly - Versions diffs - 2025.6.11__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.13__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.11__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.13__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

monarch/_monarch/hyperactor/__init__.py +0 -16
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +50 -18
monarch/actor_mesh.py +74 -30
monarch/bootstrap_main.py +1 -20
monarch/builtins/random.py +4 -5
monarch/common/client.py +17 -5
monarch/common/stream.py +3 -0
monarch/debugger.py +377 -0
monarch/mesh_controller.py +72 -15
monarch/monarch_controller +0 -0
monarch/pdb_wrapper.py +135 -0
monarch/proc_mesh.py +9 -5
monarch/telemetry.py +19 -0
tests/test_allocator.py +3 -3
tests/test_coalescing.py +1 -1
tests/test_controller.py +12 -2
tests/test_python_actors.py +150 -0
tests/test_remote_functions.py +1 -1
{torchmonarch_nightly-2025.6.11.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.6.11.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/RECORD +25 -22
{torchmonarch_nightly-2025.6.11.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.11.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.6.11.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.11.dist-info → torchmonarch_nightly-2025.6.13.dist-info}/top_level.txt +0 -0

monarch/_monarch/hyperactor/__init__.py CHANGED Viewed

@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 # pyre-strict
-import abc
 from monarch._rust_bindings.monarch_hyperactor.actor import PythonMessage
@@ -29,21 +28,6 @@ from monarch._rust_bindings.monarch_hyperactor.shape import (  # @manual=//monar
     Shape,
 )
-class Actor(abc.ABC):
-    @abc.abstractmethod
-    async def handle(self, mailbox: Mailbox, message: PythonMessage) -> None: ...
-    async def handle_cast(
-        self,
-        mailbox: Mailbox,
-        rank: int,
-        coordinates: list[tuple[str, int]],
-        message: PythonMessage,
-    ) -> None:
-        await self.handle(mailbox, message)
 __all__ = [
     "init_proc",
     "Actor",

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/_testing.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import tempfile
 import time
 from contextlib import contextmanager, ExitStack
-from typing import Callable, Generator, Optional
+from typing import Any, Callable, Dict, Generator, Literal, Optional
 import monarch_supervisor
 from monarch.common.client import Client
@@ -18,6 +18,8 @@ from monarch.common.device_mesh import DeviceMesh
 from monarch.common.invocation import DeviceException, RemoteException
 from monarch.common.shape import NDSlice
 from monarch.controller.backend import ProcessBackend
+from monarch.mesh_controller import spawn_tensor_engine
+from monarch.proc_mesh import proc_mesh, ProcMesh
 from monarch.python_local_mesh import PythonLocalContext
 from monarch.rust_local_mesh import (
     local_mesh,
@@ -50,6 +52,7 @@ class TestingContext:
         self.cleanup = ExitStack()
         self._py_process_cache = {}
         self._rust_process_cache = None
+        self._proc_mesh_cache: Dict[Any, ProcMesh] = {}
     @contextmanager
     def _get_context(self, num_hosts, gpu_per_host):
@@ -75,16 +78,14 @@ class TestingContext:
     @contextmanager
     def local_py_device_mesh(
-        self, num_hosts, gpu_per_host, activate=True
+        self,
+        num_hosts,
+        gpu_per_host,
     ) -> Generator[DeviceMesh, None, None]:
         ctx, hosts, processes = self._processes(num_hosts, gpu_per_host)
         dm = world_mesh(ctx, hosts, gpu_per_host, _processes=processes)
         try:
-            if activate:
-                with dm.activate():
-                    yield dm
-            else:
-                yield dm
+            yield dm
             dm.client.shutdown(destroy_pg=False)
         except Exception:
             # abnormal exit, so we just make sure we do not try to communicate in destructors,
@@ -97,7 +98,6 @@ class TestingContext:
         self,
         num_hosts,
         gpu_per_host,
-        activate: bool = True,
         controller_params=None,
     ) -> Generator[DeviceMesh, None, None]:
         # Create a new system and mesh for test.
@@ -115,11 +115,7 @@ class TestingContext:
             controller_params=controller_params,
         ) as dm:
             try:
-                if activate:
-                    with dm.activate():
-                        yield dm
-                else:
-                    yield dm
+                yield dm
                 dm.exit()
             except Exception:
                 dm.client._shutdown = True
@@ -129,21 +125,57 @@ class TestingContext:
                 # pyre-ignore: Undefined attribute
                 dm.client.inner._actor.stop()
+    @contextmanager
+    def local_engine_on_proc_mesh(
+        self,
+        num_hosts,
+        gpu_per_host,
+    ) -> Generator[DeviceMesh, None, None]:
+        key = (num_hosts, gpu_per_host)
+        if key not in self._proc_mesh_cache:
+            self._proc_mesh_cache[key] = proc_mesh(
+                hosts=num_hosts, gpus=gpu_per_host
+            ).get()
+        dm = spawn_tensor_engine(self._proc_mesh_cache[key])
+        dm = dm.rename(hosts="host", gpus="gpu")
+        try:
+            yield dm
+            dm.exit()
+        except Exception as e:
+            # abnormal exit, so we just make sure we do not try to communicate in destructors,
+            # but we do notn wait for workers to exit since we do not know what state they are in.
+            dm.client._shutdown = True
+            raise
     @contextmanager
     def local_device_mesh(
-        self, num_hosts, gpu_per_host, activate=True, rust=False, controller_params=None
+        self,
+        num_hosts,
+        gpu_per_host,
+        activate=True,
+        backend: Literal["py", "rs", "mesh"] = "py",
+        controller_params=None,
     ) -> Generator[DeviceMesh, None, None]:
         start = time.time()
-        if rust:
+        if backend == "rs":
             generator = self.local_rust_device_mesh(
-                num_hosts, gpu_per_host, activate, controller_params=controller_params
+                num_hosts, gpu_per_host, controller_params=controller_params
             )
+        elif backend == "py":
+            generator = self.local_py_device_mesh(num_hosts, gpu_per_host)
+        elif backend == "mesh":
+            generator = self.local_engine_on_proc_mesh(num_hosts, gpu_per_host)
         else:
-            generator = self.local_py_device_mesh(num_hosts, gpu_per_host, activate)
+            raise ValueError(f"invalid backend: {backend}")
         with generator as dm:
             end = time.time()
             logging.info("initialized mesh in {:.2f}s".format(end - start))
-            yield dm
+            if activate:
+                with dm.activate():
+                    yield dm
+            else:
+                yield dm
             start = time.time()
         end = time.time()
         logging.info("shutdown mesh in {:.2f}s".format(end - start))

monarch/actor_mesh.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-unsafe
 import asyncio
 import collections
 import contextvars
@@ -13,6 +15,7 @@ import inspect
 import itertools
 import logging
 import random
+import sys
 import traceback
 from dataclasses import dataclass
@@ -20,6 +23,7 @@ from traceback import extract_tb, StackSummary
 from typing import (
     Any,
     AsyncGenerator,
+    Awaitable,
     Callable,
     cast,
     Concatenate,
@@ -34,6 +38,7 @@ from typing import (
     ParamSpec,
     Tuple,
     Type,
+    TYPE_CHECKING,
     TypeVar,
 )
@@ -54,8 +59,12 @@ from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Sh
 from monarch.common.pickle_flatten import flatten, unflatten
 from monarch.common.shape import MeshTrait, NDSlice
+from monarch.pdb_wrapper import remote_breakpointhook
+if TYPE_CHECKING:
+    from monarch.debugger import DebugClient
-logger = logging.getLogger(__name__)
+logger: logging.Logger = logging.getLogger(__name__)
 Allocator = monarch.ProcessAllocator | monarch.LocalAllocator
@@ -92,7 +101,7 @@ _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
 # this was implemented in python 3.12 as an argument to task
 # but I have to backport to 3.10/3.11.
-def create_eager_task(coro: Coroutine[Any, None, Any]) -> asyncio.Future:
+def create_eager_task(coro: Awaitable[None]) -> asyncio.Future:
     iter = coro.__await__()
     try:
         first_yield = next(iter)
@@ -235,7 +244,7 @@ class Endpoint(Generic[P, R]):
         self,
         actor_mesh_ref: _ActorMeshRefImpl,
         name: str,
-        impl: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]],
+        impl: Callable[Concatenate[Any, P], Awaitable[R]],
         mailbox: Mailbox,
     ) -> None:
         self._actor_mesh = actor_mesh_ref
@@ -267,14 +276,16 @@ class Endpoint(Generic[P, R]):
         return self.choose(*args, **kwargs)
     def call(self, *args: P.args, **kwargs: P.kwargs) -> "Future[ValueMesh[R]]":
+        p: PortId
+        r: PortReceiver[R]
         p, r = port(self)
         # pyre-ignore
         send(self, args, kwargs, port=p, rank_in_response=True)
-        async def process():
-            results = [None] * len(self._actor_mesh)
+        async def process() -> ValueMesh[R]:
+            results: List[R] = [None] * len(self._actor_mesh)  # pyre-fixme[9]
             for _ in range(len(self._actor_mesh)):
-                rank, value = await r.recv()
+                rank, value = await r.recv()  # pyre-fixme[23]
                 results[rank] = value
             call_shape = Shape(
                 self._actor_mesh._shape.labels,
@@ -312,15 +323,15 @@ class Endpoint(Generic[P, R]):
 class Accumulator(Generic[P, R, A]):
     def __init__(
         self, endpoint: Endpoint[P, R], identity: A, combine: Callable[[A, R], A]
-    ):
-        self._endpoint = endpoint
-        self._identity = identity
-        self._combine = combine
+    ) -> None:
+        self._endpoint: Endpoint[P, R] = endpoint
+        self._identity: A = identity
+        self._combine: Callable[[A, R], A] = combine
     def accumulate(self, *args: P.args, **kwargs: P.kwargs) -> "Future[A]":
-        gen = self._endpoint.stream(*args, **kwargs)
+        gen: AsyncGenerator[R, R] = self._endpoint.stream(*args, **kwargs)
-        async def impl():
+        async def impl() -> A:
             value = self._identity
             async for x in gen:
                 value = self._combine(value, x)
@@ -337,7 +348,7 @@ class ValueMesh(MeshTrait, Generic[R]):
     def _new_with_shape(self, shape: Shape) -> "ValueMesh[R]":
         return ValueMesh(shape, self._values)
-    def item(self, **kwargs):
+    def item(self, **kwargs) -> R:
         coordinates = [kwargs.pop(label) for label in self._labels]
         if kwargs:
             raise KeyError(f"item has extra dimensions: {list(kwargs.keys())}")
@@ -348,7 +359,7 @@ class ValueMesh(MeshTrait, Generic[R]):
         for rank in self._shape.ranks():
             yield Point(rank, self._shape), self._values[rank]
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._shape)
     @property
@@ -381,7 +392,7 @@ def send(
 class EndpointProperty(Generic[P, R]):
-    def __init__(self, method: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]]):
+    def __init__(self, method: Callable[Concatenate[Any, P], Awaitable[R]]) -> None:
         self._method = method
     def __get__(self, instance, owner) -> Endpoint[P, R]:
@@ -392,7 +403,7 @@ class EndpointProperty(Generic[P, R]):
 def endpoint(
-    method: Callable[Concatenate[Any, P], Coroutine[Any, Any, R]],
+    method: Callable[Concatenate[Any, P], Awaitable[R]],
 ) -> EndpointProperty[P, R]:
     return EndpointProperty(method)
@@ -415,7 +426,9 @@ class Port:
 # advance lower-level API for sending messages. This is intentially
 # not part of the Endpoint API because they way it accepts arguments
 # and handles concerns is different.
-def port(endpoint: Endpoint[P, R], once=False) -> Tuple["PortId", "PortReceiver[R]"]:
+def port(
+    endpoint: Endpoint[P, R], once: bool = False
+) -> Tuple["PortId", "PortReceiver[R]"]:
     handle, receiver = (
         endpoint._mailbox.open_once_port() if once else endpoint._mailbox.open_port()
     )
@@ -428,9 +441,9 @@ class PortReceiver(Generic[R]):
         self,
         mailbox: Mailbox,
         receiver: HyPortReceiver | OncePortReceiver,
-    ):
-        self._mailbox = mailbox
-        self._receiver = receiver
+    ) -> None:
+        self._mailbox: Mailbox = mailbox
+        self._receiver: HyPortReceiver | OncePortReceiver = receiver
     async def _recv(self) -> R:
         return self._process(await self._receiver.recv())
@@ -438,7 +451,7 @@ class PortReceiver(Generic[R]):
     def _blocking_recv(self) -> R:
         return self._process(self._receiver.blocking_recv())
-    def _process(self, msg: PythonMessage):
+    def _process(self, msg: PythonMessage) -> R:
         # TODO: Try to do something more structured than a cast here
         payload = cast(R, _unpickle(msg.message, self._mailbox))
         if msg.method == "result":
@@ -485,7 +498,9 @@ class _Actor:
             else None
         )
         try:
-            ctx = MonarchContext(mailbox, mailbox.actor_id.proc_id, Point(rank, shape))
+            ctx: MonarchContext = MonarchContext(
+                mailbox, mailbox.actor_id.proc_id, Point(rank, shape)
+            )
             _context.set(ctx)
             args, kwargs = _unpickle(message.message, mailbox)
@@ -510,7 +525,14 @@ class _Actor:
                     enter_span(
                         the_method.__module__, message.method, str(ctx.mailbox.actor_id)
                     )
-                    result = await the_method(self.instance, *args, **kwargs)
+                    try:
+                        result = await the_method(self.instance, *args, **kwargs)
+                    except Exception as e:
+                        logging.critical(
+                            "Unahndled exception in actor endpoint",
+                            exc_info=e,
+                        )
+                        raise e
                     exit_span()
                     return result
@@ -532,14 +554,19 @@ class _Actor:
     async def run_async(
         self,
         ctx: MonarchContext,
-        coroutine: Coroutine[Any, None, Any],
+        coroutine: Awaitable[None],
     ) -> None:
         _context.set(ctx)
         if self.complete_task is None:
             self.complete_task = asyncio.create_task(self._complete())
         await self.active_requests.put(create_eager_task(coroutine))
-    async def run_task(self, port, coroutine, panic_flag):
+    async def run_task(
+        self,
+        port: Port | None,
+        coroutine: Awaitable[Any],
+        panic_flag: PanicFlag,
+    ) -> None:
         try:
             result = await coroutine
             if port is not None:
@@ -610,15 +637,28 @@ class Actor(MeshTrait):
             "actor implementations are not meshes, but we can't convince the typechecker of it..."
         )
+    @endpoint
+    async def _set_debug_client(self, client: "DebugClient") -> None:
+        point = MonarchContext.get().point
+        # For some reason, using a lambda instead of functools.partial
+        # confuses the pdb wrapper implementation.
+        sys.breakpointhook = functools.partial(  # pyre-ignore
+            remote_breakpointhook,
+            point.rank,
+            point.shape.coordinates(point.rank),
+            MonarchContext.get().mailbox.actor_id,
+            client,
+        )
 class ActorMeshRef(MeshTrait):
     def __init__(
         self, Class: Type[T], actor_mesh_ref: _ActorMeshRefImpl, mailbox: Mailbox
     ) -> None:
-        self.__name__ = Class.__name__
-        self._class = Class
-        self._actor_mesh_ref = actor_mesh_ref
-        self._mailbox = mailbox
+        self.__name__: str = Class.__name__
+        self._class: Type[T] = Class
+        self._actor_mesh_ref: _ActorMeshRefImpl = actor_mesh_ref
+        self._mailbox: Mailbox = mailbox
         for attr_name in dir(self._class):
             attr_value = getattr(self._class, attr_name, None)
             if isinstance(attr_value, EndpointProperty):
@@ -659,7 +699,11 @@ class ActorMeshRef(MeshTrait):
             f"'{self.__class__.__name__}' object has no attribute '{name}'"
         )
-    def _create(self, args: Iterable[Any], kwargs: Dict[str, Any]) -> None:
+    def _create(
+        self,
+        args: Iterable[Any],
+        kwargs: Dict[str, Any],
+    ) -> None:
         async def null_func(*_args: Iterable[Any], **_kwargs: Dict[str, Any]) -> None:
             return None

monarch/bootstrap_main.py CHANGED Viewed

@@ -30,28 +30,9 @@ def invoke_main():
     # behavior of std out as if it were a terminal.
     sys.stdout.reconfigure(line_buffering=True)
     global bootstrap_main
-    from monarch._rust_bindings.hyperactor_extension.telemetry import (  # @manual=//monarch/monarch_extension:monarch_extension  # @manual=//monarch/monarch_extension:monarch_extension
-        forward_to_tracing,
-    )
     # TODO: figure out what from worker_main.py we should reproduce here.
-    class TracingForwarder(logging.Handler):
-        def emit(self, record: logging.LogRecord) -> None:
-            try:
-                forward_to_tracing(
-                    record.getMessage(),
-                    record.filename or "",
-                    record.lineno or 0,
-                    record.levelno,
-                )
-            except AttributeError:
-                forward_to_tracing(
-                    record.__str__(),
-                    record.filename or "",
-                    record.lineno or 0,
-                    record.levelno,
-                )
+    from monarch.telemetry import TracingForwarder
     if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
         raise RuntimeError("Error during bootstrap for testing")

monarch/builtins/random.py CHANGED Viewed

@@ -16,11 +16,6 @@ def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
     torch.manual_seed(seed ^ process_idx)
-@remote(propagate=lambda: 0)
-def initial_seed_remote() -> int:
-    return torch.initial_seed()
 @remote(propagate=lambda: torch.zeros(1))
 def get_rng_state_remote() -> torch.Tensor:
     return torch.get_rng_state()
@@ -67,3 +62,7 @@ def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
 @remote(propagate="inspect")
 def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
     torch.cuda.set_rng_state_all(states)
+# initial_seed may sometimes return a uint64 which currenly can't be unwrapped by the framework
+# def initial_seed_remote() -> int: ...

monarch/common/client.py CHANGED Viewed

@@ -103,6 +103,13 @@ class Client:
         # workers.
         self.last_processed_seq = -1
+        # an error that we have received but know for certain has not
+        # been propagated to a future. This will be reported on shutdown
+        # to avoid hiding the error. This is best effort: we only keep
+        # the error until the point the a future is dependent on
+        # _any_ error, not particularly the tracked one.
+        self._pending_shutdown_error = None
         self.recorder = Recorder()
         self.pending_results: Dict[
@@ -174,6 +181,8 @@ class Client:
         destroy_pg: bool = True,
         error_reason: Optional[RemoteException | DeviceException | Exception] = None,
     ) -> None:
+        if self.has_shutdown:
+            return
         logger.info("shutting down the client gracefully")
         atexit.unregister(self._atexit)
@@ -302,7 +311,8 @@ class Client:
         self.last_processed_seq = max(self.last_processed_seq, seq)
         if error is not None:
-            logging.error("Received error for seq %s: %s", seq, error)
+            logging.info("Received error for seq %s: %s", seq, error)
+            self._pending_shutdown_error = error
             # We should not have set result if we have an error.
             assert result is None
             if not isinstance(error, RemoteException):
@@ -326,15 +336,17 @@ class Client:
         fut, _ = self.pending_results[seq]
         if fut is not None:
-            fut._set_result(result if error is None else error)
+            if error is None:
+                fut._set_result(result)
+            else:
+                fut._set_result(error)
+                self._pending_shutdown_error = None
         elif result is not None:
             logger.debug(f"{seq}: unused result {result}")
         elif error is not None:
             # errors get reported as results even if they
             # do not have futures attached.
-            logger.warning(
-                f"Error encountered for this instruction {seq}. Proceeding forward because error is unused and unhandled. Error details:\n{error}."
-            )
+            pass
         # We can safely delete the seq as tracebacks have been saved to the remote failure itself.
         del self.pending_results[seq]

monarch/common/stream.py CHANGED Viewed

@@ -82,6 +82,9 @@ class StreamRef(Referenceable):
             messages.CreateStream(self, self.default),
         )
+    def __repr__(self):
+        return f"<StreamRef {repr(self.name)} {self.ref}>"
     def delete_ref(self, ref):
         client = self.client()
         if client is not None and not client._shutdown: