PyPI - torchmonarch-nightly - Versions diffs - 2025.6.12__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.14__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.12__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.14__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

monarch/_rust_bindings.so +0 -0
monarch/_testing.py +50 -18
monarch/actor_mesh.py +62 -25
monarch/bootstrap_main.py +1 -20
monarch/builtins/random.py +4 -5
monarch/common/client.py +15 -1
monarch/debugger.py +377 -0
monarch/mesh_controller.py +71 -13
monarch/monarch_controller +0 -0
monarch/pdb_wrapper.py +135 -0
monarch/telemetry.py +19 -0
tests/test_coalescing.py +1 -1
tests/test_controller.py +12 -2
tests/test_python_actors.py +192 -0
tests/test_remote_functions.py +1 -1
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.14.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.14.dist-info}/RECORD +21 -18
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.14.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.14.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.14.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.12.dist-info → torchmonarch_nightly-2025.6.14.dist-info}/top_level.txt +0 -0

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/_testing.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import tempfile
 import time
 from contextlib import contextmanager, ExitStack
-from typing import Callable, Generator, Optional
+from typing import Any, Callable, Dict, Generator, Literal, Optional
 import monarch_supervisor
 from monarch.common.client import Client
@@ -18,6 +18,8 @@ from monarch.common.device_mesh import DeviceMesh
 from monarch.common.invocation import DeviceException, RemoteException
 from monarch.common.shape import NDSlice
 from monarch.controller.backend import ProcessBackend
+from monarch.mesh_controller import spawn_tensor_engine
+from monarch.proc_mesh import proc_mesh, ProcMesh
 from monarch.python_local_mesh import PythonLocalContext
 from monarch.rust_local_mesh import (
     local_mesh,
@@ -50,6 +52,7 @@ class TestingContext:
         self.cleanup = ExitStack()
         self._py_process_cache = {}
         self._rust_process_cache = None
+        self._proc_mesh_cache: Dict[Any, ProcMesh] = {}
     @contextmanager
     def _get_context(self, num_hosts, gpu_per_host):
@@ -75,16 +78,14 @@ class TestingContext:
     @contextmanager
     def local_py_device_mesh(
-        self, num_hosts, gpu_per_host, activate=True
+        self,
+        num_hosts,
+        gpu_per_host,
     ) -> Generator[DeviceMesh, None, None]:
         ctx, hosts, processes = self._processes(num_hosts, gpu_per_host)
         dm = world_mesh(ctx, hosts, gpu_per_host, _processes=processes)
         try:
-            if activate:
-                with dm.activate():
-                    yield dm
-            else:
-                yield dm
+            yield dm
             dm.client.shutdown(destroy_pg=False)
         except Exception:
             # abnormal exit, so we just make sure we do not try to communicate in destructors,
@@ -97,7 +98,6 @@ class TestingContext:
         self,
         num_hosts,
         gpu_per_host,
-        activate: bool = True,
         controller_params=None,
     ) -> Generator[DeviceMesh, None, None]:
         # Create a new system and mesh for test.
@@ -115,11 +115,7 @@ class TestingContext:
             controller_params=controller_params,
         ) as dm:
             try:
-                if activate:
-                    with dm.activate():
-                        yield dm
-                else:
-                    yield dm
+                yield dm
                 dm.exit()
             except Exception:
                 dm.client._shutdown = True
@@ -129,21 +125,57 @@ class TestingContext:
                 # pyre-ignore: Undefined attribute
                 dm.client.inner._actor.stop()
+    @contextmanager
+    def local_engine_on_proc_mesh(
+        self,
+        num_hosts,
+        gpu_per_host,
+    ) -> Generator[DeviceMesh, None, None]:
+        key = (num_hosts, gpu_per_host)
+        if key not in self._proc_mesh_cache:
+            self._proc_mesh_cache[key] = proc_mesh(
+                hosts=num_hosts, gpus=gpu_per_host
+            ).get()
+        dm = spawn_tensor_engine(self._proc_mesh_cache[key])
+        dm = dm.rename(hosts="host", gpus="gpu")
+        try:
+            yield dm
+            dm.exit()
+        except Exception as e:
+            # abnormal exit, so we just make sure we do not try to communicate in destructors,
+            # but we do notn wait for workers to exit since we do not know what state they are in.
+            dm.client._shutdown = True
+            raise
     @contextmanager
     def local_device_mesh(
-        self, num_hosts, gpu_per_host, activate=True, rust=False, controller_params=None
+        self,
+        num_hosts,
+        gpu_per_host,
+        activate=True,
+        backend: Literal["py", "rs", "mesh"] = "py",
+        controller_params=None,
     ) -> Generator[DeviceMesh, None, None]:
         start = time.time()
-        if rust:
+        if backend == "rs":
             generator = self.local_rust_device_mesh(
-                num_hosts, gpu_per_host, activate, controller_params=controller_params
+                num_hosts, gpu_per_host, controller_params=controller_params
             )
+        elif backend == "py":
+            generator = self.local_py_device_mesh(num_hosts, gpu_per_host)
+        elif backend == "mesh":
+            generator = self.local_engine_on_proc_mesh(num_hosts, gpu_per_host)
         else:
-            generator = self.local_py_device_mesh(num_hosts, gpu_per_host, activate)
+            raise ValueError(f"invalid backend: {backend}")
         with generator as dm:
             end = time.time()
             logging.info("initialized mesh in {:.2f}s".format(end - start))
-            yield dm
+            if activate:
+                with dm.activate():
+                    yield dm
+            else:
+                yield dm
             start = time.time()
         end = time.time()
         logging.info("shutdown mesh in {:.2f}s".format(end - start))

monarch/actor_mesh.py CHANGED Viewed

@@ -15,6 +15,7 @@ import inspect
 import itertools
 import logging
 import random
+import sys
 import traceback
 from dataclasses import dataclass
@@ -37,6 +38,7 @@ from typing import (
     ParamSpec,
     Tuple,
     Type,
+    TYPE_CHECKING,
     TypeVar,
 )
@@ -57,6 +59,10 @@ from monarch._rust_bindings.monarch_hyperactor.shape import Point as HyPoint, Sh
 from monarch.common.pickle_flatten import flatten, unflatten
 from monarch.common.shape import MeshTrait, NDSlice
+from monarch.pdb_wrapper import remote_breakpointhook
+if TYPE_CHECKING:
+    from monarch.debugger import DebugClient
 logger: logging.Logger = logging.getLogger(__name__)
@@ -270,11 +276,11 @@ class Endpoint(Generic[P, R]):
         return self.choose(*args, **kwargs)
     def call(self, *args: P.args, **kwargs: P.kwargs) -> "Future[ValueMesh[R]]":
-        p: PortId
-        r: PortReceiver[R]
-        p, r = port(self)
+        p: Port[R]
+        r: RankedPortReceiver[R]
+        p, r = ranked_port(self)
         # pyre-ignore
-        send(self, args, kwargs, port=p, rank_in_response=True)
+        send(self, args, kwargs, port=p)
         async def process() -> ValueMesh[R]:
             results: List[R] = [None] * len(self._actor_mesh)  # pyre-fixme[9]
@@ -369,9 +375,8 @@ def send(
     endpoint: Endpoint[P, R],
     args: Tuple[Any, ...],
     kwargs: Dict[str, Any],
-    port: "Optional[PortId]" = None,
+    port: "Optional[Port]" = None,
     selection: Selection = "all",
-    rank_in_response: bool = False,
 ) -> None:
     """
     Fire-and-forget broadcast invocation of the endpoint across all actors in the mesh.
@@ -380,7 +385,10 @@ def send(
     """
     endpoint._signature.bind(None, *args, **kwargs)
     message = PythonMessage(
-        endpoint._name, _pickle((args, kwargs)), port, rank_in_response
+        endpoint._name,
+        _pickle((args, kwargs)),
+        None if port is None else port._port,
+        None,
     )
     endpoint._actor_mesh.cast(message, selection)
@@ -402,18 +410,16 @@ def endpoint(
     return EndpointProperty(method)
-class Port:
-    def __init__(self, port: PortId, mailbox: Mailbox, rank_in_response: bool) -> None:
+class Port(Generic[R]):
+    def __init__(self, port: PortId, mailbox: Mailbox, rank: Optional[int]) -> None:
         self._port = port
         self._mailbox = mailbox
-        self._rank_in_response = rank_in_response
+        self._rank = rank
-    def send(self, method: str, obj: object) -> None:
-        if self._rank_in_response:
-            obj = (MonarchContext.get().point.rank, obj)
+    def send(self, method: str, obj: R) -> None:
         self._mailbox.post(
             self._port,
-            PythonMessage(method, _pickle(obj), None),
+            PythonMessage(method, _pickle(obj), None, self._rank),
         )
@@ -422,12 +428,21 @@ class Port:
 # and handles concerns is different.
 def port(
     endpoint: Endpoint[P, R], once: bool = False
-) -> Tuple["PortId", "PortReceiver[R]"]:
+) -> Tuple["Port[R]", "PortReceiver[R]"]:
     handle, receiver = (
         endpoint._mailbox.open_once_port() if once else endpoint._mailbox.open_port()
     )
     port_id: PortId = handle.bind()
-    return port_id, PortReceiver(endpoint._mailbox, receiver)
+    return Port(port_id, endpoint._mailbox, rank=None), PortReceiver(
+        endpoint._mailbox, receiver
+    )
+def ranked_port(
+    endpoint: Endpoint[P, R], once: bool = False
+) -> Tuple["Port[R]", "RankedPortReceiver[R]"]:
+    p, receiver = port(endpoint, once)
+    return p, RankedPortReceiver[R](receiver._mailbox, receiver._receiver)
 class PortReceiver(Generic[R]):
@@ -452,18 +467,20 @@ class PortReceiver(Generic[R]):
             return payload
         else:
             assert msg.method == "exception"
-            if isinstance(payload, tuple):
-                # If the payload is a tuple, it's because we requested the rank
-                # to be included in the response; just ignore it.
-                raise payload[1]
-            else:
-                # pyre-ignore
-                raise payload
+            # pyre-ignore
+            raise payload
     def recv(self) -> "Future[R]":
         return Future(lambda: self._recv(), self._blocking_recv)
+class RankedPortReceiver(PortReceiver[Tuple[int, R]]):
+    def _process(self, msg: PythonMessage) -> Tuple[int, R]:
+        if msg.rank is None:
+            raise ValueError("RankedPort receiver got a message without a rank")
+        return msg.rank, super()._process(msg)
 singleton_shape = Shape([], NDSlice(offset=0, sizes=[], strides=[]))
@@ -487,7 +504,7 @@ class _Actor:
         panic_flag: PanicFlag,
     ) -> Optional[Coroutine[Any, Any, Any]]:
         port = (
-            Port(message.response_port, mailbox, message.rank_in_response)
+            Port(message.response_port, mailbox, rank)
             if message.response_port
             else None
         )
@@ -519,7 +536,14 @@ class _Actor:
                     enter_span(
                         the_method.__module__, message.method, str(ctx.mailbox.actor_id)
                     )
-                    result = await the_method(self.instance, *args, **kwargs)
+                    try:
+                        result = await the_method(self.instance, *args, **kwargs)
+                    except Exception as e:
+                        logging.critical(
+                            "Unahndled exception in actor endpoint",
+                            exc_info=e,
+                        )
+                        raise e
                     exit_span()
                     return result
@@ -624,6 +648,19 @@ class Actor(MeshTrait):
             "actor implementations are not meshes, but we can't convince the typechecker of it..."
         )
+    @endpoint
+    async def _set_debug_client(self, client: "DebugClient") -> None:
+        point = MonarchContext.get().point
+        # For some reason, using a lambda instead of functools.partial
+        # confuses the pdb wrapper implementation.
+        sys.breakpointhook = functools.partial(  # pyre-ignore
+            remote_breakpointhook,
+            point.rank,
+            point.shape.coordinates(point.rank),
+            MonarchContext.get().mailbox.actor_id,
+            client,
+        )
 class ActorMeshRef(MeshTrait):
     def __init__(

monarch/bootstrap_main.py CHANGED Viewed

@@ -30,28 +30,9 @@ def invoke_main():
     # behavior of std out as if it were a terminal.
     sys.stdout.reconfigure(line_buffering=True)
     global bootstrap_main
-    from monarch._rust_bindings.hyperactor_extension.telemetry import (  # @manual=//monarch/monarch_extension:monarch_extension  # @manual=//monarch/monarch_extension:monarch_extension
-        forward_to_tracing,
-    )
     # TODO: figure out what from worker_main.py we should reproduce here.
-    class TracingForwarder(logging.Handler):
-        def emit(self, record: logging.LogRecord) -> None:
-            try:
-                forward_to_tracing(
-                    record.getMessage(),
-                    record.filename or "",
-                    record.lineno or 0,
-                    record.levelno,
-                )
-            except AttributeError:
-                forward_to_tracing(
-                    record.__str__(),
-                    record.filename or "",
-                    record.lineno or 0,
-                    record.levelno,
-                )
+    from monarch.telemetry import TracingForwarder
     if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
         raise RuntimeError("Error during bootstrap for testing")

monarch/builtins/random.py CHANGED Viewed

@@ -16,11 +16,6 @@ def set_manual_seed_remote(seed: int, process_idx: int = 0) -> None:
     torch.manual_seed(seed ^ process_idx)
-@remote(propagate=lambda: 0)
-def initial_seed_remote() -> int:
-    return torch.initial_seed()
 @remote(propagate=lambda: torch.zeros(1))
 def get_rng_state_remote() -> torch.Tensor:
     return torch.get_rng_state()
@@ -67,3 +62,7 @@ def get_rng_state_all_cuda_remote() -> list[torch.Tensor]:
 @remote(propagate="inspect")
 def set_rng_state_all_cuda_remote(states: list[torch.Tensor]) -> None:
     torch.cuda.set_rng_state_all(states)
+# initial_seed may sometimes return a uint64 which currenly can't be unwrapped by the framework
+# def initial_seed_remote() -> int: ...

monarch/common/client.py CHANGED Viewed

@@ -103,6 +103,13 @@ class Client:
         # workers.
         self.last_processed_seq = -1
+        # an error that we have received but know for certain has not
+        # been propagated to a future. This will be reported on shutdown
+        # to avoid hiding the error. This is best effort: we only keep
+        # the error until the point the a future is dependent on
+        # _any_ error, not particularly the tracked one.
+        self._pending_shutdown_error = None
         self.recorder = Recorder()
         self.pending_results: Dict[
@@ -174,6 +181,8 @@ class Client:
         destroy_pg: bool = True,
         error_reason: Optional[RemoteException | DeviceException | Exception] = None,
     ) -> None:
+        if self.has_shutdown:
+            return
         logger.info("shutting down the client gracefully")
         atexit.unregister(self._atexit)
@@ -303,6 +312,7 @@ class Client:
         if error is not None:
             logging.info("Received error for seq %s: %s", seq, error)
+            self._pending_shutdown_error = error
             # We should not have set result if we have an error.
             assert result is None
             if not isinstance(error, RemoteException):
@@ -326,7 +336,11 @@ class Client:
         fut, _ = self.pending_results[seq]
         if fut is not None:
-            fut._set_result(result if error is None else error)
+            if error is None:
+                fut._set_result(result)
+            else:
+                fut._set_result(error)
+                self._pending_shutdown_error = None
         elif result is not None:
             logger.debug(f"{seq}: unused result {result}")
         elif error is not None: