PyPI - torchmonarch-nightly - Versions diffs - 2025.6.30__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.30__cp311-cp311-manylinux2014_x86_64.whl → 2025.7.25__cp311-cp311-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

monarch/__init__.py +13 -9
monarch/_rust_bindings.so +0 -0
monarch/{_monarch/selection → _src/actor}/__init__.py +3 -7
monarch/_src/actor/actor_mesh.py +874 -0
monarch/{allocator.py → _src/actor/allocator.py} +26 -17
monarch/_src/actor/bootstrap_main.py +73 -0
monarch/{code_sync.py → _src/actor/code_sync/__init__.py} +3 -1
monarch/_src/actor/code_sync/auto_reload.py +223 -0
monarch/_src/actor/debugger.py +565 -0
monarch/_src/actor/endpoint.py +270 -0
monarch/_src/actor/event_loop.py +97 -0
monarch/_src/actor/future.py +100 -0
monarch/{pdb_wrapper.py → _src/actor/pdb_wrapper.py} +47 -46
monarch/{common/pickle_flatten.py → _src/actor/pickle.py} +26 -2
monarch/_src/actor/proc_mesh.py +500 -0
monarch/_src/actor/sync_state.py +18 -0
monarch/{telemetry.py → _src/actor/telemetry/__init__.py} +1 -1
monarch/_src/actor/telemetry/rust_span_tracing.py +159 -0
monarch/_src/actor/tensor_engine_shim.py +56 -0
monarch/_src/tensor_engine/rdma.py +180 -0
monarch/_testing.py +3 -2
monarch/actor/__init__.py +51 -0
monarch/actor_mesh.py +6 -752
monarch/bootstrap_main.py +8 -47
monarch/common/client.py +1 -1
monarch/common/controller_api.py +2 -1
monarch/common/device_mesh.py +12 -2
monarch/common/messages.py +12 -1
monarch/common/recording.py +4 -3
monarch/common/remote.py +135 -52
monarch/common/tensor.py +2 -1
monarch/controller/backend.py +2 -2
monarch/controller/controller.py +2 -1
monarch/controller/rust_backend/controller.py +2 -1
monarch/fetch.py +3 -5
monarch/mesh_controller.py +201 -139
monarch/monarch_controller +0 -0
monarch/opaque_module.py +4 -6
monarch/opaque_object.py +3 -3
monarch/proc_mesh.py +6 -309
monarch/python_local_mesh.py +1 -1
monarch/rust_backend_mesh.py +2 -1
monarch/rust_local_mesh.py +4 -2
monarch/sim_mesh.py +10 -19
monarch/simulator/command_history.py +1 -1
monarch/simulator/interface.py +2 -1
monarch/simulator/mock_controller.py +1 -1
monarch/simulator/simulator.py +1 -1
monarch/tensor_engine/__init__.py +23 -0
monarch/tensor_worker_main.py +3 -1
monarch/tools/cli.py +3 -1
monarch/tools/commands.py +95 -35
monarch/tools/mesh_spec.py +55 -0
monarch/tools/utils.py +38 -0
monarch/worker/worker.py +1 -1
monarch/world_mesh.py +2 -1
monarch_supervisor/python_executable.py +6 -3
tests/error_test_binary.py +75 -9
tests/test_actor_error.py +370 -21
tests/test_alloc.py +1 -1
tests/test_allocator.py +373 -17
tests/test_controller.py +2 -0
tests/test_debugger.py +416 -0
tests/test_env_before_cuda.py +162 -0
tests/test_python_actors.py +184 -332
tests/test_rdma.py +198 -0
tests/test_remote_functions.py +40 -12
tests/test_rust_backend.py +7 -5
tests/test_sim_backend.py +1 -4
tests/test_tensor_engine.py +55 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/METADATA +6 -1
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/RECORD +80 -68
torchmonarch_nightly-2025.7.25.dist-info/entry_points.txt +3 -0
monarch/_monarch/hyperactor/__init__.py +0 -58
monarch/_monarch/worker/debugger.py +0 -117
monarch/_monarch/worker/logging.py +0 -107
monarch/debugger.py +0 -379
monarch/future.py +0 -76
monarch/rdma.py +0 -162
torchmonarch_nightly-2025.6.30.dist-info/entry_points.txt +0 -3
/monarch/{_monarch/worker → _src}/__init__.py +0 -0
/monarch/{common/_device_utils.py → _src/actor/device_utils.py} +0 -0
/monarch/{common → _src/actor}/shape.py +0 -0
/monarch/{_monarch → _src/tensor_engine}/__init__.py +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.6.30.dist-info → torchmonarch_nightly-2025.7.25.dist-info}/top_level.txt +0 -0

monarch/mesh_controller.py CHANGED Viewed

@@ -7,40 +7,63 @@
 import atexit
 import logging
 import os
-import time
+import pdb  # noqa
 import traceback
 from collections import deque
 from logging import Logger
-from typing import List, NamedTuple, Optional, TYPE_CHECKING, Union
+from typing import (
+    Any,
+    cast,
+    List,
+    NamedTuple,
+    Optional,
+    Sequence,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
 import torch.utils._python_dispatch
-from monarch import NDSlice
-from monarch._rust_bindings.monarch_extension import client, debugger
+from monarch._rust_bindings.monarch_extension import client
 from monarch._rust_bindings.monarch_extension.client import (  # @manual=//monarch/monarch_extension:monarch_extension
     WorldState,
 )
 from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
+from monarch._rust_bindings.monarch_hyperactor.actor import (
+    PythonMessage,
+    PythonMessageKind,
+    UnflattenArg,
+)
+from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
 from monarch._rust_bindings.monarch_hyperactor.proc import (  # @manual=//monarch/monarch_extension:monarch_extension
     ActorId,
 )
+from monarch._src.actor.actor_mesh import ActorEndpoint, Port, PortTuple
+from monarch._src.actor.endpoint import Selection
+from monarch._src.actor.shape import NDSlice
+from monarch.common import device_mesh, messages, stream
+from monarch.common.controller_api import TController
+from monarch.common.invocation import Seq
+from monarch.common.messages import Referenceable, SendResultOfActorCall
+from monarch.common.stream import StreamRef
+from monarch.common.tensor import InputChecker, Tensor
+from monarch.tensor_worker_main import _set_trace
 if TYPE_CHECKING:
     from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
         ProcMesh as HyProcMesh,
     )
-    from monarch.proc_mesh import ProcMesh
+    from monarch.actor import ProcMesh
 from monarch._rust_bindings.monarch_hyperactor.shape import Point
-from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
 from monarch.common.client import Client
 from monarch.common.controller_api import LogMessage, MessageResult
-from monarch.common.device_mesh import DeviceMesh, no_mesh
+from monarch.common.device_mesh import DeviceMesh
+from monarch.common.future import Future as OldFuture
 from monarch.common.invocation import DeviceException, RemoteException
-from monarch.controller.debugger import read as debugger_read, write as debugger_write
 from monarch.rust_local_mesh import _get_worker_exec_info
-from pyre_extensions import none_throws
 logger: Logger = logging.getLogger(__name__)
@@ -48,6 +71,7 @@ logger: Logger = logging.getLogger(__name__)
 class Controller(_Controller):
     def __init__(self, workers: "HyProcMesh") -> None:
         super().__init__()
+        self._mailbox: Mailbox = workers.client
         # Buffer for messages unrelated to debugging that are received while a
         # debugger session is active.
         self._non_debugger_pending_messages: deque[
@@ -58,19 +82,9 @@ class Controller(_Controller):
     def next_message(
         self, timeout: Optional[float]
     ) -> Optional[LogMessage | MessageResult]:
-        if self._non_debugger_pending_messages:
-            msg = self._non_debugger_pending_messages.popleft()
-        else:
-            msg = self._get_next_message(timeout_msec=int((timeout or 0.0) * 1000.0))
-        if msg is None:
-            return None
-        if isinstance(msg, client.WorkerResponse):
-            return _worker_response_to_result(msg)
-        elif isinstance(msg, client.LogMessage):
-            return LogMessage(msg.level, msg.message)
-        elif isinstance(msg, client.DebuggerMessage):
-            self._run_debugger_loop(msg)
+        raise RuntimeError(
+            "internal error: tensor engine does not produce futures that call next_message"
+        )
     def send(
         self,
@@ -86,56 +100,6 @@ class Controller(_Controller):
         self._drain_and_stop()
         return []
-    def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
-        if not isinstance(message.action, DebuggerAction.Paused):
-            raise RuntimeError(
-                f"Unexpected debugger message {message} when no debugger session is running"
-            )
-        self._pending_debugger_sessions.append(message.debugger_actor_id)
-        while self._pending_debugger_sessions:
-            debugger_actor_id = self._pending_debugger_sessions.popleft()
-            rank = debugger_actor_id.rank
-            proc_id = debugger_actor_id.proc_id
-            debugger_write(
-                f"pdb attached to proc {proc_id} with rank {rank}, debugger actor {debugger_actor_id} \n"
-            )
-            self._debugger_attach(debugger_actor_id)
-            while True:
-                # TODO: Add appropriate timeout.
-                msg = self._get_next_message(timeout_msec=None)
-                if not isinstance(msg, client.DebuggerMessage):
-                    self._non_debugger_pending_messages.append(msg)
-                    continue
-                if msg.debugger_actor_id != debugger_actor_id:
-                    if isinstance(msg.action, DebuggerAction.Paused):
-                        self._pending_debugger_sessions.append(msg.debugger_actor_id)
-                        continue
-                    else:
-                        raise RuntimeError(
-                            f"unexpected debugger message {msg} from rank {msg.debugger_actor_id.rank} "
-                            f"when debugging rank {debugger_actor_id.rank}"
-                        )
-                action = msg.action
-                if isinstance(action, DebuggerAction.Detach):
-                    break
-                elif isinstance(action, DebuggerAction.Read):
-                    self._debugger_write(
-                        debugger_actor_id, debugger_read(action.requested_size)
-                    )
-                elif isinstance(action, DebuggerAction.Write):
-                    debugger_write(
-                        debugger.get_bytes_from_write_action(action).decode()
-                    )
-                else:
-                    raise RuntimeError(
-                        f"unexpected debugger message {msg} when debugging rank {debugger_actor_id.rank}"
-                    )
     def worker_world_state(self) -> WorldState:
         raise NotImplementedError("worker world state")
@@ -145,54 +109,6 @@ class Controller(_Controller):
         pass
-# TODO: Handling conversion of the response can move to a separate module over time
-# especially as we have structured error messages.
-def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
-    if not result.is_exception():
-        # The result of the message needs to be unwrapped on a real device.
-        # Staying as a fake tensor will fail the tensor deserialization.
-        with no_mesh.activate():
-            return MessageResult(result.seq, result.result(), None)
-    exc = none_throws(result.exception())
-    if isinstance(exc, client.Error):
-        worker_frames = [
-            traceback.FrameSummary("<unknown>", None, frame)
-            for frame in exc.backtrace.split("\\n")
-        ]
-        return MessageResult(
-            seq=result.seq,
-            result=None,
-            error=RemoteException(
-                seq=exc.caused_by_seq,
-                exception=RuntimeError(exc.backtrace),
-                controller_frame_index=0,  # TODO: T225205291 fix this once we have recording support in rust
-                controller_frames=None,
-                worker_frames=worker_frames,
-                source_actor_id=exc.actor_id,
-                message=f"Remote function in {exc.actor_id} errored.",
-            ),
-        )
-    elif isinstance(exc, client.Failure):
-        frames = [
-            traceback.FrameSummary("<unknown>", None, frame)
-            for frame in exc.backtrace.split("\n")
-        ]
-        reason = f"Actor {exc.actor_id} crashed on {exc.address}, check the host log for details"
-        logger.error(reason)
-        return MessageResult(
-            seq=0,  # seq is not consumed for DeviceException; it will be directly thrown by the client
-            result=None,
-            error=DeviceException(
-                exception=RuntimeError(reason),
-                frames=frames,
-                source_actor_id=exc.actor_id,
-                message=reason,
-            ),
-        )
-    else:
-        raise RuntimeError(f"Unknown exception type: {type(exc)}")
 def _initialize_env(worker_point: Point, proc_id: str) -> None:
     worker_rank = worker_point.rank
     try:
@@ -213,12 +129,50 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None:
             "LOCAL_WORLD_SIZE": str(gpus_per_host),
         }
         os.environ.update(process_env)
+        pdb.set_trace = _set_trace
+        # workaround for set_manual_seed somehow not working if cuda is not initialized\
+        if torch.cuda.is_available():
+            torch.cuda.init()
     except Exception:
         traceback.print_exc()
         raise
 class MeshClient(Client):
+    def fetch(
+        self,
+        mesh: "DeviceMesh",
+        stream: "StreamRef",
+        shard,
+        preprocess_message,
+        args,
+        kwargs,
+        defs: Tuple["Tensor", ...],
+        uses: Tuple["Tensor", ...],
+    ) -> "OldFuture":  # the OldFuture is a lie
+        sender, receiver = PortTuple.create(self._mesh_controller._mailbox, once=True)
+        ident = self.new_node(defs, uses, cast("OldFuture", sender))
+        process = mesh._process(shard)
+        self.send(
+            process,
+            messages.SendValue(
+                ident,
+                None,
+                defs,
+                preprocess_message,
+                args,
+                kwargs,
+                stream,
+            ),
+        )
+        # we have to ask for status updates
+        # from workers to be sure they have finished
+        # enough work to count this future as finished,
+        # and all potential errors have been reported
+        self._request_status()
+        return cast("OldFuture", receiver.recv())
     def shutdown(
         self,
         destroy_pg: bool = True,
@@ -232,27 +186,43 @@ class MeshClient(Client):
         atexit.unregister(self._atexit)
         self._shutdown = True
-        # ensure all pending work is finished.
-        # all errors must be messaged back at this point
-        self.new_node_nocoalesce([], [], None, [])
-        self._request_status()
-        ttl = 60
-        start_time = time.time()
-        end_time = start_time + ttl
-        while ttl > 0 and self.last_assigned_seq > self.last_processed_seq:
-            ttl = end_time - time.time()
-            self.handle_next_message(ttl)
-            if self._pending_shutdown_error:
-                raise self._pending_shutdown_error
-        if ttl <= 0:
-            raise RuntimeError("shutdown timed out")
+        sender, receiver = PortTuple.create(self._mesh_controller._mailbox, once=True)
+        assert sender._port_ref is not None
+        self._mesh_controller.sync_at_exit(sender._port_ref.port_id)
+        receiver.recv().get(timeout=60)
         # we are not expecting anything more now, because we already
         # waited for the responses
         self.inner.drain_and_stop()
+    @property
+    def _mesh_controller(self) -> Controller:
+        return cast(Controller, self.inner)
+    def new_node_nocoalesce(
+        self,
+        defs: Sequence["Tensor"],
+        uses: Sequence["Tensor"],
+        future: Optional["OldFuture"],
+        tracebacks: List[List[traceback.FrameSummary]],
+    ) -> Seq:
+        seq = self._next_seq()
+        for d in defs:
+            d._seq = seq
+        response_port = None
+        if future is not None:
+            # method annotation is a lie to make Client happy
+            port, slice = cast("Tuple[Port[Any], NDSlice]", future)
+            assert port._port_ref is not None
+            response_port = (port._port_ref.port_id, slice)
+        self._mesh_controller.node(seq, defs, uses, response_port, tracebacks)
+        return seq
+    def handle_next_message(self, timeout: Optional[float]) -> bool:
+        """
+        Mesh controller message loop is handled by the tokio event loop.
+        """
+        return False
 def spawn_tensor_engine(proc_mesh: "ProcMesh") -> DeviceMesh:
     # This argument to Controller
@@ -260,7 +230,7 @@ def spawn_tensor_engine(proc_mesh: "ProcMesh") -> DeviceMesh:
     # report the proc ID instead of the rank it currently does.
     gpus = proc_mesh.sizes.get("gpus", 1)
     backend_ctrl = Controller(proc_mesh._proc_mesh)
-    client = MeshClient(backend_ctrl, proc_mesh.size(), gpus)
+    client = MeshClient(cast("TController", backend_ctrl), proc_mesh.size(), gpus)
     dm = DeviceMesh(
         client,
         NDSlice.new_row_major(list(proc_mesh.sizes.values())),
@@ -268,3 +238,95 @@ def spawn_tensor_engine(proc_mesh: "ProcMesh") -> DeviceMesh:
     )
     dm.exit = lambda: client.shutdown()
     return dm
+class RemoteException(Exception):
+    def __init__(
+        self,
+        worker_error_string: str,  # this should really be an exception + stacktrace but
+        # worker code needs major refactor to make this possible
+        controller_frames: List[traceback.FrameSummary],
+        rank: int,
+    ):
+        self.worker_error_string = worker_error_string
+        self.controller_frames = controller_frames
+        self.rank = rank
+    def __str__(self):
+        try:
+            controller_tb = "".join(traceback.format_list(self.controller_frames))
+            return (
+                f"A remote function has failed asynchronously on rank {self.rank}.\n"
+                f"Traceback of where the remote function was issued on controller (most recent call last):\n{controller_tb}"
+                f"Error as reported from worker:\n{self.worker_error_string}"
+            )
+        except Exception:
+            traceback.print_exc()
+            return "<exception formatting RemoteException>"
+def actor_send(
+    endpoint: ActorEndpoint,
+    args_kwargs_tuple: bytes,
+    refs: Sequence[Any],
+    port: Optional[Port[Any]],
+    selection: Selection,
+):
+    unflatten_args = [
+        UnflattenArg.PyObject if isinstance(ref, Tensor) else UnflattenArg.Mailbox
+        for ref in refs
+    ]
+    tensors = [ref for ref in refs if isinstance(ref, Tensor)]
+    # we have some monarch references, we need to ensure their
+    # proc_mesh matches that of the tensors we sent to it
+    chosen_stream = stream._active
+    for t in tensors:
+        if hasattr(t, "stream"):
+            chosen_stream = t.stream
+            break
+    with InputChecker(refs, lambda x: f"actor_call({x})") as checker:
+        checker.check_mesh_stream_local(device_mesh._active, chosen_stream)
+        # TODO: move propagators into Endpoint abstraction and run the propagator to get the
+        # mutates
+        checker.check_permission(())
+    selected_device_mesh = (
+        endpoint._actor_mesh._proc_mesh and endpoint._actor_mesh._proc_mesh._device_mesh
+    )
+    if selected_device_mesh is not checker.mesh:
+        raise ValueError(
+            f"monarch Tensors sent to an actor must be located on the same process as the actor. However {checker.mesh} is not {selected_device_mesh}."
+            "NYI: better serialization of mesh names to make the mismatch more clear."
+        )
+    client = cast(MeshClient, checker.mesh.client)
+    broker_id: Tuple[str, int] = client._mesh_controller.broker_id
+    stream_ref = chosen_stream._to_ref(client)
+    fut = (port, checker.mesh._ndslice) if port is not None else None
+    ident = client.new_node([], tensors, cast("OldFuture", fut))
+    # To ensure that both the actor and the stream execute in order, we send a message
+    # to each at this point. The message to the worker will be handled on the stream actor where
+    # it will send the 'tensor's to the broker actor locally, along with a response port with the
+    # computed value.
+    # The message to the generic actor tells it to first wait on the broker to get the local arguments
+    # from the stream, then it will run the actor method, and send the result to response port.
+    actor_msg = PythonMessage(
+        PythonMessageKind.CallMethodIndirect(
+            endpoint._name, broker_id, ident, unflatten_args
+        ),
+        args_kwargs_tuple,
+    )
+    endpoint._actor_mesh.cast(actor_msg, selection)
+    worker_msg = SendResultOfActorCall(ident, broker_id, tensors, [], stream_ref)
+    client.send(checker.mesh._ndslice, worker_msg)
+    # we have to ask for status updates
+    # from workers to be sure they have finished
+    # enough work to count this future as finished,
+    # and all potential errors have been reported
+    client._request_status()

monarch/monarch_controller CHANGED Viewed

Binary file

monarch/opaque_module.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import List
 import torch
 from monarch.common.function_caching import TensorGroup, TensorGroupPattern
 from monarch.common.opaque_ref import OpaqueRef
-from monarch.common.remote import remote
+from monarch.common.remote import call_on_shard_and_fetch, remote
 from monarch.common.tensor_factory import TensorFactory
 from monarch.common.tree import flatten
 from monarch.opaque_object import _fresh_opaque_ref, OpaqueObject
@@ -144,11 +144,9 @@ class OpaqueModule:
     def parameters(self):
         if self._parameters is None:
-            tensor_group_pattern = (
-                remote(_get_parameters_shape)
-                .call_on_shard_and_fetch(self._object)
-                .result()
-            )
+            tensor_group_pattern = call_on_shard_and_fetch(
+                remote(_get_parameters_shape), self._object
+            ).result()
             self._parameters = [
                 p.requires_grad_(True)
                 for p in remote(

monarch/opaque_object.py CHANGED Viewed

@@ -14,7 +14,7 @@ from monarch.common.function import (
 )
 from monarch.common.opaque_ref import OpaqueRef
-from monarch.common.remote import remote
+from monarch.common.remote import call_on_shard_and_fetch, remote
 def _invoke_method(obj: OpaqueRef, method_name: str, *args, **kwargs):
@@ -83,6 +83,6 @@ class OpaqueObject(OpaqueRef):
         return endpoint(self, method_name, *args, **kwargs)
     def call_method_on_shard_and_fetch(self, method_name, *args, **kwargs):
-        return remote(_invoke_method).call_on_shard_and_fetch(
-            self, method_name, *args, **kwargs
+        return call_on_shard_and_fetch(
+            remote(_invoke_method), self, method_name, *args, **kwargs
         )