PyPI - torchmonarch-nightly - Versions diffs - 2025.6.8__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.10__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.8__cp310-cp310-manylinux2014_x86_64.whl → 2025.6.10__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/actor_mesh.py CHANGED Viewed

@@ -83,7 +83,7 @@ class MonarchContext:
 _context: contextvars.ContextVar[MonarchContext] = contextvars.ContextVar(
-    "monarch.service._context"
+    "monarch.actor_mesh._context"
 )
@@ -677,7 +677,7 @@ class ActorError(Exception):
     def __init__(
         self,
         exception: Exception,
-        message: str = "A remote service call has failed asynchronously.",
+        message: str = "A remote actor call has failed asynchronously.",
     ) -> None:
         self.exception = exception
         self.actor_mesh_ref_frames: StackSummary = extract_tb(exception.__traceback__)
@@ -688,7 +688,7 @@ class ActorError(Exception):
         actor_mesh_ref_tb = "".join(traceback.format_list(self.actor_mesh_ref_frames))
         return (
             f"{self.message}\n"
-            f"Traceback of where the service call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
+            f"Traceback of where the remote call failed (most recent call last):\n{actor_mesh_ref_tb}{type(self.exception).__name__}: {exe}"
         )

monarch/mesh_controller.py ADDED Viewed

@@ -0,0 +1,209 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import traceback
+from collections import deque
+from logging import Logger
+from typing import List, NamedTuple, Optional, Union
+import torch.utils._python_dispatch
+from monarch import NDSlice
+from monarch._rust_bindings.monarch_extension import client, debugger
+from monarch._rust_bindings.monarch_extension.client import (  # @manual=//monarch/monarch_extension:monarch_extension
+    WorldState,
+)
+from monarch._rust_bindings.monarch_extension.mesh_controller import _Controller
+from monarch._rust_bindings.monarch_hyperactor.proc import (  # @manual=//monarch/monarch_extension:monarch_extension
+    ActorId,
+)
+from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
+from monarch._rust_bindings.monarch_messages.debugger import DebuggerAction
+from monarch.common.client import Client
+from monarch.common.controller_api import LogMessage, MessageResult
+from monarch.common.device_mesh import DeviceMesh, no_mesh
+from monarch.common.invocation import DeviceException, RemoteException
+from monarch.controller.debugger import read as debugger_read, write as debugger_write
+from monarch.proc_mesh import ProcMesh
+from pyre_extensions import none_throws
+logger: Logger = logging.getLogger(__name__)
+class Controller(_Controller):
+    def __init__(self, workers: HyProcMesh) -> None:
+        super().__init__()
+        # Buffer for messages unrelated to debugging that are received while a
+        # debugger session is active.
+        self._non_debugger_pending_messages: deque[
+            Optional[client.LogMessage | client.WorkerResponse]
+        ] = deque()
+        self._pending_debugger_sessions: deque[ActorId] = deque()
+    def next_message(
+        self, timeout: Optional[float]
+    ) -> Optional[LogMessage | MessageResult]:
+        if self._non_debugger_pending_messages:
+            msg = self._non_debugger_pending_messages.popleft()
+        else:
+            msg = self._get_next_message(timeout_msec=int((timeout or 0.0) * 1000.0))
+        if msg is None:
+            return None
+        if isinstance(msg, client.WorkerResponse):
+            return _worker_response_to_result(msg)
+        elif isinstance(msg, client.LogMessage):
+            return LogMessage(msg.level, msg.message)
+        elif isinstance(msg, client.DebuggerMessage):
+            self._run_debugger_loop(msg)
+    def send(
+        self,
+        ranks: Union[NDSlice, List[NDSlice]],
+        msg: NamedTuple,
+    ) -> None:
+        with torch.utils._python_dispatch._disable_current_modes():
+            return super().send(ranks, msg)
+    def drain_and_stop(
+        self,
+    ) -> List[LogMessage | MessageResult | client.DebuggerMessage]:
+        logger.info("rust controller shutting down")
+        results = []
+        for msg in self._drain_and_stop():
+            if isinstance(msg, client.WorkerResponse):
+                results.append(_worker_response_to_result(msg))
+            elif isinstance(msg, client.LogMessage):
+                results.append(LogMessage(msg.level, msg.message))
+            elif isinstance(msg, client.DebuggerMessage):
+                results.append(msg)
+            else:
+                raise RuntimeError(f"Unexpected message type {type(msg)}")
+        return results
+    def _run_debugger_loop(self, message: client.DebuggerMessage) -> None:
+        if not isinstance(message.action, DebuggerAction.Paused):
+            raise RuntimeError(
+                f"Unexpected debugger message {message} when no debugger session is running"
+            )
+        self._pending_debugger_sessions.append(message.debugger_actor_id)
+        while self._pending_debugger_sessions:
+            debugger_actor_id = self._pending_debugger_sessions.popleft()
+            rank = debugger_actor_id.rank
+            proc_id = debugger_actor_id.proc_id
+            debugger_write(
+                f"pdb attached to proc {proc_id} with rank {rank}, debugger actor {debugger_actor_id} \n"
+            )
+            self._debugger_attach(debugger_actor_id)
+            while True:
+                # TODO: Add appropriate timeout.
+                msg = self._get_next_message(timeout_msec=None)
+                if not isinstance(msg, client.DebuggerMessage):
+                    self._non_debugger_pending_messages.append(msg)
+                    continue
+                if msg.debugger_actor_id != debugger_actor_id:
+                    if isinstance(msg.action, DebuggerAction.Paused):
+                        self._pending_debugger_sessions.append(msg.debugger_actor_id)
+                        continue
+                    else:
+                        raise RuntimeError(
+                            f"unexpected debugger message {msg} from rank {msg.debugger_actor_id.rank} "
+                            f"when debugging rank {debugger_actor_id.rank}"
+                        )
+                action = msg.action
+                if isinstance(action, DebuggerAction.Detach):
+                    break
+                elif isinstance(action, DebuggerAction.Read):
+                    self._debugger_write(
+                        debugger_actor_id, debugger_read(action.requested_size)
+                    )
+                elif isinstance(action, DebuggerAction.Write):
+                    debugger_write(
+                        debugger.get_bytes_from_write_action(action).decode()
+                    )
+                else:
+                    raise RuntimeError(
+                        f"unexpected debugger message {msg} when debugging rank {debugger_actor_id.rank}"
+                    )
+    def worker_world_state(self) -> WorldState:
+        raise NotImplementedError("worker world state")
+    def stop_mesh(self):
+        # I think this is a noop?
+        pass
+# TODO: Handling conversion of the response can move to a separate module over time
+# especially as we have structured error messages.
+def _worker_response_to_result(result: client.WorkerResponse) -> MessageResult:
+    if not result.is_exception():
+        # The result of the message needs to be unwrapped on a real device.
+        # Staying as a fake tensor will fail the tensor deserialization.
+        with no_mesh.activate():
+            return MessageResult(result.seq, result.result(), None)
+    exc = none_throws(result.exception())
+    if isinstance(exc, client.Error):
+        worker_frames = [
+            traceback.FrameSummary("<unknown>", None, frame)
+            for frame in exc.backtrace.split("\\n")
+        ]
+        logger.error(f"Worker {exc.actor_id} failed")
+        return MessageResult(
+            seq=result.seq,
+            result=None,
+            error=RemoteException(
+                seq=exc.caused_by_seq,
+                exception=RuntimeError(exc.backtrace),
+                controller_frame_index=0,  # TODO: T225205291 fix this once we have recording support in rust
+                controller_frames=None,
+                worker_frames=worker_frames,
+                source_actor_id=exc.actor_id,
+                message=f"Worker {exc.actor_id} failed",
+            ),
+        )
+    elif isinstance(exc, client.Failure):
+        frames = [
+            traceback.FrameSummary("<unknown>", None, frame)
+            for frame in exc.backtrace.split("\n")
+        ]
+        reason = f"Actor {exc.actor_id} crashed on {exc.address}, check the host log for details"
+        logger.error(reason)
+        return MessageResult(
+            seq=0,  # seq is not consumed for DeviceException; it will be directly thrown by the client
+            result=None,
+            error=DeviceException(
+                exception=RuntimeError(reason),
+                frames=frames,
+                source_actor_id=exc.actor_id,
+                message=reason,
+            ),
+        )
+    else:
+        raise RuntimeError(f"Unknown exception type: {type(exc)}")
+def spawn_tensor_engine(proc_mesh: ProcMesh) -> DeviceMesh:
+    # This argument to Controller
+    # is currently only used for debug printing. It should be fixed to
+    # report the proc ID instead of the rank it currently does.
+    gpus = proc_mesh.sizes.get("gpus", 1)
+    backend_ctrl = Controller(proc_mesh._proc_mesh)
+    client = Client(backend_ctrl, proc_mesh.size(), gpus)
+    dm = DeviceMesh(
+        client,
+        NDSlice.new_row_major(list(proc_mesh.sizes.values())),
+        tuple(proc_mesh.sizes.keys()),
+    )
+    dm.exit = lambda: client.shutdown()
+    return dm

monarch/monarch_controller CHANGED Viewed

Binary file

tests/test_python_actors.py CHANGED Viewed

@@ -7,7 +7,12 @@
 import operator
 from types import ModuleType
+import monarch
+import pytest
 import torch
 from monarch.actor_mesh import (
     Accumulator,
     Actor,
@@ -17,6 +22,8 @@ from monarch.actor_mesh import (
     endpoint,
 )
+from monarch.mesh_controller import spawn_tensor_engine
 from monarch.proc_mesh import local_proc_mesh, proc_mesh
 from monarch.rdma import RDMABuffer
@@ -375,3 +382,20 @@ def test_rust_binding_modules_correct() -> None:
                 assert value.__module__ == path
     check(bindings, "monarch._rust_bindings")
+def test_tensor_engine() -> None:
+    pm = proc_mesh(gpus=2).get()
+    dm = spawn_tensor_engine(pm)
+    with dm.activate():
+        r = monarch.inspect(2 * torch.zeros(3, 4))
+    fm = dm.flatten("all")
+    with fm.activate():
+        f = monarch.inspect(2 * torch.zeros(3, 4), all=1)
+    assert torch.allclose(torch.zeros(3, 4), r)
+    assert torch.allclose(torch.zeros(3, 4), f)
+    dm.exit()

tests/test_rust_backend.py CHANGED Viewed

@@ -14,6 +14,7 @@ import monarch
 import pytest
 import torch
+import torch.utils._python_dispatch
 from monarch import fetch_shard, no_mesh, remote, Stream
 from monarch.common.device_mesh import DeviceMesh
 from monarch.rust_local_mesh import local_meshes, LoggingLocation, SocketType
@@ -180,3 +181,37 @@ class TestRustBackend(TestCase):
             self.assertIsNotNone(mesh_info.mesh_labels)
             self.assertEqual(len(mesh_info.devices_labels), 2)
+    def test_ivalue_problems(self) -> None:
+        with local_mesh(hosts=1, gpu_per_host=1):
+            from typing import cast
+            from monarch.common.messages import CallFunction, CommandGroup
+            a = cast(monarch.Tensor, torch.rand(3, 4))
+            result = monarch.Tensor(a._fake, a.mesh, a.stream)
+            msg = CallFunction(
+                0,
+                result,
+                (),
+                monarch.common.function.ResolvableFunctionFromPath(
+                    "torch.ops.aten.mul.Tensor"
+                ),
+                (2, a),
+                {},
+                a.stream._to_ref(a.mesh.client),
+                a.mesh,
+                [],
+            )
+            # Internally, this will call CallFunction(...).to_rust_message().
+            # The 2 arg will be converted to an IValue tensor via rust + C++.
+            # Then when the CommandGroup message gets converted to rust, it
+            # will attempt to clone the rust CallFunction message, which will
+            # attempt to clone the IValue tensor, which will cause a crash.
+            # Upon attempting to clone the IValue tensor, our custom __torch_dispatch__
+            # intercepts the following two calls:
+            #   aten._to_copy.default () (2,) {'dtype': torch.float64, 'device': device(type='cpu')}
+            #   aten.clone.default () (2,) {}
+            with torch.utils._python_dispatch._disable_current_modes():
+                CommandGroup([msg]).to_rust_message()

{torchmonarch_nightly-2025.6.8.dist-info → torchmonarch_nightly-2025.6.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchmonarch-nightly
-Version: 2025.6.8
+Version: 2025.6.10
 Summary: Monarch: Single controller library
 Author: Meta
 Author-email: oncall+monarch@xmail.facebook.com

{torchmonarch_nightly-2025.6.8.dist-info → torchmonarch_nightly-2025.6.10.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 monarch/__init__.py,sha256=iUvWHc0-7Q2tovRoRxOIiA3TsefMXCbWl-jEfQ2djew,6897
-monarch/_rust_bindings.so,sha256=HiisXwHtZrYKATL6RdJxw2u_y7Wjgjtwt52V1LIR6ss,39151608
+monarch/_rust_bindings.so,sha256=0-svsKnUJboaOBd5i-LOfpHiRRAgVLX_1Hq_YYREQi8,39756680
 monarch/_testing.py,sha256=MN8DK1e-wzV0-R_nFW1b_7-O5oKfWvZ12BMGD4Z7PQk,6755
-monarch/actor_mesh.py,sha256=5DbU9OrmNk5I9yasmE-rkTgHyO07oiLlAG0jbJBOXgI,23000
+monarch/actor_mesh.py,sha256=AKdjPg3FM6Yt35uFPBnP7fNVEu6busu5BXVWLwjU2A4,23000
 monarch/allocator.py,sha256=_2DKFP9pSD33zDgH7xZJC8Tq7BQrCeQEUmMB7_xCT0Y,1784
 monarch/bootstrap_main.py,sha256=SYTOz-pTXiJNk78PPD5HAOJDSb8t2JfitRWdmWB3ogo,2559
 monarch/cached_remote_function.py,sha256=kYdB6r4OHx_T_uX4q3tCNcp1t2DJwF8tPTIahUiT2pU,8785
@@ -9,7 +9,8 @@ monarch/fetch.py,sha256=61jxo7sx4QNUTkc0_rF5NaJROen4tKbAaiIjrXWLOvg,1705
 monarch/future.py,sha256=lcdFEe7m1shYPPuvZ1RkS6JUIChEKGBWe3v7x_nu4Hg,731
 monarch/gradient_generator.py,sha256=Rl3dmXGceTdCc1mYBg2JciR88ywGPnW7TVkL86KwqEA,6366
 monarch/memory.py,sha256=ol86dBhFAJqg78iF25-BuK0wuwj1onR8FIioZ_B0gjw,1377
-monarch/monarch_controller,sha256=5TKjcz7U7K8OttrwYv-w7yYtPUm2aMOQV4gt0u_Vj5c,20385960
+monarch/mesh_controller.py,sha256=A3G8Z5S0w3mjCVI2r6YGM6K3BUs3ZHU8PFo6kCaYTU4,8615
+monarch/monarch_controller,sha256=Q1eR_EVJqDQLrJZ_6p1ldxVDAU1OmN5lSSuctDcaAFY,20396832
 monarch/notebook.py,sha256=zu9MKDFKf1-rCM2TqFSRJjMBeiWuKcJSyUFLvoZRQzs,25949
 monarch/opaque_module.py,sha256=oajOu_WD1hD4hxE8HDdO-tvWY7KDHWd7VaAhJEa5L2I,10446
 monarch/opaque_object.py,sha256=IVpll4pyuKZMo_EnPh4s0qnx8RlAcJrJ1yoLX6E75wQ,2782
@@ -139,9 +140,9 @@ tests/test_future.py,sha256=cXzaNi2YDwVyjR541ScXmgktX1YFsKzbl8wep0DMVbk,3032
 tests/test_grad_generator.py,sha256=p4Pm4kMEeGldt2jUVAkGKCB0mLccKI28pltH6OTGbQA,3412
 tests/test_mock_cuda.py,sha256=5hisElxeLJ5MHw3KM9gwxBiXiMaG-Rm382u3AsQcDOI,3068
 tests/test_pdb_actor.py,sha256=5KJhuhcZDPWMdjC6eAtDdwnz1W7jNFXvIrMSFaCWaPw,3858
-tests/test_python_actors.py,sha256=fDvHUIWNZeL3CWnTJMbdh98i1tnH1-LJEG1pIFkGYF8,10898
+tests/test_python_actors.py,sha256=gP6MDN2BL282qInUGP9untlpsqqB2uy1Iq5gUXnXcUo,11387
 tests/test_remote_functions.py,sha256=ExqYlRQWRabpGBuKvNIOa8Hwj-iXuP87Jfb9i5RhaGs,50066
-tests/test_rust_backend.py,sha256=nXSa0ZQ0NniZm4PzvKhrWvVLD-RKvIWYkPXm1BEBXq8,6235
+tests/test_rust_backend.py,sha256=94S3R995ZkyIhEiBsM5flcjf5X7bscEAHBtInbTRFe8,7776
 tests/test_signal_safe_block_on.py,sha256=bmal0XgzJowZXJV6T1Blow5a-vZluYWusCThLMGxyTE,3336
 tests/test_sim_backend.py,sha256=RckCkHO3DxKsAGdZMcIzRnd6YJXwDim1D5-xbBbgKio,1473
 tests/simulator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -149,9 +150,9 @@ tests/simulator/test_profiling.py,sha256=TGYCfzTLdkpIwnOuO6KApprmrgPIRQe60KRX3wk
 tests/simulator/test_simulator.py,sha256=LO8lA0ssY-OGEBL5ipEu74f97Y765TEwfUOv-DtIptM,14568
 tests/simulator/test_task.py,sha256=ipqBDuDAysuo1xOB9S5psaFvwe6VATD43IovCTSs0t4,2327
 tests/simulator/test_worker.py,sha256=QrWWIJ3HDgDLkBPRc2mwYPlOQoXQcj1qRfc0WUfKkFY,3507
-torchmonarch_nightly-2025.6.8.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
-torchmonarch_nightly-2025.6.8.dist-info/METADATA,sha256=AfGuuk6TyhejOLotJWjRt3Hsl80lkEWS4iOaZ61YHj4,2771
-torchmonarch_nightly-2025.6.8.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
-torchmonarch_nightly-2025.6.8.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
-torchmonarch_nightly-2025.6.8.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
-torchmonarch_nightly-2025.6.8.dist-info/RECORD,,
+torchmonarch_nightly-2025.6.10.dist-info/licenses/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
+torchmonarch_nightly-2025.6.10.dist-info/METADATA,sha256=DR1GtSFqtqsjhKWi38uGcvhw2p3ycHYSOwDmsErwLj0,2772
+torchmonarch_nightly-2025.6.10.dist-info/WHEEL,sha256=_wZSFk0d90K9wOBp8Q-UGxshyiJ987JoPiyUBNC6VLk,104
+torchmonarch_nightly-2025.6.10.dist-info/entry_points.txt,sha256=sqfQ16oZqjEvttUI-uj9BBXIIE6jt05bYFSmy-2hyXI,106
+torchmonarch_nightly-2025.6.10.dist-info/top_level.txt,sha256=E-ZssZzyM17glpVrh-S9--qJ-w9p2EjuYOuNw9tQ4Eg,33
+torchmonarch_nightly-2025.6.10.dist-info/RECORD,,

{torchmonarch_nightly-2025.6.8.dist-info → torchmonarch_nightly-2025.6.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.8.dist-info → torchmonarch_nightly-2025.6.10.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.8.dist-info → torchmonarch_nightly-2025.6.10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchmonarch_nightly-2025.6.8.dist-info → torchmonarch_nightly-2025.6.10.dist-info}/top_level.txt RENAMED Viewed

File without changes