PyPI - torchmonarch-nightly - Versions diffs - 2025.9.9__cp312-cp312-manylinux2014_x86_64.whl → 2025.9.11__cp312-cp312-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.9.9__cp312-cp312-manylinux2014_x86_64.whl → 2025.9.11__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

monarch/__init__.py +7 -0
monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +1 -1
monarch/_src/actor/bootstrap_main.py +7 -2
monarch/_src/actor/debugger/breakpoint.py +30 -0
monarch/_src/actor/debugger/debug_command.py +183 -0
monarch/_src/actor/debugger/debug_controller.py +246 -0
monarch/_src/actor/debugger/debug_io.py +68 -0
monarch/_src/actor/debugger/debug_session.py +249 -0
monarch/_src/actor/debugger/pdb_wrapper.py +1 -1
monarch/_src/actor/host_mesh.py +10 -2
monarch/_src/actor/pickle.py +4 -10
monarch/_src/actor/proc_mesh.py +80 -19
monarch/_src/tensor_engine/rdma.py +2 -0
monarch/actor/__init__.py +1 -1
monarch/gradient/_gradient_generator.so +0 -0
monarch/monarch_controller +0 -0
monarch/tools/cli.py +26 -0
monarch/tools/commands.py +15 -0
monarch/tools/debug_env.py +34 -0
monarch/tools/mesh_spec.py +2 -0
tests/test_allocator.py +18 -9
tests/test_debugger.py +29 -25
tests/test_mock_cuda.py +11 -3
torchmonarch_nightly-2025.9.11.data/scripts/process_allocator +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/RECORD +31 -29
monarch/_src/actor/debugger/debugger.py +0 -737
monarch/_src/debug_cli/__init__.py +0 -7
monarch/_src/debug_cli/debug_cli.py +0 -43
monarch/debug_cli/__init__.py +0 -7
monarch/debug_cli/__main__.py +0 -12
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/top_level.txt +0 -0

monarch/__init__.py CHANGED Viewed

@@ -9,6 +9,13 @@
 from importlib import import_module as _import_module
 from typing import TYPE_CHECKING
+# Import before monarch to pre-load torch DSOs as, in exploded wheel flows,
+# our RPATHs won't correctly find them.
+try:
+    import torch  # noqa: F401
+except ImportError:
+    pass
 # submodules of monarch should not be imported in this
 # top-level file because it will cause them to get
 # loaded even if they are not actually being used.

monarch/_rust_bindings.so CHANGED Viewed

Binary file

monarch/_src/actor/actor_mesh.py CHANGED Viewed

@@ -953,7 +953,7 @@ class _Actor:
         DebugContext.set(DebugContext())
     def _post_mortem_debug(self, exc_tb) -> None:
-        from monarch._src.actor.debugger.debugger import debug_controller
+        from monarch._src.actor.debugger.debug_controller import debug_controller
         if (pdb_wrapper := DebugContext.get().pdb_wrapper) is not None:
             with fake_sync_state():

monarch/_src/actor/bootstrap_main.py CHANGED Viewed

@@ -17,6 +17,12 @@ import multiprocessing
 import os
 import sys
+# Import torch to avoid import-time races if a spawned actor tries to import torch.
+try:
+    import torch  # @manual  # noqa: F401
+except ImportError:
+    pass
 async def main():
     from monarch._rust_bindings.monarch_hyperactor.bootstrap import bootstrap_main
@@ -32,7 +38,6 @@ def invoke_main():
     global bootstrap_main
     # TODO: figure out what from worker_main.py we should reproduce here.
     from monarch._src.actor.telemetry import TracingForwarder  # noqa
     if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
@@ -56,7 +61,7 @@ def invoke_main():
     except Exception as e:
         logging.warning(f"Failed to set up py-spy: {e}")
-    from monarch._src.actor.debugger.debugger import remote_breakpointhook
+    from monarch._src.actor.debugger.breakpoint import remote_breakpointhook
     sys.breakpointhook = remote_breakpointhook

monarch/_src/actor/debugger/breakpoint.py ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import inspect
+from monarch._src.actor.actor_mesh import context, DebugContext
+from monarch._src.actor.debugger.debug_controller import debug_controller
+from monarch._src.actor.debugger.pdb_wrapper import PdbWrapper
+def remote_breakpointhook() -> None:
+    frame = inspect.currentframe()
+    assert frame is not None
+    frame = frame.f_back
+    assert frame is not None
+    ctx = context()
+    rank = ctx.message_rank
+    pdb_wrapper = PdbWrapper(
+        rank.rank,
+        {k: rank[k] for k in rank},
+        ctx.actor_instance.actor_id,
+        debug_controller(),
+    )
+    DebugContext.set(DebugContext(pdb_wrapper))
+    pdb_wrapper.set_trace(frame)

monarch/_src/actor/debugger/debug_command.py ADDED Viewed

@@ -0,0 +1,183 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import sys
+from dataclasses import dataclass
+from typing import cast, Dict, List, Tuple, Union
+from monarch._src.actor.debugger.debug_io import DebugIO
+RanksType = Union[int, List[int], range, Dict[str, Union[range, List[int], int]]]
+_debug_input_parser = None
+# Wrap the parser in a function so that jobs don't have to import lark
+# unless they want to use the debugger.
+def _get_debug_input_parser():
+    global _debug_input_parser
+    if _debug_input_parser is None:
+        from lark import Lark
+        _debug_input_parser = Lark(
+            """
+            rank_list: INT "," INT ("," INT)*
+            start: INT?
+            stop: INT?
+            step: INT?
+            rank_range: start ":" stop (":" step)?
+            dim: CNAME "=" (rank_range | "(" rank_list ")" | INT)
+            dims: dim ("," dim)*
+            ranks: "ranks(" (dims | rank_range | rank_list | INT) ")"
+            pdb_command: /\\w+.*/
+            actor_name: /[-_a-zA-Z0-9]+/
+            cast: "cast" _WS actor_name ranks pdb_command
+            help: "h" | "help"
+            attach: ("a" | "attach") _WS actor_name INT
+            cont: "c" | "continue"
+            quit: "q" | "quit"
+            list: "l" | "list"
+            command: attach | list | cast | help | cont | quit
+            _WS: WS+
+            %import common.INT
+            %import common.CNAME
+            %import common.WS
+            %ignore WS
+            """,
+            start="command",
+        )
+    return _debug_input_parser
+_debug_input_transformer = None
+# Wrap the transformer in a function so that jobs don't have to import lark
+# unless they want to use the debugger.
+def _get_debug_input_transformer():
+    global _debug_input_transformer
+    if _debug_input_transformer is None:
+        from lark import Transformer
+        from lark.lexer import Token
+        class _IntoDebugCommandTransformer(Transformer):
+            def rank_list(self, items: List[Token]) -> List[int]:
+                return [int(item.value) for item in items]
+            def start(self, items: List[Token]) -> int:
+                if len(items) == 0:
+                    return 0
+                return int(items[0].value)
+            def stop(self, items: List[Token]) -> int:
+                if len(items) == 0:
+                    return sys.maxsize
+                return int(items[0].value)
+            def step(self, items: List[Token]) -> int:
+                if len(items) == 0:
+                    return 1
+                return int(items[0].value)
+            def rank_range(self, items: List[int]) -> range:
+                return range(*items)
+            def dim(
+                self, items: Tuple[Token, Union[range, List[int], Token]]
+            ) -> Tuple[str, Union[range, List[int], int]]:
+                if isinstance(items[1], range):
+                    return (items[0].value, cast(range, items[1]))
+                elif isinstance(items[1], list):
+                    return (items[0].value, cast(List[int], items[1]))
+                else:
+                    return (items[0].value, int(cast(Token, items[1]).value))
+            def dims(
+                self, items: List[Tuple[str, Union[range, List[int], int]]]
+            ) -> Dict[str, Union[range, List[int], int]]:
+                return {dim[0]: dim[1] for dim in items}
+            def ranks(self, items: List[Union[RanksType, Token]]) -> RanksType:
+                if isinstance(items[0], Token):
+                    return int(cast(Token, items[0]).value)
+                return cast(RanksType, items[0])
+            def pdb_command(self, items: List[Token]) -> str:
+                return items[0].value
+            def actor_name(self, items: List[Token]) -> str:
+                return items[0].value
+            def help(self, _items: List[Token]) -> "Help":
+                return Help()
+            def attach(self, items: Tuple[str, Token]) -> "Attach":
+                return Attach(items[0], int(items[1].value))
+            def cont(self, _items: List[Token]) -> "Continue":
+                return Continue()
+            def quit(self, _items: List[Token]) -> "Quit":
+                return Quit()
+            def cast(self, items: Tuple[str, RanksType, str]) -> "Cast":
+                return Cast(*items)
+            def list(self, items: List[Token]) -> "ListCommand":
+                return ListCommand()
+            def command(self, items: List["DebugCommand"]) -> "DebugCommand":
+                return items[0]
+        _debug_input_transformer = _IntoDebugCommandTransformer()
+    return _debug_input_transformer
+class DebugCommand:
+    @staticmethod
+    async def parse(debug_io: DebugIO, line: str) -> Union["DebugCommand", None]:
+        try:
+            tree = _get_debug_input_parser().parse(line)
+            return _get_debug_input_transformer().transform(tree)
+        except Exception as e:
+            await debug_io.output(f"Error parsing input: {e}\n")
+            return None
+@dataclass
+class Attach(DebugCommand):
+    actor_name: str
+    rank: int
+@dataclass
+class ListCommand(DebugCommand):
+    pass
+@dataclass
+class Quit(DebugCommand):
+    pass
+@dataclass
+class Help(DebugCommand):
+    pass
+@dataclass
+class Continue(DebugCommand):
+    pass
+@dataclass
+class Cast(DebugCommand):
+    actor_name: str
+    ranks: RanksType
+    command: str

monarch/_src/actor/debugger/debug_controller.py ADDED Viewed

@@ -0,0 +1,246 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import asyncio
+import functools
+from typing import Dict, List, Optional, Tuple
+from monarch._src.actor.actor_mesh import Actor
+from monarch._src.actor.debugger.debug_command import (
+    Attach,
+    Cast,
+    Continue,
+    DebugCommand,
+    Help,
+    ListCommand,
+    Quit,
+    RanksType,
+)
+from monarch._src.actor.debugger.debug_io import (
+    DebugCliIO,
+    DebugIO,
+    DebugIOError,
+    DebugStdIO,
+)
+from monarch._src.actor.debugger.debug_session import (
+    DebugSession,
+    DebugSessionInfo,
+    DebugSessions,
+)
+from monarch._src.actor.debugger.pdb_wrapper import DebuggerWrite
+from monarch._src.actor.endpoint import endpoint
+from monarch._src.actor.proc_mesh import get_or_spawn_controller
+from monarch._src.actor.sync_state import fake_sync_state
+from monarch.tools.debug_env import (
+    _get_debug_server_host,
+    _get_debug_server_port,
+    _get_debug_server_protocol,
+)
+from pyre_extensions import none_throws
+from tabulate import tabulate
+class DebugController(Actor):
+    """
+    Single actor for both remote debuggers and users to talk to.
+    Handles multiple sessions simultanesouly
+    """
+    def __init__(self) -> None:
+        self.sessions = DebugSessions()
+        self._task_lock = asyncio.Lock()
+        self._task: asyncio.Task | None = None
+        self._debug_io: DebugIO = DebugStdIO()
+        self._server = asyncio.Future()
+        self._server_task = asyncio.create_task(self._serve())
+    async def _serve(self) -> None:
+        try:
+            if (proto := _get_debug_server_protocol()) != "tcp":
+                raise NotImplementedError(
+                    f"Network protocol {proto} not yet supported."
+                )
+            server = await asyncio.start_server(
+                self._handle_client,
+                _get_debug_server_host(),
+                _get_debug_server_port(),
+            )
+            async with server:
+                self._server.set_result(server)
+                await server.serve_forever()
+        except Exception as e:
+            if self._server.done():
+                self._server = asyncio.Future()
+            self._server.set_exception(e)
+            raise
+    async def _handle_client(
+        self,
+        reader: asyncio.StreamReader,
+        writer: asyncio.StreamWriter,
+    ) -> None:
+        # Make sure only one external debug process can
+        # be attached at a time. If a new request is
+        # received, the current task is cancelled.
+        async with self._task_lock:
+            if self._task is not None:
+                self._task.cancel()
+                try:
+                    await none_throws(self._task)
+                except (DebugIOError, asyncio.CancelledError):
+                    pass
+            self._debug_io = DebugCliIO(reader, writer)
+            self._task = asyncio.create_task(self._enter())
+    @endpoint
+    async def wait_pending_session(self):
+        while len(self.sessions) == 0:
+            await asyncio.sleep(1)
+    @endpoint
+    async def list(self, print_output=True) -> List[DebugSessionInfo]:
+        session_info = sorted(self.sessions.info())
+        if print_output:
+            await self._debug_io.output(
+                tabulate(
+                    (
+                        (
+                            info.actor_name,
+                            info.rank,
+                            info.coords,
+                            info.hostname,
+                            info.function,
+                            info.lineno,
+                        )
+                        for info in session_info
+                    ),
+                    headers=[
+                        "Actor Name",
+                        "Rank",
+                        "Coords",
+                        "Hostname",
+                        "Function",
+                        "Line No.",
+                    ],
+                    tablefmt="grid",
+                )
+                + "\n"
+            )
+        return session_info
+    async def _enter(self) -> None:
+        await asyncio.sleep(0.5)
+        await self._debug_io.output(
+            "\n\n************************ MONARCH DEBUGGER ************************\n"
+        )
+        await self._debug_io.output("Enter 'help' for a list of commands.\n")
+        await self._debug_io.output("Enter 'list' to show all active breakpoints.\n\n")
+        while True:
+            try:
+                user_input = await self._debug_io.input("monarch_dbg> ")
+                if not user_input.strip():
+                    continue
+                command = await DebugCommand.parse(self._debug_io, user_input)
+                if isinstance(command, Help):
+                    await self._debug_io.output("monarch_dbg commands:\n")
+                    await self._debug_io.output(
+                        "\tattach <actor_name> <rank> - attach to a debug session\n"
+                    )
+                    await self._debug_io.output("\tlist - list all debug sessions\n")
+                    await self._debug_io.output(
+                        "\tquit - exit the debugger, leaving all sessions in place\n"
+                    )
+                    await self._debug_io.output(
+                        "\tcast <actor_name> ranks(...) <command> - send a command to a set of ranks on the specified actor mesh.\n"
+                        "\t\tThe value inside ranks(...) can be a single rank (ranks(1)),\n"
+                        "\t\ta list of ranks (ranks(1,4,6)), a range of ranks (ranks(start?:stop?:step?)),\n"
+                        "\t\tor a dict of dimensions (ranks(dim1=1:5:2,dim2=3, dim4=(3,6))).\n"
+                    )
+                    await self._debug_io.output(
+                        "\tcontinue - clear all breakpoints and tell all ranks to continue\n"
+                    )
+                    await self._debug_io.output("\thelp - print this help message\n")
+                elif isinstance(command, Attach):
+                    await self.sessions.get(command.actor_name, command.rank).attach(
+                        self._debug_io
+                    )
+                elif isinstance(command, ListCommand):
+                    # pyre-ignore
+                    await self.list._method(self)
+                elif isinstance(command, Continue):
+                    await self._cast_input_and_wait("clear")
+                    await self._cast_input_and_wait("c")
+                elif isinstance(command, Quit):
+                    await self._debug_io.quit()
+                    return
+                elif isinstance(command, Cast):
+                    await self._cast_input_and_wait(
+                        command.command, (command.actor_name, command.ranks)
+                    )
+            except (DebugIOError, asyncio.CancelledError):
+                raise
+            except Exception as e:
+                await self._debug_io.output(f"Error processing command: {e}\n")
+    async def _cast_input_and_wait(
+        self,
+        command: str,
+        selection: Optional[Tuple[str, Optional[RanksType]]] = None,
+    ) -> None:
+        tasks = []
+        for session in self.sessions.iter(selection):
+            tasks.append(session.attach(self._debug_io, command, suppress_output=True))
+        await asyncio.gather(*tasks)
+    ##########################################################################
+    # Debugger APIs
+    #
+    # These endpoints are called by the remote debuggers to establish sessions
+    # and communicate with them.
+    @endpoint
+    async def debugger_session_start(
+        self, rank: int, coords: Dict[str, int], hostname: str, actor_name: str
+    ) -> None:
+        # Good enough for now to ensure that if the server for processing
+        # user interactions never starts, then the rank being debugged will
+        # fail instead of hanging indefinitely with no way to send it commands.
+        # Of course this isn't sufficient to handle the case where the server
+        # fails after the rank's debug session has successfully started.
+        # TODO: implement a heartbeat to prevent pdb sessions from hanging.
+        await self._server
+        # Create a session if it doesn't exist
+        if (actor_name, rank) not in self.sessions:
+            self.sessions.insert(DebugSession(rank, coords, hostname, actor_name))
+    @endpoint
+    async def debugger_session_end(self, actor_name: str, rank: int) -> None:
+        """Detach from the current debug session."""
+        await self.sessions.remove(actor_name, rank).detach()
+    @endpoint
+    async def debugger_read(
+        self, actor_name: str, rank: int, size: int
+    ) -> DebuggerWrite | str:
+        """Read from the debug session for the given rank."""
+        return await self.sessions.get(actor_name, rank).debugger_read(size)
+    @endpoint
+    async def debugger_write(
+        self, actor_name: str, rank: int, write: DebuggerWrite
+    ) -> None:
+        """Write to the debug session for the given rank."""
+        await self.sessions.get(actor_name, rank).debugger_write(write)
+# Cached so that we don't have to call out to the root client every time,
+# which may be on a different host.
+@functools.cache
+def debug_controller() -> DebugController:
+    with fake_sync_state():
+        return get_or_spawn_controller("debug_controller", DebugController).get()

monarch/_src/actor/debugger/debug_io.py ADDED Viewed

@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import asyncio
+import sys
+from abc import abstractmethod
+class DebugIO:
+    @abstractmethod
+    async def input(self, prompt: str = "") -> str: ...
+    @abstractmethod
+    async def output(self, msg: str) -> None: ...
+    @abstractmethod
+    async def quit(self) -> None: ...
+class DebugStdIO(DebugIO):
+    async def input(self, prompt: str = "") -> str:
+        return await asyncio.to_thread(input, prompt)
+    async def output(self, msg: str) -> None:
+        sys.stdout.write(msg)
+        sys.stdout.flush()
+    async def quit(self) -> None:
+        pass
+class DebugIOError(RuntimeError):
+    def __init__(self):
+        super().__init__("Error encountered during debugger I/O operation.")
+class DebugCliIO(DebugIO):
+    def __init__(self, reader: asyncio.StreamReader, writer: asyncio.StreamWriter):
+        self._reader = reader
+        self._writer = writer
+    async def input(self, prompt: str = "") -> str:
+        try:
+            await self.output(prompt)
+            msg = (await self._reader.readline()).decode()
+            # Incomplete read due to EOF
+            if not msg.endswith("\n"):
+                raise RuntimeError("Unexpected end of input.")
+            # Strip the newline to be consistent with the behavior of input()
+            return msg.strip("\n")
+        except Exception as e:
+            raise DebugIOError() from e
+    async def output(self, msg: str) -> None:
+        try:
+            self._writer.write(msg.encode())
+            await self._writer.drain()
+        except Exception as e:
+            raise DebugIOError() from e
+    async def quit(self) -> None:
+        await self.output("Quitting debug session...\n")
+        self._writer.close()
+        await self._writer.wait_closed()