PyPI - torchmonarch-nightly - Versions diffs - 2025.9.9__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.11__cp313-cp313-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.9.9__cp313-cp313-manylinux2014_x86_64.whl → 2025.9.11__cp313-cp313-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

monarch/__init__.py +7 -0
monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +1 -1
monarch/_src/actor/bootstrap_main.py +7 -2
monarch/_src/actor/debugger/breakpoint.py +30 -0
monarch/_src/actor/debugger/debug_command.py +183 -0
monarch/_src/actor/debugger/debug_controller.py +246 -0
monarch/_src/actor/debugger/debug_io.py +68 -0
monarch/_src/actor/debugger/debug_session.py +249 -0
monarch/_src/actor/debugger/pdb_wrapper.py +1 -1
monarch/_src/actor/host_mesh.py +10 -2
monarch/_src/actor/pickle.py +4 -10
monarch/_src/actor/proc_mesh.py +80 -19
monarch/_src/tensor_engine/rdma.py +2 -0
monarch/actor/__init__.py +1 -1
monarch/gradient/_gradient_generator.so +0 -0
monarch/monarch_controller +0 -0
monarch/tools/cli.py +26 -0
monarch/tools/commands.py +15 -0
monarch/tools/debug_env.py +34 -0
monarch/tools/mesh_spec.py +2 -0
tests/test_allocator.py +18 -9
tests/test_debugger.py +29 -25
tests/test_mock_cuda.py +11 -3
torchmonarch_nightly-2025.9.11.data/scripts/process_allocator +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/METADATA +1 -1
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/RECORD +31 -29
monarch/_src/actor/debugger/debugger.py +0 -737
monarch/_src/debug_cli/__init__.py +0 -7
monarch/_src/debug_cli/debug_cli.py +0 -43
monarch/debug_cli/__init__.py +0 -7
monarch/debug_cli/__main__.py +0 -12
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.9.9.dist-info → torchmonarch_nightly-2025.9.11.dist-info}/top_level.txt +0 -0

monarch/_src/actor/debugger/debug_session.py ADDED Viewed

@@ -0,0 +1,249 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import asyncio
+from dataclasses import dataclass
+from typing import Dict, Generator, List, Optional, Tuple
+from monarch._src.actor.debugger.debug_command import RanksType
+from monarch._src.actor.debugger.debug_io import DebugIO, DebugIOError
+from monarch._src.actor.debugger.pdb_wrapper import DebuggerWrite
+@dataclass
+class DebugSessionInfo:
+    actor_name: str
+    rank: int
+    coords: Dict[str, int]
+    hostname: str
+    function: str | None
+    lineno: int | None
+    def __lt__(self, other):
+        if self.actor_name < other.actor_name:
+            return True
+        elif self.actor_name == other.actor_name:
+            return self.rank < other.rank
+        else:
+            return False
+class DebugSession:
+    """Represents a single session with a remote debugger."""
+    def __init__(
+        self, rank: int, coords: Dict[str, int], hostname: str, actor_name: str
+    ):
+        self.rank = rank
+        self.coords = coords
+        self.hostname = hostname
+        self.actor_name = actor_name
+        self._active = False
+        self._message_queue = asyncio.Queue()
+        self._task = None
+        self._pending_send_to_actor = asyncio.Queue()
+        self._outputs_since_last_input = []
+        self._function_lineno = None
+        self._need_read = False
+    async def _event_loop(self, debug_io: DebugIO, line=None, suppress_output=False):
+        if not suppress_output:
+            # If the user had previously attached to this debug session,
+            # then it would have printed various messages from the
+            # message queue. When the user re-attaches, we want to
+            # print out all of the output that was printed since the
+            # last command sent to this session.
+            if len(self._outputs_since_last_input) > 0:
+                await debug_io.output(
+                    f"<last pdb output for {self.actor_name} {self.rank} follows>\n"
+                )
+            for output in self._outputs_since_last_input:
+                await debug_io.output(output.payload.decode())
+        while True:
+            # When the user inputs "detach", it uses up a "read" message
+            # without actually responding to the actor being debugged. We
+            # can't manually reinsert the "read" message into the message queue,
+            # so instead the self._need_read flag indicates there's an additional
+            # "read" that we need to respond to.
+            if self._need_read:
+                self._need_read = False
+                message = "read"
+            else:
+                message = await self._message_queue.get()
+            if message == "detach":
+                # Return to the main outer debug loop.
+                break
+            elif message == "read":
+                try:
+                    break_after = False
+                    if line is not None:
+                        break_after = True
+                    else:
+                        line = await debug_io.input()
+                    if line == "detach":
+                        self._need_read = True
+                        break
+                    else:
+                        await self._pending_send_to_actor.put((line + "\n").encode())
+                        # Cancel safety: don't clear the previous outputs until we know
+                        # the actor will receive the input.
+                        self._outputs_since_last_input = []
+                        line = None
+                        if break_after:
+                            break
+                except (DebugIOError, asyncio.CancelledError):
+                    # See earlier comment about this flag. If either of the awaits inside
+                    # the try block is cancelled, we need to redo the read without actually
+                    # reinserting "read" into the message queue.
+                    self._need_read = True
+                    raise
+            elif message[0] == "write":
+                output = message[1]
+                # If the user sees this output but then detaches from the session,
+                # its useful to store all outputs since the last input so that
+                # they can be printed again when the user re-attaches.
+                self._outputs_since_last_input.append(output)
+                if not suppress_output:
+                    await debug_io.output(output.payload.decode())
+        if not suppress_output:
+            await debug_io.output(
+                f"Detaching from debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
+            )
+    def get_info(self):
+        function = lineno = None
+        if self._function_lineno is not None:
+            function, lineno = self._function_lineno
+        return DebugSessionInfo(
+            self.actor_name, self.rank, self.coords, self.hostname, function, lineno
+        )
+    async def attach(self, debug_io: DebugIO, line=None, suppress_output=False):
+        self._active = True
+        if not suppress_output:
+            await debug_io.output(
+                f"Attached to debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
+            )
+        self._task = asyncio.create_task(
+            self._event_loop(debug_io, line, suppress_output)
+        )
+        await self._task
+        if not suppress_output:
+            await debug_io.output(
+                f"Detached from debug session for {self.actor_name} {self.rank} ({self.hostname})\n"
+            )
+        self._active = False
+    async def detach(self):
+        if self._active:
+            await self._message_queue.put("detach")
+    async def debugger_read(self, size: int) -> DebuggerWrite:
+        await self._message_queue.put("read")
+        input_data = await self._pending_send_to_actor.get()
+        if len(input_data) > size:
+            input_data = input_data[:size]
+        return DebuggerWrite(input_data, None, None)
+    async def debugger_write(self, write: DebuggerWrite) -> None:
+        if write.function is not None and write.lineno is not None:
+            self._function_lineno = (write.function, write.lineno)
+        await self._message_queue.put(("write", write))
+class DebugSessions:
+    def __init__(self):
+        self._sessions: Dict[str, Dict[int, DebugSession]] = {}
+    def insert(self, session: DebugSession) -> None:
+        if session.actor_name not in self._sessions:
+            self._sessions[session.actor_name] = {session.rank: session}
+        elif session.rank not in self._sessions[session.actor_name]:
+            self._sessions[session.actor_name][session.rank] = session
+        else:
+            raise ValueError(
+                f"Debug session for rank {session.rank} already exists for actor {session.actor_name}"
+            )
+    def remove(self, actor_name: str, rank: int) -> DebugSession:
+        if actor_name not in self._sessions:
+            raise ValueError(f"No debug sessions for actor {actor_name}")
+        elif rank not in self._sessions[actor_name]:
+            raise ValueError(f"No debug session for rank {rank} for actor {actor_name}")
+        session = self._sessions[actor_name].pop(rank)
+        if len(self._sessions[actor_name]) == 0:
+            del self._sessions[actor_name]
+        return session
+    def get(self, actor_name: str, rank: int) -> DebugSession:
+        if actor_name not in self._sessions:
+            raise ValueError(f"No debug sessions for actor {actor_name}")
+        elif rank not in self._sessions[actor_name]:
+            raise ValueError(f"No debug session for rank {rank} for actor {actor_name}")
+        return self._sessions[actor_name][rank]
+    def iter(
+        self, selection: Optional[Tuple[str, Optional[RanksType]]]
+    ) -> Generator[DebugSession, None, None]:
+        if selection is None:
+            for sessions in self._sessions.values():
+                for session in sessions.values():
+                    yield session
+            return
+        actor_name, ranks = selection
+        if actor_name not in self._sessions:
+            return
+        sessions = self._sessions[actor_name]
+        if ranks is None:
+            for session in sessions.values():
+                yield session
+        elif isinstance(ranks, int):
+            if ranks in sessions:
+                yield sessions[ranks]
+        elif isinstance(ranks, list):
+            for rank in ranks:
+                if rank in sessions:
+                    yield sessions[rank]
+        elif isinstance(ranks, dict):
+            dims = ranks
+            for session in sessions.values():
+                include_rank = True
+                for dim, ranks in dims.items():
+                    if dim not in session.coords:
+                        include_rank = False
+                        break
+                    elif (
+                        isinstance(ranks, range) or isinstance(ranks, list)
+                    ) and session.coords[dim] not in ranks:
+                        include_rank = False
+                        break
+                    elif isinstance(ranks, int) and session.coords[dim] != ranks:
+                        include_rank = False
+                        break
+                if include_rank:
+                    yield session
+        elif isinstance(ranks, range):
+            for rank, session in sessions.items():
+                if rank in ranks:
+                    yield session
+    def info(self) -> List[DebugSessionInfo]:
+        session_info = []
+        for sessions in self._sessions.values():
+            for session in sessions.values():
+                session_info.append(session.get_info())
+        return session_info
+    def __len__(self) -> int:
+        return sum(len(sessions) for sessions in self._sessions.values())
+    def __contains__(self, item: Tuple[str, int]) -> bool:
+        actor_name, rank = item
+        return actor_name in self._sessions and rank in self._sessions[actor_name]

monarch/_src/actor/debugger/pdb_wrapper.py CHANGED Viewed

@@ -22,7 +22,7 @@ from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
 from monarch._src.actor.sync_state import fake_sync_state
 if TYPE_CHECKING:
-    from monarch._src.actor.debugger.debugger import DebugController
+    from monarch._src.actor.debugger.debug_controller import DebugController
 @dataclass

monarch/_src/actor/host_mesh.py CHANGED Viewed

@@ -52,13 +52,21 @@ class HostMesh(MeshTrait):
     interfaces with the underlying resource allocator of your choice.
     """
-    def __init__(self, shape: Shape, allocator: AllocateMixin):
+    def __init__(
+        self,
+        shape: Shape,
+        allocator: AllocateMixin,
+        alloc_constraints: Optional[AllocConstraints] = None,
+    ):
         self._allocator = allocator
+        self._alloc_constraints = alloc_constraints
         self._shape = shape
         self._spawned = 0
     def _alloc(self, hosts: int, gpus: int) -> "AllocHandle":
-        spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
+        spec: AllocSpec = AllocSpec(
+            self._alloc_constraints or AllocConstraints(), hosts=hosts, gpus=gpus
+        )
         return self._allocator.allocate(spec)
     def spawn_procs(

monarch/_src/actor/pickle.py CHANGED Viewed

@@ -8,18 +8,15 @@
 import io
 import pickle
-import sys
 from contextlib import contextmanager, ExitStack
 from typing import Any, Callable, Iterable, List, Tuple
 import cloudpickle
-def maybe_torch():
-    """
-    We have to do some special pickling if torch is loaded but not if it isn't loaded?
-    """
-    return sys.modules.get("torch")
+try:
+    import torch  # @manual
+except ImportError:
+    torch = None
 _orig_function_getstate = cloudpickle.cloudpickle._function_getstate
@@ -79,7 +76,6 @@ def flatten(obj: Any, filter: Callable[[Any], bool]) -> Tuple[List[Any], bytes]:
 def unflatten(data: bytes, values: Iterable[Any]) -> Any:
     with ExitStack() as stack:
-        torch = maybe_torch()
         if torch is not None:
             stack.enter_context(load_tensors_on_cpu())
             stack.enter_context(torch.utils._python_dispatch._disable_current_modes())
@@ -91,8 +87,6 @@ def unflatten(data: bytes, values: Iterable[Any]) -> Any:
 def load_tensors_on_cpu():
     # Ensure that any tensors load from CPU via monkeypatching how Storages are
     # loaded.
-    import torch
     old = torch.storage._load_from_bytes
     try:
         torch.storage._load_from_bytes = lambda b: torch.load(

monarch/_src/actor/proc_mesh.py CHANGED Viewed

@@ -7,6 +7,8 @@
 # pyre-strict
 import asyncio
+import importlib.metadata
+import json
 import logging
 import os
 import sys
@@ -31,6 +33,7 @@ from typing import (
     TYPE_CHECKING,
     TypeVar,
 )
+from urllib.parse import urlparse
 from weakref import WeakValueDictionary
 from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
@@ -468,20 +471,19 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
         # The workspace shape (i.e. only perform one rsync per host).
         assert set(self._shape.labels).issubset({"gpus", "hosts"})
-        workspaces = []
+        workspaces = {}
         for src_dir, dst_dir in workspace.dirs.items():
-            workspaces.append(
-                WorkspaceConfig(
-                    local=Path(src_dir),
-                    remote=RemoteWorkspace(
-                        location=WorkspaceLocation.FromEnvVar(
-                            env="WORKSPACE_DIR",
-                            relpath=dst_dir,
-                        ),
-                        shape=WorkspaceShape.shared("gpus"),
+            local = Path(src_dir)
+            workspaces[local] = WorkspaceConfig(
+                local=local,
+                remote=RemoteWorkspace(
+                    location=WorkspaceLocation.FromEnvVar(
+                        env="WORKSPACE_DIR",
+                        relpath=dst_dir,
                     ),
-                    method=CodeSyncMethod.Rsync,
+                    shape=WorkspaceShape.shared("gpus"),
                 ),
+                method=CodeSyncMethod.Rsync(),
             )
         # If `conda` is set, also sync the currently activated conda env.
@@ -496,23 +498,82 @@ class ProcMesh(MeshTrait, DeprecatedNotAFuture):
             while conda_prefix.is_symlink():
                 conda_prefix = conda_prefix.parent / conda_prefix.readlink()
-            workspaces.append(
-                WorkspaceConfig(
-                    local=conda_prefix,
-                    remote=RemoteWorkspace(
+            # Build a list of additional paths prefixes to fixup when syncing
+            # the conda env.
+            conda_prefix_replacements = {}
+            # Auto-detect editable installs and implicitly add workspaces for
+            # them.
+            # NOTE(agallagher): There's sometimes a `python3.1` symlink to
+            # `python3.10`, so avoid it.
+            (lib_python,) = [
+                dirpath
+                for dirpath in conda_prefix.glob("lib/python*")
+                if not os.path.islink(dirpath)
+            ]
+            for direct_url in lib_python.glob(
+                "site-packages/*.dist-info/direct_url.json"
+            ):
+                # Parse the direct_url.json to see if it's an editable install
+                # (https://packaging.python.org/en/latest/specifications/direct-url/#example-pip-commands-and-their-effect-on-direct-url-json).
+                with open(direct_url) as f:
+                    info = json.load(f)
+                if not info.get("dir_info", {}).get("editable", False):
+                    continue
+                # Extract the workspace path from the URL (e.g. `file///my/workspace/`).
+                url = urlparse(info["url"])
+                assert url.scheme == "file", f"expected file:// URL, got {url.scheme}"
+                # Get the project name, so we can use it below to create a unique-ish
+                # remote directory.
+                dist = importlib.metadata.PathDistribution(direct_url.parent)
+                name = dist.metadata["Name"]
+                local = Path(url.path)
+                # Check if we've already defined a workspace for this local path.
+                existing = workspaces.get(local)
+                if existing is not None:
+                    assert existing.method == CodeSyncMethod.Rsync()
+                    remote = existing.remote
+                else:
+                    # Otherwise, add the workspace to the list.
+                    remote = RemoteWorkspace(
                         location=WorkspaceLocation.FromEnvVar(
-                            env="CONDA_PREFIX",
-                            relpath="",
+                            env="WORKSPACE_DIR",
+                            relpath=f"__editable__.{name}",
                         ),
                         shape=WorkspaceShape.shared("gpus"),
+                    )
+                    workspaces[local] = WorkspaceConfig(
+                        local=local,
+                        remote=remote,
+                        method=CodeSyncMethod.Rsync(),
+                    )
+                logging.info(
+                    f"Syncing editable install of {name} from {local} (to {remote.location})"
+                )
+                # Make sure we fixup path prefixes to the editable install.
+                conda_prefix_replacements[local] = remote.location
+            workspaces[conda_prefix] = WorkspaceConfig(
+                local=conda_prefix,
+                remote=RemoteWorkspace(
+                    location=WorkspaceLocation.FromEnvVar(
+                        env="CONDA_PREFIX",
+                        relpath="",
                     ),
-                    method=CodeSyncMethod.CondaSync,
+                    shape=WorkspaceShape.shared("gpus"),
                 ),
+                method=CodeSyncMethod.CondaSync(conda_prefix_replacements),
             )
         assert self._code_sync_client is not None
         await self._code_sync_client.sync_workspaces(
-            workspaces=workspaces,
+            workspaces=list(workspaces.values()),
             auto_reload=auto_reload,
         )

monarch/_src/tensor_engine/rdma.py CHANGED Viewed

@@ -127,6 +127,8 @@ class RDMABuffer:
             storage = data.untyped_storage()
             addr: int = storage.data_ptr()
             size = storage.element_size() * data.numel()
+            if size == 0:
+                raise ValueError("Cannot create RDMABuffer with size 0.")
             ctx = context()
             self._buffer: _RdmaBuffer = _RdmaBuffer.create_rdma_buffer_blocking(
                 addr=addr,

monarch/actor/__init__.py CHANGED Viewed

@@ -27,7 +27,7 @@ from monarch._src.actor.actor_mesh import (
     send,
     ValueMesh,
 )
-from monarch._src.actor.debugger.debugger import debug_controller
+from monarch._src.actor.debugger.debug_controller import debug_controller
 from monarch._src.actor.endpoint import endpoint
 from monarch._src.actor.future import Future

monarch/gradient/_gradient_generator.so CHANGED Viewed

Binary file

monarch/monarch_controller CHANGED Viewed

Binary file

monarch/tools/cli.py CHANGED Viewed

@@ -13,6 +13,7 @@ from monarch.tools.commands import (
     bounce,
     component_args_from_cli,
     create,
+    debug,
     info,
     kill,
     stop,
@@ -22,6 +23,8 @@ from monarch.tools.config import (  # @manual=//monarch/python/monarch/tools/con
     Config,
     defaults,
 )
+from monarch.tools.debug_env import _get_debug_server_host, _get_debug_server_port
 from torchx.specs.finder import get_component
@@ -141,6 +144,25 @@ class StopCmd:
         stop(args.server_handle)
+class DebugCmd:
+    def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
+        subparser.add_argument(
+            "--host",
+            type=str,
+            default=_get_debug_server_host(),
+            help="Hostname where the debug server is running",
+        )
+        subparser.add_argument(
+            "--port",
+            type=int,
+            default=_get_debug_server_port(),
+            help="Port that the debug server is listening on",
+        )
+    def run(self, args: argparse.Namespace) -> None:
+        debug(args.host, args.port)
 def get_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Monarch CLI")
     subparser = parser.add_subparsers(title="COMMANDS")
@@ -149,6 +171,7 @@ def get_parser() -> argparse.ArgumentParser:
         "create": CreateCmd(),
         "info": InfoCmd(),
         "kill": KillCmd(),
+        "debug": DebugCmd(),
         # --- placeholder subcommands (not yet implemented) ---
         "bounce": BounceCmd(),
         "stop": StopCmd(),
@@ -162,6 +185,9 @@ def get_parser() -> argparse.ArgumentParser:
 def main(argv: list[str] = sys.argv[1:]) -> None:
     parser = get_parser()
     args = parser.parse_args(argv)
+    if not hasattr(args, "func"):
+        parser.print_help()
+        sys.exit(1)
     args.func(args)

monarch/tools/commands.py CHANGED Viewed

@@ -11,6 +11,7 @@ import asyncio
 import inspect
 import logging
 import os
+import subprocess
 import tempfile
 from datetime import datetime, timedelta
 from pathlib import Path
@@ -366,3 +367,17 @@ def bounce(server_handle: str) -> None:
 def stop(server_handle: str) -> None:
     """Stops the server's unix processes without tearing down the server's job."""
     raise NotImplementedError("`stop` is not yet implemented")
+def debug(host: str, port: int) -> None:
+    """Connect to the debug server running on the provided host and port."""
+    for cmd in ["ncat", "nc", "netcat"]:
+        try:
+            subprocess.run([cmd, f"{host}", f"{port}"], check=True)
+            return
+        except FileNotFoundError:
+            pass
+    logging.error(
+        "Could not find a suitable netcat binary. Please install one and try again."
+    )

monarch/tools/debug_env.py ADDED Viewed

@@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import os
+_MONARCH_DEBUG_SERVER_HOST_ENV_VAR = "MONARCH_DEBUG_SERVER_HOST"
+_MONARCH_DEBUG_SERVER_HOST_DEFAULT = "localhost"
+_MONARCH_DEBUG_SERVER_PORT_ENV_VAR = "MONARCH_DEBUG_SERVER_PORT"
+_MONARCH_DEBUG_SERVER_PORT_DEFAULT = "27000"
+_MONARCH_DEBUG_SERVER_PROTOCOL_ENV_VAR = "MONARCH_DEBUG_SERVER_PROTOCOL"
+_MONARCH_DEBUG_SERVER_PROTOCOL_DEFAULT = "tcp"
+def _get_debug_server_host():
+    return os.environ.get(
+        _MONARCH_DEBUG_SERVER_HOST_ENV_VAR, _MONARCH_DEBUG_SERVER_HOST_DEFAULT
+    )
+def _get_debug_server_port():
+    return os.environ.get(
+        _MONARCH_DEBUG_SERVER_PORT_ENV_VAR, _MONARCH_DEBUG_SERVER_PORT_DEFAULT
+    )
+def _get_debug_server_protocol():
+    return os.environ.get(
+        _MONARCH_DEBUG_SERVER_PROTOCOL_ENV_VAR, _MONARCH_DEBUG_SERVER_PROTOCOL_DEFAULT
+    )

monarch/tools/mesh_spec.py CHANGED Viewed

@@ -40,6 +40,7 @@ class MeshSpec:
     port: int = DEFAULT_REMOTE_ALLOCATOR_PORT
     hostnames: list[str] = field(default_factory=list)
     state: specs.AppState = specs.AppState.UNSUBMITTED
+    image: str = _UNSET_STR
     def server_addrs(
         self, transport: Optional[str] = None, port: Optional[int] = None
@@ -81,6 +82,7 @@ def mesh_spec_from_metadata(appdef: specs.AppDef, mesh_name: str) -> Optional[Me
         if role.name == mesh_name:
             return MeshSpec(
                 name=mesh_name,
+                image=role.image,
                 num_hosts=role.num_replicas,
                 host_type=appdef.metadata.get(
                     _tag(mesh_name, _TAG_HOST_TYPE), _UNSET_STR