PyPI - torchmonarch-nightly - Versions diffs - 2025.8.2__cp310-cp310-manylinux2014_x86_64.whl → 2025.9.3__cp310-cp310-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.8.2__cp310-cp310-manylinux2014_x86_64.whl → 2025.9.3__cp310-cp310-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

monarch/_rust_bindings.so +0 -0
monarch/_src/actor/actor_mesh.py +414 -216
monarch/_src/actor/allocator.py +75 -6
monarch/_src/actor/bootstrap_main.py +7 -4
monarch/_src/actor/code_sync/__init__.py +2 -0
monarch/_src/actor/debugger/__init__.py +7 -0
monarch/_src/actor/{debugger.py → debugger/debugger.py} +246 -135
monarch/_src/actor/{pdb_wrapper.py → debugger/pdb_wrapper.py} +62 -23
monarch/_src/actor/endpoint.py +27 -45
monarch/_src/actor/future.py +86 -24
monarch/_src/actor/host_mesh.py +125 -0
monarch/_src/actor/logging.py +94 -0
monarch/_src/actor/pickle.py +25 -0
monarch/_src/actor/proc_mesh.py +423 -156
monarch/_src/actor/python_extension_methods.py +90 -0
monarch/_src/actor/shape.py +8 -1
monarch/_src/actor/source_loader.py +45 -0
monarch/_src/actor/telemetry/__init__.py +172 -0
monarch/_src/actor/telemetry/rust_span_tracing.py +6 -39
monarch/_src/debug_cli/__init__.py +7 -0
monarch/_src/debug_cli/debug_cli.py +43 -0
monarch/_src/tensor_engine/rdma.py +64 -9
monarch/_testing.py +1 -3
monarch/actor/__init__.py +24 -4
monarch/common/_C.so +0 -0
monarch/common/device_mesh.py +14 -0
monarch/common/future.py +10 -0
monarch/common/remote.py +14 -25
monarch/common/tensor.py +12 -0
monarch/debug_cli/__init__.py +7 -0
monarch/debug_cli/__main__.py +12 -0
monarch/fetch.py +2 -2
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +4 -2
monarch/mesh_controller.py +34 -14
monarch/monarch_controller +0 -0
monarch/tools/colors.py +25 -0
monarch/tools/commands.py +42 -7
monarch/tools/components/hyperactor.py +1 -1
monarch/tools/config/__init__.py +31 -4
monarch/tools/config/defaults.py +13 -3
monarch/tools/config/environment.py +45 -0
monarch/tools/config/workspace.py +165 -0
monarch/tools/mesh_spec.py +2 -0
monarch/utils/__init__.py +9 -0
monarch/utils/utils.py +78 -0
tests/error_test_binary.py +5 -3
tests/python_actor_test_binary.py +52 -0
tests/test_actor_error.py +142 -14
tests/test_alloc.py +1 -1
tests/test_allocator.py +59 -72
tests/test_debugger.py +639 -45
tests/test_env_before_cuda.py +4 -4
tests/test_mesh_trait.py +38 -0
tests/test_python_actors.py +965 -75
tests/test_rdma.py +7 -6
tests/test_tensor_engine.py +6 -6
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/METADATA +82 -4
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/RECORD +63 -47
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/WHEEL +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/entry_points.txt +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/licenses/LICENSE +0 -0
{torchmonarch_nightly-2025.8.2.dist-info → torchmonarch_nightly-2025.9.3.dist-info}/top_level.txt +0 -0

monarch/_src/actor/proc_mesh.py CHANGED Viewed

@@ -6,83 +6,91 @@
 # pyre-strict
+import asyncio
 import logging
 import os
 import sys
+import threading
 import warnings
 from contextlib import AbstractContextManager
+from functools import cache
+from pathlib import Path
 from typing import (
     Any,
     Callable,
     cast,
     Dict,
     List,
+    Literal,
     Optional,
     Sequence,
+    Tuple,
     Type,
     TYPE_CHECKING,
     TypeVar,
 )
+from weakref import WeakValueDictionary
-from monarch._rust_bindings.monarch_extension.logging import LoggingMeshClient
 from monarch._rust_bindings.monarch_hyperactor.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension
     Alloc,
     AllocConstraints,
     AllocSpec,
 )
-from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
 from monarch._rust_bindings.monarch_hyperactor.proc_mesh import (
     ProcMesh as HyProcMesh,
     ProcMeshMonitor,
 )
+from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask, Shared
 from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
-from monarch._src.actor.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
+from monarch._src.actor.actor_mesh import _Actor, Actor, ActorMesh, context
 from monarch._src.actor.allocator import (
     AllocateMixin,
+    AllocHandle,
     LocalAllocator,
     ProcessAllocator,
     SimAllocator,
 )
 from monarch._src.actor.code_sync import (
     CodeSyncMeshClient,
+    CodeSyncMethod,
     RemoteWorkspace,
+    WorkspaceConfig,
     WorkspaceLocation,
     WorkspaceShape,
 )
-from monarch._src.actor.debugger import (
-    _DEBUG_MANAGER_ACTOR_NAME,
-    DebugClient,
-    DebugManager,
-)
 from monarch._src.actor.device_utils import _local_device_count
 from monarch._src.actor.endpoint import endpoint
-from monarch._src.actor.future import Future
+from monarch._src.actor.future import DeprecatedNotAFuture, Future
+from monarch._src.actor.logging import LoggingManager
 from monarch._src.actor.shape import MeshTrait
+from monarch.tools.config.environment import CondaEnvironment
+from monarch.tools.config.workspace import Workspace
+from monarch.tools.utils import conda as conda_utils
-HAS_TENSOR_ENGINE = False
-try:
-    # Torch is needed for tensor engine
-    import torch  # @manual
-    # Confirm that rust bindings were built with tensor engine enabled
-    from monarch._rust_bindings.rdma import (  # type: ignore[import]
-        _RdmaBuffer,
-        _RdmaManager,
-    )
+@cache
+def _has_tensor_engine() -> bool:
+    try:
+        # Torch is needed for tensor engine
+        import torch  # @manual
-    # type: ignore[16]
-    HAS_TENSOR_ENGINE = torch.cuda.is_available()
-except ImportError:
-    logging.warning("Tensor engine is not available on this platform")
+        # Confirm that rust bindings were built with tensor engine enabled
+        from monarch._rust_bindings.rdma import _RdmaManager  # noqa
+        return True
+    except ImportError:
+        logging.warning("Tensor engine is not available on this platform")
+        return False
 if TYPE_CHECKING:
     Tensor = Any
     DeviceMesh = Any
+    from monarch._src.actor.host_mesh import HostMesh
 class SetupActor(Actor):
@@ -114,55 +122,108 @@ except ImportError:
     IN_PAR = False
-class ProcMesh(MeshTrait):
+# A temporary gate used by the PythonActorMesh/PythonActorMeshRef migration.
+# We can use this gate to quickly roll back to using _ActorMeshRefImpl, if we
+# encounter any issues with the migration.
+#
+# This should be removed once we confirm PythonActorMesh/PythonActorMeshRef is
+# working correctly in production.
+@cache
+def _use_standin_mesh() -> bool:
+    return os.getenv("USE_STANDIN_ACTOR_MESH", default="0") != "0"
+# Ultra-hack to allow actors to identify proc meshes but with no real functionality.
+class ProcMeshRef:
+    def __init__(self, proc_mesh_id: int) -> None:
+        self._proc_mesh_id = proc_mesh_id
+        self._host_mesh: Optional["HostMesh"] = None
+    @classmethod
+    def _fake_proc_mesh(cls, proc_mesh_id: int) -> "ProcMesh":
+        return cast(ProcMesh, cls(proc_mesh_id))
+    def __getattr__(self, attr: str) -> Any:
+        # AttributeError instead of NotImplementedError so that any hasattr calls
+        # will properly return False
+        raise AttributeError(
+            f"NYI: attempting to get ProcMesh attribute `{attr}` on object that's actually a ProcMeshRef"
+        )
+    def __hash__(self) -> int:
+        return hash(self._proc_mesh_id)
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, ProcMeshRef):
+            return False
+        return self._proc_mesh_id == other._proc_mesh_id
+    @property
+    def _proc_mesh(self) -> Shared["HyProcMesh"]:
+        return _deref_proc_mesh(self)._proc_mesh
+_proc_mesh_lock: threading.Lock = threading.Lock()
+_proc_mesh_key: int = 0
+_proc_mesh_registry: WeakValueDictionary[ProcMeshRef, "ProcMesh"] = (
+    WeakValueDictionary()
+)
+def _deref_proc_mesh(proc_mesh: ProcMeshRef) -> "ProcMesh":
+    if proc_mesh not in _proc_mesh_registry:
+        raise ValueError(
+            f"ProcMesh with id {proc_mesh._proc_mesh_id} does not exist on host."
+        )
+    return _proc_mesh_registry[proc_mesh]
+class ProcMesh(MeshTrait, DeprecatedNotAFuture):
     def __init__(
         self,
-        hy_proc_mesh: HyProcMesh,
-        _mock_shape: Optional[Shape] = None,
+        hy_proc_mesh: "Shared[HyProcMesh]",
+        shape: Shape,
         _device_mesh: Optional["DeviceMesh"] = None,
     ) -> None:
         self._proc_mesh = hy_proc_mesh
-        self._mock_shape: Optional[Shape] = _mock_shape
-        # type: ignore[21]
-        self._rdma_manager: Optional["_RdmaManager"] = None
-        self._debug_manager: Optional[DebugManager] = None
-        self._mailbox: Mailbox = self._proc_mesh.client
+        global _proc_mesh_lock, _proc_mesh_key
+        with _proc_mesh_lock:
+            self._proc_mesh_id: int = _proc_mesh_key
+            _proc_mesh_key += 1
+        self._shape = shape
+        # until we have real slicing support keep track
+        # of whether this is a slice of a real proc_meshg
+        self._slice = False
         self._code_sync_client: Optional[CodeSyncMeshClient] = None
-        self._logging_mesh_client: Optional[LoggingMeshClient] = None
+        self._logging_manager: LoggingManager = LoggingManager()
         self._maybe_device_mesh: Optional["DeviceMesh"] = _device_mesh
         self._stopped = False
+        self._controller_controller: Optional["_ControllerController"] = None
+        # current set only for context()'s proc_mesh to be a local host mesh.
+        self._host_mesh: Optional["HostMesh"] = None
-    async def _init_manager_actors(
-        self,
-        setup: Callable[[], None] | None = None,
-    ) -> "ProcMesh":
-        _rdma_manager = (
-            # type: ignore[16]
-            await _RdmaManager.create_rdma_manager_nonblocking(self._proc_mesh)
-            # type: ignore[16]
-            if HAS_TENSOR_ENGINE and _RdmaBuffer.rdma_supported()
-            else None
-        )
-        _debug_manager = await self._spawn_nonblocking(
-            _DEBUG_MANAGER_ACTOR_NAME, DebugManager, await _debug_client()
-        )
+    @property
+    def initialized(self) -> Future[Literal[True]]:
+        """
+        Future completes with 'True' when the ProcMesh has initialized.
+        Because ProcMesh are remote objects, there is no guarentee that the ProcMesh is
+        still usable after this completes, only that at some point in the past it was usable.
+        """
+        pm: Shared[HyProcMesh] = self._proc_mesh
-        self._debug_manager = _debug_manager
-        self._rdma_manager = _rdma_manager
+        async def task() -> Literal[True]:
+            await pm
+            return True
-        if setup is not None:
-            # If the user has passed the setup lambda, we need to call
-            # it here before any of the other actors are spawned so that
-            # the environment variables are set up before cuda init.
-            setup_actor = await self._spawn_nonblocking("setup", SetupActor, setup)
-            # pyre-ignore
-            await setup_actor.setup.call()._status.coro
-        return self
+        return Future(coro=task())
     @property
-    def _shape(self) -> Shape:
-        return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
+    def host_mesh(self) -> "HostMesh":
+        if self._host_mesh is None:
+            raise NotImplementedError(
+                "NYI complete for release 0.1 (ProcMeshRef knowing its host mesh)"
+            )
+        return self._host_mesh
     @property
     def _ndslice(self) -> Slice:
@@ -173,17 +234,34 @@ class ProcMesh(MeshTrait):
         return self._shape.labels
     def _new_with_shape(self, shape: Shape) -> "ProcMesh":
+        # make sure that if we slice something with unity,
+        # we do not lose the ability to spawn on it.
+        # remote when spawn is implemented.
+        if shape == self._shape:
+            return self
         device_mesh = (
             None
             if self._maybe_device_mesh is None
             else self._device_mesh._new_with_shape(shape)
         )
-        return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
+        pm = ProcMesh(self._proc_mesh, shape, _device_mesh=device_mesh)
+        pm._slice = True
+        return pm
-    def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> Future[T]:
-        if self._mock_shape is not None:
+    def spawn(self, name: str, Class: Type[T], *args: Any, **kwargs: Any) -> T:
+        if self._slice:
             raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
-        return Future(coro=self._spawn_nonblocking(name, Class, *args, **kwargs))
+        return self._spawn_nonblocking(name, Class, *args, **kwargs)
+    @property
+    async def _proc_mesh_for_asyncio_fixme(self) -> HyProcMesh:
+        """
+        Get ProcMesh on the asyncio event stream.
+        We should redo this functionality to work on the tokio stream.
+        This must be called on the asyncio stream.
+        """
+        assert asyncio.get_running_loop() is not None
+        return await Future(coro=self._proc_mesh.task())
     async def monitor(self) -> ProcMeshMonitor:
         """
@@ -201,12 +279,17 @@ class ProcMesh(MeshTrait):
         # Kick off in background
         asyncio.create_task(monitor_loop(monitor))
         """
-        return await self._proc_mesh.monitor()
+        # todo: move monitor to tokio loop
+        proc_mesh = await Future(coro=self._proc_mesh.task())
+        return await proc_mesh.monitor()
     @classmethod
     def from_alloc(
-        self, alloc: Alloc, setup: Callable[[], None] | None = None
-    ) -> Future["ProcMesh"]:
+        self,
+        alloc: AllocHandle,
+        setup: Callable[[], None] | None = None,
+        _attach_controller_controller: bool = True,
+    ) -> "ProcMesh":
         """
         Allocate a process mesh according to the provided alloc.
         Returns when the mesh is fully allocated.
@@ -225,37 +308,98 @@ class ProcMesh(MeshTrait):
             os.environ["LOCAL_RANK"] = str(rank["gpus"])
         ```
         """
-        return Future(
-            coro=_proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors=True)
+        async def task() -> HyProcMesh:
+            return await HyProcMesh.allocate_nonblocking(await alloc._hy_alloc)
+        shape = Shape(
+            list(alloc._extent.keys()),
+            Slice.new_row_major(list(alloc._extent.values())),
         )
+        hy_proc_mesh = PythonTask.from_coroutine(task()).spawn()
+        pm = ProcMesh(hy_proc_mesh, shape)
+        if _attach_controller_controller:
+            instance = context().actor_instance
+            pm._controller_controller = instance._controller_controller
+            instance._add_child(pm)
+        async def task(
+            pm: "ProcMesh",
+            hy_proc_mesh_task: "Shared[HyProcMesh]",
+            setup_actor: Optional[SetupActor],
+            stream_log_to_client: bool,
+        ) -> HyProcMesh:
+            hy_proc_mesh = await hy_proc_mesh_task
+            await pm._logging_manager.init(hy_proc_mesh, stream_log_to_client)
+            if setup_actor is not None:
+                await setup_actor.setup.call()
+            return hy_proc_mesh
+        setup_actor = None
+        if setup is not None:
+            # If the user has passed the setup lambda, we need to call
+            # it here before any of the other actors are spawned so that
+            # the environment variables are set up before cuda init.
+            setup_actor = pm._spawn_nonblocking_on(
+                hy_proc_mesh, "setup", SetupActor, setup
+            )
+        pm._proc_mesh = PythonTask.from_coroutine(
+            task(pm, hy_proc_mesh, setup_actor, alloc.stream_logs)
+        ).spawn()
+        return pm
     def __repr__(self) -> str:
         return repr(self._proc_mesh)
     def __str__(self) -> str:
         return str(self._proc_mesh)
-    async def _spawn_nonblocking(
+    def _spawn_nonblocking(
         self, name: str, Class: Type[T], *args: Any, **kwargs: Any
+    ) -> T:
+        return self._spawn_nonblocking_on(self._proc_mesh, name, Class, *args, **kwargs)
+    def to_table(self) -> str:
+        return self._device_mesh.to_table()
+    def _spawn_nonblocking_on(
+        self,
+        pm: "Shared[HyProcMesh]",
+        name: str,
+        Class: Type[T],
+        *args: Any,
+        **kwargs: Any,
     ) -> T:
         if not issubclass(Class, Actor):
             raise ValueError(
                 f"{Class} must subclass monarch.service.Actor to spawn it."
             )
-        actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
-        service = ActorMeshRef(
+        actor_mesh = HyProcMesh.spawn_async(pm, name, _Actor, _use_standin_mesh())
+        instance = context().actor_instance
+        service = ActorMesh._create(
             Class,
-            _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh, self),
-            self._mailbox,
+            actor_mesh,
+            instance._mailbox,
+            self._shape,
+            self,
+            self._controller_controller,
+            *args,
+            **kwargs,
         )
-        # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
-        # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
-        service._create(args, kwargs)
+        instance._add_child(service)
         return cast(T, service)
     @property
     def _device_mesh(self) -> "DeviceMesh":
-        if not HAS_TENSOR_ENGINE:
+        if not _has_tensor_engine():
             raise RuntimeError(
                 "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
             )
@@ -264,7 +408,7 @@ class ProcMesh(MeshTrait):
         from monarch.mesh_controller import spawn_tensor_engine  # @manual
         if self._maybe_device_mesh is None:
-            if self._mock_shape is not None:
+            if self._slice:
                 raise NotImplementedError(
                     "NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
                 )
@@ -282,52 +426,97 @@ class ProcMesh(MeshTrait):
     def rank_tensors(self) -> Dict[str, "Tensor"]:
         return self._device_mesh.ranks
-    async def sync_workspace(self, auto_reload: bool = False) -> None:
+    async def sync_workspace(
+        self,
+        workspace: Workspace,
+        conda: bool = False,
+        auto_reload: bool = False,
+    ) -> None:
         if self._code_sync_client is None:
             self._code_sync_client = CodeSyncMeshClient.spawn_blocking(
-                proc_mesh=self._proc_mesh,
+                proc_mesh=await self._proc_mesh_for_asyncio_fixme,
             )
         # TODO(agallagher): We need some way to configure and pass this
         # in -- right now we're assuming the `gpu` dimension, which isn't
         # correct.
         # The workspace shape (i.e. only perform one rsync per host).
-        assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
+        assert set(self._shape.labels).issubset({"gpus", "hosts"})
+        workspaces = []
+        for src_dir, dst_dir in workspace.dirs.items():
+            workspaces.append(
+                WorkspaceConfig(
+                    local=Path(src_dir),
+                    remote=RemoteWorkspace(
+                        location=WorkspaceLocation.FromEnvVar(
+                            env="WORKSPACE_DIR",
+                            relpath=dst_dir,
+                        ),
+                        shape=WorkspaceShape.shared("gpus"),
+                    ),
+                    method=CodeSyncMethod.Rsync,
+                ),
+            )
+        # If `conda` is set, also sync the currently activated conda env.
+        conda_prefix = conda_utils.active_env_dir()
+        if isinstance(workspace.env, CondaEnvironment):
+            conda_prefix = workspace.env._conda_prefix
+        if conda and conda_prefix is not None:
+            conda_prefix = Path(conda_prefix)
+            # Resolve top-level symlinks for rsync/conda-sync.
+            while conda_prefix.is_symlink():
+                conda_prefix = conda_prefix.parent / conda_prefix.readlink()
+            workspaces.append(
+                WorkspaceConfig(
+                    local=conda_prefix,
+                    remote=RemoteWorkspace(
+                        location=WorkspaceLocation.FromEnvVar(
+                            env="CONDA_PREFIX",
+                            relpath="",
+                        ),
+                        shape=WorkspaceShape.shared("gpus"),
+                    ),
+                    method=CodeSyncMethod.CondaSync,
+                ),
+            )
         assert self._code_sync_client is not None
-        await self._code_sync_client.sync_workspace(
-            # TODO(agallagher): Is there a better way to infer/set the local
-            # workspace dir, rather than use PWD?
-            local=os.getcwd(),
-            remote=RemoteWorkspace(
-                location=WorkspaceLocation.FromEnvVar("WORKSPACE_DIR"),
-                shape=WorkspaceShape.shared("gpus"),
-            ),
+        await self._code_sync_client.sync_workspaces(
+            workspaces=workspaces,
             auto_reload=auto_reload,
         )
     async def logging_option(
         self,
-        stream_to_client: bool = False,
-        aggregate_window_sec: int | None = None,
+        stream_to_client: bool = True,
+        aggregate_window_sec: int | None = 3,
+        level: int = logging.INFO,
     ) -> None:
         """
         Set the logging options for the remote processes
         Args:
             stream_to_client (bool): If True, logs from the remote processes will be streamed to the client.
-            Defaults to False.
+            Defaults to True.
             aggregate_window_sec (Optional[int]): If not None, logs from the remote processes will be aggregated
-            and sent to the client every aggregate_window_sec seconds. Defaults to None, meaning no aggregation.
-            aggregate_window_sec will be ignored if stream_to_client is False.
+            and sent to the client every aggregate_window_sec seconds. Defaults to 3 seconds, meaning no aggregation.
+            Error will be thrown if aggregate_window_sec is set and stream_to_client is False.
+            level (int): The logging level of the logger. Defaults to logging.INFO.
         Returns:
             None
         """
-        if self._logging_mesh_client is None:
-            self._logging_mesh_client = await LoggingMeshClient.spawn(
-                proc_mesh=self._proc_mesh
-            )
-        self._logging_mesh_client.set_mode(
-            stream_to_client, aggregate_window_sec=aggregate_window_sec
+        await self.initialized
+        await self._logging_manager.logging_option(
+            stream_to_client=stream_to_client,
+            aggregate_window_sec=aggregate_window_sec,
+            level=level,
         )
     async def __aenter__(self) -> "ProcMesh":
@@ -336,8 +525,10 @@ class ProcMesh(MeshTrait):
         return self
     def stop(self) -> Future[None]:
+        self._logging_manager.stop()
         async def _stop_nonblocking() -> None:
-            await self._proc_mesh.stop_nonblocking()
+            await (await self._proc_mesh).stop_nonblocking()
             self._stopped = True
         return Future(coro=_stop_nonblocking())
@@ -353,6 +544,8 @@ class ProcMesh(MeshTrait):
     # Finalizer to check if the proc mesh was closed properly.
     def __del__(self) -> None:
         if not self._stopped:
+            self._logging_manager.stop()
             warnings.warn(
                 f"unstopped ProcMesh {self!r}",
                 ResourceWarning,
@@ -361,17 +554,59 @@ class ProcMesh(MeshTrait):
             )
             # Cannot call stop here because it is async.
+    def __reduce_ex__(self, protocol: ...) -> Tuple[Any, Tuple[Any, ...]]:
+        # Ultra-hack. Remote python actors can get a reference to this proc mesh that
+        # doesn't have any real functionality, but if they send a request back to the client
+        # where the real proc mesh exists, the client can look it up in the proc mesh registry
+        # and do something with it.
+        global _proc_mesh_registry
+        _proc_mesh_registry[ProcMeshRef(self._proc_mesh_id)] = self
+        return (ProcMeshRef._fake_proc_mesh, (self._proc_mesh_id,))
+    @staticmethod
+    def _from_ref(proc_mesh_ref: ProcMeshRef) -> "ProcMesh":
+        maybe_proc_mesh = _proc_mesh_registry.get(proc_mesh_ref, None)
+        if maybe_proc_mesh is None:
+            raise RuntimeError(
+                f"ProcMesh with id {proc_mesh_ref._proc_mesh_id} does not exist"
+            )
+        return maybe_proc_mesh
+def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
+    warnings.warn(
+        "Use monarch._src.actor.host_mesh.fake_in_process_host().spawn_procs for testing. For launching an actor in the current process use this_proc().spawn_procs()",
+        DeprecationWarning,
+        stacklevel=2,
+    )
-def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
-    return Future(
-        coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=LocalAllocator())
+    return _proc_mesh_from_allocator(
+        allocator=LocalAllocator(),
+        gpus=gpus,
+        hosts=hosts,
     )
-def sim_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
-    return Future(
-        coro=_proc_mesh_coro(gpus=gpus, hosts=hosts, allocator=SimAllocator())
+def sim_proc_mesh(
+    *,
+    gpus: int = 1,
+    hosts: int = 1,
+    racks: int = 1,
+    zones: int = 1,
+    dcs: int = 1,
+    regions: int = 1,
+) -> ProcMesh:
+    spec: AllocSpec = AllocSpec(
+        AllocConstraints(),
+        hosts=hosts,
+        gpus=gpus,
+        racks=racks,
+        zones=zones,
+        dcs=dcs,
+        regions=regions,
     )
+    alloc = SimAllocator().allocate(spec)
+    return ProcMesh.from_alloc(alloc, None, True)
 _BOOTSTRAP_MAIN = "monarch._src.actor.bootstrap_main"
@@ -392,25 +627,19 @@ def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
     return cmd, args, env
-async def _proc_mesh_from_alloc_coro(
-    alloc: Alloc,
-    setup: Callable[[], None] | None,
-    init_manager_actors: bool,
-) -> ProcMesh:
-    _hy_proc_mesh = await HyProcMesh.allocate_nonblocking(alloc)
-    proc_mesh = ProcMesh(_hy_proc_mesh)
-    if init_manager_actors:
-        await proc_mesh._init_manager_actors(setup)
-    return proc_mesh
+async def _hy_proc_mesh_from_alloc_coro(
+    alloc: "Shared[Alloc] | PythonTask[Alloc]",
+) -> HyProcMesh:
+    return await HyProcMesh.allocate_nonblocking(await alloc)
-async def _proc_mesh_coro(
+def _proc_mesh_from_allocator(
     *,
     allocator: AllocateMixin,
-    gpus: Optional[int] = None,
-    hosts: int = 1,
+    gpus: Optional[int],
+    hosts: int,
     setup: Callable[[], None] | None = None,
-    init_manager_actors: bool = True,
+    _attach_controller_controller: bool = True,
 ) -> ProcMesh:
     if gpus is None:
         gpus = _local_device_count()
@@ -418,9 +647,8 @@ async def _proc_mesh_coro(
     # test_remote_function_all_gather expects that hosts comes before gpus
     # in the order of the dimensions.
     spec: AllocSpec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
-    alloc = await allocator.allocate_nonblocking(spec)
-    return await _proc_mesh_from_alloc_coro(alloc, setup, init_manager_actors)
+    alloc = allocator.allocate(spec)
+    return ProcMesh.from_alloc(alloc, setup, _attach_controller_controller)
 def proc_mesh(
@@ -429,53 +657,92 @@ def proc_mesh(
     hosts: int = 1,
     env: dict[str, str] | None = None,
     setup: Callable[[], None] | None = None,
-) -> Future[ProcMesh]:
-    env = env or {}
+) -> ProcMesh:
+    warnings.warn(
+        "use this_host().spawn_procs(per_host = {'hosts': 2, 'gpus': 3}) instead of monarch.actor.proc_mesh(hosts=2, gpus=3)",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    env = env or {}
     # Todo: Deprecate the env field from the ProcessAllocator
     # The PAR_MAIN_OVERRIDE needs to be passed as an env
     # to the proc mesh construction in rust, so can not be moved to the
     # SetupActor yet
     cmd, args, bootstrap_env = _get_bootstrap_args()
     env.update(bootstrap_env)
-    task = _proc_mesh_coro(
-        gpus=gpus,
+    return _proc_mesh_from_allocator(
+        allocator=ProcessAllocator(cmd, args, env),
         hosts=hosts,
+        gpus=gpus,
         setup=setup,
-        allocator=ProcessAllocator(cmd, args, env),
-        init_manager_actors=True,
+        _attach_controller_controller=True,
     )
-    return Future(coro=task)
-_debug_proc_mesh: Optional["ProcMesh"] = None
+_ActorType = TypeVar("_ActorType", bound=Actor)
-# Lazy init of the debug proc mesh so that importing monarch.proc_mesh
-# doesn't trigger the debug client to spawn, which could cause confusing
-# logs. This is defined in proc_mesh.py instead of debugger.py for
-# circular import reasons.
-async def _get_debug_proc_mesh() -> "ProcMesh":
-    global _debug_proc_mesh
-    if _debug_proc_mesh is None:
-        _debug_proc_mesh = await _proc_mesh_coro(
-            gpus=1, hosts=1, allocator=LocalAllocator(), init_manager_actors=False
-        )
-    return _debug_proc_mesh
+class _ControllerController(Actor):
+    def __init__(self) -> None:
+        self._controllers: Dict[str, Actor] = {}
-_debug_client_mesh: Optional[DebugClient] = None
+    # pyre-ignore
+    @endpoint
+    def get_or_spawn(
+        self, name: str, Class: Type[_ActorType], *args: Any, **kwargs: Any
+    ) -> _ActorType:
+        if name not in self._controllers:
+            proc_mesh = _proc_mesh_from_allocator(
+                gpus=1,
+                hosts=1,
+                allocator=LocalAllocator(),
+            )
+            self._controllers[name] = proc_mesh.spawn(name, Class, *args, **kwargs)
+        return cast(_ActorType, self._controllers[name])
+_cc_init = threading.Lock()
+_cc_proc_mesh: Optional["ProcMesh"] = None
+_controller_controller: Optional["_ControllerController"] = None
+# Lazy init so that the controller_controller and proc do not produce logs when they aren't used.
+# Checking for the controller (when it does not already exist in the MonarchContext) needs a lock,
+# otherwise two initializing procs will both try to init resulting in duplicates. The critical
+# region is not blocking: it spawns a separate task to do the init, assigns the
+# Shared[_ControllerController] from that task to the global and releases the lock.
+def _get_controller_controller() -> "Tuple[ProcMesh, _ControllerController]":
+    global _controller_controller, _cc_proc_mesh
+    with _cc_init:
+        if _controller_controller is None:
+            alloc = LocalAllocator().allocate(AllocSpec(AllocConstraints()))
+            _cc_proc_mesh = ProcMesh.from_alloc(
+                alloc, _attach_controller_controller=False
+            )
+            _controller_controller = _cc_proc_mesh.spawn(
+                "controller_controller", _ControllerController
+            )
+    assert _cc_proc_mesh is not None
+    return _cc_proc_mesh, _controller_controller
-# Lazy init for the same reason as above. This is defined in proc_mesh.py
-# instead of debugger.py for circular import reasons.
-async def _debug_client() -> DebugClient:
-    global _debug_client_mesh
-    if _debug_client_mesh is None:
-        mesh = await _get_debug_proc_mesh()
-        _debug_client_mesh = await mesh._spawn_nonblocking("debug_client", DebugClient)
-    return _debug_client_mesh
+def get_or_spawn_controller(
+    name: str, Class: Type["_ActorType"], *args: Any, **kwargs: Any
+) -> Future["_ActorType"]:
+    """
+    Creates a singleton actor (controller) indexed by name, or if it already exists, returns the
+    existing actor.
+    Args:
+        name (str): The unique name of the actor, used as a key for retrieval.
+        Class (Type): The class of the actor to spawn. Must be a subclass of Actor.
+        *args (Any): Positional arguments to pass to the actor constructor.
+        **kwargs (Any): Keyword arguments to pass to the actor constructor.
-def debug_client() -> DebugClient:
-    return Future(coro=_debug_client()).get()
+    Returns:
+        A Future that resolves to a reference to the actor.
+    """
+    return context().actor_instance._controller_controller.get_or_spawn.call_one(
+        name, Class, *args, **kwargs
+    )