PyPI - torchmonarch-nightly - Versions diffs - 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl - Mend

torchmonarch-nightly 2025.6.27__cp312-cp312-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

monarch/__init__.py +189 -0
monarch/_monarch/__init__.py +5 -0
monarch/_monarch/hyperactor/__init__.py +58 -0
monarch/_monarch/selection/__init__.py +13 -0
monarch/_monarch/worker/__init__.py +0 -0
monarch/_monarch/worker/debugger.py +117 -0
monarch/_monarch/worker/logging.py +107 -0
monarch/_rust_bindings.so +0 -0
monarch/_testing.py +230 -0
monarch/actor_mesh.py +761 -0
monarch/allocator.py +220 -0
monarch/bootstrap_main.py +59 -0
monarch/builtins/__init__.py +14 -0
monarch/builtins/log.py +22 -0
monarch/builtins/random.py +68 -0
monarch/cached_remote_function.py +257 -0
monarch/code_sync.py +10 -0
monarch/common/_C.pyi +11 -0
monarch/common/_C.so +0 -0
monarch/common/__init__.py +0 -0
monarch/common/_coalescing.py +308 -0
monarch/common/_device_utils.py +18 -0
monarch/common/_tensor_to_table.py +172 -0
monarch/common/base_tensor.py +28 -0
monarch/common/borrows.py +143 -0
monarch/common/client.py +690 -0
monarch/common/constants.py +10 -0
monarch/common/context_manager.py +40 -0
monarch/common/controller_api.py +104 -0
monarch/common/device_mesh.py +417 -0
monarch/common/fake.py +55 -0
monarch/common/function.py +160 -0
monarch/common/function_caching.py +164 -0
monarch/common/future.py +168 -0
monarch/common/invocation.py +125 -0
monarch/common/mast.py +221 -0
monarch/common/messages.py +573 -0
monarch/common/mock_cuda.py +41 -0
monarch/common/opaque_ref.py +98 -0
monarch/common/pickle_flatten.py +48 -0
monarch/common/pipe.py +152 -0
monarch/common/process_group.py +55 -0
monarch/common/recording.py +127 -0
monarch/common/reference.py +33 -0
monarch/common/remote.py +297 -0
monarch/common/selection.py +9 -0
monarch/common/shape.py +229 -0
monarch/common/stream.py +114 -0
monarch/common/tensor.py +814 -0
monarch/common/tensor_factory.py +31 -0
monarch/common/tree.py +73 -0
monarch/controller/__init__.py +7 -0
monarch/controller/backend.py +223 -0
monarch/controller/controller.py +223 -0
monarch/controller/debugger.py +47 -0
monarch/controller/history.py +90 -0
monarch/controller/rust_backend/__init__.py +7 -0
monarch/controller/rust_backend/controller.py +245 -0
monarch/debugger.py +379 -0
monarch/fetch.py +55 -0
monarch/future.py +76 -0
monarch/gradient/__init__.py +11 -0
monarch/gradient/_gradient_generator.pyi +22 -0
monarch/gradient/_gradient_generator.so +0 -0
monarch/gradient_generator.py +185 -0
monarch/memory.py +43 -0
monarch/mesh_controller.py +271 -0
monarch/monarch_controller +0 -0
monarch/notebook.py +761 -0
monarch/opaque_module.py +235 -0
monarch/opaque_object.py +88 -0
monarch/parallel/__init__.py +9 -0
monarch/parallel/pipelining/__init__.py +7 -0
monarch/parallel/pipelining/runtime.py +847 -0
monarch/parallel/pipelining/schedule_ir.py +692 -0
monarch/parallel/pipelining/scheduler.py +249 -0
monarch/pdb_wrapper.py +135 -0
monarch/proc_mesh.py +299 -0
monarch/profiler.py +160 -0
monarch/python_local_mesh.py +107 -0
monarch/random.py +61 -0
monarch/rdma.py +162 -0
monarch/remote_class.py +114 -0
monarch/rust_backend_mesh.py +280 -0
monarch/rust_local_mesh.py +1402 -0
monarch/sim_mesh.py +359 -0
monarch/simulator/__init__.py +7 -0
monarch/simulator/command_history.py +424 -0
monarch/simulator/config.py +21 -0
monarch/simulator/interface.py +59 -0
monarch/simulator/ir.py +770 -0
monarch/simulator/mock_controller.py +214 -0
monarch/simulator/profiling.py +424 -0
monarch/simulator/simulator.py +1052 -0
monarch/simulator/task.py +255 -0
monarch/simulator/tensor.py +373 -0
monarch/simulator/trace.py +395 -0
monarch/simulator/utils.py +41 -0
monarch/simulator/worker.py +389 -0
monarch/telemetry.py +19 -0
monarch/tensor_worker_main.py +260 -0
monarch/tensorboard.py +84 -0
monarch/timer/__init__.py +21 -0
monarch/timer/example_monarch.py +78 -0
monarch/timer/example_spmd.py +55 -0
monarch/timer/execution_timer.py +199 -0
monarch/timer/execution_timer_test.py +131 -0
monarch/tools/__init__.py +7 -0
monarch/tools/cli.py +167 -0
monarch/tools/commands.py +251 -0
monarch/tools/components/__init__.py +7 -0
monarch/tools/components/hyperactor.py +58 -0
monarch/tools/config/__init__.py +20 -0
monarch/tools/config/defaults.py +54 -0
monarch/tools/mesh_spec.py +165 -0
monarch/tools/network.py +69 -0
monarch/worker/__init__.py +7 -0
monarch/worker/_testing_function.py +481 -0
monarch/worker/compiled_block.py +270 -0
monarch/worker/debugger.py +125 -0
monarch/worker/lines.py +47 -0
monarch/worker/monitor.py +53 -0
monarch/worker/worker.py +1191 -0
monarch/world_mesh.py +34 -0
monarch_supervisor/__init__.py +1044 -0
monarch_supervisor/_testing.py +44 -0
monarch_supervisor/function_call.py +30 -0
monarch_supervisor/host.py +386 -0
monarch_supervisor/launchers.py +145 -0
monarch_supervisor/log_pstree.py +48 -0
monarch_supervisor/logging.py +103 -0
monarch_supervisor/python_executable.py +42 -0
tests/__init__.py +0 -0
tests/dispatch_bench.py +124 -0
tests/dispatch_bench_helper.py +25 -0
tests/error_test_binary.py +180 -0
tests/simulator/__init__.py +0 -0
tests/simulator/test_profiling.py +136 -0
tests/simulator/test_simulator.py +411 -0
tests/simulator/test_task.py +64 -0
tests/simulator/test_worker.py +102 -0
tests/sleep_binary.py +35 -0
tests/test_actor_error.py +240 -0
tests/test_alloc.py +25 -0
tests/test_allocator.py +365 -0
tests/test_coalescing.py +492 -0
tests/test_controller.py +845 -0
tests/test_device_mesh.py +132 -0
tests/test_fault_tolerance.py +398 -0
tests/test_future.py +94 -0
tests/test_grad_generator.py +121 -0
tests/test_mock_cuda.py +74 -0
tests/test_pdb_actor.py +110 -0
tests/test_python_actors.py +736 -0
tests/test_remote_functions.py +1271 -0
tests/test_rust_backend.py +217 -0
tests/test_signal_safe_block_on.py +103 -0
tests/test_sim_backend.py +54 -0
tests/test_tensor_engine.py +52 -0
torchmonarch_nightly-2025.6.27.dist-info/METADATA +94 -0
torchmonarch_nightly-2025.6.27.dist-info/RECORD +165 -0
torchmonarch_nightly-2025.6.27.dist-info/WHEEL +5 -0
torchmonarch_nightly-2025.6.27.dist-info/entry_points.txt +3 -0
torchmonarch_nightly-2025.6.27.dist-info/licenses/LICENSE +29 -0
torchmonarch_nightly-2025.6.27.dist-info/top_level.txt +3 -0

monarch/parallel/pipelining/scheduler.py ADDED Viewed

@@ -0,0 +1,249 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from functools import cache
+from logging import getLogger
+from timeit import default_timer as timer
+from .schedule_ir import (
+    _Action,
+    _add_send_recv,
+    _ComputationType,
+    _dump_csv,
+    _format_pipeline_order,
+    _merge_bw,
+    BACKWARD,
+    FORWARD,
+    FULL_BACKWARD,
+)
+logger = getLogger()
+def get_stage_str(model_chunk_index, training_stage, mb_index):
+    ctype = _ComputationType.from_str(training_stage)
+    return str(_Action(model_chunk_index, ctype, mb_index))
+def get_dora_schedule(
+    num_model_chunks,
+    pipeline_parallel_size,
+    num_round,
+    num_microbatch_per_round,
+    zero_bubble,
+    total_num_microbatches,
+    num_microbatches,
+    dfs=False,
+    prefetch_weight_latency=1.0,
+    enable_weight_sharding_in_pp=False,
+    enable_wgrad_sharding_in_pp=False,
+):
+    start_time = timer()
+    num_warmup_microbatches_list = []
+    num_1f1b_microbatches_list = []
+    num_additional_1b1w_list = []
+    for pipeline_parallel_rank in range(pipeline_parallel_size):
+        num_warmup_microbatches = 0
+        # The number of microbatches that last pipeline stage run before 1f1b.
+        num_warmup_microbatches += (num_model_chunks - 1) * num_microbatch_per_round
+        # From last PP stage up, each rank will be 2 more than the previous one.
+        num_warmup_microbatches += (
+            pipeline_parallel_size - pipeline_parallel_rank - 1
+        ) * 2
+        num_warmup_microbatches = min(num_warmup_microbatches, total_num_microbatches)
+        # The number of 1f1b for zero bubble schedule
+        if num_microbatches == pipeline_parallel_size:
+            num_1f1b_microbatches = pipeline_parallel_rank
+        else:
+            num_1f1b_microbatches = 2 * pipeline_parallel_rank
+        num_additional_1b1w = max(
+            int(math.ceil((pipeline_parallel_size - 4) / 2)) - pipeline_parallel_rank,
+            0,
+        )
+        if dfs:
+            num_1f1b_microbatches = 0
+            num_additional_1b1w = 0
+        num_warmup_microbatches_list.append(num_warmup_microbatches)
+        num_1f1b_microbatches_list.append(num_1f1b_microbatches)
+        num_additional_1b1w_list.append(num_additional_1b1w)
+    schedules = []
+    def get_last_pp_rank(i):
+        return (i - 1) % pipeline_parallel_size, i - 1 < 0
+    def get_next_pp_rank(i):
+        return (i + 1) % pipeline_parallel_size, i + 1 >= pipeline_parallel_size
+    for pipeline_parallel_rank in range(pipeline_parallel_size):
+        s = []
+        fwd_mb_index_list = [0 for i in range(num_model_chunks)]
+        bwd_mb_index_list = [0 for i in range(num_model_chunks)]
+        fwd_model_chunk_index = 0
+        bwd_model_chunk_index = num_model_chunks - 1
+        weight_store = []
+        num_warmup_microbatches = num_warmup_microbatches_list[pipeline_parallel_rank]
+        num_1f1b_microbatches = num_1f1b_microbatches_list[pipeline_parallel_rank]
+        num_additional_1b1w = num_additional_1b1w_list[pipeline_parallel_rank]
+        fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+        bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+        fill_1b1w = False
+        for _ in range(num_warmup_microbatches):  # warm up fwd
+            fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+            bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+            tmp = get_stage_str(fwd_model_chunk_index, "F", fwd_mb_index)
+            s.append(tmp)
+            fwd_mb_index_list[fwd_model_chunk_index] += 1
+            if fwd_mb_index_list[fwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if fwd_model_chunk_index < num_model_chunks - 1:
+                    fwd_model_chunk_index += 1
+                else:
+                    fwd_model_chunk_index = 0
+        for i in range(
+            total_num_microbatches - num_warmup_microbatches
+        ):  # 1f1b and 1f1b1w
+            if (
+                fwd_model_chunk_index == 1 and not fill_1b1w
+            ):  # additional 1b1w to fill before fwd
+                fill_1b1w = True
+                for _ in range(num_additional_1b1w):
+                    bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+                    tmp = get_stage_str(bwd_model_chunk_index, "B", bwd_mb_index)
+                    s.append(tmp)
+                    tmp = get_stage_str(bwd_model_chunk_index, "W", bwd_mb_index)
+                    s.append(tmp)
+                    bwd_mb_index_list[bwd_model_chunk_index] += 1
+                    if (
+                        bwd_mb_index_list[bwd_model_chunk_index]
+                        % num_microbatch_per_round
+                        == 0
+                    ):
+                        if bwd_model_chunk_index > 0:
+                            bwd_model_chunk_index -= 1
+                        else:
+                            bwd_model_chunk_index = num_model_chunks - 1
+            fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+            bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+            tmp = get_stage_str(fwd_model_chunk_index, "F", fwd_mb_index)
+            s.append(tmp)
+            fwd_mb_index_list[fwd_model_chunk_index] += 1
+            if fwd_mb_index_list[fwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if fwd_model_chunk_index < num_model_chunks - 1:
+                    fwd_model_chunk_index += 1
+                else:
+                    fwd_model_chunk_index = 0
+            tmp = get_stage_str(
+                bwd_model_chunk_index, "B" if zero_bubble else "BW", bwd_mb_index
+            )
+            s.append(tmp)
+            tmp = get_stage_str(bwd_model_chunk_index, "W", bwd_mb_index)
+            if zero_bubble and i < num_1f1b_microbatches:
+                weight_store.append(tmp)
+            else:
+                s.append(tmp)
+            bwd_mb_index_list[bwd_model_chunk_index] += 1
+            if bwd_mb_index_list[bwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if bwd_model_chunk_index > 0:
+                    bwd_model_chunk_index -= 1
+                else:
+                    bwd_model_chunk_index = num_model_chunks - 1
+        num_cooldown = (
+            num_warmup_microbatches - num_additional_1b1w
+            if fill_1b1w
+            else num_warmup_microbatches
+        )
+        for _ in range(num_cooldown):  # cooldown bwd
+            fwd_mb_index = fwd_mb_index_list[fwd_model_chunk_index]
+            bwd_mb_index = bwd_mb_index_list[bwd_model_chunk_index]
+            tmp = get_stage_str(bwd_model_chunk_index, "B", bwd_mb_index)
+            s.append(tmp)
+            tmp = get_stage_str(bwd_model_chunk_index, "W", bwd_mb_index)
+            s.append(tmp)
+            bwd_mb_index_list[bwd_model_chunk_index] += 1
+            if bwd_mb_index_list[bwd_model_chunk_index] % num_microbatch_per_round == 0:
+                if bwd_model_chunk_index > 0:
+                    bwd_model_chunk_index -= 1
+                else:
+                    bwd_model_chunk_index = num_model_chunks - 1
+        if len(weight_store) > 0:
+            s += weight_store
+        schedules.append(s)
+    compute_schedules = {}
+    for rank in range(pipeline_parallel_size):
+        compute_schedules[rank] = []
+        for action_str in schedules[rank]:
+            action = _Action.from_str(action_str)
+            stage_index = action.stage_index * pipeline_parallel_size + rank
+            action = _Action(
+                stage_index, action.computation_type, action.microbatch_index
+            )
+            compute_schedules[rank].append(action)
+    lowered_comm_schedule = compute_schedules
+    for rank in lowered_comm_schedule:
+        lowered_comm_schedule[rank] = _merge_bw(lowered_comm_schedule[rank])
+    dump_scheduler_ir = True
+    if dump_scheduler_ir:
+        compute_str = _format_pipeline_order(lowered_comm_schedule)
+        with open("lowered_compute.log", "w") as logf:
+            logf.write(compute_str)
+        _dump_csv(compute_schedules, "lowered_compute.csv")
+    lowered_comm_schedule = _add_send_recv(
+        lowered_comm_schedule,
+        stage_to_rank=lambda chunk_index: chunk_index % pipeline_parallel_size,
+        num_stages=num_model_chunks * pipeline_parallel_size,
+    )
+    comms_str = _format_pipeline_order(lowered_comm_schedule)
+    if dump_scheduler_ir:
+        with open("lowered_comms.log", "w") as logf:
+            logf.write(comms_str)
+        _dump_csv(lowered_comm_schedule, "lowered_compute_with_send_recv.csv")
+    logger.debug("---------- lowered IR\n%s----------", comms_str)
+    if not enable_weight_sharding_in_pp and not enable_wgrad_sharding_in_pp:
+        return lowered_comm_schedule
+    generation_time = timer() - start_time
+    logger.info(f"schedule generation took {generation_time:.6f} seconds")
+    return lowered_comm_schedule
+# TODO - replace bfs / dfs functions below with new IR generators
+ir_schedules = {
+    # "dora": get_dora_schedule,
+    "dora-dfs": lambda *args, **kwargs: get_dora_schedule(*args, **kwargs, dfs=True),
+    # "zbv": get_zbv_schedule,
+    # "zbw": get_zbw_schedule,
+}
+is_zero_bubble = {
+    # "dora": True,
+    "dora-dfs": True,
+    # "zbv": True,
+    # "zbw": True,
+}
+@cache
+def generate_schedule(name: str, *args, **kwargs):
+    assert name in ir_schedules, f"{name} is not a supported schedule type"
+    schedules = ir_schedules[name](*args, **kwargs)
+    stage_to_rank = {}
+    for rank, schedule_actions_rank in schedules.items():
+        for action in schedule_actions_rank:
+            comp_type = action.computation_type
+            stage_idx = action.stage_index
+            if comp_type == FORWARD:
+                stage_to_rank[stage_idx] = rank
+            if comp_type in (BACKWARD, FULL_BACKWARD):
+                stage_to_rank[stage_idx] = rank
+    return schedules, stage_to_rank

monarch/pdb_wrapper.py ADDED Viewed

@@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import bdb
+import inspect
+import io
+import pdb  # noqa
+import socket
+import sys
+from dataclasses import dataclass
+from typing import Dict, TYPE_CHECKING
+from monarch._rust_bindings.monarch_hyperactor.proc import ActorId
+if TYPE_CHECKING:
+    from monarch.debugger import DebugClient
+@dataclass
+class DebuggerWrite:
+    payload: bytes
+    function: str | None
+    lineno: int | None
+class PdbWrapper(pdb.Pdb):
+    def __init__(
+        self,
+        rank: int,
+        coords: Dict[str, int],
+        actor_id: ActorId,
+        client_ref: "DebugClient",
+        header: str | None = None,
+    ):
+        self.rank = rank
+        self.coords = coords
+        self.header = header
+        self.actor_id = actor_id
+        self.client_ref = client_ref
+        # pyre-ignore
+        super().__init__(stdout=WriteWrapper(self), stdin=ReadWrapper.create(self))
+        self._first = True
+    def setup(self, *args, **kwargs):
+        r = super().setup(*args, **kwargs)
+        if self._first:
+            self._first = False
+            # when we enter the debugger, we want to present the user's stack frame
+            # not the nested one inside session.run. This means that the local
+            # variables are what gets printed, etc. To do this
+            # we first execute up 2 to get to that frame.
+            self.do_up(2)
+        return r
+    def set_continue(self) -> None:
+        r = super().set_continue()
+        if not self.breaks:
+            # no more breakpoints so this debugger will not
+            # be used again, and we detach from the controller io.
+            self.client_ref.debugger_session_end.call_one(self.rank).get()
+            # break cycle with itself before we exit
+            self.stdin = sys.stdin
+            self.stdout = sys.stdout
+        return r
+    def set_trace(self):
+        self.client_ref.debugger_session_start.call_one(
+            self.rank, self.coords, socket.getfqdn(socket.gethostname()), self.actor_id
+        ).get()
+        if self.header:
+            self.message(self.header)
+        super().set_trace()
+class ReadWrapper(io.RawIOBase):
+    def __init__(self, session: "PdbWrapper"):
+        self.session = session
+    def readinto(self, b):
+        response = self.session.client_ref.debugger_read.call_one(
+            self.session.rank, len(b)
+        ).get()
+        if response == "detach":
+            # this gets injected by the worker event loop to
+            # get the worker thread to exit on an Exit command.
+            raise bdb.BdbQuit
+        assert isinstance(response, DebuggerWrite) and len(response.payload) <= len(b)
+        b[: len(response.payload)] = response.payload
+        return len(response.payload)
+    def readable(self) -> bool:
+        return True
+    @classmethod
+    def create(cls, session: "PdbWrapper"):
+        return io.TextIOWrapper(io.BufferedReader(cls(session)))
+class WriteWrapper:
+    def __init__(self, session: "PdbWrapper"):
+        self.session = session
+    def writable(self) -> bool:
+        return True
+    def write(self, s: str):
+        function = None
+        lineno = None
+        if self.session.curframe is not None:
+            # pyre-ignore
+            function = f"{inspect.getmodulename(self.session.curframe.f_code.co_filename)}.{self.session.curframe.f_code.co_name}"
+            # pyre-ignore
+            lineno = self.session.curframe.f_lineno
+        self.session.client_ref.debugger_write.call_one(
+            self.session.rank,
+            DebuggerWrite(
+                s.encode(),
+                function,
+                lineno,
+            ),
+        ).get()
+    def flush(self):
+        pass
+def remote_breakpointhook(
+    rank: int, coords: Dict[str, int], actor_id: ActorId, client_ref: "DebugClient"
+):
+    ds = PdbWrapper(rank, coords, actor_id, client_ref)
+    ds.set_trace()

monarch/proc_mesh.py ADDED Viewed

@@ -0,0 +1,299 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import os
+import sys
+from contextlib import AbstractContextManager
+from typing import (
+    Any,
+    cast,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Type,
+    TYPE_CHECKING,
+    TypeVar,
+)
+if TYPE_CHECKING:
+    import torch
+import monarch
+from monarch import ActorFuture as Future
+# Conditionally import DeviceMesh and spawn_tensor_engine only if tensor_engine is available
+# pyre-ignore[21]
+from monarch._rust_bindings import has_tensor_engine
+from monarch._rust_bindings.hyperactor_extension.alloc import (  # @manual=//monarch/monarch_extension:monarch_extension  # @manual=//monarch/monarch_extension:monarch_extension
+    Alloc,
+    AllocConstraints,
+    AllocSpec,
+)
+from monarch._rust_bindings.monarch_hyperactor.mailbox import Mailbox
+from monarch._rust_bindings.monarch_hyperactor.proc_mesh import ProcMesh as HyProcMesh
+from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
+from monarch.actor_mesh import _Actor, _ActorMeshRefImpl, Actor, ActorMeshRef
+from monarch.code_sync import RemoteWorkspace, RsyncMeshClient
+from monarch.common._device_utils import _local_device_count
+from monarch.common.shape import MeshTrait
+from monarch.rdma import RDMAManager
+if has_tensor_engine():
+    from monarch.common.device_mesh import DeviceMesh
+    from monarch.mesh_controller import spawn_tensor_engine
+else:
+    DeviceMesh = None
+    spawn_tensor_engine = None
+T = TypeVar("T")
+try:
+    from __manifest__ import fbmake  # noqa
+    IN_PAR = True
+except ImportError:
+    IN_PAR = False
+async def _allocate_nonblocking(alloc: Alloc) -> "ProcMesh":
+    return ProcMesh(await HyProcMesh.allocate_nonblocking(alloc))
+def _allocate_blocking(alloc: Alloc) -> "ProcMesh":
+    return ProcMesh(HyProcMesh.allocate_blocking(alloc))
+class ProcMesh(MeshTrait):
+    def __init__(
+        self,
+        hy_proc_mesh: HyProcMesh,
+        _mock_shape: Optional[Shape] = None,
+        _device_mesh: Optional[DeviceMesh] = None,
+    ) -> None:
+        self._proc_mesh = hy_proc_mesh
+        self._mock_shape: Optional[Shape] = _mock_shape
+        self._mailbox: Mailbox = self._proc_mesh.client
+        self._rdma_manager: Optional[RDMAManager] = None
+        self._rsync_mesh_client: Optional[RsyncMeshClient] = None
+        self._maybe_device_mesh: Optional[DeviceMesh] = _device_mesh
+        if _mock_shape is None:
+            self._rdma_manager = self._spawn_blocking("rdma_manager", RDMAManager)
+    @property
+    def _shape(self) -> Shape:
+        return self._proc_mesh.shape if self._mock_shape is None else self._mock_shape
+    @property
+    def _ndslice(self) -> Slice:
+        return self._shape.ndslice
+    @property
+    def _labels(self) -> List[str]:
+        return self._shape.labels
+    def _new_with_shape(self, shape: Shape) -> "ProcMesh":
+        device_mesh = (
+            None
+            if self._device_mesh is None
+            else self._device_mesh._new_with_shape(shape)
+        )
+        return ProcMesh(self._proc_mesh, _mock_shape=shape, _device_mesh=device_mesh)
+    def spawn(
+        self, name: str, Class: Type[T], *args: Any, **kwargs: Any
+    ) -> Future[ActorMeshRef[T]]:
+        if self._mock_shape is not None:
+            raise NotImplementedError("NYI: spawn on slice of a proc mesh.")
+        return Future(
+            lambda: self._spawn_nonblocking(name, Class, *args, **kwargs),
+            lambda: self._spawn_blocking(name, Class, *args, **kwargs),
+        )
+    @classmethod
+    def from_alloc(self, alloc: Alloc) -> Future["ProcMesh"]:
+        return Future(
+            lambda: _allocate_nonblocking(alloc),
+            lambda: _allocate_blocking(alloc),
+        )
+    def _spawn_blocking(
+        self, name: str, Class: Type[T], *args: Any, **kwargs: Any
+    ) -> T:
+        if not issubclass(Class, Actor):
+            raise ValueError(
+                f"{Class} must subclass monarch.service.Actor to spawn it."
+            )
+        actor_mesh = self._proc_mesh.spawn_blocking(name, _Actor)
+        service = ActorMeshRef(
+            Class,
+            _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
+            self._mailbox,
+        )
+        # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
+        # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
+        service._create(args, kwargs)
+        return cast(T, service)
+    def __repr__(self) -> str:
+        return repr(self._proc_mesh)
+    def __str__(self) -> str:
+        return str(self._proc_mesh)
+    async def _spawn_nonblocking(
+        self, name: str, Class: Type[T], *args: Any, **kwargs: Any
+    ) -> T:
+        if not issubclass(Class, Actor):
+            raise ValueError(
+                f"{Class} must subclass monarch.service.Actor to spawn it."
+            )
+        actor_mesh = await self._proc_mesh.spawn_nonblocking(name, _Actor)
+        service = ActorMeshRef(
+            Class,
+            _ActorMeshRefImpl.from_hyperactor_mesh(self._mailbox, actor_mesh),
+            self._mailbox,
+        )
+        # useful to have this separate, because eventually we can reconstitute ActorMeshRef objects across pickling by
+        # doing `ActorMeshRef(Class, actor_handle)` but not calling _create.
+        service._create(args, kwargs)
+        return cast(T, service)
+    @property
+    def _device_mesh(self) -> "DeviceMesh":
+        if spawn_tensor_engine is None:
+            raise RuntimeError(
+                "DeviceMesh is not available because tensor_engine was not compiled (USE_TENSOR_ENGINE=0)"
+            )
+        if self._maybe_device_mesh is None:
+            if self._mock_shape is not None:
+                raise NotImplementedError(
+                    "NYI: activating a proc mesh must first happen on the root proc_mesh until we fix spawning on submeshes."
+                )
+            self._maybe_device_mesh = spawn_tensor_engine(self)
+        return self._maybe_device_mesh
+    # pyre-ignore
+    def activate(self) -> AbstractContextManager:
+        return self._device_mesh.activate()
+    def rank_tensor(self, dim: str | Sequence[str]) -> "torch.Tensor":
+        return self._device_mesh.rank(dim)
+    def rank_tensors(self) -> Dict[str, "torch.Tensor"]:
+        return self._device_mesh.ranks
+    async def sync_workspace(self) -> None:
+        if self._rsync_mesh_client is None:
+            # TODO(agallagher): We need some way to configure and pass this
+            # in -- right now we're assuming the `gpu` dimension, which isn't
+            # correct.
+            assert set(self._proc_mesh.shape.labels).issubset({"gpus", "hosts"})
+            # The workspace shape (i.e. only perform one rsync per host).
+            workspace_shape = self.slice(gpus=slice(0, 1, 1))._mock_shape
+            assert workspace_shape is not None
+            # TODO(agallagher): We should probably hide this behind something
+            # like a `Workspace` class and support abstracting/configuring
+            # different sync methods.
+            self._rsync_mesh_client = RsyncMeshClient.spawn_blocking(
+                proc_mesh=self._proc_mesh,
+                shape=workspace_shape,
+                # TODO(agallagher): Is there a better way to infer/set the local
+                # workspace dir, rather than use PWD?
+                local_workspace=os.getcwd(),
+                remote_workspace=RemoteWorkspace.FromEnvVar("WORKSPACE_DIR"),
+            )
+        await self._rsync_mesh_client.sync_workspace()
+async def local_proc_mesh_nonblocking(
+    *, gpus: Optional[int] = None, hosts: int = 1
+) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    allocator = monarch.LocalAllocator()
+    alloc = await allocator.allocate(spec)
+    return await ProcMesh.from_alloc(alloc)
+def local_proc_mesh_blocking(*, gpus: Optional[int] = None, hosts: int = 1) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    allocator = monarch.LocalAllocator()
+    alloc = allocator.allocate(spec).get()
+    return ProcMesh.from_alloc(alloc).get()
+def local_proc_mesh(*, gpus: Optional[int] = None, hosts: int = 1) -> Future[ProcMesh]:
+    return Future(
+        lambda: local_proc_mesh_nonblocking(gpus=gpus, hosts=hosts),
+        lambda: local_proc_mesh_blocking(gpus=gpus, hosts=hosts),
+    )
+_BOOTSTRAP_MAIN = "monarch.bootstrap_main"
+def _get_bootstrap_args() -> tuple[str, Optional[list[str]], dict[str, str]]:
+    if IN_PAR:
+        cmd = sys.argv[0]
+        args = None
+        env = {
+            "PAR_MAIN_OVERRIDE": _BOOTSTRAP_MAIN,
+        }
+    else:
+        cmd = sys.executable
+        args = ["-m", _BOOTSTRAP_MAIN]
+        env = {}
+    return cmd, args, env
+async def proc_mesh_nonblocking(
+    *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
+) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    env = env or {}
+    cmd, args, base_env = _get_bootstrap_args()
+    env.update(base_env)
+    env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
+    allocator = monarch.ProcessAllocator(cmd, args, env)
+    alloc = await allocator.allocate(spec)
+    return await ProcMesh.from_alloc(alloc)
+def proc_mesh_blocking(
+    *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
+) -> ProcMesh:
+    if gpus is None:
+        gpus = _local_device_count()
+    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    env = env or {}
+    cmd, args, base_env = _get_bootstrap_args()
+    env.update(base_env)
+    env["HYPERACTOR_MANAGED_SUBPROCESS"] = "1"
+    allocator = monarch.ProcessAllocator(cmd, args, env)
+    alloc = allocator.allocate(spec).get()
+    return ProcMesh.from_alloc(alloc).get()
+def proc_mesh(
+    *, gpus: Optional[int] = None, hosts: int = 1, env: Optional[dict[str, str]] = None
+) -> Future[ProcMesh]:
+    return Future(
+        lambda: proc_mesh_nonblocking(gpus=gpus, hosts=hosts, env=env),
+        lambda: proc_mesh_blocking(gpus=gpus, hosts=hosts, env=env),
+    )